diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml index 800430c923..1ed35a9720 100644 --- a/.JuliaFormatter.toml +++ b/.JuliaFormatter.toml @@ -1,15 +1,15 @@ -indent = 4 -margin = 92 -whitespace_typedefs = true -whitespace_ops_in_indices = true -remove_extra_newlines = true -always_use_return = true -whitespace_in_kwargs = false -align_assignment = true -align_struct_field = true -align_conditional = true -align_pair_arrow = true -align_matrix = true -trailing_comma = false -annotate_untyped_fields_with_any=false -format_docstrings = true +indent = 4 +margin = 92 +whitespace_typedefs = true +whitespace_ops_in_indices = true +remove_extra_newlines = true +always_use_return = true +whitespace_in_kwargs = false +align_assignment = true +align_struct_field = true +align_conditional = true +align_pair_arrow = true +align_matrix = true +trailing_comma = false +annotate_untyped_fields_with_any=false +format_docstrings = true diff --git a/.clang-format b/.clang-format index 45561abccc..51d8ad7f92 100644 --- a/.clang-format +++ b/.clang-format @@ -1,54 +1,58 @@ -Language: Cpp -BasedOnStyle: LLVM -Standard: Latest - -AccessModifierOffset: -2 -AlignAfterOpenBracket: Align -AlignArrayOfStructures: None -AlignEscapedNewlines: Left -AlignTrailingComments: - Kind: Always - OverEmptyLines: 0 -AllowShortEnumsOnASingleLine: false -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: Inline -AllowShortLambdasOnASingleLine: All -AllowShortIfStatementsOnASingleLine: Never -AllowShortLoopsOnASingleLine: false -AlwaysBreakTemplateDeclarations: Yes -BreakBeforeBraces: Allman -BinPackArguments: true -BinPackParameters: true -ColumnLimit: 92 -ConstructorInitializerIndentWidth: 2 -ContinuationIndentWidth: 4 -IncludeCategories: - - Regex: '^<.*/.*/.*\.hpp>' - Priority: 6 - - Regex: '^<.*/.*\.hpp>' - Priority: 5 - - Regex: '^<.*\.hpp>' - Priority: 4 - - Regex: '^<.*/.*\.h>' - Priority: 3 - - Regex: '^<.*\.h>' - Priority: 2 - - Regex: '^<.*/.*>' - Priority: 1 - - Regex: '^<.*>' - Priority: 0 - - Regex: '^".*"' - Priority: 7 - - Regex: '.*' - Priority: 8 -IndentCaseBlocks: true -IndentCaseLabels: true -IndentWidth: 2 -PackConstructorInitializers: BinPack -QualifierAlignment: Left -ShortNamespaceLines: 0 -SpacesBeforeTrailingComments: 2 -StatementMacros: ['PalacePragmaOmp'] -TypenameMacros: ['CEED_QFUNCTION'] -UseTab: Never +Language: Cpp +BasedOnStyle: LLVM +Standard: Latest + +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignEscapedNewlines: Left +AlignTrailingComments: + Kind: Always + OverEmptyLines: 0 +AllowShortEnumsOnASingleLine: false +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakTemplateDeclarations: Yes +BreakBeforeBraces: Allman +BinPackArguments: true +BinPackParameters: true +ColumnLimit: 92 +ConstructorInitializerIndentWidth: 2 +ContinuationIndentWidth: 4 +IncludeCategories: + - Regex: '^<.*/.*/.*\.hpp>' + Priority: 6 + - Regex: '^<.*/.*\.hpp>' + Priority: 5 + - Regex: '^<.*\.hpp>' + Priority: 4 + - Regex: '^<.*/.*\.h>' + Priority: 3 + - Regex: '^<.*\.h>' + Priority: 2 + - Regex: '^<.*/.*>' + Priority: 1 + - Regex: '^<.*>' + Priority: 0 + - Regex: '^".*"' + Priority: 7 + - Regex: '.*' + Priority: 8 +IndentCaseBlocks: true +IndentCaseLabels: true +IndentWidth: 2 +PackConstructorInitializers: BinPack +QualifierAlignment: Left +ShortNamespaceLines: 0 +SpacesBeforeTrailingComments: 2 +StatementMacros: ['PalacePragmaOmp', + 'PalacePragmaDiagnosticPush', + 'PalacePragmaDiagnosticPop', + 'PalacePragmaDiagnosticDisableDeprecated', + 'PalacePragmaDiagnosticDisableUnused'] +TypenameMacros: ['CEED_QFUNCTION'] +UseTab: Never diff --git a/.clang-tidy b/.clang-tidy index 4772ac5a6e..d1295a13b2 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -1,36 +1,38 @@ -Checks: > - -*, - clang-analyzer-* - clang-diagnostic-*, - modernize-*, - -modernize-use-default-member-init, - -modernize-use-nodiscard, - -modernize-use-trailing-return-type, - -modernize-avoid-c-arrays, - -modernize-use-using, - cppcoreguidelines-*, - -cppcoreguidelines-avoid-c-arrays, - -cppcoreguidelines-avoid-magic-numbers, - -cppcoreguidelines-init-variables, - -cppcoreguidelines-macro-usage, - -cppcoreguidelines-non-private-member-variables-in-classes, - -cppcoreguidelines-pro-*, - bugprone-*, - -bugprone-easily-swappable-parameters, - -bugprone-reserved-identifier, - readability-*, - -readability-convert-member-functions-to-static, - -readability-else-after-return, - -readability-function-cognitive-complexity, - -readability-identifier-length, - -readability-implicit-bool-conversion, - -readability-isolate-declaration, - -readability-magic-numbers, - -readability-named-parameter, - -readability-redundant-*, - -readability-string-compare, - performance-*, - mpi-*, - openmp-*' -HeaderFilterRegex: 'palace/drivers|palace/fem|palace/linalg|palace/utils' -FormatStyle: 'file' +Checks: > + -*, + clang-analyzer-* + clang-diagnostic-*, + modernize-*, + -modernize-use-default-member-init, + -modernize-use-nodiscard, + -modernize-use-trailing-return-type, + -modernize-avoid-c-arrays, + -modernize-use-using, + cppcoreguidelines-*, + -cppcoreguidelines-avoid-c-arrays, + -cppcoreguidelines-avoid-magic-numbers, + -cppcoreguidelines-init-variables, + -cppcoreguidelines-macro-usage, + -cppcoreguidelines-non-private-member-variables-in-classes, + -cppcoreguidelines-pro-*, + bugprone-*, + -bugprone-easily-swappable-parameters, + -bugprone-reserved-identifier, + readability-*, + -readability-math-missing-parentheses, + -readability-convert-member-functions-to-static, + -readability-else-after-return, + -readability-function-cognitive-complexity, + -readability-identifier-length, + -readability-implicit-bool-conversion, + -readability-isolate-declaration, + -readability-magic-numbers, + -readability-named-parameter, + -readability-redundant-*, + -readability-string-compare, + -readability-uppercase-literal-suffix, + performance-*, + mpi-*, + openmp-*' +HeaderFilterRegex: 'palace/drivers|palace/fem|palace/linalg|palace/utils' +FormatStyle: 'file' diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..f969c02967 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.msh binary diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 00e128bdaf..8f491da12a 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -1,16 +1,16 @@ ---- -name: Bug report -about: Create a report to help us improve -title: '' -labels: 'bug' -assignees: '' ---- - -*Description: A clear and concise description of what the bug is.* - -*To reproduce: Please provide minimal example that reproduces the error. For existing -examples, please provide a link.* - -*Error message: Paste the complete error message or log, if applicable.* - -*Environment: Any environment details, such as operating system, compiler, etc.* +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: 'bug' +assignees: '' +--- + +*Description: A clear and concise description of what the bug is.* + +*To reproduce: Please provide minimal example that reproduces the error. For existing +examples, please provide a link.* + +*Error message: Paste the complete error message or log, if applicable.* + +*Environment: Any environment details, such as operating system, compiler, etc.* diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index e316b001fe..a8e0bea5c3 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -1,11 +1,11 @@ ---- -name: Feature request -about: Suggest an idea for the project -title: '' -labels: 'enhancement' -assignees: '' ---- - -*Description: A clear and concise description of what the feature is.* - -*Add any other context or screenshots about the feature request here.* +--- +name: Feature request +about: Suggest an idea for the project +title: '' +labels: 'enhancement' +assignees: '' +--- + +*Description: A clear and concise description of what the feature is.* + +*Add any other context or screenshots about the feature request here.* diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index b95b53ff1b..14968b00a7 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,6 +1,6 @@ -*Description of changes:* - -*Issue #, if available:* - -By submitting this pull request, I confirm that you can use, modify, copy, and redistribute -this contribution, under the terms of your choice. +*Description of changes:* + +*Issue #, if available:* + +By submitting this pull request, I confirm that you can use, modify, copy, and redistribute +this contribution, under the terms of your choice. diff --git a/.github/workflows/build-and-test-linux-aarch64.yml b/.github/workflows/build-and-test-linux-aarch64.yml index b7ee0304de..71b80e33f2 100644 --- a/.github/workflows/build-and-test-linux-aarch64.yml +++ b/.github/workflows/build-and-test-linux-aarch64.yml @@ -1,112 +1,112 @@ -name: Build and Test (Linux, Arm64) - -on: - push: - branches: - - main - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build-and-test-linux-arm64: - strategy: - fail-fast: false - matrix: - include: # Only a single simple build test for now - - compiler: clang - mpi: mpich - math-libs: openblas - build-shared: static - with-64bit-int: int32 - with-openmp: serial - with-solver: superlu - with-eigensolver: arpack - - runs-on: palace_ubuntu-latest_16-core - steps: - - uses: actions/checkout@v3 - with: - submodules: 'recursive' - - - name: Hardware setup, build, and test - uses: uraimo/run-on-arch-action@v2 - id: runcmd - with: - arch: aarch64 - distro: ubuntu_latest - env: | - CMAKE_BUILD_TYPE: Debug # Speed up builds for run-on-arch - NUM_PROC_BUILD_MAX: '32' - NUM_PROC_TEST_MAX: '8' - run: | - # Install dependencies - apt-get update -q - apt-get install -y build-essential clang cmake curl gfortran git lld \ - libmpich-dev pkg-config python3 wget - - # Install Julia - curl -fsSL https://install.julialang.org | sh -s -- -y - export PATH=~/.juliaup/bin:$PATH - - # Install math libraries (OpenBLAS) - if [[ "${{ matrix.math-libs }}" == 'openblas' ]]; then - if [[ "${{ matrix.with-openmp }}" == 'openmp' ]]; then - apt-get install -y libopenblas-openmp-dev - else - apt-get install -y libopenblas-serial-dev - fi - fi - - # Install math libraries (Arm Performance Libraries) - if [[ "${{ matrix.math-libs }}" == 'armpl' ]]; then - wget https://developer.arm.com/-/media/Files/downloads/hpc/arm-performance-libraries/22-0-2/Ubuntu20.04/arm-performance-libraries_22.0.2_Ubuntu-20.04_gcc-11.2.tar - tar -xf arm-performance-libraries* && rm -rf arm-performance-libraries*.tar - ./arm-performance-libraries*/arm-performance-libraries*.sh -a -i /opt/arm - export ARMPL_DIR=/opt/arm/armpl_22.0.2_gcc-11.2 - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+LD_LIBRARY_PATH:}$ARMPL_DIR/lib" - fi - - # Configure environment for build - if [[ "${{ matrix.compiler }}" == 'clang' ]]; then - export CC=clang - export CXX=clang++ - export FC=gfortran-11 - export LDFLAGS='-fuse-ld=lld' - elif [[ "${{ matrix.compiler }}" == 'gcc' ]]; then - export CC=gcc-11 - export CXX=g++-11 - export FC=gfortran-11 - fi - export NUM_PROC_BUILD=$(nproc 2> /dev/null || sysctl -n hw.ncpu) - if [[ "$NUM_PROC_BUILD" -gt "$NUM_PROC_BUILD_MAX" ]]; then - NUM_PROC_BUILD=$NUM_PROC_BUILD_MAX - fi - - [[ "${{ matrix.build-shared }}" == 'shared' ]] && BUILD_SHARED='ON' || BUILD_SHARED='OFF' - [[ "${{ matrix.with-64bit-int }}" == 'int64' ]] && WITH_INT64='ON' || WITH_INT64='OFF' - [[ "${{ matrix.with-openmp }}" == 'openmp' ]] && WITH_OPENMP='ON' || WITH_OPENMP='OFF' - - [[ "${{ matrix.with-solver }}" == 'superlu' ]] && WITH_SUPERLU='ON' || WITH_SUPERLU='OFF' - [[ "${{ matrix.with-solver }}" == 'strumpack' ]] && WITH_STRUMPACK='ON' || WITH_STRUMPACK='OFF' - [[ "${{ matrix.with-solver }}" == 'mumps' ]] && WITH_MUMPS='ON' || WITH_MUMPS='OFF' - [[ "${{ matrix.with-eigensolver }}" == 'slepc' ]] && WITH_SLEPC='ON' || WITH_SLEPC='OFF' - [[ "${{ matrix.with-eigensolver }}" == 'arpack' ]] && WITH_ARPACK='ON' || WITH_ARPACK='OFF' - - # Build and install - mkdir palace-build && cd palace-build - cmake .. \ - -DCMAKE_INSTALL_PREFIX=/opt/palace \ - -DBUILD_SHARED_LIBS=$BUILD_SHARED \ - -DPALACE_WITH_64BIT_INT=$WITH_INT64 \ - -DPALACE_WITH_OPENMP=$WITH_OPENMP \ - -DPALACE_WITH_SUPERLU=$WITH_SUPERLU \ - -DPALACE_WITH_STRUMPACK=$WITH_STRUMPACK \ - -DPALACE_WITH_MUMPS=$WITH_MUMPS \ - -DPALACE_WITH_SLEPC=$WITH_SLEPC \ - -DPALACE_WITH_ARPACK=$WITH_ARPACK - make -j$NUM_PROC_BUILD - - # XX TODO: Disable tests for now since Julia precompilation fails +name: Build and Test (Linux, Arm64) + +on: + push: + branches: + - main + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-and-test-linux-arm64: + strategy: + fail-fast: false + matrix: + include: # Only a single simple build test for now + - compiler: clang + mpi: mpich + math-libs: openblas + build-shared: static + with-64bit-int: int32 + with-openmp: serial + with-solver: superlu + with-eigensolver: arpack + + runs-on: palace_ubuntu-latest_16-core + steps: + - uses: actions/checkout@v3 + with: + submodules: 'recursive' + + - name: Hardware setup, build, and test + uses: uraimo/run-on-arch-action@v2 + id: runcmd + with: + arch: aarch64 + distro: ubuntu_latest + env: | + CMAKE_BUILD_TYPE: Debug # Speed up builds for run-on-arch + NUM_PROC_BUILD_MAX: '32' + NUM_PROC_TEST_MAX: '8' + run: | + # Install dependencies + apt-get update -q + apt-get install -y build-essential clang cmake curl gfortran git lld \ + libmpich-dev pkg-config python3 wget + + # Install Julia + curl -fsSL https://install.julialang.org | sh -s -- -y + export PATH=~/.juliaup/bin:$PATH + + # Install math libraries (OpenBLAS) + if [[ "${{ matrix.math-libs }}" == 'openblas' ]]; then + if [[ "${{ matrix.with-openmp }}" == 'openmp' ]]; then + apt-get install -y libopenblas-openmp-dev + else + apt-get install -y libopenblas-serial-dev + fi + fi + + # Install math libraries (Arm Performance Libraries) + if [[ "${{ matrix.math-libs }}" == 'armpl' ]]; then + wget https://developer.arm.com/-/media/Files/downloads/hpc/arm-performance-libraries/22-0-2/Ubuntu20.04/arm-performance-libraries_22.0.2_Ubuntu-20.04_gcc-11.2.tar + tar -xf arm-performance-libraries* && rm -rf arm-performance-libraries*.tar + ./arm-performance-libraries*/arm-performance-libraries*.sh -a -i /opt/arm + export ARMPL_DIR=/opt/arm/armpl_22.0.2_gcc-11.2 + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+LD_LIBRARY_PATH:}$ARMPL_DIR/lib" + fi + + # Configure environment for build + if [[ "${{ matrix.compiler }}" == 'clang' ]]; then + export CC=clang + export CXX=clang++ + export FC=gfortran-11 + export LDFLAGS='-fuse-ld=lld' + elif [[ "${{ matrix.compiler }}" == 'gcc' ]]; then + export CC=gcc-11 + export CXX=g++-11 + export FC=gfortran-11 + fi + export NUM_PROC_BUILD=$(nproc 2> /dev/null || sysctl -n hw.ncpu) + if [[ "$NUM_PROC_BUILD" -gt "$NUM_PROC_BUILD_MAX" ]]; then + NUM_PROC_BUILD=$NUM_PROC_BUILD_MAX + fi + + [[ "${{ matrix.build-shared }}" == 'shared' ]] && BUILD_SHARED='ON' || BUILD_SHARED='OFF' + [[ "${{ matrix.with-64bit-int }}" == 'int64' ]] && WITH_INT64='ON' || WITH_INT64='OFF' + [[ "${{ matrix.with-openmp }}" == 'openmp' ]] && WITH_OPENMP='ON' || WITH_OPENMP='OFF' + + [[ "${{ matrix.with-solver }}" == 'superlu' ]] && WITH_SUPERLU='ON' || WITH_SUPERLU='OFF' + [[ "${{ matrix.with-solver }}" == 'strumpack' ]] && WITH_STRUMPACK='ON' || WITH_STRUMPACK='OFF' + [[ "${{ matrix.with-solver }}" == 'mumps' ]] && WITH_MUMPS='ON' || WITH_MUMPS='OFF' + [[ "${{ matrix.with-eigensolver }}" == 'slepc' ]] && WITH_SLEPC='ON' || WITH_SLEPC='OFF' + [[ "${{ matrix.with-eigensolver }}" == 'arpack' ]] && WITH_ARPACK='ON' || WITH_ARPACK='OFF' + + # Build and install + mkdir palace-build && cd palace-build + cmake .. \ + -DCMAKE_INSTALL_PREFIX=/opt/palace \ + -DBUILD_SHARED_LIBS=$BUILD_SHARED \ + -DPALACE_WITH_64BIT_INT=$WITH_INT64 \ + -DPALACE_WITH_OPENMP=$WITH_OPENMP \ + -DPALACE_WITH_SUPERLU=$WITH_SUPERLU \ + -DPALACE_WITH_STRUMPACK=$WITH_STRUMPACK \ + -DPALACE_WITH_MUMPS=$WITH_MUMPS \ + -DPALACE_WITH_SLEPC=$WITH_SLEPC \ + -DPALACE_WITH_ARPACK=$WITH_ARPACK + make -j$NUM_PROC_BUILD + + # XX TODO: Disable tests for now since Julia precompilation fails diff --git a/.github/workflows/build-and-test-linux.yml b/.github/workflows/build-and-test-linux.yml index 4394a32911..ba956219f1 100644 --- a/.github/workflows/build-and-test-linux.yml +++ b/.github/workflows/build-and-test-linux.yml @@ -1,241 +1,241 @@ -name: Build and Test (Linux) - -on: - push: - branches: - - main - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build-and-test-linux: - strategy: - fail-fast: false - matrix: - include: # Pairwise testing - - compiler: gcc - mpi: mpich - math-libs: aocl - build-shared: static - with-64bit-int: int64 - with-openmp: openmp - with-solver: strumpack - with-eigensolver: arpack - - - compiler: clang - mpi: mpich - math-libs: aocl - build-shared: shared - with-64bit-int: int64 - with-openmp: serial - with-solver: superlu - with-eigensolver: arpack - - - compiler: gcc - mpi: openmpi - math-libs: openblas - build-shared: shared - with-64bit-int: int32 - with-openmp: openmp - with-solver: strumpack - with-eigensolver: slepc - - - compiler: intel - mpi: intelmpi - math-libs: intelmkl - build-shared: static - with-64bit-int: int64 - with-openmp: openmp - with-solver: superlu - with-eigensolver: slepc - - - compiler: intel - mpi: intelmpi - math-libs: intelmkl - build-shared: static - with-64bit-int: int32 - with-openmp: serial - with-solver: strumpack - with-eigensolver: arpack - - - compiler: intel - mpi: intelmpi - math-libs: intelmkl - build-shared: shared - with-64bit-int: int32 - with-openmp: openmp - with-solver: strumpack - with-eigensolver: slepc - - - compiler: gcc - mpi: openmpi - math-libs: aocl - build-shared: static - with-64bit-int: int32 - with-openmp: serial - with-solver: strumpack - with-eigensolver: slepc - - - compiler: clang - mpi: mpich - math-libs: aocl - build-shared: static - with-64bit-int: int32 - with-openmp: serial - with-solver: superlu - with-eigensolver: slepc - - runs-on: palace_ubuntu-latest_16-core - steps: - - uses: actions/checkout@v3 - with: - submodules: 'recursive' - - - uses: mpi4py/setup-mpi@v1 - with: - mpi: ${{ matrix.mpi }} - - - name: Configure Clang compiler - if: matrix.compiler == 'clang' - run: | - sudo apt-get install -y clang lld - - - name: Configure Intel oneAPI compiler - if: matrix.compiler == 'intel' - run: | - sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp \ - intel-oneapi-compiler-fortran - - - name: Install math libraries (OpenBLAS) - if: matrix.math-libs == 'openblas' - run: | - if [[ "${{ matrix.with-openmp }}" == 'openmp' ]]; then - sudo apt-get install -y libopenblas-openmp-dev - else - sudo apt-get install -y libopenblas-serial-dev - fi - - - name: Install math libraries (Intel oneAPI MKL) - if: matrix.math-libs == 'intelmkl' - run: | - sudo apt-get install -y intel-oneapi-mkl intel-oneapi-mkl-devel - - - name: Install math libraries (AOCL) - if: matrix.math-libs == 'aocl' - run: | - wget https://download.amd.com/developer/eula/aocl/aocl-4-1/aocl-linux-gcc-4.1.0_1_amd64.deb - sudo apt-get install -y ./aocl-linux-gcc-4.1.0_1_amd64.deb - rm aocl-linux-gcc-*.deb - - - name: Build Palace - env: - CMAKE_BUILD_TYPE: Release - NUM_PROC_BUILD_MAX: '32' - run: | - # Configure environment - if [[ "${{ matrix.compiler }}" == 'intel' ]] || \ - [[ "${{ matrix.mpi }}" == 'intelmpi' ]] || \ - [[ "${{ matrix.math-libs }}" == 'intelmkl' ]]; then - source /opt/intel/oneapi/setvars.sh # Sets PATH, MKLROOT - if [[ "${{ matrix.compiler }}" == 'intel' ]]; then - export CC=icx - export CXX=icpx - export FC=ifx - fi - elif [[ "${{ matrix.compiler }}" == 'clang' ]]; then - export CC=clang - export CXX=clang++ - export FC=gfortran-11 - export LDFLAGS='-fuse-ld=lld' - elif [[ "${{ matrix.compiler }}" == 'gcc' ]]; then - export CC=gcc-11 - export CXX=g++-11 - export FC=gfortran-11 - fi - if [[ "${{ matrix.math-libs }}" == 'aocl' ]]; then - export AOCLROOT=/opt/AMD/aocl/aocl-linux-gcc-4.1.0/gcc - export LD_LIBRARY_PATH=$AOCLROOT/lib:$LD_LIBRARY_PATH - fi - export NUM_PROC_BUILD=$(nproc 2> /dev/null || sysctl -n hw.ncpu) - if [[ "$NUM_PROC_BUILD" -gt "$NUM_PROC_BUILD_MAX" ]]; then - NUM_PROC_BUILD=$NUM_PROC_BUILD_MAX - fi - - [[ "${{ matrix.build-shared }}" == 'shared' ]] && BUILD_SHARED='ON' || BUILD_SHARED='OFF' - [[ "${{ matrix.with-64bit-int }}" == 'int64' ]] && WITH_INT64='ON' || WITH_INT64='OFF' - [[ "${{ matrix.with-openmp }}" == 'openmp' ]] && WITH_OPENMP='ON' || WITH_OPENMP='OFF' - - [[ "${{ matrix.with-solver }}" == 'superlu' ]] && WITH_SUPERLU='ON' || WITH_SUPERLU='OFF' - [[ "${{ matrix.with-solver }}" == 'strumpack' ]] && WITH_STRUMPACK='ON' || WITH_STRUMPACK='OFF' - [[ "${{ matrix.with-solver }}" == 'mumps' ]] && WITH_MUMPS='ON' || WITH_MUMPS='OFF' - [[ "${{ matrix.with-eigensolver }}" == 'slepc' ]] && WITH_SLEPC='ON' || WITH_SLEPC='OFF' - [[ "${{ matrix.with-eigensolver }}" == 'arpack' ]] && WITH_ARPACK='ON' || WITH_ARPACK='OFF' - - # Build and install (with unit tests) - mkdir palace-build && cd palace-build - cmake .. \ - -DCMAKE_INSTALL_PREFIX=$(pwd)/../palace-install \ - -DBUILD_SHARED_LIBS=$BUILD_SHARED \ - -DPALACE_WITH_64BIT_INT=$WITH_INT64 \ - -DPALACE_WITH_OPENMP=$WITH_OPENMP \ - -DPALACE_WITH_SUPERLU=$WITH_SUPERLU \ - -DPALACE_WITH_STRUMPACK=$WITH_STRUMPACK \ - -DPALACE_WITH_MUMPS=$WITH_MUMPS \ - -DPALACE_WITH_SLEPC=$WITH_SLEPC \ - -DPALACE_WITH_ARPACK=$WITH_ARPACK - make -j$NUM_PROC_BUILD palace-tests - - - name: Run unit tests - env: - NUM_PROC_TEST_MAX: '2' - run: | - # Configure environment - if [[ "${{ matrix.compiler }}" == 'intel' ]] || \ - [[ "${{ matrix.mpi }}" == 'intelmpi' ]] || \ - [[ "${{ matrix.math-libs }}" == 'intelmkl' ]]; then - source /opt/intel/oneapi/setvars.sh # Sets PATH, MKLROOT - fi - if [[ "${{ matrix.with-openmp }}" == 'true' ]]; then - export OMP_NUM_THREADS=2 - else - export OMP_NUM_THREADS=1 - fi - export LD_LIBRARY_PATH=$(pwd)/palace-install/lib:$LD_LIBRARY_PATH - export LD_LIBRARY_PATH=$(pwd)/palace-install/lib64:$LD_LIBRARY_PATH - cd $(pwd)/palace-build/palace-build/test/unit - - # Run tests - mpirun -np $NUM_PROC_TEST_MAX ./unit-tests --skip-benchmarks - - - name: Run regression tests for examples/ - env: - NUM_PROC_TEST_MAX: '8' - run: | - # Configure environment - if [[ "${{ matrix.compiler }}" == 'intel' ]] || \ - [[ "${{ matrix.mpi }}" == 'intelmpi' ]] || \ - [[ "${{ matrix.math-libs }}" == 'intelmkl' ]]; then - source /opt/intel/oneapi/setvars.sh # Sets PATH, MKLROOT - fi - if [[ "${{ matrix.math-libs }}" == 'aocl' ]]; then - export AOCLROOT=/opt/AMD/aocl/aocl-linux-gcc-4.1.0/gcc - export LD_LIBRARY_PATH=$AOCLROOT/lib:$LD_LIBRARY_PATH - fi - export NUM_PROC_TEST=$(nproc 2> /dev/null || sysctl -n hw.ncpu) - if [[ "$NUM_PROC_TEST" -gt "$NUM_PROC_TEST_MAX" ]]; then - NUM_PROC_TEST=$NUM_PROC_TEST_MAX - fi - if [[ "${{ matrix.with-openmp }}" == 'true' ]]; then - NUM_PROC_TEST=$(( NUM_PROC_TEST / 2 )) - export OMP_NUM_THREADS=2 - else - export OMP_NUM_THREADS=1 - fi - export PATH=$(pwd)/palace-install/bin:$PATH - - # Run tests - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - julia --project=test/examples --color=yes test/examples/runtests.jl +name: Build and Test (Linux) + +on: + push: + branches: + - main + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-and-test-linux: + strategy: + fail-fast: false + matrix: + include: # Pairwise testing + - compiler: gcc + mpi: mpich + math-libs: aocl + build-shared: static + with-64bit-int: int64 + with-openmp: openmp + with-solver: strumpack + with-eigensolver: arpack + + - compiler: clang + mpi: mpich + math-libs: aocl + build-shared: shared + with-64bit-int: int64 + with-openmp: serial + with-solver: superlu + with-eigensolver: arpack + + - compiler: gcc + mpi: openmpi + math-libs: openblas + build-shared: shared + with-64bit-int: int32 + with-openmp: openmp + with-solver: strumpack + with-eigensolver: slepc + + - compiler: intel + mpi: intelmpi + math-libs: intelmkl + build-shared: static + with-64bit-int: int64 + with-openmp: openmp + with-solver: superlu + with-eigensolver: slepc + + - compiler: intel + mpi: intelmpi + math-libs: intelmkl + build-shared: static + with-64bit-int: int32 + with-openmp: serial + with-solver: strumpack + with-eigensolver: arpack + + - compiler: intel + mpi: intelmpi + math-libs: intelmkl + build-shared: shared + with-64bit-int: int32 + with-openmp: openmp + with-solver: strumpack + with-eigensolver: slepc + + - compiler: gcc + mpi: openmpi + math-libs: aocl + build-shared: static + with-64bit-int: int32 + with-openmp: serial + with-solver: strumpack + with-eigensolver: slepc + + - compiler: clang + mpi: mpich + math-libs: aocl + build-shared: static + with-64bit-int: int32 + with-openmp: serial + with-solver: superlu + with-eigensolver: slepc + + runs-on: palace_ubuntu-latest_16-core + steps: + - uses: actions/checkout@v3 + with: + submodules: 'recursive' + + - uses: mpi4py/setup-mpi@v1 + with: + mpi: ${{ matrix.mpi }} + + - name: Configure Clang compiler + if: matrix.compiler == 'clang' + run: | + sudo apt-get install -y clang lld + + - name: Configure Intel oneAPI compiler + if: matrix.compiler == 'intel' + run: | + sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp \ + intel-oneapi-compiler-fortran + + - name: Install math libraries (OpenBLAS) + if: matrix.math-libs == 'openblas' + run: | + if [[ "${{ matrix.with-openmp }}" == 'openmp' ]]; then + sudo apt-get install -y libopenblas-openmp-dev + else + sudo apt-get install -y libopenblas-serial-dev + fi + + - name: Install math libraries (Intel oneAPI MKL) + if: matrix.math-libs == 'intelmkl' + run: | + sudo apt-get install -y intel-oneapi-mkl intel-oneapi-mkl-devel + + - name: Install math libraries (AOCL) + if: matrix.math-libs == 'aocl' + run: | + wget https://download.amd.com/developer/eula/aocl/aocl-4-1/aocl-linux-gcc-4.1.0_1_amd64.deb + sudo apt-get install -y ./aocl-linux-gcc-4.1.0_1_amd64.deb + rm aocl-linux-gcc-*.deb + + - name: Build Palace + env: + CMAKE_BUILD_TYPE: Release + NUM_PROC_BUILD_MAX: '32' + run: | + # Configure environment + if [[ "${{ matrix.compiler }}" == 'intel' ]] || \ + [[ "${{ matrix.mpi }}" == 'intelmpi' ]] || \ + [[ "${{ matrix.math-libs }}" == 'intelmkl' ]]; then + source /opt/intel/oneapi/setvars.sh # Sets PATH, MKLROOT + if [[ "${{ matrix.compiler }}" == 'intel' ]]; then + export CC=icx + export CXX=icpx + export FC=ifx + fi + elif [[ "${{ matrix.compiler }}" == 'clang' ]]; then + export CC=clang + export CXX=clang++ + export FC=gfortran-11 + export LDFLAGS='-fuse-ld=lld' + elif [[ "${{ matrix.compiler }}" == 'gcc' ]]; then + export CC=gcc-11 + export CXX=g++-11 + export FC=gfortran-11 + fi + if [[ "${{ matrix.math-libs }}" == 'aocl' ]]; then + export AOCLROOT=/opt/AMD/aocl/aocl-linux-gcc-4.1.0/gcc + export LD_LIBRARY_PATH=$AOCLROOT/lib:$LD_LIBRARY_PATH + fi + export NUM_PROC_BUILD=$(nproc 2> /dev/null || sysctl -n hw.ncpu) + if [[ "$NUM_PROC_BUILD" -gt "$NUM_PROC_BUILD_MAX" ]]; then + NUM_PROC_BUILD=$NUM_PROC_BUILD_MAX + fi + + [[ "${{ matrix.build-shared }}" == 'shared' ]] && BUILD_SHARED='ON' || BUILD_SHARED='OFF' + [[ "${{ matrix.with-64bit-int }}" == 'int64' ]] && WITH_INT64='ON' || WITH_INT64='OFF' + [[ "${{ matrix.with-openmp }}" == 'openmp' ]] && WITH_OPENMP='ON' || WITH_OPENMP='OFF' + + [[ "${{ matrix.with-solver }}" == 'superlu' ]] && WITH_SUPERLU='ON' || WITH_SUPERLU='OFF' + [[ "${{ matrix.with-solver }}" == 'strumpack' ]] && WITH_STRUMPACK='ON' || WITH_STRUMPACK='OFF' + [[ "${{ matrix.with-solver }}" == 'mumps' ]] && WITH_MUMPS='ON' || WITH_MUMPS='OFF' + [[ "${{ matrix.with-eigensolver }}" == 'slepc' ]] && WITH_SLEPC='ON' || WITH_SLEPC='OFF' + [[ "${{ matrix.with-eigensolver }}" == 'arpack' ]] && WITH_ARPACK='ON' || WITH_ARPACK='OFF' + + # Build and install (with unit tests) + mkdir palace-build && cd palace-build + cmake .. \ + -DCMAKE_INSTALL_PREFIX=$(pwd)/../palace-install \ + -DBUILD_SHARED_LIBS=$BUILD_SHARED \ + -DPALACE_WITH_64BIT_INT=$WITH_INT64 \ + -DPALACE_WITH_OPENMP=$WITH_OPENMP \ + -DPALACE_WITH_SUPERLU=$WITH_SUPERLU \ + -DPALACE_WITH_STRUMPACK=$WITH_STRUMPACK \ + -DPALACE_WITH_MUMPS=$WITH_MUMPS \ + -DPALACE_WITH_SLEPC=$WITH_SLEPC \ + -DPALACE_WITH_ARPACK=$WITH_ARPACK + make -j$NUM_PROC_BUILD palace-tests + + - name: Run unit tests + env: + NUM_PROC_TEST_MAX: '2' + run: | + # Configure environment + if [[ "${{ matrix.compiler }}" == 'intel' ]] || \ + [[ "${{ matrix.mpi }}" == 'intelmpi' ]] || \ + [[ "${{ matrix.math-libs }}" == 'intelmkl' ]]; then + source /opt/intel/oneapi/setvars.sh # Sets PATH, MKLROOT + fi + if [[ "${{ matrix.with-openmp }}" == 'true' ]]; then + export OMP_NUM_THREADS=2 + else + export OMP_NUM_THREADS=1 + fi + export LD_LIBRARY_PATH=$(pwd)/palace-install/lib:$LD_LIBRARY_PATH + export LD_LIBRARY_PATH=$(pwd)/palace-install/lib64:$LD_LIBRARY_PATH + cd $(pwd)/palace-build/palace-build/test/unit + + # Run tests + mpirun -np $NUM_PROC_TEST_MAX ./unit-tests --skip-benchmarks + + - name: Run regression tests for examples/ + env: + NUM_PROC_TEST_MAX: '8' + run: | + # Configure environment + if [[ "${{ matrix.compiler }}" == 'intel' ]] || \ + [[ "${{ matrix.mpi }}" == 'intelmpi' ]] || \ + [[ "${{ matrix.math-libs }}" == 'intelmkl' ]]; then + source /opt/intel/oneapi/setvars.sh # Sets PATH, MKLROOT + fi + if [[ "${{ matrix.math-libs }}" == 'aocl' ]]; then + export AOCLROOT=/opt/AMD/aocl/aocl-linux-gcc-4.1.0/gcc + export LD_LIBRARY_PATH=$AOCLROOT/lib:$LD_LIBRARY_PATH + fi + export NUM_PROC_TEST=$(nproc 2> /dev/null || sysctl -n hw.ncpu) + if [[ "$NUM_PROC_TEST" -gt "$NUM_PROC_TEST_MAX" ]]; then + NUM_PROC_TEST=$NUM_PROC_TEST_MAX + fi + if [[ "${{ matrix.with-openmp }}" == 'true' ]]; then + NUM_PROC_TEST=$(( NUM_PROC_TEST / 2 )) + export OMP_NUM_THREADS=2 + else + export OMP_NUM_THREADS=1 + fi + export PATH=$(pwd)/palace-install/bin:$PATH + + # Run tests + julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + julia --project=test/examples --color=yes test/examples/runtests.jl diff --git a/.github/workflows/build-and-test-macos.yml b/.github/workflows/build-and-test-macos.yml index b637e8a7c2..52f535cf1b 100644 --- a/.github/workflows/build-and-test-macos.yml +++ b/.github/workflows/build-and-test-macos.yml @@ -1,147 +1,147 @@ -name: Build and Test (macOS) - -on: - push: - branches: - - main - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build-and-test-macos: - strategy: - fail-fast: false - matrix: - include: # Pairwise testing - - compiler: clang - mpi: openmpi - math-libs: openblas - build-shared: shared - with-64bit-int: int32 - with-openmp: serial - with-solver: superlu - with-eigensolver: slepc - - - compiler: gcc - mpi: openmpi - math-libs: openblas - build-shared: static - with-64bit-int: int64 - with-openmp: openmp - with-solver: superlu - with-eigensolver: arpack - - - compiler: gcc - mpi: mpich - math-libs: openblas - build-shared: static - with-64bit-int: int32 - with-openmp: openmp - with-solver: strumpack - with-eigensolver: slepc - - runs-on: macos-latest-xl - steps: - - uses: actions/checkout@v3 - with: - submodules: 'recursive' - - - uses: mpi4py/setup-mpi@v1 - with: - mpi: ${{ matrix.mpi }} - - - name: Install pkg-config - run: | - brew install pkg-config - - - name: Install math libraries (OpenBLAS) - if: matrix.math-libs == 'openblas' - run: | - brew install openblas - - - name: Build Palace - env: - CMAKE_BUILD_TYPE: Release - NUM_PROC_BUILD_MAX: '32' - run: | - # Configure environment - if [[ "${{ matrix.compiler }}" == 'clang' ]]; then - export CC=$(brew --prefix llvm@15)/bin/clang - export CXX=$(brew --prefix llvm@15)/bin/clang++ - export FC=gfortran-11 - elif [[ "${{ matrix.compiler }}" == 'gcc' ]]; then - export CC=gcc-11 - export CXX=g++-11 - export FC=gfortran-11 - fi - if [[ "${{ matrix.math-libs }}" == 'openblas' ]]; then - export OPENBLAS_DIR=/usr/local/opt/openblas - fi - export NUM_PROC_BUILD=$(nproc 2> /dev/null || sysctl -n hw.ncpu) - if [[ "$NUM_PROC_BUILD" -gt "$NUM_PROC_BUILD_MAX" ]]; then - NUM_PROC_BUILD=$NUM_PROC_BUILD_MAX - fi - - [[ "${{ matrix.build-shared }}" == 'shared' ]] && BUILD_SHARED='ON' || BUILD_SHARED='OFF' - [[ "${{ matrix.with-64bit-int }}" == 'int64' ]] && WITH_INT64='ON' || WITH_INT64='OFF' - [[ "${{ matrix.with-openmp }}" == 'openmp' ]] && WITH_OPENMP='ON' || WITH_OPENMP='OFF' - - [[ "${{ matrix.with-solver }}" == 'superlu' ]] && WITH_SUPERLU='ON' || WITH_SUPERLU='OFF' - [[ "${{ matrix.with-solver }}" == 'strumpack' ]] && WITH_STRUMPACK='ON' || WITH_STRUMPACK='OFF' - [[ "${{ matrix.with-solver }}" == 'mumps' ]] && WITH_MUMPS='ON' || WITH_MUMPS='OFF' - [[ "${{ matrix.with-eigensolver }}" == 'slepc' ]] && WITH_SLEPC='ON' || WITH_SLEPC='OFF' - [[ "${{ matrix.with-eigensolver }}" == 'arpack' ]] && WITH_ARPACK='ON' || WITH_ARPACK='OFF' - - # Build and install (with unit tests) - mkdir palace-build && cd palace-build - cmake .. \ - -DCMAKE_INSTALL_PREFIX=$(pwd)/../palace-install \ - -DBUILD_SHARED_LIBS=$BUILD_SHARED \ - -DPALACE_WITH_64BIT_INT=$WITH_INT64 \ - -DPALACE_WITH_OPENMP=$WITH_OPENMP \ - -DPALACE_WITH_SUPERLU=$WITH_SUPERLU \ - -DPALACE_WITH_STRUMPACK=$WITH_STRUMPACK \ - -DPALACE_WITH_MUMPS=$WITH_MUMPS \ - -DPALACE_WITH_SLEPC=$WITH_SLEPC \ - -DPALACE_WITH_ARPACK=$WITH_ARPACK - make -j$NUM_PROC_BUILD palace-tests - - - name: Run unit tests - env: - NUM_PROC_TEST_MAX: '2' - run: | - # Configure environment - if [[ "${{ matrix.with-openmp }}" == 'true' ]]; then - export OMP_NUM_THREADS=2 - else - export OMP_NUM_THREADS=1 - fi - export DYLD_LIBRARY_PATH=$(pwd)/palace-install/lib:$DYLD_LIBRARY_PATH - cd $(pwd)/palace-build/palace-build/test/unit - - # Run tests - mpirun -np $NUM_PROC_TEST_MAX ./unit-tests --skip-benchmarks - - - name: Run regression tests for examples/ - env: - NUM_PROC_TEST_MAX: '8' - run: | - # Configure environment - export NUM_PROC_TEST=$(nproc 2> /dev/null || sysctl -n hw.ncpu) - if [[ "$NUM_PROC_TEST" -gt "$NUM_PROC_TEST_MAX" ]]; then - NUM_PROC_TEST=$NUM_PROC_TEST_MAX - fi - if [[ "${{ matrix.with-openmp }}" == 'true' ]]; then - NUM_PROC_TEST=$(( NUM_PROC_TEST / 2 )) - export OMP_NUM_THREADS=2 - else - export OMP_NUM_THREADS=1 - fi - export PATH=$(pwd)/palace-install/bin:$PATH - - # Run tests - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - julia --project=test/examples --color=yes test/examples/runtests.jl +name: Build and Test (macOS) + +on: + push: + branches: + - main + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-and-test-macos: + strategy: + fail-fast: false + matrix: + include: # Pairwise testing + - compiler: clang + mpi: openmpi + math-libs: openblas + build-shared: shared + with-64bit-int: int32 + with-openmp: serial + with-solver: superlu + with-eigensolver: slepc + + - compiler: gcc + mpi: openmpi + math-libs: openblas + build-shared: static + with-64bit-int: int64 + with-openmp: openmp + with-solver: superlu + with-eigensolver: arpack + + - compiler: gcc + mpi: mpich + math-libs: openblas + build-shared: static + with-64bit-int: int32 + with-openmp: openmp + with-solver: strumpack + with-eigensolver: slepc + + runs-on: macos-latest-xl + steps: + - uses: actions/checkout@v3 + with: + submodules: 'recursive' + + - uses: mpi4py/setup-mpi@v1 + with: + mpi: ${{ matrix.mpi }} + + - name: Install pkg-config + run: | + brew install pkg-config + + - name: Install math libraries (OpenBLAS) + if: matrix.math-libs == 'openblas' + run: | + brew install openblas + + - name: Build Palace + env: + CMAKE_BUILD_TYPE: Release + NUM_PROC_BUILD_MAX: '32' + run: | + # Configure environment + if [[ "${{ matrix.compiler }}" == 'clang' ]]; then + export CC=$(brew --prefix llvm@15)/bin/clang + export CXX=$(brew --prefix llvm@15)/bin/clang++ + export FC=gfortran-11 + elif [[ "${{ matrix.compiler }}" == 'gcc' ]]; then + export CC=gcc-11 + export CXX=g++-11 + export FC=gfortran-11 + fi + if [[ "${{ matrix.math-libs }}" == 'openblas' ]]; then + export OPENBLAS_DIR=/usr/local/opt/openblas + fi + export NUM_PROC_BUILD=$(nproc 2> /dev/null || sysctl -n hw.ncpu) + if [[ "$NUM_PROC_BUILD" -gt "$NUM_PROC_BUILD_MAX" ]]; then + NUM_PROC_BUILD=$NUM_PROC_BUILD_MAX + fi + + [[ "${{ matrix.build-shared }}" == 'shared' ]] && BUILD_SHARED='ON' || BUILD_SHARED='OFF' + [[ "${{ matrix.with-64bit-int }}" == 'int64' ]] && WITH_INT64='ON' || WITH_INT64='OFF' + [[ "${{ matrix.with-openmp }}" == 'openmp' ]] && WITH_OPENMP='ON' || WITH_OPENMP='OFF' + + [[ "${{ matrix.with-solver }}" == 'superlu' ]] && WITH_SUPERLU='ON' || WITH_SUPERLU='OFF' + [[ "${{ matrix.with-solver }}" == 'strumpack' ]] && WITH_STRUMPACK='ON' || WITH_STRUMPACK='OFF' + [[ "${{ matrix.with-solver }}" == 'mumps' ]] && WITH_MUMPS='ON' || WITH_MUMPS='OFF' + [[ "${{ matrix.with-eigensolver }}" == 'slepc' ]] && WITH_SLEPC='ON' || WITH_SLEPC='OFF' + [[ "${{ matrix.with-eigensolver }}" == 'arpack' ]] && WITH_ARPACK='ON' || WITH_ARPACK='OFF' + + # Build and install (with unit tests) + mkdir palace-build && cd palace-build + cmake .. \ + -DCMAKE_INSTALL_PREFIX=$(pwd)/../palace-install \ + -DBUILD_SHARED_LIBS=$BUILD_SHARED \ + -DPALACE_WITH_64BIT_INT=$WITH_INT64 \ + -DPALACE_WITH_OPENMP=$WITH_OPENMP \ + -DPALACE_WITH_SUPERLU=$WITH_SUPERLU \ + -DPALACE_WITH_STRUMPACK=$WITH_STRUMPACK \ + -DPALACE_WITH_MUMPS=$WITH_MUMPS \ + -DPALACE_WITH_SLEPC=$WITH_SLEPC \ + -DPALACE_WITH_ARPACK=$WITH_ARPACK + make -j$NUM_PROC_BUILD palace-tests + + - name: Run unit tests + env: + NUM_PROC_TEST_MAX: '2' + run: | + # Configure environment + if [[ "${{ matrix.with-openmp }}" == 'true' ]]; then + export OMP_NUM_THREADS=2 + else + export OMP_NUM_THREADS=1 + fi + export DYLD_LIBRARY_PATH=$(pwd)/palace-install/lib:$DYLD_LIBRARY_PATH + cd $(pwd)/palace-build/palace-build/test/unit + + # Run tests + mpirun -np $NUM_PROC_TEST_MAX ./unit-tests --skip-benchmarks + + - name: Run regression tests for examples/ + env: + NUM_PROC_TEST_MAX: '8' + run: | + # Configure environment + export NUM_PROC_TEST=$(nproc 2> /dev/null || sysctl -n hw.ncpu) + if [[ "$NUM_PROC_TEST" -gt "$NUM_PROC_TEST_MAX" ]]; then + NUM_PROC_TEST=$NUM_PROC_TEST_MAX + fi + if [[ "${{ matrix.with-openmp }}" == 'true' ]]; then + NUM_PROC_TEST=$(( NUM_PROC_TEST / 2 )) + export OMP_NUM_THREADS=2 + else + export OMP_NUM_THREADS=1 + fi + export PATH=$(pwd)/palace-install/bin:$PATH + + # Run tests + julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + julia --project=test/examples --color=yes test/examples/runtests.jl diff --git a/.github/workflows/docs-cleanup.yml b/.github/workflows/docs-cleanup.yml index 1d6da4df53..c6405e9210 100644 --- a/.github/workflows/docs-cleanup.yml +++ b/.github/workflows/docs-cleanup.yml @@ -1,26 +1,26 @@ -name: Documentation Preview Cleanup - -on: - pull_request: - types: [closed] - -jobs: - cleanup-preview-docs: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - ref: gh-pages - - - name: Delete preview and history - env: - PRNUM: ${{ github.event.number }} - run: | - if [[ -d previews/PR$PRNUM ]]; then - git config user.name "Documenter.jl" - git config user.email "documenter@juliadocs.github.io" - git rm -rf previews/PR$PRNUM - git commit -m "delete preview" - git branch gh-pages-new $(echo "delete history" | git commit-tree HEAD^{tree}) - git push --force origin gh-pages-new:gh-pages - fi +name: Documentation Preview Cleanup + +on: + pull_request: + types: [closed] + +jobs: + cleanup-preview-docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + ref: gh-pages + + - name: Delete preview and history + env: + PRNUM: ${{ github.event.number }} + run: | + if [[ -d previews/PR$PRNUM ]]; then + git config user.name "Documenter.jl" + git config user.email "documenter@juliadocs.github.io" + git rm -rf previews/PR$PRNUM + git commit -m "delete preview" + git branch gh-pages-new $(echo "delete history" | git commit-tree HEAD^{tree}) + git push --force origin gh-pages-new:gh-pages + fi diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 2fac5a90bf..87b8c809c2 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,29 +1,29 @@ -name: Documentation - -on: - push: - branches: - - main - tags: '*' - pull_request: - -jobs: - build-docs: - permissions: - contents: write - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - - name: Build and deploy - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - julia --project=docs -e 'using Pkg; Pkg.instantiate()' - julia --project=docs --color=yes docs/make.jl - - - uses: actions/upload-artifact@v3 - with: - name: docs - path: docs/build/ - retention-days: 7 +name: Documentation + +on: + push: + branches: + - main + tags: '*' + pull_request: + +jobs: + build-docs: + permissions: + contents: write + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Build and deploy + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + julia --project=docs -e 'using Pkg; Pkg.instantiate()' + julia --project=docs --color=yes docs/make.jl + + - uses: actions/upload-artifact@v3 + with: + name: docs + path: docs/build/ + retention-days: 7 diff --git a/.github/workflows/singularity.yml b/.github/workflows/singularity.yml index 951367af6f..7ac2b9b9b0 100644 --- a/.github/workflows/singularity.yml +++ b/.github/workflows/singularity.yml @@ -1,49 +1,49 @@ -name: Singularity - -on: - push: - branches: - - main - tags: '*' - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - CONTAINER_NAME: palace.sif - -jobs: - build-and-test-singularity: - runs-on: palace_ubuntu-latest_16-core - steps: - - uses: actions/checkout@v3 - - - uses: singularityhub/install-singularity@main - - - name: Build container - run: | - sudo singularity build --bind ${{ github.workspace }}:/opt/palace-src:ro \ - $CONTAINER_NAME singularity/singularity.def - - - name: Run tests - env: - NUM_PROC_TEST_MAX: '8' - run: | - # Configure environment - export NUM_PROC_TEST=$(nproc 2> /dev/null || sysctl -n hw.ncpu) - if [[ "$NUM_PROC_TEST" -gt "$NUM_PROC_TEST_MAX" ]]; then - NUM_PROC_TEST=$NUM_PROC_TEST_MAX - fi - export PALACE_TEST="singularity run $(pwd)/$CONTAINER_NAME" - - # Run tests - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - julia --project=test/examples --color=yes test/examples/runtests.jl - - - uses: actions/upload-artifact@v3 - with: - name: ${{ env.CONTAINER_NAME }} - path: ${{ env.CONTAINER_NAME }} - retention-days: 1 +name: Singularity + +on: + push: + branches: + - main + tags: '*' + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + CONTAINER_NAME: palace.sif + +jobs: + build-and-test-singularity: + runs-on: palace_ubuntu-latest_16-core + steps: + - uses: actions/checkout@v3 + + - uses: singularityhub/install-singularity@main + + - name: Build container + run: | + sudo singularity build --bind ${{ github.workspace }}:/opt/palace-src:ro \ + $CONTAINER_NAME singularity/singularity.def + + - name: Run tests + env: + NUM_PROC_TEST_MAX: '8' + run: | + # Configure environment + export NUM_PROC_TEST=$(nproc 2> /dev/null || sysctl -n hw.ncpu) + if [[ "$NUM_PROC_TEST" -gt "$NUM_PROC_TEST_MAX" ]]; then + NUM_PROC_TEST=$NUM_PROC_TEST_MAX + fi + export PALACE_TEST="singularity run $(pwd)/$CONTAINER_NAME" + + # Run tests + julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + julia --project=test/examples --color=yes test/examples/runtests.jl + + - uses: actions/upload-artifact@v3 + with: + name: ${{ env.CONTAINER_NAME }} + path: ${{ env.CONTAINER_NAME }} + retention-days: 1 diff --git a/.github/workflows/spack.yml b/.github/workflows/spack.yml index b52c6d2a5a..4b874cdbf9 100644 --- a/.github/workflows/spack.yml +++ b/.github/workflows/spack.yml @@ -1,64 +1,64 @@ -name: Spack - -on: - push: - branches: - - main - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - build-and-test-spack: - strategy: - fail-fast: false - matrix: - include: - - compiler: gcc - mpi: openmpi - - runs-on: palace_ubuntu-latest_16-core - steps: - - uses: actions/checkout@v3 - - - uses: mpi4py/setup-mpi@v1 - with: - mpi: ${{ matrix.mpi }} - - - name: Configure Clang compiler - if: matrix.compiler == 'clang' - run: | - sudo apt-get install -y clang lld - - - uses: vsoch/spack-package-action/install@main - - - name: Build Palace - run: | - # Clean up Android SDK install (confuses Spack MKL link line?) - sudo rm -rf $ANDROID_HOME - - # Set up Spack to use external packages (MPI, etc.) - . /opt/spack/share/spack/setup-env.sh - spack external find --all - - # Build and install - spack repo add spack/local - spack dev-build local.palace@develop%${{ matrix.compiler }} ^petsc~hdf5 ^intel-oneapi-mkl - - - name: Run tests - env: - NUM_PROC_TEST_MAX: '8' - run: | - # Configure environment - export NUM_PROC_TEST=$(nproc 2> /dev/null || sysctl -n hw.ncpu) - if [[ "$NUM_PROC_TEST" -gt "$NUM_PROC_TEST_MAX" ]]; then - NUM_PROC_TEST=$NUM_PROC_TEST_MAX - fi - . /opt/spack/share/spack/setup-env.sh - spack load palace - - # Run tests - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - julia --project=test/examples --color=yes test/examples/runtests.jl +name: Spack + +on: + push: + branches: + - main + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-and-test-spack: + strategy: + fail-fast: false + matrix: + include: + - compiler: gcc + mpi: openmpi + + runs-on: palace_ubuntu-latest_16-core + steps: + - uses: actions/checkout@v3 + + - uses: mpi4py/setup-mpi@v1 + with: + mpi: ${{ matrix.mpi }} + + - name: Configure Clang compiler + if: matrix.compiler == 'clang' + run: | + sudo apt-get install -y clang lld + + - uses: vsoch/spack-package-action/install@main + + - name: Build Palace + run: | + # Clean up Android SDK install (confuses Spack MKL link line?) + sudo rm -rf $ANDROID_HOME + + # Set up Spack to use external packages (MPI, etc.) + . /opt/spack/share/spack/setup-env.sh + spack external find --all + + # Build and install + spack repo add spack/local + spack dev-build local.palace@develop%${{ matrix.compiler }} ^petsc~hdf5 ^intel-oneapi-mkl + + - name: Run tests + env: + NUM_PROC_TEST_MAX: '8' + run: | + # Configure environment + export NUM_PROC_TEST=$(nproc 2> /dev/null || sysctl -n hw.ncpu) + if [[ "$NUM_PROC_TEST" -gt "$NUM_PROC_TEST_MAX" ]]; then + NUM_PROC_TEST=$NUM_PROC_TEST_MAX + fi + . /opt/spack/share/spack/setup-env.sh + spack load palace + + # Run tests + julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + julia --project=test/examples --color=yes test/examples/runtests.jl diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index 74df816dd6..3db1e5cdba 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -1,59 +1,59 @@ -name: Style - -on: - push: - branches: - - main - pull_request: - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - check-style: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - # Install more recent clang-format from LLVM (match Homebrew v16) - - name: Install clang-format - run: | - wget -O- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - - sudo add-apt-repository 'deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-16 main' - sudo apt-get update -q - sudo apt-get install -y clang-format-16 - - - name: Check style - run: | - ./scripts/format-source --clang-format clang-format-16 - if [[ `git status -s | wc -l` -ne 0 ]]; then - echo 'Error: Commit is not formatted!' - echo 'Run '\`'./scripts/format-source'\`' in the source root directory' - echo 'Summary of required changes:' - echo "`git diff --stat`" - echo 'Repository status:' - echo "`git status`" - exit 1 - else - echo 'Commit is correctly formatted' - exit 0 - fi - - check-config: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - - name: Check JSON Schema - run: | - OUTPUT=$(find examples -name *.json -maxdepth 2 -print0 | xargs -0 -n1 ./scripts/validate-config) - if echo $OUTPUT | grep -q 'Validation failed'; then - echo 'Error: Configuration file validation failed!' - echo 'Summary of output:' - echo $OUTPUT - exit 1 - else - echo 'Configuration file validation passed' - exit 0 - fi +name: Style + +on: + push: + branches: + - main + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + check-style: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + # Install more recent clang-format from LLVM (match Homebrew v16) + - name: Install clang-format + run: | + wget -O- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - + sudo add-apt-repository 'deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-16 main' + sudo apt-get update -q + sudo apt-get install -y clang-format-16 + + - name: Check style + run: | + ./scripts/format-source --clang-format clang-format-16 + if [[ `git status -s | wc -l` -ne 0 ]]; then + echo 'Error: Commit is not formatted!' + echo 'Run '\`'./scripts/format-source'\`' in the source root directory' + echo 'Summary of required changes:' + echo "`git diff --stat`" + echo 'Repository status:' + echo "`git status`" + exit 1 + else + echo 'Commit is correctly formatted' + exit 0 + fi + + check-config: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Check JSON Schema + run: | + OUTPUT=$(find examples -name *.json -maxdepth 2 -print0 | xargs -0 -n1 ./scripts/validate-config) + if echo $OUTPUT | grep -q 'Validation failed'; then + echo 'Error: Configuration file validation failed!' + echo 'Summary of output:' + echo $OUTPUT + exit 1 + else + echo 'Configuration file validation passed' + exit 0 + fi diff --git a/.gitignore b/.gitignore index 9238a4a2f1..47516de3fc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,11 @@ -build*/ -examples/**/postpro*/ -examples/**/log*/ -spack/local/packages/palace/__pycache__/ -test/examples/ref/**/*.json -.*.swp -*.sif -.DS_Store -.gitlab-ci-local -Manifest.toml +build*/ +examples/**/postpro*/ +examples/**/log*/ +spack_repo/local/packages/palace/__pycache__/ +test/examples/ref/**/*.json +.*.swp +*.sif +.DS_Store +.gitlab-ci-local +Manifest.toml +.vscode/ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6ef2e6f997..86fcd8843f 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,302 +1,302 @@ -stages: - - build - - check - - deploy - -variables: - GIT_SUBMODULE_STRATEGY: recursive - CMAKE_BUILD_TYPE: Release - NUM_PROC_BUILD_MAX: '32' - NUM_PROC_TEST_MAX: '8' - -default: - interruptible: true - image: public.ecr.aws/lts/ubuntu:latest # Ubuntu 22.04 - tags: - - arch:amd64 - - size:2xlarge - before_script: - # Install build dependencies, set default number of cores for building and testing - - | - if [[ "$CI_JOB_STAGE" == "build" ]]; then - apt-get update -q - apt-get install -y build-essential cmake curl gfortran git pkg-config python3 wget - if [[ "$(nproc)" -gt "$NUM_PROC_BUILD_MAX" ]]; then - export NUM_PROC_BUILD=$NUM_PROC_BUILD_MAX - else - export NUM_PROC_BUILD=$(nproc) - fi - if [[ "$(nproc)" -gt "$NUM_PROC_TEST_MAX" ]]; then - export NUM_PROC_TEST=$NUM_PROC_TEST_MAX - else - export NUM_PROC_TEST=$(nproc) - fi - fi - # Install Julia - - | - curl -fsSL https://install.julialang.org | sh -s -- -y - export PATH=~/.juliaup/bin:$PATH - -test-build: - stage: build - script: - - apt-get install -y libmpich-dev libopenblas-serial-dev - - mkdir build && cd build - - cmake .. - -DPALACE_WITH_STRUMPACK=ON - -DPALACE_WITH_MUMPS=ON - -DPALACE_WITH_ARPACK=ON - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - artifacts: - paths: - - build - expire_in: 1 day - -test-build-clang: - stage: build - script: - - apt-get install -y clang lld libmpich-dev libopenblas-serial-dev - - export LDFLAGS='-fuse-ld=lld' - - mkdir build && cd build - - cmake .. - -DCMAKE_CXX_COMPILER=clang++ - -DCMAKE_C_COMPILER=clang - -DPALACE_WITH_STRUMPACK=ON - -DPALACE_WITH_MUMPS=ON - -DPALACE_WITH_ARPACK=ON - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - -test-build-mpicc: - stage: build - script: - - apt-get install -y libmpich-dev libopenblas-serial-dev - - mkdir build && cd build - - cmake .. - -DCMAKE_CXX_COMPILER=mpicxx - -DCMAKE_C_COMPILER=mpicc - -DCMAKE_Fortran_COMPILER=mpif90 - -DPALACE_WITH_STRUMPACK=ON - -DPALACE_WITH_MUMPS=ON - -DPALACE_WITH_ARPACK=ON - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - -test-build-shared: - stage: build - script: - - apt-get install -y libmpich-dev libopenblas-serial-dev - - mkdir build && cd build - - cmake .. - -DBUILD_SHARED_LIBS=ON - -DPALACE_WITH_STRUMPACK=ON - -DPALACE_WITH_MUMPS=ON - -DPALACE_WITH_ARPACK=ON - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - -test-build-openmp: - stage: build - script: - - apt-get install -y libmpich-dev libopenblas-serial-dev - - mkdir build && cd build - - cmake .. - -DPALACE_WITH_OPENMP=ON - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - -test-build-bigint: - stage: build - script: - - apt-get install -y libmpich-dev libopenblas-serial-dev - - mkdir build && cd build - - cmake .. - -DPALACE_WITH_64BIT_INT=ON - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - -test-build-openmpi: - stage: build - script: - - apt-get install -y libopenmpi-dev libopenblas-serial-dev - - export OMPI_ALLOW_RUN_AS_ROOT=1 && export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 - - export OMPI_MCA_btl_vader_single_copy_mechanism=none - - mkdir build && cd build - - cmake .. - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - -test-build-intel-oneapi: - stage: build - image: intel/oneapi-hpckit:latest # Intel oneAPI image from Docker Hub - script: - # Image has MKLROOT and environment configured by default - - mkdir build && cd build - - cmake .. - -DCMAKE_CXX_COMPILER=icpx - -DCMAKE_C_COMPILER=icx - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - -test-build-aocl: - stage: build - script: - # Install AMD Optimizing CPU Libraries (AOCL) - - apt-get install -y libmpich-dev - - wget https://download.amd.com/developer/eula/aocl/aocl-4-1/aocl-linux-gcc-4.1.0_1_amd64.deb - - apt-get install -y ./aocl-linux-gcc-4.1.0_1_amd64.deb - - rm aocl-linux-gcc-*.deb - - export AOCLROOT=/opt/AMD/aocl/aocl-linux-gcc-4.1.0/gcc - - export LD_LIBRARY_PATH=$AOCLROOT/lib:$LD_LIBRARY_PATH - - mkdir build && cd build - - cmake .. - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - -test-build-arm64: - stage: build - tags: - - arch:arm64 - - size:2xlarge - script: - - apt-get install -y libmpich-dev libopenblas-serial-dev - - mkdir build && cd build - - cmake .. - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - -test-build-arm64-shared: - stage: build - tags: - - arch:arm64 - - size:2xlarge - script: - - apt-get install -y libmpich-dev libopenblas-serial-dev - - mkdir build && cd build - - cmake .. - -DBUILD_SHARED_LIBS=ON - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - -test-build-arm64-armpl: - stage: build - tags: - - arch:arm64 - - size:2xlarge - script: - # Install Arm Performance Libraries - - apt-get install -y libmpich-dev - - wget https://developer.arm.com/-/media/Files/downloads/hpc/arm-performance-libraries/22-0-2/Ubuntu20.04/arm-performance-libraries_22.0.2_Ubuntu-20.04_gcc-11.2.tar - - tar -xf arm-performance-libraries* && rm -rf arm-performance-libraries*.tar - - ./arm-performance-libraries*/arm-performance-libraries*.sh -a -i /opt/arm - - export ARMPL_DIR=/opt/arm/armpl_22.0.2_gcc-11.2 - - export LD_LIBRARY_PATH=$ARMPL_DIR/lib:$LD_LIBRARY_PATH - - mkdir build && cd build - - cmake .. - - make -j$NUM_PROC_BUILD - - cd .. - - export PATH=$(pwd)/build/bin:$PATH - - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' - - julia --project=test/examples --color=yes test/examples/runtests.jl - -test-format: - stage: check - tags: - - size:medium - variables: - GIT_SUBMODULE_STRATEGY: none - script: - # Install more recent clang-format from LLVM (match Homebrew v16) - - apt-get install -y gpg - - wget -O- https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - - - | - echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-16 main" \ - | tee /etc/apt/sources.list.d/llvm.list - - apt-get update -q && apt-get install -y clang-format-16 - - ./scripts/format-source --clang-format clang-format-16 - - |- - if [[ `git status -s | wc -l` -ne 0 ]]; then - echo 'Error: Commit is not formatted!' - echo 'Run '\`'./scripts/format-source'\`' in the source root directory' - echo 'Summary of required changes:' - echo "`git diff --stat`" - echo 'Repository status:' - echo "`git status`" - exit 1 - else - echo 'Commit is correctly formatted' - exit 0 - fi - dependencies: [] - -test-config: - stage: check - tags: - - size:medium - variables: - GIT_SUBMODULE_STRATEGY: none - script: - - OUTPUT=$(find examples -name *.json -exec ./scripts/validate-config {} \;) - - |- - if echo $OUTPUT | grep -q 'Validation failed'; then - echo 'Error: Configuration file validation failed!' - echo 'Summary of output:' - echo $OUTPUT - exit 1 - else - echo 'Configuration file validation passed' - exit 0 - fi - dependencies: [] - -pages: - stage: deploy - tags: - - size:medium - variables: - GIT_SUBMODULE_STRATEGY: none - script: - - julia --project=docs -e 'using Pkg; Pkg.instantiate()' - - julia --project=docs --color=yes docs/make.jl - - mv docs/build public - dependencies: [] - artifacts: - paths: - - public - expire_in: never - only: - - main +stages: + - build + - check + - deploy + +variables: + GIT_SUBMODULE_STRATEGY: recursive + CMAKE_BUILD_TYPE: Release + NUM_PROC_BUILD_MAX: '32' + NUM_PROC_TEST_MAX: '8' + +default: + interruptible: true + image: public.ecr.aws/lts/ubuntu:latest # Ubuntu 22.04 + tags: + - arch:amd64 + - size:2xlarge + before_script: + # Install build dependencies, set default number of cores for building and testing + - | + if [[ "$CI_JOB_STAGE" == "build" ]]; then + apt-get update -q + apt-get install -y build-essential cmake curl gfortran git pkg-config python3 wget + if [[ "$(nproc)" -gt "$NUM_PROC_BUILD_MAX" ]]; then + export NUM_PROC_BUILD=$NUM_PROC_BUILD_MAX + else + export NUM_PROC_BUILD=$(nproc) + fi + if [[ "$(nproc)" -gt "$NUM_PROC_TEST_MAX" ]]; then + export NUM_PROC_TEST=$NUM_PROC_TEST_MAX + else + export NUM_PROC_TEST=$(nproc) + fi + fi + # Install Julia + - | + curl -fsSL https://install.julialang.org | sh -s -- -y + export PATH=~/.juliaup/bin:$PATH + +test-build: + stage: build + script: + - apt-get install -y libmpich-dev libopenblas-serial-dev + - mkdir build && cd build + - cmake .. + -DPALACE_WITH_STRUMPACK=ON + -DPALACE_WITH_MUMPS=ON + -DPALACE_WITH_ARPACK=ON + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + artifacts: + paths: + - build + expire_in: 1 day + +test-build-clang: + stage: build + script: + - apt-get install -y clang lld libmpich-dev libopenblas-serial-dev + - export LDFLAGS='-fuse-ld=lld' + - mkdir build && cd build + - cmake .. + -DCMAKE_CXX_COMPILER=clang++ + -DCMAKE_C_COMPILER=clang + -DPALACE_WITH_STRUMPACK=ON + -DPALACE_WITH_MUMPS=ON + -DPALACE_WITH_ARPACK=ON + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + +test-build-mpicc: + stage: build + script: + - apt-get install -y libmpich-dev libopenblas-serial-dev + - mkdir build && cd build + - cmake .. + -DCMAKE_CXX_COMPILER=mpicxx + -DCMAKE_C_COMPILER=mpicc + -DCMAKE_Fortran_COMPILER=mpif90 + -DPALACE_WITH_STRUMPACK=ON + -DPALACE_WITH_MUMPS=ON + -DPALACE_WITH_ARPACK=ON + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + +test-build-shared: + stage: build + script: + - apt-get install -y libmpich-dev libopenblas-serial-dev + - mkdir build && cd build + - cmake .. + -DBUILD_SHARED_LIBS=ON + -DPALACE_WITH_STRUMPACK=ON + -DPALACE_WITH_MUMPS=ON + -DPALACE_WITH_ARPACK=ON + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + +test-build-openmp: + stage: build + script: + - apt-get install -y libmpich-dev libopenblas-serial-dev + - mkdir build && cd build + - cmake .. + -DPALACE_WITH_OPENMP=ON + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + +test-build-bigint: + stage: build + script: + - apt-get install -y libmpich-dev libopenblas-serial-dev + - mkdir build && cd build + - cmake .. + -DPALACE_WITH_64BIT_INT=ON + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + +test-build-openmpi: + stage: build + script: + - apt-get install -y libopenmpi-dev libopenblas-serial-dev + - export OMPI_ALLOW_RUN_AS_ROOT=1 && export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 + - export OMPI_MCA_btl_vader_single_copy_mechanism=none + - mkdir build && cd build + - cmake .. + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + +test-build-intel-oneapi: + stage: build + image: intel/oneapi-hpckit:latest # Intel oneAPI image from Docker Hub + script: + # Image has MKLROOT and environment configured by default + - mkdir build && cd build + - cmake .. + -DCMAKE_CXX_COMPILER=icpx + -DCMAKE_C_COMPILER=icx + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + +test-build-aocl: + stage: build + script: + # Install AMD Optimizing CPU Libraries (AOCL) + - apt-get install -y libmpich-dev + - wget https://download.amd.com/developer/eula/aocl/aocl-5-0/aocl-linux-gcc-5.0.0_1_amd64.deb + - apt-get install -y ./aocl-linux-gcc-5.0.0_1_amd64.deb + - rm aocl-linux-gcc-*.deb + - export AOCLROOT=/opt/AMD/aocl/aocl-linux-gcc-5.0.0/gcc + - export LD_LIBRARY_PATH=$AOCLROOT/lib:$LD_LIBRARY_PATH + - mkdir build && cd build + - cmake .. + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + +test-build-arm64: + stage: build + tags: + - arch:arm64 + - size:2xlarge + script: + - apt-get install -y libmpich-dev libopenblas-serial-dev + - mkdir build && cd build + - cmake .. + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + +test-build-arm64-shared: + stage: build + tags: + - arch:arm64 + - size:2xlarge + script: + - apt-get install -y libmpich-dev libopenblas-serial-dev + - mkdir build && cd build + - cmake .. + -DBUILD_SHARED_LIBS=ON + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + +test-build-arm64-armpl: + stage: build + tags: + - arch:arm64 + - size:2xlarge + script: + # Install Arm Performance Libraries + - apt-get install -y libmpich-dev + - wget https://developer.arm.com/-/media/Files/downloads/hpc/arm-performance-libraries/22-0-2/Ubuntu20.04/arm-performance-libraries_22.0.2_Ubuntu-20.04_gcc-11.2.tar + - tar -xf arm-performance-libraries* && rm -rf arm-performance-libraries*.tar + - ./arm-performance-libraries*/arm-performance-libraries*.sh -a -i /opt/arm + - export ARMPL_DIR=/opt/arm/armpl_22.0.2_gcc-11.2 + - export LD_LIBRARY_PATH=$ARMPL_DIR/lib:$LD_LIBRARY_PATH + - mkdir build && cd build + - cmake .. + - make -j$NUM_PROC_BUILD + - cd .. + - export PATH=$(pwd)/build/bin:$PATH + - julia --project=test/examples -e 'using Pkg; Pkg.instantiate()' + - julia --project=test/examples --color=yes test/examples/runtests.jl + +test-format: + stage: check + tags: + - size:medium + variables: + GIT_SUBMODULE_STRATEGY: none + script: + # Install more recent clang-format from LLVM (match Homebrew v16) + - apt-get install -y gpg + - wget -O- https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - + - | + echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-16 main" \ + | tee /etc/apt/sources.list.d/llvm.list + - apt-get update -q && apt-get install -y clang-format-16 + - ./scripts/format-source --clang-format clang-format-16 + - |- + if [[ `git status -s | wc -l` -ne 0 ]]; then + echo 'Error: Commit is not formatted!' + echo 'Run '\`'./scripts/format-source'\`' in the source root directory' + echo 'Summary of required changes:' + echo "`git diff --stat`" + echo 'Repository status:' + echo "`git status`" + exit 1 + else + echo 'Commit is correctly formatted' + exit 0 + fi + dependencies: [] + +test-config: + stage: check + tags: + - size:medium + variables: + GIT_SUBMODULE_STRATEGY: none + script: + - OUTPUT=$(find examples -name *.json -exec ./scripts/validate-config {} \;) + - |- + if echo $OUTPUT | grep -q 'Validation failed'; then + echo 'Error: Configuration file validation failed!' + echo 'Summary of output:' + echo $OUTPUT + exit 1 + else + echo 'Configuration file validation passed' + exit 0 + fi + dependencies: [] + +pages: + stage: deploy + tags: + - size:medium + variables: + GIT_SUBMODULE_STRATEGY: none + script: + - julia --project=docs -e 'using Pkg; Pkg.instantiate()' + - julia --project=docs --color=yes docs/make.jl + - mv docs/build public + dependencies: [] + artifacts: + paths: + - public + expire_in: never + only: + - main diff --git a/CHANGELOG.md b/CHANGELOG.md index 27051e1039..4a8737c47f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,293 +1,525 @@ - -# Changelog - -> Note: *Palace* is under active initial development, pre-v1.0. Functionality and interfaces -> may change rapidly as development progresses. - -The format of this changelog is based on -[Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to -[Semantic Versioning](https://semver.org/). - -## In progress - - - Changed implementation of complex-valued linear algebra to use new `ComplexVector` and - `ComplexOperator` types, which are based on the underlying `mfem::Vector` and - `mfem::Operator` classes, instead of PETSc. PETSc is now fully optional and only - required when SLEPc eigenvalue solver support is requested. Krylov solvers for real- and - complex-valued linear systems are implemented via the built-in `IterativeSolver` - classes. - - Changed implementation of PROMs for adaptive fast frequency sweep to use the Eigen - library for sequential dense linear algebra. - - Changed implementation of numeric wave ports to use MFEM's `SubMesh` functionality. As - of [#3379](https://github.com/mfem/mfem/pull/3379) in MFEM, this has full ND and RT - basis support. For now, support for nonconforming mesh boundaries is limited. - - Added support for operator partial assembly for high-order finite element spaces based - on libCEED for non-tensor product element meshes. This option is disabled by default, - but can be activated using `config["Solver"]["PartialAssemblyOrder"]` set to some number - less than or equal to `"Order"`. - - Added `config["Solver"]["Device"]` and `config["Solver"]["Backend"]` options for runtime - configuration of the MFEM device (CPU or GPU) and corresponding libCEED backend, with - suitable defaults for users. - - Added support for non axis aligned lumped ports and current sources. Key words `"X"`, - `"Y"`, `"Z"` and `"R"`, with optional prefix `"+"` or `"-"` still work, but now - directions can be specified as vectors with 3 components. Users will be warned, and - ultimately errored, if the specified directions do not agree with axis directions - discovered from the geometry. - - Added flux-based error estimation, reported in `error-estimate.csv`. This computes the - difference between the numerical gradient (electrostatics) or curl (otherwise) of the - solution, and a smoother approximation obtained through a global mass matrix inversion. - The results are reported in `error-estimates.csv` within the `"Output"` folder. - - Added Adaptive Mesh Refinement (AMR), specified in the `config["Model"]["Refinement"]`, - for all problem types aside from transient. To enable AMR, a user must specify - `"MaxIts"`, while all other options have reasonable defaults. Nonconformal(all mesh - types) and conformal (simplex meshes) refinement are supported. - - Added output of lumped port voltage and current for eigenmode simulations. - - Added dimensionalized output for energies, voltages, currents, and field values based on - a choice of the characteristic magnetic field strength used for nondimensionalization. - - Fixed bugs for simulations using tetrahedral meshes associated with unexpected mesh - toplogy changes during parallel mesh construction. - - Added improved `Timer` and `BlockTimer` classes with more timing categories for - reporting simulation runtime. - - Added build dependencies on [libCEED](https://github.com/CEED/libCEED) and - [LIBXSMM](https://github.com/libxsmm/libxsmm) to support operator partial assembly (CPU- - based for now). - - Added unit test framework for all integrators based on - [Catch2](https://github.com/catchorg/Catch2), which also includes some automated - benchmarking capabilities for operator assembly and application. - - Added improved OpenMP support in `palace` wrapper script and CI tests. - - Added Apptainer/Singularity container build definition for Palace. - - Fixed bugs related to thread-safety for OpenMP builds and parallel tetrahedral meshes in - the upstream MFEM library. - -## [0.11.2] - 2023-07-14 - - - Fixed a regression bug affecting meshes which have domain elements which are not - assigned material properties in the configuration file. - - Changed layout and names of `palace/` source directory for better organization. - - Added many updates to build system: Removed use of Git submodules to download - dependencies relying instead directly on CMake's ExternalProject, patch GSLIB dependency - for shared library builds, add CI tests with ARPACK-NG instead of SLEPc, update all - dependency versions including MFEM to incorporate bug fixes and improvements. This - affects the Spack package recipe, so a new recipe is distributed as part of Palace in - in `spack/` which will keep compatibility with `main` while changes are upstreamed to - the built-in Spack repository. - - Added a basic Makefile with some useful targets for development. - -## [0.11.1] - 2023-05-03 - - - Fixed a bug for interface dielectric loss postprocessing including when using - `--dry-run`. - - Fixed a regression bug affecting second-order absorbing boundary conditions. - - Fixed a bug when the number of processors was large enough that some subdomains own no - true dofs. - - Fixed memory bug in destruction of SuperLU solver. - - Fixed some typos in the documentation. - - Fixed a bug in the cavity convergence study example for the coarsest hexahedral mesh - case, as well as some other minor Julia fixes in the `examples/` and `scripts/` - directories. - - Added updates to superbuild including better ARPACK support, though still experimental - compared to SLEPc. - - Added updated submodules for superbuild. - -## [0.11.0] - 2023-01-26 - - - Initial public release on GitHub. - - Added support for anisotropic materials through the use of `"MaterialAxes"` under - `"Materials"`. These material axes allow for specifying symmetric material property - tensors through a sum of weighted outer products of normal vectors. This can be used in - conjunction with scalar material properties or multiple different anisotropic - properties, though all properties are required to use the same set of basis vectors. - Demonstration given for the coplanar waveguide example which now utilizes a sapphire - substrate. - - Added to default postprocessing outputs: Bulk and interface dielectric loss writes - energy-participation ratios in addition to quality factors associated with lossy - regions, IO coupling $\kappa$ in addition to quality factor for eigenmode - simulations, lumped element energy for transient simulations similar to eigenmode and - frequency domain driven simulations. - - Changed configuration file syntax to simplify support for multielement lumped ports, - where each index for a multielement port should now use the `"Elements"` object to - describe the attributes for each port element. - - Changed configuration file keywords to better reflect meaning: - `config["Boundaries"]["Current"]` => `config["Boundaries"]["SurfaceCurrent"]`, and - `"UseGMG"`, `"UsePCShifted"`, `"MGCycleIts"`, and `"MGSmoothIts"` under - `config["Solver"]["Linear"]`. - - Changed geometric multigrid implementation to generalize across problem types (added - for electrostatics) and use Jacobi-preconditioned Chebyshev smoothing with optional - auxiliary space smoothing at each multigrid level as well. The auxiliary space matrices - are constructed directly in H1 now to allow for matrix-free/partial-assembly support - eventually. Geometric multigrid is turned on by default. - - Added structured simulation metadata output in the form of a JSON file `palace.json` - written to the directory specified by `config["Problem"]["Output"]`. - - Added JSON Schema files for the configuration file format as well as a helper script - to check a provided configuration file against the schema. - - Added optional interface to GSLIB library which enables `"Probe"` functionality to - sample the computed electric and magnetic fields at points in space. - - Added preliminary support for builds using Spack. This includes an option in the build - system for user-supplied dependencies as an alternative to the superbuild - configuration. - - Added updated submodules for superbuild, fixing a bug in SuperLU_DIST solver causing - communication hangs for certain numbers of processors. - -## [0.10.0] - 2022-10-04 - - - Added interfaces to ARPACK and FEAST eigenvalue solvers. - - Added divergence-free projection for eliminating spurious DC modes for eigenmode - solves. - - Added option for visualizing fields on mesh boundaries, output alongside the full 3D - solution fields for ParaVeiew visualization. - - Added real and imaginary fields output for complex-valued phasors, and electric and - magnetic energy density fields. - - Added convergence study for the cavity example application, and example Julia code for - automated mesh generation, running of the solver, and postprocessing. - - Fixed bugs in mesh preprocessing related to precision of nodal coordinates when - distributing a serial mesh to each process. - - Added option for `"Driven"` and `"Transient"` simulations to accelerate postprocessing - by only considering port boundaries. - - Added `-dry-run`/`--dry-run` command line option to check configuration file for errors - and exit. - - Changed dependency build process to rely less on PETSc build system, no longer give - option to build BLAS/LAPACK libraries from source, added detection for OpenBLAS and AMD - Optimizing CPU Libraries (AOCL) blis/libflame. - - Fixed issues when compiling with Intel or Clang compilers. - - Added CI test builds to test various build options and added easier default build - options in CMake configuration. - - Added `clang-format` and `JuliaFormatter` configurations for automated C++, Julia, and - Markdown code formatting. - -## [0.9.0] - 2022-07-11 - - - Added region-based mesh refinement with box or sphere refinement regions. - - Added new sparse direct solver interfaces to SuperLU_DIST and STRUMPACK libraries, to - complement the existing MUMPS interface. - - Changed configuration file keywords for linear solver and preconditioner parameters to - make more options available to the user and clarify them. - - Added check that all external boundary surface attributes must have an associated - boundary condition specified in the configuration file (adds new `"PMC"` and - `"ZeroCharge"` boundaries for natural boundary conditions). - - Added advanced configuration options for GMRES/FGMRES/SLEPc eigenvalue solver - orthogonalization, making CGS2 (classical GS with iterative refinement) the default. - -## [0.8.0] - 2022-06-06 - - - Added new comprehensive example applications and detailed tutorials in the - documentation. - - Added CI pipeline for build and regression testing. - - Fixed bugs for shared library builds, which are now the default for dependency builds. - - Added complex lumped port voltage and current output for computing impedance and - admittance parameters from frequency domain driven simulations. - - Fixed a bug in mesh parsers for COMSOL and Nastran formats for high-order hexahedra, - prism, and pyramid element types. - - Changed adaptive frequency sweep implementation to introduce optimizations which, in - particular, speed up driven simulations with wave ports. - - Fixed bug related to domain dielectric postprocessing and surface dielectric loss - postprocessing now automatically detects correct side for substrate-air interfaces - (always evaluates on vacuum side). - -## [0.7.0] - 2022-03-25 - - - Added adaptive fast frequency sweep implementation for frequency domain driven - simulations, activated with `config["Solver"]["Driven"]["AdaptiveTol"] > 0.0`. - - Changed ASCII file postprocessing to write out .csv files instead of generic whitespace - delimited .txt for easier parsing. - - Changed field output postprocessing for visualization to save the mesh in the original - mesh length units (rather than nondimensionalized ones). - - Added electric and magnetic field energy output to files as function of mode, frequency, - time, or solution step, in `domain-E.csv`. - - Added option to specify circuit R/L/C parameters for lumped port boundaries as an - alternative to the surface Rs/Ls/Cs ones. - - Added improved and expanded documentation. - - Added new MFEM features, including support for ND and RT spaces on wedge or prism - elements and bug fixes. - -## [0.6.0] - 2022-02-17 - - - Changed implementation of boundary conditions and excitations to do a better job - separating spatial discretization and algebraic operators. This is also in preparation - for future additions of boundary conditions which are not quadratic in the frequency - such as wave ports, as well as linear algebra improvements. - - Added a `config["Boundaries"]["Current"]` configuration file parameter to specify a - surface current excitation, and removed the lumped port `"Termination"` option as it is - no longer relevant. Magnetostatics simulations use this `"Current"` excitation rather - than `"Port"` now. - - Changed lumped port and surface current excitations to now use a unit circuit voltage - and circuit current excitation, rather than unit electric field or surface current - density. - - Added numeric wave port boundaries for frequency domain driven simulations. To - differentiate, lumped port boundaries are now specified using the keyword - `"LumpedPort"` instead of just `"Port"` in the configuration file. - - Fixed farfield boundaries to account for non-vacuum materials at the boundary and added - second-order option for absorbing boundary condition (ABC). - - Added finite conductivity surface impedance boundary condition, with optional thickness - correction. - - Added dielectric loss calculation specializations for metal-air, metal-substrate, and - substrate-air interfaces in accordance with - [this paper](https://aip.scitation.org/doi/10.1063/1.3637047). - - Changed implementation of frequency domain linear algebra to eliminate the use of PETSc - matrix types, relying instead on wrappers of the real and imaginary Hypre matrices - making up a complex operator. - - Added geometric multigrid AMS solvers (in two variants) using h- or p-coarsening, which - employs either a specialized smoother for Nedelec space coarsening or geometric - coarsening for the auxiliary space solves. - - Changed build process to include PETSc as a submodule and configure as part the standard - CMake build process. - -## [0.5.0] - 2021-11-08 - - - Changed postprocessing implementation which leads to some configuration file changes, - namely `config["Domains"]["Postprocessing"]` and - `config["Boundaries"]["Postprocessing"]`. - - Changed some filenames for postprocessed quantities to be more intuitive. - - Added dielectric (bulk and interface) loss calculations for all simulation types. - Correctly handles two-sided internal surfaces using `"Side"` specification. - - Added mutual capacitance matrix extraction to electrostatic solver. - - Added built-in mesh support for COMSOL (`.mphbin` and `.mphtxt`) and Nastran (`.nas` and - `.bdf`) mesh formats. - - Removed Gmsh dependency for mesh format conversion. - - Added some improvements to the default build flags and dependency build options. - -## [0.4.0] - 2021-09-27 - - - Added `"Capacitance"` and `"Inductance"` options to extract surface charges and fluxes - in frequency and time domain simulation types. - - Changed `"Transient"` solver implementation to use real-valued scalars (user must use a - domain electrical conductivity as opposed to a loss tangent for loss modeling), for a - roughly 2x performance improvement. - - Added new MFEM features, including upgrades to high-order Nedelec element spaces on - tetrahedra which no longer require global dof reordering. - - Changed system matrix assembly to use Hypre P^T A P within MFEM and keep the full dense - element matrices when the coefficient is nonzero (`skip_zeros = 0` always). The - stiffness and mass matrices thus always share the same sparsity pattern now. - -## [0.3.0] - 2021-09-02 - - - Added electrostatic solver for capacitance matrix extraction. - - Added `"Termination"` option for ports to allow for open or short circuiting but still - enabling port postprocessing. - - Changed timer to separate out time spent for disk I/O during initialization and - postprocessing. - - Added options to job wrappers for new instance types and subnet ID option. - - Added changes from Hypre and MFEM dependencies to include improvements and bug fixes - from those libraries. - -## [0.2.0] - 2021-06-25 - - - Added a proper [changelog](./CHANGELOG.md) for the project. - - Added time domain solver with various excitation options to accompany existing frequency - domain driven and eigenmode simulation types. - - Added wrapper scripts in [`bin/`](./bin) for launching parallel simulations either - interactively or as batch jobs using PBS. - - Changed JSON configuration file keywords for frequency and time intervals. - - Added domain conductivity models for normal and superconducting metals. - - Fixed a bug for coaxial lumped port postprocessing. - - Added visualization of surface currents on domain boundaries to accompany existing - $\bm {E}$- and $\bm{B}$-field visualization. - - Changed default length scale for nondimensionalization. - - Changed default filenames for port-related postprocessed quantities. - - Fixed many smaller issues and added various other improvements. - -## [0.1.0] - 2020-11-03 - - - Initial development release. + + +# Changelog + +> Note: *Palace* is under active initial development, pre-v1.0. Functionality and interfaces +> may change rapidly as development progresses. + +The format of this changelog is based on +[Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to +[Semantic Versioning](https://semver.org/). + +## In progress + +#### Bug Fixes + + - Fixed a bug in the computation of the waveport maximum propagation constant in parallel simulations + [PR 580](https://github.com/awslabs/palace/pull/580). + +## [0.15.0] - 2025-12-2 + +#### New Features + + - Added support for nonlinear eigenvalue problems arising from frequency-dependent + boundary conditions. Two nonlinear eigensolvers are now available and can be specified + by setting the `config["Solver"]["Eigenmode"]["NonlinearType"]` option to `"Hybrid"` + (default) or `"SLP"`. The nonlinear eigensolver will automatically be used if + frequency-dependent boundary conditions are used. [PR + 467](https://github.com/awslabs/palace/pull/467). + - Added support for extraction of electric fields in the radiative zone. + Consult the + [documentation](https://awslabs.github.io/palace/dev/features/farfield) for + additional information. [PR + 449](https://github.com/awslabs/palace/pull/449). + - Added support for more granular tests and for computing the testing coverage. + Consult the + [documentation](https://awslabs.github.io/palace/dev/developer/testing/) for + additional information. [PR + 398](https://github.com/awslabs/palace/pull/398), [PR + 480](https://github.com/awslabs/palace/pull/480). + - Added option to export field solutions as MFEM grid functions to visualize with + [GLVis](https://glvis.org). Output formats can be specified in `config["Problem"]["OutputFormats"]`. + [PR 518](https://github.com/awslabs/palace/pull/518). + - Added an option to drop small entries (below machine epsilon) from the matrix used in the sparse + direct solver. This can be specified with `config["Solver"]["Linear"]["DropSmallEntries"]`. + [PR 476](https://github.com/awslabs/palace/pull/476). + +#### Interface Changes + + - `config["Boundaries"]["Periodic"]` is now a dictionary where all periodic boundary + pairs, built into the mesh file or not, should be specified in + `config["Boundaries"]["Periodic"]["BoundaryPairs"]` and a single global Floquet wave + vector can be specified in `config["Boundaries"]["Periodic"]["FloquetWaveVector"]`. [PR + 471](https://github.com/awslabs/palace/pull/471). + - Changed Paraview and GLVis output fields from nondimensional to SI units + [PR 532](https://github.com/awslabs/palace/pull/532). + - The name of the unit test executable is now `palace-unit-tests` (instead of `unit-tests`) + [PR 549](https://github.com/awslabs/palace/pull/549). + +#### Bug Fixes + + - Changed wave port eigenproblem shift and sorting to fix an issue with the mode ordering. + The first mode now has the largest propagation constant, closest to the TEM limit, and + subsequent modes are ordered by decreasing propagation constant. [PR + 448](https://github.com/awslabs/palace/pull/448). + - Fixed an issue where Gmsh meshes with built-in periodicity (specified in the mesh file) + were failing. [PR + 471](https://github.com/awslabs/palace/pull/471). + - Fixed bug where a mesh from a previous nonconformal adaptation could not be loaded to + use in a non-amr simulation. [PR + 497](https://github.com/awslabs/palace/pull/497). + - Fixed bug where `"CrackInternalBoundaryElements"` would result in incorrect results for + some lumped port boundary conditions. [PR + 505](https://github.com/awslabs/palace/pull/505). + - Changed the sign of the Floquet phase factor from $\exp(+ik\cdot x)$ to $\exp(-ik\cdot + x)$. [PR 510](https://github.com/awslabs/palace/pull/510) and [PR + 526](https://github.com/awslabs/palace/pull/526). + - Update EM constants to CODATA Recommended Values of the Fundamental Physical Constants + 2022 [PR 525](https://github.com/awslabs/palace/pull/525). + - Scaled Rs/Ls/Cs of impedance boundary conditions affected by mesh cracking, fixing bug + where `"CrackInternalBoundaryElements"` would lead to incorrect results. [PR + 544](https://github.com/awslabs/palace/pull/544). + - Fixed Paraview/MFEM output for transient simulations. [PR 561](https://github.com/awslabs/palace/pull/561). + - Fixed units of energy. Results were incorrectly reported in joules, while the correct + unit was nanojoules. Results are now correctly in joules. [PR + 541](https://github.com/awslabs/palace/pull/541). + - Fixed a bug in transient solver implicit formulation affecting the magnetic flux density fields computed with + `"GeneralizedAlpha"` or `"RungeKutta"` transient solver types (`config["Solver"]["Transient"]["Type"]`) + [PR 568](https://github.com/awslabs/palace/pull/568). + +## [0.14.0] - 2025-08-20 + + - Added `--version` command line flag for displaying Palace version information. + - Fixed a small regression bug for boundary postprocessing when specifying + `"Side": "LargerRefractiveIndex"`, introduced as part of v0.13.0. + - Added an improvement to numeric wave ports to avoid targeting evanescent modes at + higher operating frequencies. Also finite conductivity boundaries + (`config["Boundaries"]["Conductivity"]`) are automatically marked as PEC for the wave + port mode solve (previously these were marked as PMC unless specified under + `"WavePortPEC"`). + - Fixed a bug in divergence-free projection for problems without essential or mixed + boundary conditions. + - Added `"MakeSimplex"` and `"MakeHexahedral"` mesh options to convert an input mesh to + all tetrahedra or all hexahedra. Also adds `"SerialUniformLevels"` option to + `config["Model"]["Refinement"]` for testing or debugging. + - Added `config["Model"]["CrackInternalBoundaryElements"]` which will separate or "crack" the mesh + along all internal boundaries. This improves the performance of error estimation and AMR + as the recovered smooth fields do not enforce additional erroneous continuity at + internal boundaries. This will change the default behaviour in the case of internal + impedance boundary conditions, and can be disabled by setting this option to false. + - Added support for exact periodic boundary conditions, these can be specified as part of + the mesh file (where supported by the format) or by specification of `"DonorAttributes"` + and `"ReceiverAttributes"` in `config["Boundaries"]["Periodic"]` which will attempt to + match the mesh on the boundaries specified by the donor and receiver attributes. This is + only possible the mesh on the donor and receiver match exactly, non-matching meshes are + not supported. + - Exposed configuration linear solver and eigen solver options for the wave port + subproblem. These can now be specified as part of the `config["Boundaries"]["WavePort"]` + configuration. The defaults align with the previously hardcoded values. + - Nonconformal adaptation is now supported for WavePort boundary conditions. This was + achieved through a patch applied to MFEM to support `mfem::ParSubMesh` on external + nonconformal surface subdomains. + - Added adaptive time-stepping capability for transient simulations. The new ODE integrators + rely on the SUNDIALS library and can be specified by setting the + `config["Solver"]["Transient"]["Type"]` option to `"CVODE"` or `"ARKODE"`. + - Added an option to use the complex-valued system matrix for the coarse level solve (sparse + direct solve) instead of the real-valued approximation. This can be specified with + `config["Solver"]["Linear"]["ComplexCoarseSolve"]`. + - Fix bug in London equation implementation where a curl-curl term was added to the + stiffness operator instead of a mass term. + - Added support for Floquet periodic boundary conditions with phase-delay constraints. + The Floquet wave vector can be specified along with periodic boundaries in the + `config["Boundaries"]["Periodic"]` configuration. + - Fixed bug where the default `config["Model"]["CrackDisplacementFactor"]` would cause + lumped port discovery to fail by reducing default from `1e-3` to `1e-12`. + - Fixed bug in Nastran mesh reader where carriage returns (`\r`) in the mesh file could + cause a failure to read the mesh. + - Changed post-processing, so that appropriate measurements are always written to disk as CSV + files. This is a breaking change, since users are now required to specify a valid output + folder in `config["Problem"]["Output"]`. Previously an empty string `""` would suppress file + printing. ParaView printing is still controlled and suppressed by the `"Save"` or `"SaveStep"` + options in the `config["Solver"][...]`. + - Changed measurement and printing during post-processing substantially. This change is not + user-facing, but will enable future multi-excitation support. All measurements are now + performed as part of the `PostOperator` class, which is templated on solver type. The public + interface of the `PostOperator` has been simplified: measurements can no longer be called + individually, but are grouped in a `MeasurePrintAll` function. The actual printing of + measurements has moved out of individual solvers and is orchestrated by the `PostOperatorCSV` + class. Other helper classes include a small `Table` class (for data storage and formatting) + and a light wrapper `TableWithCSVFile` (with file interaction). + - Changed unit conversion interface: this was moved out of `IOData` into a separate `Units` class. + - Changed csv output to print more digits. + - Changed boundary indexing to enforce that the value of `Index` is unique between `LumpedPort`, + `WavePort`, `SurfaceCurrent` and `Terminal`. This is a breaking change — e.g. currently a lumped + port and a wave port could share an index. + - Added multi-excitation support. `Excitation` in `LumpedPort` and `WavePort` boundary conditions + can now be specified either as bools or excitation integers. Allow driven simulations to have + multiple excitations indices in same configuration file to be simulated consecutively during the + same Palace run. The measurement output of multiple excitations is printed to the same csv files + with distinguish columns being post-indexed by the excitation index. + - Changed `RomOperator` to split out rational interpolation component into the separate + `MinimalRationalInterpolation` class to support multiple excitations. The MRI is unique to each + excitation, but the PROM is shared between excitations. + - Added `PortExcitations` to manage excitation pattern and print it to json metadata. + - Update spack configuration to address `0.12` API breakage between libxsmm and libCEED. + - Support spack vended libCEED and GSLIB builds. + - Fix bug where `MakeSimplex` would fail for higher order meshes. + - Fix bug in STRUMPACK build where dependency SCALAPACK library install directory could + change based on build environment. + - Introduce `"Samples"` mechanism for driven simulation, which allows for specifying a + range of sample frequencies by combining `"Linear"`, `"Log"` and `"Point"` + specifications. These are joined together effectively allowing for variable resolution + sampling (for examples a coarse sampling supported by a fine sampling in a subset + region). This is in addition to the prior existing `"MinFreq"`, `"MaxFreq"`, + `"FreqStep"` interface which is now implicitly converted to a `"Linear"` sample scheme + internally. + - Introduce `"Save"` keyword for `"Driven"` simulation type, which will save specific + frequency choices as opposed to the existing (and maintained `"SaveStep"` based on a + regular sampling). Only frequencies contained within the set of `"Samples"` are + supported, as no interpolation is performed. + - Fix bugs in post processing where `E_cap`, `E_ind`, and `mode_port_kappa` and other + dependent quantities were not dimensionalized correctly. + - Refactor `PostOperator` usage of `Measurement` to be entirely non-dimensional, until + `PostOperatorCSV` which dimensionalizes all measurements before writing to file. Reduces + the risk of mixed unit bugs throughout `PostOperator`. + - Fix bug in using `"MakeSimplex"` which would cause undefined behaviour for higher order + meshes. + - Fix bug when combining OpenMP and GPU builds in reduction operations over `Vector`. + - Fix race condition that would affect OpenMP parallelism with periodic boundaries (exact + and Floquet). + - Fix race condition in `mfem::DenseTensor::operator()` with OpenMP, due to class member + variable access. + - Fix race condition in `DofToQuad` methods within mfem. + - Normalize eigenmodes so their mean phase is a positive real number. + - Added `scnlib` as a dependency to Palace. + - Added parsing ability of existing csv files into the `Table` class. Fix `"Restart"` + behaviour with the multi-excitation feature of driven solver. + +## [0.13.0] - 2024-05-20 + + - Changed default value of `config["Solver"]["PartialAssemblyOrder"]` in order to activate + operator partial assembly by default for all operators in all simulation types. + - Changed the normalization of computed eigenmodes for consistency across different domain + decompositions. Eigenvectors are now normalized with respect to the mass matrix for unit + domain electric field energy. + - Added documentation for various timer categories and improved timing breakdown of + various sections of a simulation. + - Changed mesh files for the cavity and CPW examples, including prism, hexahedral, and + tetrahedral meshes for the cylindrical cavity and correcting the wave port dimensions + for the coplanar wave guide. + - Fixed a few bugs and issues in the implementation of numeric wave ports for driven + simulations. + - Added GPU support for *Palace* via its dependencies, and added the + `config["Solver"]["Device"]` and `config["Solver"]["Backend"]` options for runtime + configuration of the MFEM device (`"CPU"` or `"GPU"`) and libCEED backend, with suitable + defaults for users. + - Added a new section to the documentation on + [Parallelism and GPU support](https://awslabs.github.io/palace/dev/guide/parallelism/). + - Removed use of `mfem::SparseMatrix` and replaced with HYPRE's `hypre_CSRMatrix` when + needed for full assembly, wrapped as `palace::hypre::HypreCSRMatrix`. + - Added `"Active"` configuration file parameter for lumped and wave port boundaries to + disable the associated boundary condition and only use the surface for postprocessing. + - Changed the smooth flux space for the electrostatic error estimator to fix performance + on problems with material interfaces. + - Fixed error estimation bug affecting time-dependent simulation types (driven, transient, + eigenmode) where the recovery of the electric flux density also needs to be taken into + account in addition to the magnetic field. + - Fixed a bug related to mesh cleaning for unspecified domains and mesh partitioning. + - Change computation of domain energy postprocessing for electrostatic and magnetostatic + simulation types in order to improve performance. + - Fixed a bug when computing the energy associated with lumped elements with more than + one nonzero R, L, or C. This also affects the inductive EPR for lumped inductors with + and associated parallel capacitance. + - Fixed a bug for coaxial lumped ports which led to incorrect extraction of the geometric + parameters, especially when coarsely-meshed or non-axis-aligned. + - Added boundary postprocessing functionality for surface flux including electric, + magnetic, and power given by the Poynting vector. This results in some breaking changes + to the configuration file interface, see + `config["Boundaries"]["Postprocessing"]["SurfaceFlux"]` and + `config["Boundaries"]["Postprocessing"]["Dielectric"]`. In addition, related + configuration file keyword changes to for consistency were made to + `config["Domains"]["Postprocessing"]["Probe"]` and + `config["Model"]["Refinement"]["Boxes"]`. + - Fixed a bug in MFEM for nonconformal AMR meshes with internal boundaries affecting + non-homogeneous Dirichlet boundary conditions for electrostatic simulations (see + [#236](https://github.com/awslabs/palace/pull/236)). + +## [0.12.0] - 2023-12-21 + + - Added support for operator partial assembly for high-order finite element spaces based + on libCEED for mixed and non-tensor product element meshes. This option is disabled by + default, but can be activated using `config["Solver"]["PartialAssemblyOrder"]` set to + some number less than or equal to `"Order"`. + - Added flux-based error estimation, reported in `error-estimate.csv`. This computes the + difference between the numerical gradient (electrostatics) or curl (otherwise) of the + solution, and a smoother approximation obtained through a global mass matrix inversion. + The results are reported in `error-estimates.csv` within the `"Output"` folder. + - Added Adaptive Mesh Refinement (AMR), specified in the `config["Model"]["Refinement"]`, + for all problem types aside from transient. To enable AMR, a user must specify + `"MaxIts"`, while all other options have reasonable defaults. Nonconformal (all mesh + types) and conformal (simplex meshes) refinement are supported. + - Added support for non-axis-aligned lumped ports and current sources. Key words `"X"`, + `"Y"`, `"Z"` and `"R"`, with optional prefix `"+"` or `"-"` still work, but now + directions can be specified as vectors with 3 components. Users will be warned, and + ultimately errored, if the specified directions do not agree with axis directions + discovered from the geometry. + - Added output of lumped port voltage and current for eigenmode simulations. + - Added dimensionalized output for energies, voltages, currents, and field values based on + a choice of the characteristic magnetic field strength used for nondimensionalization. + - Added output of electric and magnetic field energies and participation ratios in regions + of the domain, specified with `config["Domains"]["Postprocessing"]["Energy"]` and + written to `domain-E.csv`. This replaces + `config["Domains"]["Postprocessing"]["Dielectric"]` and `domain-Q.csv`. + - Added improved `Timer` and `BlockTimer` classes with more timing categories for + reporting simulation runtime. + - Changed implementation of complex-valued linear algebra to use new `ComplexVector` and + `ComplexOperator` types, which are based on the underlying `mfem::Vector` and + `mfem::Operator` classes, instead of PETSc. PETSc is now fully optional and only + required when SLEPc eigenvalue solver support is requested. Krylov solvers for real- and + complex-valued linear systems are implemented via the built-in `IterativeSolver` + classes. + - Changed implementation of PROMs for adaptive fast frequency sweep to use the Eigen + library for sequential dense linear algebra. + - Changed implementation of numeric wave ports to use MFEM's `SubMesh` functionality. As + of [#3379](https://github.com/mfem/mfem/pull/3379) in MFEM, this has full ND and RT + basis support. For now, support for nonconforming mesh boundaries is limited. + - Added build dependencies on [libCEED](https://github.com/CEED/libCEED), including + [LIBXSMM](https://github.com/libxsmm/libxsmm) and [MAGMA](https://icl.utk.edu/magma/) + to support CPU- and GPU-based operator partial assembly. + - Added unit test framework for all integrators based on + [Catch2](https://github.com/catchorg/Catch2), which also includes some automated + benchmarking capabilities for operator assembly and application. + - Added improved OpenMP support in `palace` wrapper script and CI tests. + - Added Apptainer/Singularity container build definition for *Palace*. + - Fixed bugs related to thread-safety for OpenMP builds and parallel tetrahedral meshes in + the upstream MFEM library. + +## [0.11.2] - 2023-07-14 + + - Fixed a regression bug affecting meshes which have domain elements which are not + assigned material properties in the configuration file. + - Changed layout and names of `palace/` source directory for better organization. + - Added many updates to build system: Removed use of Git submodules to download + dependencies relying instead directly on CMake's ExternalProject, patch GSLIB dependency + for shared library builds, add CI tests with ARPACK-NG instead of SLEPc, update all + dependency versions including MFEM to incorporate bug fixes and improvements. This + affects the Spack package recipe, so a new recipe is distributed as part of *Palace* in + in `spack/` which will keep compatibility with `main` while changes are upstreamed to + the built-in Spack repository. + - Added a basic Makefile with some useful targets for development. + +## [0.11.1] - 2023-05-03 + + - Fixed a bug for interface dielectric loss postprocessing including when using + `--dry-run`. + - Fixed a regression bug affecting second-order absorbing boundary conditions. + - Fixed a bug when the number of processors was large enough that some subdomains own no + true dofs. + - Fixed memory bug in destruction of SuperLU solver. + - Fixed some typos in the documentation. + - Fixed a bug in the cavity convergence study example for the coarsest hexahedral mesh + case, as well as some other minor Julia fixes in the `examples/` and `scripts/` + directories. + - Added updates to superbuild including better ARPACK support, though still experimental + compared to SLEPc. + - Added updated submodules for superbuild. + +## [0.11.0] - 2023-01-26 + + - Initial public release on GitHub. + - Added support for anisotropic materials through the use of `"MaterialAxes"` under + `"Materials"`. These material axes allow for specifying symmetric material property + tensors through a sum of weighted outer products of normal vectors. This can be used in + conjunction with scalar material properties or multiple different anisotropic + properties, though all properties are required to use the same set of basis vectors. + Demonstration given for the coplanar waveguide example which now utilizes a sapphire + substrate. + - Added to default postprocessing outputs: Bulk and interface dielectric loss writes + energy-participation ratios in addition to quality factors associated with lossy + regions, IO coupling $\kappa$ in addition to quality factor for eigenmode + simulations, lumped element energy for transient simulations similar to eigenmode and + frequency domain driven simulations. + - Changed configuration file syntax to simplify support for multielement lumped ports, + where each index for a multielement port should now use the `"Elements"` object to + describe the attributes for each port element. + - Changed configuration file keywords to better reflect meaning: + `config["Boundaries"]["Current"]` => `config["Boundaries"]["SurfaceCurrent"]`, and + `"UseGMG"`, `"UsePCShifted"`, `"MGCycleIts"`, and `"MGSmoothIts"` under + `config["Solver"]["Linear"]`. + - Changed geometric multigrid implementation to generalize across problem types (added + for electrostatics) and use Jacobi-preconditioned Chebyshev smoothing with optional + auxiliary space smoothing at each multigrid level as well. The auxiliary space matrices + are constructed directly in H1 now to allow for matrix-free/partial-assembly support + eventually. Geometric multigrid is turned on by default. + - Added structured simulation metadata output in the form of a JSON file `palace.json` + written to the directory specified by `config["Problem"]["Output"]`. + - Added JSON Schema files for the configuration file format as well as a helper script + to check a provided configuration file against the schema. + - Added optional interface to GSLIB library which enables `"Probe"` functionality to + sample the computed electric and magnetic fields at points in space. + - Added preliminary support for builds using Spack. This includes an option in the build + system for user-supplied dependencies as an alternative to the superbuild + configuration. + - Added updated submodules for superbuild, fixing a bug in SuperLU_DIST solver causing + communication hangs for certain numbers of processors. + +## [0.10.0] - 2022-10-04 + + - Added interfaces to ARPACK and FEAST eigenvalue solvers. + - Added divergence-free projection for eliminating spurious DC modes for eigenmode + solves. + - Added option for visualizing fields on mesh boundaries, output alongside the full 3D + solution fields for ParaVeiew visualization. + - Added real and imaginary fields output for complex-valued phasors, and electric and + magnetic energy density fields. + - Added convergence study for the cavity example application, and example Julia code for + automated mesh generation, running of the solver, and postprocessing. + - Fixed bugs in mesh preprocessing related to precision of nodal coordinates when + distributing a serial mesh to each process. + - Added option for `"Driven"` and `"Transient"` simulations to accelerate postprocessing + by only considering port boundaries. + - Added `-dry-run`/`--dry-run` command line option to check configuration file for errors + and exit. + - Changed dependency build process to rely less on PETSc build system, no longer give + option to build BLAS/LAPACK libraries from source, added detection for OpenBLAS and AMD + Optimizing CPU Libraries (AOCL) blis/libflame. + - Fixed issues when compiling with Intel or Clang compilers. + - Added CI test builds to test various build options and added easier default build + options in CMake configuration. + - Added `clang-format` and `JuliaFormatter` configurations for automated C++, Julia, and + Markdown code formatting. + +## [0.9.0] - 2022-07-11 + + - Added region-based mesh refinement with box or sphere refinement regions. + - Added new sparse direct solver interfaces to SuperLU_DIST and STRUMPACK libraries, to + complement the existing MUMPS interface. + - Changed configuration file keywords for linear solver and preconditioner parameters to + make more options available to the user and clarify them. + - Added check that all external boundary surface attributes must have an associated + boundary condition specified in the configuration file (adds new `"PMC"` and + `"ZeroCharge"` boundaries for natural boundary conditions). + - Added advanced configuration options for GMRES/FGMRES/SLEPc eigenvalue solver + orthogonalization, making CGS2 (classical GS with iterative refinement) the default. + +## [0.8.0] - 2022-06-06 + + - Added new comprehensive example applications and detailed tutorials in the + documentation. + - Added CI pipeline for build and regression testing. + - Fixed bugs for shared library builds, which are now the default for dependency builds. + - Added complex lumped port voltage and current output for computing impedance and + admittance parameters from frequency domain driven simulations. + - Fixed a bug in mesh parsers for COMSOL and Nastran formats for high-order hexahedra, + prism, and pyramid element types. + - Changed adaptive frequency sweep implementation to introduce optimizations which, in + particular, speed up driven simulations with wave ports. + - Fixed bug related to domain dielectric postprocessing and surface dielectric loss + postprocessing now automatically detects correct side for substrate-air interfaces + (always evaluates on vacuum side). + +## [0.7.0] - 2022-03-25 + + - Added adaptive fast frequency sweep implementation for frequency domain driven + simulations, activated with `config["Solver"]["Driven"]["AdaptiveTol"] > 0.0`. + - Changed ASCII file postprocessing to write out .csv files instead of generic whitespace + delimited .txt for easier parsing. + - Changed field output postprocessing for visualization to save the mesh in the original + mesh length units (rather than nondimensionalized ones). + - Added electric and magnetic field energy output to files as function of mode, frequency, + time, or solution step, in `domain-E.csv`. + - Added option to specify circuit R/L/C parameters for lumped port boundaries as an + alternative to the surface Rs/Ls/Cs ones. + - Added improved and expanded documentation. + - Added new MFEM features, including support for ND and RT spaces on wedge or prism + elements and bug fixes. + +## [0.6.0] - 2022-02-17 + + - Changed implementation of boundary conditions and excitations to do a better job + separating spatial discretization and algebraic operators. This is also in preparation + for future additions of boundary conditions which are not quadratic in the frequency + such as wave ports, as well as linear algebra improvements. + - Added a `config["Boundaries"]["Current"]` configuration file parameter to specify a + surface current excitation, and removed the lumped port `"Termination"` option as it is + no longer relevant. Magnetostatics simulations use this `"Current"` excitation rather + than `"Port"` now. + - Changed lumped port and surface current excitations to now use a unit circuit voltage + and circuit current excitation, rather than unit electric field or surface current + density. + - Added numeric wave port boundaries for frequency domain driven simulations. To + differentiate, lumped port boundaries are now specified using the keyword + `"LumpedPort"` instead of just `"Port"` in the configuration file. + - Fixed farfield boundaries to account for non-vacuum materials at the boundary and added + second-order option for absorbing boundary condition (ABC). + - Added finite conductivity surface impedance boundary condition, with optional thickness + correction. + - Added dielectric loss calculation specializations for metal-air, metal-substrate, and + substrate-air interfaces in accordance with + [this paper](https://aip.scitation.org/doi/10.1063/1.3637047). + - Changed implementation of frequency domain linear algebra to eliminate the use of PETSc + matrix types, relying instead on wrappers of the real and imaginary Hypre matrices + making up a complex operator. + - Added geometric multigrid AMS solvers (in two variants) using h- or p-coarsening, which + employs either a specialized smoother for Nedelec space coarsening or geometric + coarsening for the auxiliary space solves. + - Changed build process to include PETSc as a submodule and configure as part the standard + CMake build process. + +## [0.5.0] - 2021-11-08 + + - Changed postprocessing implementation which leads to some configuration file changes, + namely `config["Domains"]["Postprocessing"]` and + `config["Boundaries"]["Postprocessing"]`. + - Changed some filenames for postprocessed quantities to be more intuitive. + - Added dielectric (bulk and interface) loss calculations for all simulation types. + Correctly handles two-sided internal surfaces using `"Side"` specification. + - Added mutual capacitance matrix extraction to electrostatic solver. + - Added built-in mesh support for COMSOL (`.mphbin` and `.mphtxt`) and Nastran (`.nas` and + `.bdf`) mesh formats. + - Removed Gmsh dependency for mesh format conversion. + - Added some improvements to the default build flags and dependency build options. + +## [0.4.0] - 2021-09-27 + + - Added `"Capacitance"` and `"Inductance"` options to extract surface charges and fluxes + in frequency and time domain simulation types. + - Changed `"Transient"` solver implementation to use real-valued scalars (user must use a + domain electrical conductivity as opposed to a loss tangent for loss modeling), for a + roughly 2x performance improvement. + - Added new MFEM features, including upgrades to high-order Nedelec element spaces on + tetrahedra which no longer require global dof reordering. + - Changed system matrix assembly to use Hypre P^T A P within MFEM and keep the full dense + element matrices when the coefficient is nonzero (`skip_zeros = 0` always). The + stiffness and mass matrices thus always share the same sparsity pattern now. + +## [0.3.0] - 2021-09-02 + + - Added electrostatic solver for capacitance matrix extraction. + - Added `"Termination"` option for ports to allow for open or short circuiting but still + enabling port postprocessing. + - Changed timer to separate out time spent for disk I/O during initialization and + postprocessing. + - Added options to job wrappers for new instance types and subnet ID option. + - Added changes from Hypre and MFEM dependencies to include improvements and bug fixes + from those libraries. + +## [0.2.0] - 2021-06-25 + + - Added a proper [changelog](./CHANGELOG.md) for the project. + - Added time domain solver with various excitation options to accompany existing frequency + domain driven and eigenmode simulation types. + - Added wrapper scripts in [`bin/`](./bin) for launching parallel simulations either + interactively or as batch jobs using PBS. + - Changed JSON configuration file keywords for frequency and time intervals. + - Added domain conductivity models for normal and superconducting metals. + - Fixed a bug for coaxial lumped port postprocessing. + - Added visualization of surface currents on domain boundaries to accompany existing + $\bm {E}$- and $\bm{B}$-field visualization. + - Changed default length scale for nondimensionalization. + - Changed default filenames for port-related postprocessed quantities. + - Fixed many smaller issues and added various other improvements. + +## [0.1.0] - 2020-11-03 + + - Initial development release. diff --git a/CMakeLists.txt b/CMakeLists.txt index 2eb0c3208d..914afbb306 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,167 +1,224 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# CMake superbuild for Palace and its dependencies -# - -# CMake 3.21 was released in Jul. 2021 (required for HIP support) -cmake_minimum_required(VERSION 3.21) - -# Prohibit in-source builds -if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR) - message(FATAL_ERROR "In-source builds are prohibited") -endif() - -# Initialize the project -project(palace-superbuild LANGUAGES CXX C VERSION 0.11.2) - -# Define build settings and defaults -set(PALACE_WITH_64BIT_INT OFF CACHE BOOL "Use 64 bit integers") -set(PALACE_WITH_64BIT_BLAS_INT OFF CACHE BOOL "Use ILP64 BLAS/LAPACK interface instead of LP64 (experimental, not recommended)") -set(PALACE_WITH_OPENMP OFF CACHE BOOL "Use OpenMP for shared-memory parallelism") - -set(PALACE_BUILD_EXTERNAL_DEPS ON CACHE BOOL "Build external third-party dependency libraries") -set(PALACE_WITH_SUPERLU ON CACHE BOOL "Build with SuperLU_DIST sparse direct solver") -set(PALACE_WITH_STRUMPACK OFF CACHE BOOL "Build with STRUMPACK sparse direct solver") -set(PALACE_WITH_MUMPS OFF CACHE BOOL "Build with MUMPS sparse direct solver") -set(PALACE_WITH_SLEPC ON CACHE BOOL "Build with SLEPc eigenvalue solver") -set(PALACE_WITH_ARPACK OFF CACHE BOOL "Build with ARPACK eigenvalue solver") -set(PALACE_WITH_LIBXSMM ON CACHE BOOL "Build with LIBXSMM backend for libCEED") -set(PALACE_WITH_MAGMA ON CACHE BOOL "Build with MAGMA backend for libCEED") -set(PALACE_WITH_GSLIB ON CACHE BOOL "Build with GSLIB library for high-order field interpolation") - -set(ANALYZE_SOURCES_CLANG_TIDY OFF CACHE BOOL "Run static analysis checks using clang-tidy") -set(ANALYZE_SOURCES_CPPCHECK OFF CACHE BOOL "Run static analysis checks using cppcheck") - -# Enable Fortran if required -if(PALACE_WITH_STRUMPACK OR PALACE_WITH_MUMPS OR PALACE_WITH_ARPACK) - enable_language(Fortran) -endif() - -# Set a default build type if none was provided -if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - message(STATUS "Setting CMAKE_BUILD_TYPE to 'Release' as none was specified") - set(CMAKE_BUILD_TYPE "Release" CACHE STRING - "Specifies the build type ('Debug' or 'Release', for example)" FORCE - ) -endif() - -# Set a default installation location if none was provided -if(NOT CMAKE_INSTALL_PREFIX OR CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) - message(STATUS "Setting CMAKE_INSTALL_PREFIX to '${CMAKE_BINARY_DIR}' as none was specified") - set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}" CACHE STRING - "Install directory used by install()" FORCE - ) -endif() - -# Set a default for dependency library builds if none was provided -if(NOT DEFINED BUILD_SHARED_LIBS) - message(STATUS "Setting BUILD_SHARED_LIBS to 'OFF' as it was not specified") - set(BUILD_SHARED_LIBS OFF CACHE BOOL - "Global flag to cause add_library() to create shared libraries if ON" - ) -endif() - -# Configure default RPATH for installed targets if not provided -if(NOT DEFINED CMAKE_INSTALL_RPATH) - message(STATUS "Setting CMAKE_INSTALL_RPATH to '\${CMAKE_INSTALL_PREFIX}/lib;\${CMAKE_INSTALL_PREFIX}/lib64' as it was not specified") - set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib$${CMAKE_INSTALL_PREFIX}/lib64" CACHE STRING - "Global RPATH to use for installed targets" - ) -endif() -if(NOT DEFINED CMAKE_INSTALL_RPATH_USE_LINK_PATH) - message(STATUS "Setting CMAKE_INSTALL_RPATH_USE_LINK_PATH to 'ON' as it was not specified") - set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON CACHE BOOL - "Global flag to append to the RPATH of installed binaries any directories which linked library files if ON" - ) -endif() - -# Add extra CMake modules -list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") - -# Intel classic compilers no longer supported (prior to CMake 3.20, the new compilers are -# detected as Clang instead of IntelLLVM) -if(CMAKE_C_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel") - message(WARNING "The Intel classic compilers (icc/icpc) are no longer supported and \ -have been replaced by the newer Clang-based icx/icpx from Intel oneAPI") -endif() - -# MAGMA is only for GPU builds -if(PALACE_WITH_MAGMA AND NOT (PALACE_WITH_CUDA OR PALACE_WITH_HIP)) - message(STATUS "Disabling MAGMA due to lack of CUDA or HIP support") - set(PALACE_WITH_MAGMA OFF CACHE BOOL - "Build with MAGMA backend when libCEED is enabled" FORCE - ) -endif() - -# MPI is required for most dependency builds -message(STATUS "====================== Configuring MPI dependency ======================") -find_package(MPI REQUIRED) - -# Add BLAS/LAPACK libraries -message(STATUS "================= Configuring BLAS/LAPACK dependencies =================") -include(ExternalBLASLAPACK) - -# Default arguments for all external CMake builds (needs to happen after BLAS/LAPACK -# detection in case CMAKE_PREFIX_PATH is modified) -# See https://spack.readthedocs.io/en/latest/build_systems/cmakepackage.html#cmake-arguments-provided-by-spack -set(PALACE_SUPERBUILD_DEFAULT_ARGS - "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" - "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" - "-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}" -) -if(NOT "${CMAKE_PREFIX_PATH}" STREQUAL "") - string(REPLACE ";" "$" PALACE_CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH}") - list(APPEND PALACE_SUPERBUILD_DEFAULT_ARGS - "-DCMAKE_PREFIX_PATH=${PALACE_CMAKE_PREFIX_PATH}$${CMAKE_INSTALL_PREFIX}" - ) -else() - list(APPEND PALACE_SUPERBUILD_DEFAULT_ARGS - "-DCMAKE_PREFIX_PATH=${CMAKE_INSTALL_PREFIX}" - ) -endif() -if(NOT "${CMAKE_INSTALL_RPATH}" STREQUAL "") - string(REPLACE ";" "$" PALACE_CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}") - list(APPEND PALACE_SUPERBUILD_DEFAULT_ARGS - "-DCMAKE_INSTALL_RPATH=${PALACE_CMAKE_INSTALL_RPATH}" - ) -endif() -if(NOT "${CMAKE_INSTALL_RPATH_USE_LINK_PATH}" STREQUAL "") - list(APPEND PALACE_SUPERBUILD_DEFAULT_ARGS - "-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=${CMAKE_INSTALL_RPATH_USE_LINK_PATH}" - ) -endif() - -include(ExternalGitTags) -# Avoid DOWNLOAD_EXTRACT_TIMESTAMP warning -if(POLICY CMP0135) - cmake_policy(SET CMP0135 NEW) -endif() - -# Add other third-party dependency builds -include(ExternalGitTags) -if(PALACE_BUILD_EXTERNAL_DEPS) - add_subdirectory("extern") -endif() - -# Add GSLIB (always built as part of Palace) -if(PALACE_WITH_GSLIB) - message(STATUS "===================== Configuring GSLIB dependency =====================") - include(ExternalGSLIB) -endif() - -# Add libCEED (always built as part of Palace) -message(STATUS "==================== Configuring libCEED dependency ====================") -include(ExternalLibCEED) - -# Add MFEM (always built as part of Palace) -message(STATUS "====================== Configuring MFEM dependency =====================") -include(ExternalMFEM) - -# Add the main Palace project -message(STATUS "========================== Configuring Palace ==========================") -include(ExternalPalace) - -# Finished with superbuild configuration -message(STATUS "======================= Configure stage complete =======================") +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# CMake superbuild for Palace and its dependencies +# + +# CMake 3.21 was released in Jul. 2021 (required for HIP support) +# Cmake 3.24 was released in Aug. 2022 (required for portable WHOLE_ARCHIVE linking of tests) +cmake_minimum_required(VERSION 3.24) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Prohibit in-source builds +if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR) + message(FATAL_ERROR "In-source builds are prohibited") +endif() + +# Initialize the project +project(palace-superbuild LANGUAGES CXX C VERSION 0.15.0) + +# Define build settings and defaults +set(PALACE_WITH_64BIT_INT OFF CACHE BOOL "Use 64 bit integers") +set(PALACE_WITH_64BIT_BLAS_INT OFF CACHE BOOL "Use ILP64 BLAS/LAPACK interface instead of LP64 (experimental, not recommended)") +set(PALACE_WITH_OPENMP OFF CACHE BOOL "Use OpenMP for shared-memory parallelism") +set(PALACE_WITH_CUDA OFF CACHE BOOL "Use CUDA for NVIDIA GPU support") +set(PALACE_WITH_HIP OFF CACHE BOOL "Use HIP for AMD or NVIDIA GPU support") +set(PALACE_WITH_GPU_AWARE_MPI OFF CACHE BOOL "Option to set if MPI distribution is GPU aware") + +set(PALACE_BUILD_EXTERNAL_DEPS ON CACHE BOOL "Build external third-party dependency libraries") +set(PALACE_WITH_SUPERLU ON CACHE BOOL "Build with SuperLU_DIST sparse direct solver") +set(PALACE_WITH_STRUMPACK OFF CACHE BOOL "Build with STRUMPACK sparse direct solver") +set(PALACE_WITH_MUMPS OFF CACHE BOOL "Build with MUMPS sparse direct solver") +set(PALACE_WITH_SLEPC ON CACHE BOOL "Build with SLEPc eigenvalue solver") +set(PALACE_WITH_ARPACK OFF CACHE BOOL "Build with ARPACK eigenvalue solver") +set(PALACE_WITH_LIBXSMM ON CACHE BOOL "Build with LIBXSMM backend for libCEED") +set(PALACE_WITH_MAGMA ON CACHE BOOL "Build with MAGMA backend for libCEED") +set(PALACE_WITH_GSLIB ON CACHE BOOL "Build with GSLIB library for high-order field interpolation") + +set(PALACE_WITH_STRUMPACK_BUTTERFLYPACK OFF CACHE BOOL "Build with ButterflyPACK support for STRUMPACK solver") +set(PALACE_WITH_STRUMPACK_ZFP OFF CACHE BOOL "Build with ZFP support for STRUMPACK solver") + +set(PALACE_WITH_SUNDIALS ON CACHE BOOL "Build with SUNDIALS differential/algebraic equations solver") + +set(ANALYZE_SOURCES_CLANG_TIDY OFF CACHE BOOL "Run static analysis checks using clang-tidy") +set(ANALYZE_SOURCES_CPPCHECK OFF CACHE BOOL "Run static analysis checks using cppcheck") + +set(PALACE_BUILD_WITH_COVERAGE OFF CACHE BOOL "Compile Palace with coverage flags (source-based for LLVM; gcov for GCC)") + +# Enable Fortran if required +if(PALACE_WITH_STRUMPACK OR PALACE_WITH_MUMPS OR PALACE_WITH_ARPACK) + enable_language(Fortran) +endif() + +# Enable CUDA/HIP if required +if(PALACE_WITH_CUDA AND PALACE_WITH_HIP) + message(FATAL_ERROR "PALACE_WITH_CUDA is not compatible with PALACE_WITH_HIP") +endif() +if(PALACE_WITH_CUDA) + # Note: The new behavior of CMake policy CMP0104 will initialize CMAKE_CUDA_ARCHITECTURES + # to an (old) compatible value even when not set by the user. + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "52") + find_program(NVIDIA_SMI_CMD nvidia-smi) + if(NVIDIA_SMI_CMD) + execute_process(COMMAND ${NVIDIA_SMI_CMD} --query-gpu=compute_cap --format=csv,noheader + OUTPUT_VARIABLE CUDA_CAPS RESULT_VARIABLE SMI_RESULT ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if(SMI_RESULT EQUAL 0 AND CUDA_CAPS) + # Remove dots (8.6 -> 86) and convert newlines to semicolons + string(REGEX REPLACE "\\." "" CUDA_CAPS "${CUDA_CAPS}") + string(REGEX REPLACE "\n" ";" CUDA_CAPS "${CUDA_CAPS}") + list(REMOVE_DUPLICATES CUDA_CAPS) + string(REPLACE ";" "," CMAKE_CUDA_ARCHITECTURES "${CUDA_CAPS}") + message(STATUS "Detected CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") + else() + set(CMAKE_CUDA_ARCHITECTURES "70") + message(STATUS "nvidia-smi failed, using default CUDA architecture: 70") + endif() + else() + set(CMAKE_CUDA_ARCHITECTURES "70") + message(STATUS "nvidia-smi not found, using default CUDA architecture: 70") + endif() + set(CMAKE_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" CACHE STRING "CUDA architectures" FORCE) + endif() + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) +elseif(PALACE_WITH_HIP) + enable_language(HIP) + get_filename_component(HIPCC_DIR ${CMAKE_HIP_COMPILER} DIRECTORY) + get_filename_component(BIN_HIPCC_DIR ${HIPCC_DIR} DIRECTORY) + set(ROCM_DIR "${BIN_HIPCC_DIR}" CACHE STRING + "CUDA installation directory, typically /opt/rocm" + ) + message(STATUS "Found HIP: ${ROCM_DIR}") + if(NOT DEFINED CMAKE_HIP_ARCHITECTURES) + message(STATUS "Setting CMAKE_HIP_ARCHITECTURES to 'gfx900' as none were specified") + set(CMAKE_HIP_ARCHITECTURES "gfx900" CACHE STRING + "Specifies the list of AMD GPU architectures to generate device code for" + ) + endif() +endif() + +# Set a default build type if none was provided +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message(STATUS "Setting CMAKE_BUILD_TYPE to 'Release' as none was specified") + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Specifies the build type ('Debug' or 'Release', for example)" FORCE + ) +endif() + +string(TOUPPER "${CMAKE_BUILD_TYPE}" BUILD_TYPE_UPPER) + +# Set a default installation location if none was provided +if(NOT CMAKE_INSTALL_PREFIX OR CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + message(STATUS "Setting CMAKE_INSTALL_PREFIX to '${CMAKE_BINARY_DIR}' as none was specified") + set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}" CACHE STRING + "Install directory used by install()" FORCE + ) +endif() + +# Set a default for dependency library builds if none was provided +if(NOT DEFINED BUILD_SHARED_LIBS) + message(STATUS "Setting BUILD_SHARED_LIBS to 'OFF' as it was not specified") + set(BUILD_SHARED_LIBS OFF CACHE BOOL + "Global flag to cause add_library() to create shared libraries if ON" + ) +endif() + +# Configure default RPATH for installed targets if not provided +if(NOT DEFINED CMAKE_INSTALL_RPATH) + message(STATUS "Setting CMAKE_INSTALL_RPATH to '\${CMAKE_INSTALL_PREFIX}/lib;\${CMAKE_INSTALL_PREFIX}/lib64' as it was not specified") + set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib$${CMAKE_INSTALL_PREFIX}/lib64" CACHE STRING + "Global RPATH to use for installed targets" + ) +endif() +if(NOT DEFINED CMAKE_INSTALL_RPATH_USE_LINK_PATH) + message(STATUS "Setting CMAKE_INSTALL_RPATH_USE_LINK_PATH to 'ON' as it was not specified") + set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON CACHE BOOL + "Global flag to append to the RPATH of installed binaries any directories which linked library files if ON" + ) +endif() + +# Add extra CMake modules +list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") + +# Intel classic compilers no longer supported (prior to CMake 3.20, the new compilers are +# detected as Clang instead of IntelLLVM) +if(CMAKE_C_COMPILER_ID STREQUAL "Intel" OR CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + message(WARNING "The Intel classic compilers (icc/icpc) are no longer supported and \ +have been replaced by the newer Clang-based icx/icpx from Intel oneAPI") +endif() + +# MAGMA is only for GPU builds +if(PALACE_WITH_MAGMA AND NOT (PALACE_WITH_CUDA OR PALACE_WITH_HIP)) + message(STATUS "Disabling MAGMA due to lack of CUDA or HIP support") + set(PALACE_WITH_MAGMA OFF CACHE BOOL + "Build with MAGMA backend when libCEED is enabled" FORCE + ) +endif() + +# MPI is required for most dependency builds +message(STATUS "====================== Configuring MPI dependency ======================") +find_package(MPI REQUIRED) + +# Add BLAS/LAPACK libraries +message(STATUS "================= Configuring BLAS/LAPACK dependencies =================") +include(ExternalBLASLAPACK) + +# Default arguments for all external CMake builds (needs to happen after BLAS/LAPACK +# detection in case CMAKE_PREFIX_PATH is modified). Compilers and flags are added on a +# package by package basis. +# See Spack docs: http://tinyurl.com/43t2recp +set(PALACE_SUPERBUILD_DEFAULT_ARGS + "-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}" + "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" + "-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}" + "-DCMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS}" + "-DCMAKE_MODULE_LINKER_FLAGS=${CMAKE_MODULE_LINKER_FLAGS}" + "-DCMAKE_SHARED_LINKER_FLAGS=${CMAKE_SHARED_LINKER_FLAGS}" + "-DCMAKE_STATIC_LINKER_FLAGS=${CMAKE_STATIC_LINKER_FLAGS}" +) +if(NOT "${CMAKE_PREFIX_PATH}" STREQUAL "") + string(REPLACE ";" "$" PALACE_CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH}") + list(APPEND PALACE_SUPERBUILD_DEFAULT_ARGS + "-DCMAKE_PREFIX_PATH=${PALACE_CMAKE_PREFIX_PATH}$${CMAKE_INSTALL_PREFIX}" + ) +else() + list(APPEND PALACE_SUPERBUILD_DEFAULT_ARGS + "-DCMAKE_PREFIX_PATH=${CMAKE_INSTALL_PREFIX}" + ) +endif() +if(NOT "${CMAKE_INSTALL_RPATH}" STREQUAL "") + string(REPLACE ";" "$" PALACE_CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}") + list(APPEND PALACE_SUPERBUILD_DEFAULT_ARGS + "-DCMAKE_INSTALL_RPATH=${PALACE_CMAKE_INSTALL_RPATH}" + ) +endif() +if(NOT "${CMAKE_INSTALL_RPATH_USE_LINK_PATH}" STREQUAL "") + list(APPEND PALACE_SUPERBUILD_DEFAULT_ARGS + "-DCMAKE_INSTALL_RPATH_USE_LINK_PATH=${CMAKE_INSTALL_RPATH_USE_LINK_PATH}" + ) +endif() + +include(ExternalGitTags) +# Avoid DOWNLOAD_EXTRACT_TIMESTAMP warning +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +# Add other third-party dependency builds +include(ExternalGitTags) +if(PALACE_BUILD_EXTERNAL_DEPS) + add_subdirectory("extern") +endif() + +# Add MFEM (always built as part of Palace) +message(STATUS "====================== Configuring MFEM dependency =====================") +include(ExternalMFEM) + +# Add the main Palace project +message(STATUS "========================== Configuring Palace ==========================") +include(ExternalPalace) + +# Finished with superbuild configuration +message(STATUS "======================= Configure stage complete =======================") diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index dfa5598a37..94c0db199f 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,12 +1,10 @@ - -# Code of Conduct - -This project has adopted the -[Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). -For more information see the -[Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact -([opensource-codeofconduct@amazon.com](mailto:opensource-codeofconduct@amazon.com)) with any -additional questions or comments. + + +# Code of Conduct + +This project has adopted the +[Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). +For more information see the +[Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact +([opensource-codeofconduct@amazon.com](mailto:opensource-codeofconduct@amazon.com)) with any +additional questions or comments. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f247796bd1..860ae7b30f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,75 +1,73 @@ - -# Contributing Guidelines - -Thank you for your interest in contributing to our project. Whether it's a bug report, new -feature, correction, or additional documentation, we greatly value feedback and -contributions from our community. - -Please read through this document before submitting any issues or pull requests to ensure we -have all the necessary information to effectively respond to your bug report or -contribution. - -## Feature requests and bug reporting - -We welcome you to use the GitHub issue tracker to report bugs or suggest features. - -When filing an issue, please check existing open, or recently closed, issues to make sure -somebody else hasn't already reported the issue. Please try to include as much information -as you can. Details like these are incredibly useful: - - - A reproducible test case or series of steps - - The version of our code being used - - Any modifications you've made relevant to the bug - - Anything unusual about your environment or deployment - -## Contributing via pull requests - -Contributions via pull requests are much appreciated. Before sending us a pull request, -please ensure that: - - 1. You are working against the latest source on the *main* branch. - 2. You check existing open, and recently merged, pull requests to make sure someone else - hasn't addressed the problem already. - 3. You open an issue to discuss any significant work - we would hate for your time to be - wasted. - -To send us a pull request, please: - - 1. Fork the repository. - 2. Modify the source; please focus on the specific change you are contributing. If you also - reformat all the code, it will be hard for us to focus on your change. - 3. Ensure local tests pass. - 4. Commit to your fork using clear commit messages. - 5. Send us a pull request, answering any default questions in the pull request interface. - 6. Pay attention to any automated CI failures reported in the pull request, and stay - involved in the conversation. - -GitHub provides additional document on -[forking a repository](https://help.github.com/articles/fork-a-repo/) and -[creating a pull request](https://help.github.com/articles/creating-a-pull-request/). - -## Finding contributions to work on - -Looking at the existing issues is a great way to find something to contribute on. As our -projects, by default, use the default GitHub issue labels -(enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help -wanted' issues is a great place to start. - -## Code of conduct - -See the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file for our project's code of conduct. - -## Security issue notifications - -If you discover a potential security issue in this project we ask that you notify AWS/Amazon -Security via our -[vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). -Please do **not** create a public GitHub issue. - -## Licensing - -See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the -licensing of your contribution. + + +# Contributing Guidelines + +Thank you for your interest in contributing to our project. Whether it's a bug report, new +feature, correction, or additional documentation, we greatly value feedback and +contributions from our community. + +Please read through this document before submitting any issues or pull requests to ensure we +have all the necessary information to effectively respond to your bug report or +contribution. + +## Feature requests and bug reporting + +We welcome you to use the GitHub issue tracker to report bugs or suggest features. + +When filing an issue, please check existing open, or recently closed, issues to make sure +somebody else hasn't already reported the issue. Please try to include as much information +as you can. Details like these are incredibly useful: + + - A reproducible test case or series of steps + - The version of our code being used + - Any modifications you've made relevant to the bug + - Anything unusual about your environment or deployment + +## Contributing via pull requests + +Contributions via pull requests are much appreciated. Before sending us a pull request, +please ensure that: + + 1. You are working against the latest source on the *main* branch. + 2. You check existing open, and recently merged, pull requests to make sure someone else + hasn't addressed the problem already. + 3. You open an issue to discuss any significant work - we would hate for your time to be + wasted. + +To send us a pull request, please: + + 1. Fork the repository. + 2. Modify the source; please focus on the specific change you are contributing. If you also + reformat all the code, it will be hard for us to focus on your change. + 3. Ensure local tests pass. + 4. Commit to your fork using clear commit messages. + 5. Send us a pull request, answering any default questions in the pull request interface. + 6. Pay attention to any automated CI failures reported in the pull request, and stay + involved in the conversation. + +GitHub provides additional document on +[forking a repository](https://help.github.com/articles/fork-a-repo/) and +[creating a pull request](https://help.github.com/articles/creating-a-pull-request/). + +## Finding contributions to work on + +Looking at the existing issues is a great way to find something to contribute on. As our +projects, by default, use the default GitHub issue labels +(enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help +wanted' issues is a great place to start. + +## Code of conduct + +See the [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) file for our project's code of conduct. + +## Security issue notifications + +If you discover a potential security issue in this project we ask that you notify AWS/Amazon +Security via our +[vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). +Please do **not** create a public GitHub issue. + +## Licensing + +See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the +licensing of your contribution. diff --git a/LICENSE b/LICENSE index 67db858821..b0fcd6d9b4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,175 +1,175 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. diff --git a/Makefile b/Makefile index 9418b3890c..929a9ef72c 100644 --- a/Makefile +++ b/Makefile @@ -1,27 +1,27 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -CLANG_FORMAT ?= clang-format -JULIA ?= julia - -.PHONY: format format-cpp format-jl docs tests - -# Style/format -format: format-cpp format-jl - -format-cpp: - ./scripts/format-source -cpp --clang-format $(CLANG_FORMAT) - -format-jl: - ./scripts/format-source -jl --julia $(JULIA) - -# Documentation -docs: - $(RM) -r docs/build - $(JULIA) --project=docs -e 'using Pkg; Pkg.instantiate()' - $(JULIA) --project=docs --color=yes docs/make.jl - -# Tests -tests: - $(JULIA) --project=test/examples -e 'using Pkg; Pkg.instantiate()' - $(JULIA) --project=test/examples --color=yes test/examples/runtests.jl +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +CLANG_FORMAT ?= clang-format +JULIA ?= julia + +.PHONY: format format-cpp format-jl docs tests + +# Style/format +format: format-cpp format-jl + +format-cpp: + ./scripts/format-source -cpp --clang-format $(CLANG_FORMAT) + +format-jl: + ./scripts/format-source -jl --julia $(JULIA) + +# Documentation +docs: + $(RM) -r docs/build + $(JULIA) --project=docs -e 'using Pkg; Pkg.instantiate()' + $(JULIA) --project=docs --color=yes docs/make.jl + +# Tests +tests: + $(JULIA) --project=test/examples -e 'using Pkg; Pkg.instantiate()' + $(JULIA) --project=test/examples --color=yes test/examples/runtests.jl diff --git a/NOTICE b/NOTICE index 616fc58894..82cfee9fea 100644 --- a/NOTICE +++ b/NOTICE @@ -1 +1 @@ -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/README.md b/README.md index 210810eb40..5771cadbb4 100644 --- a/README.md +++ b/README.md @@ -1,98 +1,125 @@ - -# Palace: 3D Finite Element Solver for Computational Electromagnetics - -[![CI (Linux)](https://github.com/awslabs/palace/actions/workflows/build-and-test-linux.yml/badge.svg)](https://github.com/awslabs/palace/actions/workflows/build-and-test-linux.yml) -[![CI (macOS)](https://github.com/awslabs/palace/actions/workflows/build-and-test-macos.yml/badge.svg)](https://github.com/awslabs/palace/actions/workflows/build-and-test-macos.yml) -[![](https://img.shields.io/badge/docs-stable-blue.svg)](https://awslabs.github.io/palace/stable) -[![](https://img.shields.io/badge/docs-dev-blue.svg)](https://awslabs.github.io/palace/dev) - -*Palace*, for **PA**rallel **LA**rge-scale **C**omputational **E**lectromagnetics, is an -open-source, parallel finite element code for full-wave 3D electromagnetic simulations in -the frequency or time domain, using the -[MFEM finite element discretization library](http://mfem.org). - -## Key features - - - Eigenmode calculations with optional material or radiative loss including lumped - impedance boundaries. Automatic postprocessing of energy-participation ratios (EPRs) for - [circuit quantization](https://www.nature.com/articles/s41534-021-00461-8) and - interface or bulk participation ratios for predicting dielectric loss. - - Frequency domain driven simulations with surface current excitation and lumped or - numeric wave port boundaries. Wideband frequency response calculation using uniform - frequency space sampling or an adaptive fast frequency sweep algorithm. - - Explicit or fully-implicit time domain solver for transient electromagnetic analysis. - - Lumped capacitance and inductance matrix extraction via electrostatic and magnetostatic - problem formulations. - - Support for a wide range of mesh file formats for structured and unstructured meshes, - with built-in uniform or region-based parallel mesh refinement. - - Solution-based Adaptive Mesh Refinement (AMR) for all simulation types aside from - transient. Nonconformal refinement is supported for all mesh types, and conformal - refinement for simplex meshes. - - Arbitrary high-order finite element spaces and curvilinear mesh support thanks to the - [MFEM library](https://mfem.org/features/). - - Scalable algorithms for the solution of linear systems of equations, including geometric - multigrid (GMG), parallel sparse direct solvers, and algebraic multigrid - (AMG) preconditioners, for fast performance on platforms ranging from laptops to HPC - systems. - -## Getting started - -*Palace* can be installed using the [Spack HPC package manager](https://spack.io/), with the -command `spack install palace`. Run `spack info palace` to get more information about the -available configuration options and dependencies. - -Those wishing to work in a containerized environment may use the Singularity/Apptainer -recipe for *Palace* in [`singularity/`](./singularity) to build a container containing -*Palace* and all its dependencies. - -Finally, instructions for obtaining *Palace* and building from source can be found in the -[documentation](https://awslabs.github.io/palace/dev/install/). As part of the CMake build -process, most dependencies are downloaded and installed automatically and thus an internet -connection is required. - -System requirements: - - - CMake version 3.18.1 or later - - C++17 compatible C++ compiler - - C and (optionally) Fortran compilers for dependency builds - - MPI distribution - - BLAS, LAPACK libraries - -## Documentation - -[https://awslabs.github.io/palace/](https://awslabs.github.io/palace/) - -The documentation for *Palace* provides full instructions for building the solver and -running electromagnetic simulations. - -To build a local version of the documentation, run `julia make.jl` from within the -[`docs/`](./docs) directory. - -## Examples - -Some example applications including configuration files and meshes can be found in the -[`examples/`](./examples) directory. Complete tutorials for each example are available in -the [documentation](https://awslabs.github.io/palace/dev/examples/examples/). - -## Changelog - -Check out the [changelog](./CHANGELOG.md). - -## Contributing - -We welcome contributions to *Palace* including bug fixes, feature requests, etc. To get -started, check out our [contributing guidelines](CONTRIBUTING.md). - -## Contact - -*Palace* is developed by the Design and Simulation group in the AWS Center for Quantum -Computing (CQC). Please contact the development team at -[palace-maint@amazon.com](mailto:palace-maint@amazon.com) with any questions or comments, or -[open an issue](https://github.com/awslabs/palace/issues). - -## License - -This project is licensed under the [Apache-2.0 License](./LICENSE). + + +# Palace: 3D Finite Element Solver for Computational Electromagnetics + +[![CI (Linux)](https://github.com/awslabs/palace/actions/workflows/build-and-test-linux.yml/badge.svg)](https://github.com/awslabs/palace/actions/workflows/build-and-test-linux.yml) +[![CI (macOS)](https://github.com/awslabs/palace/actions/workflows/build-and-test-macos.yml/badge.svg)](https://github.com/awslabs/palace/actions/workflows/build-and-test-macos.yml) +[![](https://img.shields.io/badge/docs-stable-blue.svg)](https://awslabs.github.io/palace/stable) +[![](https://img.shields.io/badge/docs-dev-blue.svg)](https://awslabs.github.io/palace/dev) + +*Palace*, for **PA**rallel **LA**rge-scale **C**omputational **E**lectromagnetics, is an +open-source, parallel finite element code for full-wave 3D electromagnetic simulations in +the frequency or time domain, using the +[MFEM finite element discretization library](http://mfem.org) and +[libCEED library](https://github.com/CEED/libCEED) for efficient exascale discretizations. + +## Key features + + - Eigenmode calculations with optional material or radiative loss including lumped + impedance boundaries. Automatic postprocessing of energy-participation ratios (EPRs) for + [circuit quantization](https://www.nature.com/articles/s41534-021-00461-8) and + interface or bulk participation ratios for predicting dielectric loss. + - Frequency domain driven simulations with surface current excitation and lumped or + numeric wave port boundaries. Wideband frequency response calculation using uniform + frequency space sampling or an adaptive fast frequency sweep algorithm. + - Explicit or fully-implicit time domain solver for transient electromagnetic analysis. + - Lumped capacitance and inductance matrix extraction via electrostatic and magnetostatic + problem formulations. + - Support for a wide range of mesh file formats for structured and unstructured meshes, + with built-in uniform or region-based parallel mesh refinement. + - Solution-based Adaptive Mesh Refinement (AMR) for all simulation types aside from + transient. Nonconformal refinement is supported for all mesh types, and conformal + refinement for simplex meshes. + - Arbitrary high-order finite element spaces and curvilinear mesh support thanks to the + [MFEM library](https://mfem.org/features/). + - Scalable algorithms for the solution of linear systems of equations, including + matrix-free $p$-multigrid utilizing + [high-order operator partial assembly](https://mfem.org/performance/), parallel sparse + direct solvers, and algebraic multigrid (AMG) preconditioners, for fast performance on + platforms ranging from laptops to HPC systems. + - Support for hardware acceleration using NVIDIA or AMD GPUs, including multi-GPU + parallelism, using pure CUDA and HIP code as well as [MAGMA](https://icl.utk.edu/magma/) + and other libraries. + +## Getting started + +*Palace* can be installed using the [Spack HPC package manager](https://spack.io/), with the +command `spack install palace`. Run `spack info palace` to get more information about the +available configuration options and dependencies. + +Those wishing to work in a containerized environment may use the Singularity/Apptainer +recipe for *Palace* in [`singularity/`](./singularity) to build a container containing +*Palace* and all its dependencies. + +Finally, instructions for obtaining *Palace* and building from source can be found in the +[documentation](https://awslabs.github.io/palace/dev/install/). As part of the CMake build +process, most dependencies are downloaded and installed automatically and thus an internet +connection is required. + +System requirements: + + - CMake version 3.24 or later + - C++17 compatible C++ compiler + - C and Fortran (optional) compilers for dependency builds + - MPI distribution + - BLAS, LAPACK libraries + - CUDA Toolkit or ROCm installation (optional, for GPU support only) + +## Documentation + +[https://awslabs.github.io/palace/](https://awslabs.github.io/palace/) + +The documentation for *Palace* provides full instructions for building the solver and +running electromagnetic simulations. + +### Building a local copy of the documentation + +[Julia](https://julialang.org) with +[Documenter](https://documenter.juliadocs.org/) is required to build a local +version of the documentation. Obtain Julia following the [official +instructions](https://julialang.org/install/) and install Documenter by +instantiating the `docs` environment + +```sh +julia --project=docs -e "using Pkg; Pkg.instantiate()" +``` + +Then, generate the documentation with `julia --project make.jl` from within the +[`docs/`](./docs) directory. An HTTP server is needed to visualize the +rendered documentation. The simple way is to start a server is with python: + +```sh +cd docs/build && python -m http.server 8000 +``` + +Then, navigate to `localhost:8000` with your browser. + +## Examples + +Some example applications including configuration files and meshes can be found in the +[`examples/`](./examples) directory. Complete tutorials for each example are available in +the [documentation](https://awslabs.github.io/palace/dev/examples/examples/). + +## Changelog + +Check out the [changelog](./CHANGELOG.md). + +## Contributing + +We welcome contributions to *Palace* including bug fixes, feature requests, etc. To get +started, check out our [contributing guidelines](CONTRIBUTING.md). + +## Contact + +*Palace* is developed by the Design and Simulation group in the AWS Center for Quantum +Computing (CQC). Please contact the development team at +[palace-maint@amazon.com](mailto:palace-maint@amazon.com) with any questions or comments, or +[open an issue](https://github.com/awslabs/palace/issues). + +## License + +This project is licensed under the [Apache-2.0 License](./LICENSE). + +See [THIRD-PARTY-LICENSES](./THIRD-PARTY-LICENSES) and +[THIRD-PARTY-NOTICES](./THIRD-PARTY-NOTICES) for licenses and notices of +third-party software in this repository. diff --git a/THIRD-PARTY-LICENSES b/THIRD-PARTY-LICENSES new file mode 100644 index 0000000000..08a66c1753 --- /dev/null +++ b/THIRD-PARTY-LICENSES @@ -0,0 +1,32 @@ +// GLVis-js + +// BSD 3-Clause License + +// Copyright (c) 2010-2024, Lawrence Livermore National Security, LLC +// All rights reserved. + +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: + +// * Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. + +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. + +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. + +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/THIRD-PARTY-NOTICES b/THIRD-PARTY-NOTICES new file mode 100644 index 0000000000..456a3b381b --- /dev/null +++ b/THIRD-PARTY-NOTICES @@ -0,0 +1,22 @@ +// GLVis-js + +// This work was produced under the auspices of the U.S. Department of Energy by +// Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344. + +// This work was prepared as an account of work sponsored by an agency of +// the United States Government. Neither the United States Government nor +// Lawrence Livermore National Security, LLC, nor any of their employees +// makes any warranty, expressed or implied, or assumes any legal liability +// or responsibility for the accuracy, completeness, or usefulness of any +// information, apparatus, product, or process disclosed, or represents that +// its use would not infringe privately owned rights. + +// Reference herein to any specific commercial product, process, or service +// by trade name, trademark, manufacturer, or otherwise does not necessarily +// constitute or imply its endorsement, recommendation, or favoring by the +// United States Government or Lawrence Livermore National Security, LLC. + +// The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence +// Livermore National Security, LLC, and shall not be used for advertising +// or product endorsement purposes. diff --git a/cmake/ExternalARPACK.cmake b/cmake/ExternalARPACK.cmake index d39f0b7336..531761bb0b 100644 --- a/cmake/ExternalARPACK.cmake +++ b/cmake/ExternalARPACK.cmake @@ -1,68 +1,69 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build ARPACK/PARPACK (from ARPACK-NG) -# - -# Force build order -set(ARPACK_DEPENDENCIES) - -# We always build the 32-bit integer ARPACK interface and link with LP64 BLAS/LAPACK -# For PARPACK, this strategy is only not feasible when matrix sizes PER MPI PROCESS exceed -# 2B nonzeros, which is very unlikely -if(PALACE_WITH_64BIT_BLAS_INT) - message(FATAL_ERROR "ARPACK has not been tested with INTERFACE64 and ILP64 BLAS/LAPACK") -endif() - -set(ARPACK_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND ARPACK_OPTIONS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" - "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" - "-DMPI=ON" - "-DICB=ON" - "-DINTERFACE64=OFF" - "-DTESTS=OFF" -) - -# Configure BLAS/LAPACK -if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") - list(APPEND ARPACK_OPTIONS - "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - ) -endif() - -string(REPLACE ";" "; " ARPACK_OPTIONS_PRINT "${ARPACK_OPTIONS}") -message(STATUS "ARPACK_OPTIONS: ${ARPACK_OPTIONS_PRINT}") - -# ARPACK-NG patches zdotc to a custom zzdotc, which unfortunately conflicts with a similar -# patch from the reference ScaLAPACK, so we patch the patch -set(ARPACK_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/arpack-ng/patch_build.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/arpack-ng/patch_zdotc.diff" -) -if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") - list(APPEND ARPACK_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/arpack-ng/patch_second.diff" - ) -endif() - -include(ExternalProject) -ExternalProject_Add(arpack-ng - DEPENDS ${ARPACK_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_ARPACK_URL} - GIT_TAG ${EXTERN_ARPACK_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/arpack-ng - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/arpack-ng-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/arpack-ng-cmake - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${ARPACK_PATCH_FILES}" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${ARPACK_OPTIONS}" - TEST_COMMAND "" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build ARPACK/PARPACK (from ARPACK-NG) +# + +# Force build order +set(ARPACK_DEPENDENCIES) + +# We always build the 32-bit integer ARPACK interface and link with LP64 BLAS/LAPACK +# For PARPACK, this strategy is only not feasible when matrix sizes PER MPI PROCESS exceed +# 2B nonzeros, which is very unlikely +if(PALACE_WITH_64BIT_BLAS_INT) + message(FATAL_ERROR "ARPACK has not been tested with INTERFACE64 and ILP64 BLAS/LAPACK") +endif() + +set(ARPACK_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) +list(APPEND ARPACK_OPTIONS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" + "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" + "-DMPI=ON" + "-DICB=ON" + "-DINTERFACE64=OFF" + "-DTESTS=OFF" +) + +# Configure BLAS/LAPACK +if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") + list(APPEND ARPACK_OPTIONS + "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + ) +endif() + +string(REPLACE ";" "; " ARPACK_OPTIONS_PRINT "${ARPACK_OPTIONS}") +message(STATUS "ARPACK_OPTIONS: ${ARPACK_OPTIONS_PRINT}") + +# ARPACK-NG patches zdotc to a custom zzdotc, which unfortunately conflicts with a similar +# patch from the reference ScaLAPACK, so we patch the patch +set(ARPACK_PATCH_FILES + "${CMAKE_SOURCE_DIR}/extern/patch/arpack-ng/patch_build.diff" + "${CMAKE_SOURCE_DIR}/extern/patch/arpack-ng/patch_zdotc.diff" + "${CMAKE_SOURCE_DIR}/extern/patch/arpack-ng/patch_pzneupd.diff" +) +if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") + list(APPEND ARPACK_PATCH_FILES + "${CMAKE_SOURCE_DIR}/extern/patch/arpack-ng/patch_second.diff" + ) +endif() + +include(ExternalProject) +ExternalProject_Add(arpack-ng + DEPENDS ${ARPACK_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_ARPACK_URL} + GIT_TAG ${EXTERN_ARPACK_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/arpack-ng + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/arpack-ng-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/arpack-ng-cmake + UPDATE_COMMAND "" + PATCH_COMMAND git apply "${ARPACK_PATCH_FILES}" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${ARPACK_OPTIONS}" + TEST_COMMAND "" +) diff --git a/cmake/ExternalBLASLAPACK.cmake b/cmake/ExternalBLASLAPACK.cmake index 8ad5dc1840..b77f80ffdc 100644 --- a/cmake/ExternalBLASLAPACK.cmake +++ b/cmake/ExternalBLASLAPACK.cmake @@ -1,192 +1,203 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Configure BLAS/LAPACK libraries -# - -# Configuring 64-bit BLAS/LAPACK integer interface is not supported by older CMake. -if(NOT PALACE_WITH_64BIT_INT AND PALACE_WITH_64BIT_BLAS_INT) - message(FATAL_ERROR "ILP64 BLAS/LAPACK interface requires PALACE_WITH_64BIT_INT") -endif() -if(NOT ${CMAKE_VERSION} VERSION_LESS "3.22.0") - if(PALACE_WITH_64BIT_BLAS_INT) - set(BLA_SIZEOF_INTEGER 8) - else() - set(BLA_SIZEOF_INTEGER 4) - endif() -endif() - -# Defines cache variables BLAS_LAPACK_LIBRARIES/BLAS_LAPACK_INCLUDE_DIRS for building -# dependencies on top of BLAS and LAPACK -if(DEFINED ENV{ARMPL_DIR} OR DEFINED ENV{ARMPLROOT} OR DEFINED ENV{ARMPL_ROOT}) - # Arm Performance Libraries for arm64 builds when available - if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm") - message(WARNING "Arm PL math libraries are only intended for arm64 architecture builds") - endif() - if(DEFINED ENV{ARMPL_DIR}) - set(ARMPL_DIR $ENV{ARMPL_DIR}) - elseif(DEFINED ENV{ARMPLROOT}) - set(ARMPL_DIR $ENV{ARMPLROOT}) - elseif(DEFINED ENV{ARMPL_ROOT}) - set(ARMPL_DIR $ENV{ARMPL_ROOT}) - else() - set(ARMPL_DIR) - endif() - if(PALACE_WITH_64BIT_BLAS_INT) - if(PALACE_WITH_OPENMP) - set(ARMPL_LIB_SUFFIX "_ilp64_mp") - else() - set(ARMPL_LIB_SUFFIX "_ilp64") - endif() - else() - if(PALACE_WITH_OPENMP) - set(ARMPL_LIB_SUFFIX "_lp64_mp") - else() - set(ARMPL_LIB_SUFFIX "_lp64") - endif() - endif() - find_library(_BLAS_LAPACK_LIBRARIES - NAMES armpl${ARMPL_LIB_SUFFIX} armpl - PATHS ${ARMPL_DIR} - PATH_SUFFIXES lib lib64 - NO_DEFAULT_PATH - REQUIRED - ) - find_path(_BLAS_LAPACK_INCLUDE_DIRS - NAMES cblas.h - PATHS ${ARMPL_DIR} - PATH_SUFFIXES include${ARMPL_LIB_SUFFIX} include - NO_DEFAULT_PATH - REQUIRED - ) - message(STATUS "Using BLAS/LAPACK from Arm Performance Libraries (Arm PL)") -elseif(DEFINED ENV{AOCL_DIR} OR DEFINED ENV{AOCLROOT} OR DEFINED ENV{AOCL_ROOT}) - # AOCL for x86_64 builds when available - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm") - message(WARNING "AOCL math libraries are not intended for arm64 architecture builds") - endif() - if(DEFINED ENV{AOCL_DIR}) - set(AOCL_DIR $ENV{AOCL_DIR}) - elseif(DEFINED ENV{AOCLROOT}) - set(AOCL_DIR $ENV{AOCLROOT}) - elseif(DEFINED ENV{AOCL_ROOT}) - set(AOCL_DIR $ENV{AOCL_ROOT}) - else() - set(AOCL_DIR) - endif() - if(PALACE_WITH_64BIT_BLAS_INT) - set(AOCL_DIR_SUFFIX "_ILP64") - else() - set(AOCL_DIR_SUFFIX "_LP64") - endif() - if(PALACE_WITH_OPENMP) - set(AOCL_LIB_SUFFIX "-mt") - else() - set(AOCL_LIB_SUFFIX "") - endif() - find_library(BLIS_LIBRARY - NAMES blis${AOCL_LIB_SUFFIX} blis - PATHS ${AOCL_DIR} - PATH_SUFFIXES lib${AOCL_DIR_SUFFIX} lib lib64 - NO_DEFAULT_PATH - REQUIRED - ) - find_library(FLAME_LIBRARY - NAMES flame FLAME - PATHS ${AOCL_DIR} - PATH_SUFFIXES lib${AOCL_DIR_SUFFIX} lib lib64 - NO_DEFAULT_PATH - REQUIRED - ) - set(_BLAS_LAPACK_LIBRARIES "${FLAME_LIBRARY}$${BLIS_LIBRARY}") - find_path(_BLAS_LAPACK_INCLUDE_DIRS - NAMES cblas.h - PATHS ${AOCL_DIR} - PATH_SUFFIXES include${AOCL_DIR_SUFFIX} include/blis include - NO_DEFAULT_PATH - REQUIRED - ) - message(STATUS "Using BLAS/LAPACK from AMD BLIS/libFLAME") -elseif(DEFINED ENV{MKL_DIR} OR DEFINED ENV{MKLROOT} OR DEFINED ENV{MKL_ROOT}) - # MKL for x86_64 builds when available - if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm") - message(WARNING "MKL math libraries are not intended for arm64 architecture builds") - endif() - if(DEFINED ENV{MKL_DIR}) - set(MKL_DIR $ENV{MKL_DIR}) - elseif(DEFINED ENV{MKLROOT}) - set(MKL_DIR $ENV{MKLROOT}) - elseif(DEFINED ENV{MKL_ROOT}) - set(MKL_DIR $ENV{MKL_ROOT}) - else() - set(MKL_DIR) - endif() - if(PALACE_WITH_64BIT_BLAS_INT) - if(PALACE_WITH_OPENMP) - set(MKL_LIB_SUFFIX "_64ilp") - else() - set(MKL_LIB_SUFFIX "_64ilp_seq") - endif() - else() - if(PALACE_WITH_OPENMP) - set(MKL_LIB_SUFFIX "_64lp") - else() - set(MKL_LIB_SUFFIX "_64lp_seq") - endif() - endif() - list(APPEND CMAKE_PREFIX_PATH ${MKL_DIR}) - set(BLA_VENDOR "Intel10${MKL_LIB_SUFFIX}") - find_package(BLAS REQUIRED) - find_package(LAPACK REQUIRED) - set(_BLAS_LAPACK_LIBRARIES ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) - list(REMOVE_DUPLICATES _BLAS_LAPACK_LIBRARIES) - string(REPLACE ";" "$" _BLAS_LAPACK_LIBRARIES "${_BLAS_LAPACK_LIBRARIES}") - find_path(_BLAS_LAPACK_INCLUDE_DIRS - NAMES mkl_cblas.h - PATHS ${MKL_DIR} - PATH_SUFFIXES include - NO_DEFAULT_PATH - REQUIRED - ) - message(STATUS "Using BLAS/LAPACK from Intel MKL") -else() - # Try to find OpenBLAS installation on the system - # Warning: This does NOT automatically configure for OpenMP support - if(DEFINED ENV{OPENBLAS_DIR}) - set(OPENBLAS_DIR $ENV{OPENBLAS_DIR}) - elseif(DEFINED ENV{OPENBLASROOT}) - set(OPENBLAS_DIR $ENV{OPENBLASROOT}) - elseif(DEFINED ENV{OPENBLAS_ROOT}) - set(OPENBLAS_DIR $ENV{OPENBLAS_ROOT}) - else() - set(OPENBLAS_DIR) - endif() - list(APPEND CMAKE_PREFIX_PATH ${OPENBLAS_DIR}) - find_package(BLAS REQUIRED) - find_package(LAPACK REQUIRED) - set(_BLAS_LAPACK_LIBRARIES ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) - list(REMOVE_DUPLICATES _BLAS_LAPACK_LIBRARIES) - string(REPLACE ";" "$" _BLAS_LAPACK_LIBRARIES "${_BLAS_LAPACK_LIBRARIES}") - set(_BLAS_LAPACK_DIRS) - foreach(LIB IN LISTS _BLAS_LAPACK_LIBRARIES) - get_filename_component(LIB_DIR ${LIB} DIRECTORY) - list(APPEND _BLAS_LAPACK_DIRS ${LIB_DIR}) - endforeach() - list(REMOVE_DUPLICATES _BLAS_LAPACK_DIRS) - find_path(_BLAS_LAPACK_INCLUDE_DIRS - NAMES cblas.h - HINTS ${_BLAS_LAPACK_DIRS} - PATH_SUFFIXES include include/openblas - REQUIRED - ) - message(STATUS "Using BLAS/LAPACK located by CMake") -endif() - -# Save variables to cache -set(BLAS_LAPACK_LIBRARIES ${_BLAS_LAPACK_LIBRARIES} CACHE STRING - "List of library files for BLAS/LAPACK" -) -set(BLAS_LAPACK_INCLUDE_DIRS ${_BLAS_LAPACK_INCLUDE_DIRS} CACHE STRING - "Path to BLAS/LAPACK include directories" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Configure BLAS/LAPACK libraries +# + +# Configuring 64-bit BLAS/LAPACK integer interface is not supported by older CMake. +if(NOT PALACE_WITH_64BIT_INT AND PALACE_WITH_64BIT_BLAS_INT) + message(FATAL_ERROR "ILP64 BLAS/LAPACK interface requires PALACE_WITH_64BIT_INT") +endif() +if(NOT ${CMAKE_VERSION} VERSION_LESS "3.22.0") + if(PALACE_WITH_64BIT_BLAS_INT) + set(BLA_SIZEOF_INTEGER 8) + else() + set(BLA_SIZEOF_INTEGER 4) + endif() +endif() + +# Defines cache variables BLAS_LAPACK_LIBRARIES/BLAS_LAPACK_INCLUDE_DIRS for building +# dependencies on top of BLAS and LAPACK +if(DEFINED ENV{ARMPL_DIR} OR DEFINED ENV{ARMPLROOT} OR DEFINED ENV{ARMPL_ROOT}) + # Arm Performance Libraries for arm64 builds when available + if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm") + message(WARNING "Arm PL math libraries are only intended for arm64 architecture builds") + endif() + if(DEFINED ENV{ARMPL_DIR}) + set(ARMPL_DIR $ENV{ARMPL_DIR}) + elseif(DEFINED ENV{ARMPLROOT}) + set(ARMPL_DIR $ENV{ARMPLROOT}) + elseif(DEFINED ENV{ARMPL_ROOT}) + set(ARMPL_DIR $ENV{ARMPL_ROOT}) + else() + set(ARMPL_DIR) + endif() + if(PALACE_WITH_64BIT_BLAS_INT) + if(PALACE_WITH_OPENMP) + set(ARMPL_LIB_SUFFIX "_ilp64_mp") + else() + set(ARMPL_LIB_SUFFIX "_ilp64") + endif() + else() + if(PALACE_WITH_OPENMP) + set(ARMPL_LIB_SUFFIX "_mp") + else() + set(ARMPL_LIB_SUFFIX "") + endif() + endif() + list(APPEND CMAKE_PREFIX_PATH ${ARMPL_DIR}) + set(BLA_VENDOR "Arm${ARMPL_LIB_SUFFIX}") + find_package(BLAS REQUIRED) + find_package(LAPACK REQUIRED) + + # Locate include directory + find_path(_BLAS_LAPACK_INCLUDE_DIRS + NAMES armpl.h + PATHS ${ARMPL_DIR} + PATH_SUFFIXES include + NO_DEFAULT_PATH + REQUIRED + ) + message(STATUS "Using BLAS/LAPACK from Arm Performance Libraries (Arm PL)") +elseif(DEFINED ENV{AOCL_DIR} OR DEFINED ENV{AOCLROOT} OR DEFINED ENV{AOCL_ROOT}) + # AOCL for x86_64 builds when available (part of CMake's FindBLAS/FindLAPACK as of v3.27 + # but unnecessarily adds -fopenmp flag) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm") + message(WARNING "AOCL math libraries are not intended for arm64 architecture builds") + endif() + if(DEFINED ENV{AOCL_DIR}) + set(AOCL_DIR $ENV{AOCL_DIR}) + elseif(DEFINED ENV{AOCLROOT}) + set(AOCL_DIR $ENV{AOCLROOT}) + elseif(DEFINED ENV{AOCL_ROOT}) + set(AOCL_DIR $ENV{AOCL_ROOT}) + else() + set(AOCL_DIR) + endif() + if(PALACE_WITH_64BIT_BLAS_INT) + set(AOCL_DIR_SUFFIX "_ILP64") + else() + set(AOCL_DIR_SUFFIX "_LP64") + endif() + if(PALACE_WITH_OPENMP) + set(AOCL_LIB_SUFFIX "-mt") + else() + set(AOCL_LIB_SUFFIX "") + endif() + find_library(BLAS_LIBRARIES + NAMES blis${AOCL_LIB_SUFFIX} blis + PATHS ${AOCL_DIR} + PATH_SUFFIXES lib${AOCL_DIR_SUFFIX} lib lib64 + NO_DEFAULT_PATH + REQUIRED + ) + find_library(LAPACK_LIBRARIES + NAMES flame FLAME + PATHS ${AOCL_DIR} + PATH_SUFFIXES lib${AOCL_DIR_SUFFIX} lib lib64 + NO_DEFAULT_PATH + REQUIRED + ) + + # Locate include directory + find_path(_BLAS_LAPACK_INCLUDE_DIRS + NAMES cblas.h + PATHS ${AOCL_DIR} + PATH_SUFFIXES include${AOCL_DIR_SUFFIX} include/blis include + NO_DEFAULT_PATH + REQUIRED + ) + message(STATUS "Using BLAS/LAPACK from AMD BLIS/libFLAME") +elseif(DEFINED ENV{MKL_DIR} OR DEFINED ENV{MKLROOT} OR DEFINED ENV{MKL_ROOT}) + # MKL for x86_64 builds when available + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm") + message(WARNING "MKL math libraries are not intended for arm64 architecture builds") + endif() + if(DEFINED ENV{MKL_DIR}) + set(MKL_DIR $ENV{MKL_DIR}) + elseif(DEFINED ENV{MKLROOT}) + set(MKL_DIR $ENV{MKLROOT}) + elseif(DEFINED ENV{MKL_ROOT}) + set(MKL_DIR $ENV{MKL_ROOT}) + else() + set(MKL_DIR) + endif() + if(PALACE_WITH_64BIT_BLAS_INT) + if(PALACE_WITH_OPENMP) + set(MKL_LIB_SUFFIX "_64ilp") + else() + set(MKL_LIB_SUFFIX "_64ilp_seq") + endif() + else() + if(PALACE_WITH_OPENMP) + set(MKL_LIB_SUFFIX "_64lp") + else() + set(MKL_LIB_SUFFIX "_64lp_seq") + endif() + endif() + list(APPEND CMAKE_PREFIX_PATH ${MKL_DIR}) + set(BLA_VENDOR "Intel10${MKL_LIB_SUFFIX}") + find_package(BLAS REQUIRED) + find_package(LAPACK REQUIRED) + + # Locate include directories + find_path(_BLAS_LAPACK_INCLUDE_DIRS + NAMES mkl_cblas.h + PATHS ${MKL_DIR} + PATH_SUFFIXES include + NO_DEFAULT_PATH + REQUIRED + ) + message(STATUS "Using BLAS/LAPACK from Intel MKL") +else() + # Try to find OpenBLAS installation on the system + # Warning: This does NOT automatically configure for OpenMP support + if(DEFINED ENV{OPENBLAS_DIR}) + set(OPENBLAS_DIR $ENV{OPENBLAS_DIR}) + elseif(DEFINED ENV{OPENBLASROOT}) + set(OPENBLAS_DIR $ENV{OPENBLASROOT}) + elseif(DEFINED ENV{OPENBLAS_ROOT}) + set(OPENBLAS_DIR $ENV{OPENBLAS_ROOT}) + else() + set(OPENBLAS_DIR) + message(STATUS "Using BLAS/LAPACK located by CMake") + endif() + + if(NOT OPENBLAS_DIR STREQUAL "") + # If OpenBLAS was found set the vendor to avoid conflict with Accelerate on Darwin + set(BLA_VENDOR "OpenBLAS") + message(STATUS "Using BLAS/LAPACK from OpenBLAS") + endif() + + list(APPEND CMAKE_PREFIX_PATH ${OPENBLAS_DIR}) + find_package(BLAS REQUIRED) + find_package(LAPACK REQUIRED) + + # Locate include directory + set(_BLAS_LAPACK_DIRS) + foreach(LIB IN LISTS LAPACK_LIBRARIES BLAS_LIBRARIES) + cmake_path(GET LIB PARENT_PATH LIB_DIR) + cmake_path(GET LIB_DIR PARENT_PATH LIB_DIR) + list(APPEND _BLAS_LAPACK_DIRS ${LIB_DIR}) + endforeach() + list(REMOVE_DUPLICATES _BLAS_LAPACK_DIRS) + find_path(_BLAS_LAPACK_INCLUDE_DIRS + NAMES cblas.h + HINTS ${_BLAS_LAPACK_DIRS} + PATH_SUFFIXES include include/openblas include/blis + REQUIRED + ) +endif() +set(LAPACK_LIBRARIES "${LAPACK_LIBRARIES};-lm") + +# Save variables to cache +set(_BLAS_LAPACK_LIBRARIES ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) +list(REMOVE_DUPLICATES _BLAS_LAPACK_LIBRARIES) +string(REPLACE ";" "$" _BLAS_LAPACK_LIBRARIES "${_BLAS_LAPACK_LIBRARIES}") +set(BLAS_LAPACK_LIBRARIES ${_BLAS_LAPACK_LIBRARIES} CACHE STRING + "List of library files for BLAS/LAPACK" +) +set(BLAS_LAPACK_INCLUDE_DIRS ${_BLAS_LAPACK_INCLUDE_DIRS} CACHE STRING + "Path to BLAS/LAPACK include directories" +) \ No newline at end of file diff --git a/cmake/ExternalEigen.cmake b/cmake/ExternalEigen.cmake index ed63a36f48..df5141312f 100644 --- a/cmake/ExternalEigen.cmake +++ b/cmake/ExternalEigen.cmake @@ -1,29 +1,30 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Configure Eigen library (header-only) -# - -set(EIGEN_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND EIGEN_OPTIONS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DEIGEN_BUILD_DOC=ON" - "-DBUILD_TESTING=OFF" -) - -string(REPLACE ";" "; " EIGEN_OPTIONS_PRINT "${EIGEN_OPTIONS}") -message(STATUS "EIGEN_OPTIONS: ${EIGEN_OPTIONS_PRINT}") - -include(ExternalProject) -ExternalProject_Add(eigen - URL ${EXTERN_EIGEN_URL} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/eigen - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/eigen-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/eigen-cmake - UPDATE_COMMAND "" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${EIGEN_OPTIONS}" - TEST_COMMAND "" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Configure Eigen library (header-only) +# + +set(EIGEN_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) +list(APPEND EIGEN_OPTIONS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DEIGEN_BUILD_DOC=OFF" + "-DBUILD_TESTING=OFF" + "-DCMAKE_Fortran_COMPILER=" # Unneeded +) + +string(REPLACE ";" "; " EIGEN_OPTIONS_PRINT "${EIGEN_OPTIONS}") +message(STATUS "EIGEN_OPTIONS: ${EIGEN_OPTIONS_PRINT}") + +include(ExternalProject) +ExternalProject_Add(eigen + URL ${EXTERN_EIGEN_URL} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/eigen + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/eigen-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/eigen-cmake + UPDATE_COMMAND "" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${EIGEN_OPTIONS}" + TEST_COMMAND "" +) diff --git a/cmake/ExternalFmt.cmake b/cmake/ExternalFmt.cmake index fd1c637e70..9dd96de044 100644 --- a/cmake/ExternalFmt.cmake +++ b/cmake/ExternalFmt.cmake @@ -1,30 +1,30 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Configure fmt library -# - -set(FMT_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND FMT_OPTIONS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DFMT_INSTALL=ON" - "-DFMT_DOC=OFF" - "-DFMT_TEST=OFF" -) - -string(REPLACE ";" "; " FMT_OPTIONS_PRINT "${FMT_OPTIONS}") -message(STATUS "FMT_OPTIONS: ${FMT_OPTIONS_PRINT}") - -include(ExternalProject) -ExternalProject_Add(fmt - URL ${EXTERN_FMT_URL} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/fmt - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/fmt-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/fmt-cmake - UPDATE_COMMAND "" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${FMT_OPTIONS}" - TEST_COMMAND "" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Configure fmt library +# + +set(FMT_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) +list(APPEND FMT_OPTIONS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DFMT_INSTALL=ON" + "-DFMT_DOC=OFF" + "-DFMT_TEST=OFF" +) + +string(REPLACE ";" "; " FMT_OPTIONS_PRINT "${FMT_OPTIONS}") +message(STATUS "FMT_OPTIONS: ${FMT_OPTIONS_PRINT}") + +include(ExternalProject) +ExternalProject_Add(fmt + URL ${EXTERN_FMT_URL} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/fmt + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/fmt-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/fmt-cmake + UPDATE_COMMAND "" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${FMT_OPTIONS}" + TEST_COMMAND "" +) diff --git a/cmake/ExternalGSLIB.cmake b/cmake/ExternalGSLIB.cmake index c647642391..ea10b572da 100644 --- a/cmake/ExternalGSLIB.cmake +++ b/cmake/ExternalGSLIB.cmake @@ -1,102 +1,96 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build GSLIB -# - -# Force build order -set(GSLIB_DEPENDENCIES) - -set(GSLIB_OPTIONS - "INSTALL_ROOT=${CMAKE_INSTALL_PREFIX}" - "CC=${CMAKE_C_COMPILER}" - "MPI=1" -) - -set(GSLIB_CFLAGS ${CMAKE_C_FLAGS}) -set(GSLIB_LDFLAGS) - -# GSLIB will add -fPIC as necessary -if(BUILD_SHARED_LIBS) - list(APPEND GSLIB_OPTIONS - "STATIC=0" - "SHARED=1" - ) -else() - list(APPEND GSLIB_OPTIONS - "STATIC=1" - "SHARED=0" - ) -endif() - -# User might specify the MPI compiler wrappers directly, otherwise we need to supply MPI -# as found from the CMake module -if(NOT MPI_FOUND) - message(FATAL_ERROR "MPI is not found when trying to build GSLIB") -endif() -if(NOT CMAKE_C_COMPILER STREQUAL MPI_C_COMPILER) - foreach(INCLUDE_DIR IN LISTS MPI_C_INCLUDE_DIRS) - set(GSLIB_CFLAGS "${GSLIB_CFLAGS} -I${INCLUDE_DIR}") - endforeach() - string(REPLACE ";" " " GSLIB_MPI_LIBRARIES "${MPI_C_LIBRARIES}") - set(GSLIB_LDFLAGS "${GSLIB_LDFLAGS} ${GSLIB_MPI_LIBRARIES}") -endif() - -# Don't build GSLIB with external BLAS (default option) -list(APPEND GSLIB_OPTIONS - "BLAS=0" -) - -# Configure BLAS dependency -# if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") -# foreach(INCLUDE_DIR IN LISTS BLAS_LAPACK_INCLUDE_DIRS) -# set(GSLIB_CFLAGS "${GSLIB_CFLAGS} -I${INCLUDE_DIR}") -# endforeach() -# string(REPLACE "$" " " GSLIB_BLAS_LAPACK_LIBRARIES "${BLAS_LAPACK_LIBRARIES}") -# set(GSLIB_LDFLAGS "${GSLIB_LDFLAGS} ${GSLIB_BLAS_LAPACK_LIBRARIES}") -# if(BLA_VENDOR MATCHES "Intel") -# list(APPEND GSLIB_OPTIONS -# "BLAS=1" -# "MKL=1" -# ) -# else() -# list(APPEND GSLIB_OPTIONS -# "BLAS=1" -# ) -# endif() -# else() -# list(APPEND GSLIB_OPTIONS -# "BLAS=0" -# ) -# endif() - -list(APPEND GSLIB_OPTIONS - "CFLAGS=${GSLIB_CFLAGS}" - "LDFLAGS=${GSLIB_LDFLAGS}" -) - -string(REPLACE ";" "; " GSLIB_OPTIONS_PRINT "${GSLIB_OPTIONS}") -message(STATUS "GSLIB_OPTIONS: ${GSLIB_OPTIONS_PRINT}") - -# Fix build -set(GSLIB_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/gslib/patch_build.diff" -) - -include(ExternalProject) -ExternalProject_Add(gslib - DEPENDS ${GSLIB_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_GSLIB_URL} - GIT_TAG ${EXTERN_GSLIB_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/gslib - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/gslib-cmake - BUILD_IN_SOURCE TRUE - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${GSLIB_PATCH_FILES}" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} ${GSLIB_OPTIONS} install - TEST_COMMAND "" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build GSLIB +# + +# Force build order +set(GSLIB_DEPENDENCIES) + +set(GSLIB_OPTIONS + "INSTALL_ROOT=${CMAKE_INSTALL_PREFIX}" + "CC=${CMAKE_C_COMPILER}" + "MPI=1" +) + +set(GSLIB_CFLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${BUILD_TYPE_UPPER}}") +set(GSLIB_LDFLAGS ${CMAKE_EXE_LINKER_FLAGS}) + +# GSLIB will add -fPIC as necessary +if(BUILD_SHARED_LIBS) + list(APPEND GSLIB_OPTIONS + "STATIC=0" + "SHARED=1" + ) +else() + list(APPEND GSLIB_OPTIONS + "STATIC=1" + "SHARED=0" + ) +endif() + +# User might specify the MPI compiler wrappers directly, otherwise we need to supply MPI +# as found from the CMake module +if(NOT MPI_FOUND) + message(FATAL_ERROR "MPI is not found when trying to build GSLIB") +endif() +if(NOT CMAKE_C_COMPILER STREQUAL MPI_C_COMPILER) + foreach(INCLUDE_DIR IN LISTS MPI_C_INCLUDE_DIRS) + set(GSLIB_CFLAGS "${GSLIB_CFLAGS} -I${INCLUDE_DIR}") + endforeach() + string(REPLACE ";" " " GSLIB_MPI_LIBRARIES "${MPI_C_LIBRARIES}") + set(GSLIB_LDFLAGS "${GSLIB_LDFLAGS} ${GSLIB_MPI_LIBRARIES}") +endif() + +# Don't build GSLIB with external BLAS (default option) +list(APPEND GSLIB_OPTIONS + "BLAS=0" +) + +# # Configure BLAS dependency +# if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") +# foreach(INCLUDE_DIR IN LISTS BLAS_LAPACK_INCLUDE_DIRS) +# set(GSLIB_CFLAGS "${GSLIB_CFLAGS} -I${INCLUDE_DIR}") +# endforeach() +# string(REPLACE "$" " " GSLIB_BLAS_LAPACK_LIBRARIES "${BLAS_LAPACK_LIBRARIES}") +# set(GSLIB_LDFLAGS "${GSLIB_LDFLAGS} ${GSLIB_BLAS_LAPACK_LIBRARIES}") +# if(BLA_VENDOR MATCHES "Intel") +# list(APPEND GSLIB_OPTIONS +# "BLAS=1" +# "MKL=1" +# ) +# else() +# list(APPEND GSLIB_OPTIONS +# "BLAS=1" +# ) +# endif() +# else() +# list(APPEND GSLIB_OPTIONS +# "BLAS=0" +# ) +# endif() + +list(APPEND GSLIB_OPTIONS + "CFLAGS=${GSLIB_CFLAGS}" + "LDFLAGS=${GSLIB_LDFLAGS}" +) + +string(REPLACE ";" "; " GSLIB_OPTIONS_PRINT "${GSLIB_OPTIONS}") +message(STATUS "GSLIB_OPTIONS: ${GSLIB_OPTIONS_PRINT}") + +include(ExternalProject) +ExternalProject_Add(gslib + DEPENDS ${GSLIB_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_GSLIB_URL} + GIT_TAG ${EXTERN_GSLIB_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/gslib + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/gslib-cmake + BUILD_IN_SOURCE TRUE + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} ${GSLIB_OPTIONS} install + TEST_COMMAND "" +) diff --git a/cmake/ExternalGitTags.cmake b/cmake/ExternalGitTags.cmake index a9bfa9f49e..21cf14a70a 100644 --- a/cmake/ExternalGitTags.cmake +++ b/cmake/ExternalGitTags.cmake @@ -1,295 +1,287 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Repository URLs and tags for external third-party dependencies -# - -if(__extern_git_tags) - return() -endif() -set(__extern_git_tags YES) - -# ARPACK-NG -set(EXTERN_ARPACK_URL - "https://github.com/opencollab/arpack-ng.git" CACHE STRING - "URL for external ARPACK-NG build" -) -set(EXTERN_ARPACK_GIT_BRANCH - "master" CACHE STRING - "Git branch for external ARPACK-NG build" -) -set(EXTERN_ARPACK_GIT_TAG - "569a3859c1bf79b303d830ada7b745de0f18512c" CACHE STRING # 10/15/2023 - "Git tag for external ARPACK-NG build" -) - -# ButterflyPACK (for STRUMPACK) -set(EXTERN_BUTTERFLYPACK_URL - "https://github.com/liuyangzhuan/ButterflyPACK.git" CACHE STRING - "URL for external ButterflyPACK build" -) -set(EXTERN_BUTTERFLYPACK_GIT_BRANCH - "master" CACHE STRING - "Git branch for external ButterflyPACK build" -) -set(EXTERN_BUTTERFLYPACK_GIT_TAG - "cb336d97dce3ddd371e87c21f622fdba3ca3ed9b" CACHE STRING # 10/31/2023 - "Git tag for external ButterflyPACK build" -) - -# GKlib (for METIS and ParMETIS) -set(EXTERN_GKLIB_URL - "https://github.com/KarypisLab/GKlib.git" CACHE STRING - "URL for external GKlib build" -) -set(EXTERN_GKLIB_GIT_BRANCH - "master" CACHE STRING - "Git branch for external GKlib build" -) -set(EXTERN_GKLIB_GIT_TAG - "8bd6bad750b2b0d90800c632cf18e8ee93ad72d7" CACHE STRING # 03/26/2023 - "Git tag for external GKlib build" -) - -# GSLIB -set(EXTERN_GSLIB_URL - "https://github.com/Nek5000/gslib.git" CACHE STRING - "URL for external GSLIB build" -) -set(EXTERN_GSLIB_GIT_BRANCH - "master" CACHE STRING - "Git branch for external GSLIB build" -) -set(EXTERN_GSLIB_GIT_TAG - "39d1baae8f4bfebe3ebca6a234dcc8ba1ee5edc7" CACHE STRING # 11/09/2022 - "Git tag for external GSLIB build" -) - -# HYPRE (for MFEM) -set(EXTERN_HYPRE_URL - "https://github.com/hypre-space/hypre.git" CACHE STRING - "URL for external HYPRE build" -) -set(EXTERN_HYPRE_GIT_BRANCH - "master" CACHE STRING - "Git branch for external HYPRE build" -) -set(EXTERN_HYPRE_GIT_TAG - "cf43b1653008bf69a433365f39162831ec75a863" CACHE STRING # 11/10/2023 - "Git tag for external HYPRE build" -) - -# libCEED -set(EXTERN_LIBCEED_URL - "https://github.com/CEED/libCEED.git" CACHE STRING - "URL for external libCEED build" -) -set(EXTERN_LIBCEED_GIT_BRANCH - "main" CACHE STRING - "Git branch for external libCEED build" -) -set(EXTERN_LIBCEED_GIT_TAG - "6dbc39456a768b0ea4055927c94c474873ea8bf6" CACHE STRING # main @ 11/12/2023 - "Git tag for external libCEED build" -) - -# LIBXSMM (for libCEED) -set(EXTERN_LIBXSMM_URL - "https://github.com/hfp/libxsmm.git" CACHE STRING - "URL for external LIBXSMM build" -) -set(EXTERN_LIBXSMM_GIT_BRANCH - "main" CACHE STRING - "Git branch for external LIBXSMM build" -) -set(EXTERN_LIBXSMM_GIT_TAG - "c62dae286096c3f3e057cff08f60cb9f9c588423" CACHE STRING # 11/13/2023 - "Git tag for external LIBXSMM build" -) - -# MAGMA -set(EXTERN_MAGMA_URL - "https://bitbucket.org/icl/magma.git" CACHE STRING - "URL for external MAGMA build" -) -set(EXTERN_MAGMA_GIT_BRANCH - "master" CACHE STRING - "Git branch for external MAGMA build" -) -set(EXTERN_MAGMA_GIT_TAG - "fcfe5aa61c1a4c664b36a73ebabbdbab82765e9f" CACHE STRING # 11/09/2023 - "Git tag for external MAGMA build" -) - -# METIS -set(EXTERN_METIS_URL - "https://github.com/KarypisLab/METIS.git" CACHE STRING - "URL for external METIS build" -) -set(EXTERN_METIS_GIT_BRANCH - "master" CACHE STRING - "Git branch for external METIS build" -) -set(EXTERN_METIS_GIT_TAG - "e0f1b88b8efcb24ffa0ec55eabb78fbe61e58ae7" CACHE STRING # 04/02/2023 - "Git tag for external METIS build" -) - -# MFEM -set(EXTERN_MFEM_URL - "https://github.com/mfem/mfem.git" CACHE STRING - "URL for external MFEM build" -) -set(EXTERN_MFEM_GIT_BRANCH - "master" CACHE STRING - "Git branch for external MFEM build" -) -set(EXTERN_MFEM_GIT_TAG - "1c58d6d3d1f30d822d3a8b1ebefe07888b348e58" CACHE STRING # master @ 11/12/2023 - "Git tag for external MFEM build" -) - -# MUMPS -set(EXTERN_MUMPS_URL - "https://github.com/scivision/mumps.git" CACHE STRING - "URL for external MUMPS build" -) -set(EXTERN_MUMPS_GIT_BRANCH - "main" CACHE STRING - "Git branch for external MUMPS build" -) -set(EXTERN_MUMPS_GIT_TAG - "1ea85fa01fd79cca8d06fefb52e0bfc2e996132f" CACHE STRING # 11/09/2023 - "Git tag for external MUMPS build" -) - -# ParMETIS -set(EXTERN_PARMETIS_URL - "https://github.com/KarypisLab/ParMETIS.git" CACHE STRING - "URL for external ParMETIS build" -) -set(EXTERN_PARMETIS_GIT_BRANCH - "main" CACHE STRING - "Git branch for external ParMETIS build" -) -set(EXTERN_PARMETIS_GIT_TAG - "8ee6a372ca703836f593e3c450ca903f04be14df" CACHE STRING # 03/26/2023 - "Git tag for external ParMETIS build" -) - -# PETSc (for SLEPc) -set(EXTERN_PETSC_URL - "https://gitlab.com/petsc/petsc.git" CACHE STRING - "URL for external PETSc build" -) -set(EXTERN_PETSC_GIT_BRANCH - "main" CACHE STRING - "Git branch for external PETSc build" -) -set(EXTERN_PETSC_GIT_TAG - "7b506345644a939af5723216e40ffcdd7780697d" CACHE STRING # 11/14/2023 - "Git tag for external PETSc build" -) - -# ScaLAPACK (for STRUMPACK and MUMPS) -set(EXTERN_SCALAPACK_URL - "https://github.com/scivision/scalapack.git" CACHE STRING - "URL for external ScaLAPACK build" -) -set(EXTERN_SCALAPACK_GIT_BRANCH - "main" CACHE STRING - "Git branch for external ScaLAPACK build" -) -set(EXTERN_SCALAPACK_GIT_TAG - "acf286b783d53f73a42ec219ead74357e0b34501" CACHE STRING # 11/09/2023 - "Git tag for external ScaLAPACK build" -) - -# SLATE (for STRUMPACK) -set(EXTERN_SLATE_URL - "https://github.com/icl-utk-edu/slate.git" CACHE STRING - "URL for external SLATE build" -) -set(EXTERN_SLATE_GIT_BRANCH - "master" CACHE STRING - "Git branch for external SLATE build" -) -set(EXTERN_SLATE_GIT_TAG - "4323f4b430198d64c6ba90ca8e8a4f9272f59e77" CACHE STRING # 11/08/2022 - "Git tag for external SLATE build" -) - -# SLEPc -set(EXTERN_SLEPC_URL - "https://gitlab.com/slepc/slepc.git" CACHE STRING - "URL for external SLEPc build" -) -set(EXTERN_SLEPC_GIT_BRANCH - "main" CACHE STRING - "Git branch for external SLEPc build" -) -set(EXTERN_SLEPC_GIT_TAG - "228ef2b053da4df23e47def33189f7d6381cd660" CACHE STRING # 11/14/2023 - "Git tag for external SLEPc build" -) - -# STRUMPACK -set(EXTERN_STRUMPACK_URL - "https://github.com/pghysels/STRUMPACK.git" CACHE STRING - "URL for external STRUMPACK build" -) -set(EXTERN_STRUMPACK_GIT_BRANCH - "master" CACHE STRING - "Git branch for external STRUMPACK build" -) -set(EXTERN_STRUMPACK_GIT_TAG - "8f716f1f819c4fcf864ccaaa2046d50420dcee36" CACHE STRING # 10/26/2023 - "Git tag for external STRUMPACK build" -) - -# SuperLU_DIST -set(EXTERN_SUPERLU_URL - "https://github.com/xiaoyeli/superlu_dist.git" CACHE STRING - "URL for external SuperLU_DIST build" -) -set(EXTERN_SUPERLU_GIT_BRANCH - "master" CACHE STRING - "Git branch for external SuperLU_DIST build" -) -set(EXTERN_SUPERLU_GIT_TAG - "ad6411ff95bbaf0180a5001ff269047622bc1ae6" CACHE STRING # 11/12/2023 - "Git tag for external SuperLU_DIST build" -) - -# ZFP (for STRUMPACK) -set(EXTERN_ZFP_URL - "https://github.com/LLNL/zfp.git" CACHE STRING - "URL for external ZFP build" -) -set(EXTERN_ZFP_GIT_BRANCH - "develop" CACHE STRING - "Git branch for external ZFP build" -) -set(EXTERN_ZFP_GIT_TAG - "bcc5a254823224c5010b51dc28d5c6c47b20ef39" CACHE STRING # 10/17/2023 - "Git tag for external ZFP build" -) - -# nlohmann/json -set(EXTERN_JSON_URL - "https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz" CACHE STRING - "URL for external nlohmann/json build" -) - -# fmt -set(EXTERN_FMT_URL - "https://github.com/fmtlib/fmt/releases/download/10.1.1/fmt-10.1.1.zip" CACHE STRING - "URL for external fmt build" -) - -# Eigen -set(EXTERN_EIGEN_URL - "https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz" CACHE STRING - "URL for external Eigen build" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Repository URLs and tags for external third-party dependencies +# + +if(__extern_git_tags) + return() +endif() +set(__extern_git_tags YES) + +# ARPACK-NG +set(EXTERN_ARPACK_URL + "https://github.com/opencollab/arpack-ng.git" CACHE STRING + "URL for external ARPACK-NG build" +) +set(EXTERN_ARPACK_GIT_BRANCH + "master" CACHE STRING + "Git branch for external ARPACK-NG build" +) +set(EXTERN_ARPACK_GIT_TAG + "804fa3149a0f773064198a8e883bd021832157ca" CACHE STRING + "Git tag for external ARPACK-NG build" +) + +# ButterflyPACK (for STRUMPACK) +set(EXTERN_BUTTERFLYPACK_URL + "https://github.com/liuyangzhuan/ButterflyPACK.git" CACHE STRING + "URL for external ButterflyPACK build" +) +set(EXTERN_BUTTERFLYPACK_GIT_BRANCH + "master" CACHE STRING + "Git branch for external ButterflyPACK build" +) +set(EXTERN_BUTTERFLYPACK_GIT_TAG + "b7130fd234a23232620dfabc52fa1c7df6065842" CACHE STRING + "Git tag for external ButterflyPACK build" +) + +# GSLIB +set(EXTERN_GSLIB_URL + "https://github.com/Nek5000/gslib.git" CACHE STRING + "URL for external GSLIB build" +) +set(EXTERN_GSLIB_GIT_BRANCH + "master" CACHE STRING + "Git branch for external GSLIB build" +) +set(EXTERN_GSLIB_GIT_TAG + "95acf5b42301d6cb48fda88d662f1d784b863089" CACHE STRING + "Git tag for external GSLIB build" +) + +# HYPRE (for MFEM) +set(EXTERN_HYPRE_URL + "https://github.com/hypre-space/hypre.git" CACHE STRING + "URL for external HYPRE build" +) +set(EXTERN_HYPRE_GIT_BRANCH + "master" CACHE STRING + "Git branch for external HYPRE build" +) +set(EXTERN_HYPRE_GIT_TAG + "4dd96d0e83088890879612c58364f6c10756ed90" CACHE STRING + "Git tag for external HYPRE build" +) + +# libCEED +set(EXTERN_LIBCEED_URL + "https://github.com/CEED/libCEED.git" CACHE STRING + "URL for external libCEED build" +) +set(EXTERN_LIBCEED_GIT_BRANCH + "main" CACHE STRING + "Git branch for external libCEED build" +) +set(EXTERN_LIBCEED_GIT_TAG + "95bd1e908b16e04a70015e3a9a7fddec5e9c3fc8" CACHE STRING + "Git tag for external libCEED build" +) + +# LIBXSMM (for libCEED) +set(EXTERN_LIBXSMM_URL + "https://github.com/hfp/libxsmm.git" CACHE STRING + "URL for external LIBXSMM build" +) +set(EXTERN_LIBXSMM_GIT_BRANCH + "main" CACHE STRING + "Git branch for external LIBXSMM build" +) +set(EXTERN_LIBXSMM_GIT_TAG + "3469aa806f4acef8f30c1241d5c2705713811b4c" CACHE STRING + "Git tag for external LIBXSMM build" +) + +# MAGMA +set(EXTERN_MAGMA_URL + "https://github.com/icl-utk-edu/magma.git" CACHE STRING + "URL for external MAGMA build" +) +set(EXTERN_MAGMA_GIT_BRANCH + "master" CACHE STRING + "Git branch for external MAGMA build" +) +set(EXTERN_MAGMA_GIT_TAG + "07b2b05635f0510ea4538f7ab68e50dcf0c0c815" CACHE STRING + "Git tag for external MAGMA build" +) + +# METIS +set(EXTERN_METIS_URL + "https://bitbucket.org/petsc/pkg-metis.git" CACHE STRING + "URL for external METIS build" +) +set(EXTERN_METIS_GIT_BRANCH + "master" CACHE STRING + "Git branch for external METIS build" +) +set(EXTERN_METIS_GIT_TAG + "08c3082720ff9114b8e3cbaa4484a26739cd7d2d" CACHE STRING + "Git tag for external METIS build" +) + +# MFEM +set(EXTERN_MFEM_URL + "https://github.com/mfem/mfem.git" CACHE STRING + "URL for external MFEM build" +) +set(EXTERN_MFEM_GIT_BRANCH + "master" CACHE STRING + "Git branch for external MFEM build" +) +set(EXTERN_MFEM_GIT_TAG + "0c4c006ef86dc2b2cf415e5bc4ed9118c9768652" CACHE STRING + "Git tag for external MFEM build" +) + +# MUMPS +set(EXTERN_MUMPS_URL + "https://github.com/scivision/mumps.git" CACHE STRING + "URL for external MUMPS build" +) +set(EXTERN_MUMPS_GIT_BRANCH + "main" CACHE STRING + "Git branch for external MUMPS build" +) +set(EXTERN_MUMPS_GIT_TAG + "1cfd19699702f9a64ff5d45827d6025ff5c3873a" CACHE STRING + "Git tag for external MUMPS build" +) + +# ParMETIS +set(EXTERN_PARMETIS_URL + "https://bitbucket.org/petsc/pkg-parmetis.git" CACHE STRING + "URL for external ParMETIS build" +) +set(EXTERN_PARMETIS_GIT_BRANCH + "master" CACHE STRING + "Git branch for external ParMETIS build" +) +set(EXTERN_PARMETIS_GIT_TAG + "53c9341b6c1ba876c97567cb52ddfc87c159dc36" CACHE STRING + "Git tag for external ParMETIS build" +) + +# PETSc (for SLEPc) +set(EXTERN_PETSC_URL + "https://gitlab.com/petsc/petsc.git" CACHE STRING + "URL for external PETSc build" +) +set(EXTERN_PETSC_GIT_BRANCH + "main" CACHE STRING + "Git branch for external PETSc build" +) +set(EXTERN_PETSC_GIT_TAG + "0311516ef26856f9037490a5104151a9a9d292aa" CACHE STRING + "Git tag for external PETSc build" +) + +# ScaLAPACK (for STRUMPACK and MUMPS) +set(EXTERN_SCALAPACK_URL + "https://github.com/Reference-ScaLAPACK/scalapack.git" CACHE STRING + "URL for external ScaLAPACK build" +) +set(EXTERN_SCALAPACK_GIT_BRANCH + "master" CACHE STRING + "Git branch for external ScaLAPACK build" +) +set(EXTERN_SCALAPACK_GIT_TAG + "6423f17933eb9a2522814b78ab3c0d6da25ee85a" CACHE STRING + "Git tag for external ScaLAPACK build" +) + +# SLEPc +set(EXTERN_SLEPC_URL + "https://gitlab.com/slepc/slepc.git" CACHE STRING + "URL for external SLEPc build" +) +set(EXTERN_SLEPC_GIT_BRANCH + "main" CACHE STRING + "Git branch for external SLEPc build" +) +set(EXTERN_SLEPC_GIT_TAG + "61182f12f55a2c1e9f5f1ddbc66df7e264d4bfe8" CACHE STRING + "Git tag for external SLEPc build" +) + +# STRUMPACK +set(EXTERN_STRUMPACK_URL + "https://github.com/pghysels/STRUMPACK.git" CACHE STRING + "URL for external STRUMPACK build" +) +set(EXTERN_STRUMPACK_GIT_BRANCH + "master" CACHE STRING + "Git branch for external STRUMPACK build" +) +set(EXTERN_STRUMPACK_GIT_TAG + "5a5643c174eb64845bf397859d11a14d4f1d11a1" CACHE STRING + "Git tag for external STRUMPACK build" +) + +# SuperLU_DIST +set(EXTERN_SUPERLU_URL + "https://github.com/xiaoyeli/superlu_dist.git" CACHE STRING + "URL for external SuperLU_DIST build" +) +set(EXTERN_SUPERLU_GIT_BRANCH + "master" CACHE STRING + "Git branch for external SuperLU_DIST build" +) +set(EXTERN_SUPERLU_GIT_TAG + "e621c471cf23329e568df71c6b724329bc04b0f8" CACHE STRING + "Git tag for external SuperLU_DIST build" +) + +# ZFP (for STRUMPACK) +set(EXTERN_ZFP_URL + "https://github.com/LLNL/zfp.git" CACHE STRING + "URL for external ZFP build" +) +set(EXTERN_ZFP_GIT_BRANCH + "develop" CACHE STRING + "Git branch for external ZFP build" +) +set(EXTERN_ZFP_GIT_TAG + "c2dd2966f6bab18e352b0fa29c3c5f27836580f8" CACHE STRING + "Git tag for external ZFP build" +) + +# nlohmann/json +set(EXTERN_JSON_URL + "https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz" CACHE STRING + "URL for external nlohmann/json build" +) + +# fmt +set(EXTERN_FMT_URL + "https://github.com/fmtlib/fmt/releases/download/10.2.1/fmt-10.2.1.zip" CACHE STRING + "URL for external fmt build" +) + +# scn +set(EXTERN_SCN_URL + "https://github.com/eliaskosunen/scnlib/archive/refs/tags/v4.0.1.zip" CACHE STRING + "URL for external scn build" +) + +# Eigen +set(EXTERN_EIGEN_URL + "https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz" CACHE STRING + "URL for external Eigen build" +) + +# SUNDIALS +set(EXTERN_SUNDIALS_URL + "https://github.com/LLNL/sundials.git" CACHE STRING + "URL for external SUNDIALS build" +) +set(EXTERN_SUNDIALS_GIT_BRANCH + "main" CACHE STRING + "Git branch for external SUNDIALS build" +) +set(EXTERN_SUNDIALS_GIT_TAG + "8e17876d3b4d682b4098684b07a85b005a122f81" CACHE STRING + "Git tag for external SUNDIALS build" +) \ No newline at end of file diff --git a/cmake/ExternalHYPRE.cmake b/cmake/ExternalHYPRE.cmake index 7af157628d..0649d2d5ce 100644 --- a/cmake/ExternalHYPRE.cmake +++ b/cmake/ExternalHYPRE.cmake @@ -1,181 +1,185 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build HYPRE -# - -# Force build order -set(HYPRE_DEPENDENCIES) - -# Hypre does not add OpenMP flags -set(HYPRE_CFLAGS "${CMAKE_C_FLAGS}") -set(HYPRE_CXXFLAGS "${CMAKE_CXX_FLAGS}") -set(HYPRE_LDFLAGS "${CMAKE_EXE_LINKER_FLAGS}") -if(PALACE_WITH_OPENMP) - find_package(OpenMP REQUIRED) - set(HYPRE_CFLAGS "${OpenMP_C_FLAGS} ${HYPRE_CFLAGS}") - set(HYPRE_CXXFLAGS "${OpenMP_CXX_FLAGS} ${HYPRE_CXXFLAGS}") - string(REPLACE ";" " " HYPRE_OPENMP_LIBRARIES "${OpenMP_C_LIBRARIES}") - set(HYPRE_LDFLAGS "${HYPRE_OPENMP_LIBRARIES} ${HYPRE_LDFLAGS}") -endif() - -# Silence some CUDA/HIP include file warnings -if(HYPRE_CFLAGS MATCHES "-pedantic" AND (PALACE_WITH_CUDA OR PALACE_WITH_HIP)) - string(REGEX REPLACE "-pedantic" "" HYPRE_CFLAGS ${HYPRE_CFLAGS}) - string(REGEX REPLACE "-pedantic" "" HYPRE_CXXFLAGS ${HYPRE_CXXFLAGS}) -endif() -if(PALACE_WITH_CUDA) - set(HYPRE_CFLAGS "${HYPRE_CFLAGS} -isystem ${CUDA_DIR}/include") - set(HYPRE_CXXFLAGS "${HYPRE_CXXFLAGS} -isystem ${CUDA_DIR}/include") -endif() -if(PALACE_WITH_HIP) - set(HYPRE_CFLAGS "${HYPRE_CFLAGS} -isystem ${ROCM_DIR}/include") - set(HYPRE_CXXFLAGS "${HYPRE_CXXFLAGS} -isystem ${ROCM_DIR}/include") -endif() - -# Need to manually specificy MPI flags for test program compilation/linkage during configure -if(NOT MPI_FOUND) - message(FATAL_ERROR "MPI is not found when trying to build HYPRE") -endif() -if(NOT CMAKE_C_COMPILER STREQUAL MPI_C_COMPILER) - foreach(INCLUDE_DIR IN LISTS MPI_C_INCLUDE_DIRS) - set(HYPRE_CFLAGS "${HYPRE_CFLAGS} -I${INCLUDE_DIR}") - set(HYPRE_CXXFLAGS "${HYPRE_CXXFLAGS} -I${INCLUDE_DIR}") - endforeach() - string(REPLACE ";" " " HYPRE_MPI_LIBRARIES "${MPI_C_LIBRARIES}") - set(HYPRE_LDFLAGS "${HYPRE_LDFLAGS} ${HYPRE_MPI_LIBRARIES}") -endif() - -# Use Autotools build instead of CMake for HIP support -set(HYPRE_OPTIONS - "CC=${CMAKE_C_COMPILER}" - "CFLAGS=${HYPRE_CFLAGS}" - "CXX=${CMAKE_CXX_COMPILER}" - "CXXFLAGS=${HYPRE_CXXFLAGS}" - "FC=" - "LDFLAGS=${HYPRE_LDFLAGS}" - "--prefix=${CMAKE_INSTALL_PREFIX}" - "--disable-fortran" - "--with-MPI" -) -if(CMAKE_BUILD_TYPE MATCHES "Debug|debug|DEBUG") - list(APPEND HYPRE_OPTIONS "--enable-debug") -else() - list(APPEND HYPRE_OPTIONS "--disable-debug") -endif() -if(BUILD_SHARED_LIBS) - list(APPEND HYPRE_OPTIONS "--enable-shared") -else() - list(APPEND HYPRE_OPTIONS "--disable-shared") -endif() -if(PALACE_WITH_64BIT_INT) - list(APPEND HYPRE_OPTIONS - "--enable-mixedint" - "--disable-bigint" - ) -else() - list(APPEND HYPRE_OPTIONS - "--disable-mixedint" - "--disable-bigint" - ) -endif() -if(PALACE_WITH_OPENMP) - list(APPEND HYPRE_OPTIONS "--with-openmp") -endif() - -# User might specify the MPI compiler wrappers directly, otherwise we need to supply MPI -# as found from the CMake module -if(NOT CMAKE_C_COMPILER STREQUAL MPI_C_COMPILER) - set(HYPRE_MPI_LIBRARIES) - set(HYPRE_MPI_LIBRARY_DIRS) - foreach(LIB IN LISTS MPI_C_LIBRARIES) - get_filename_component(LIB_NAME ${LIB} NAME_WE) - get_filename_component(LIB_DIR ${LIB} DIRECTORY) - string(REGEX REPLACE "^lib" "" LIB_NAME ${LIB_NAME}) - list(APPEND HYPRE_MPI_LIBRARIES ${LIB_NAME}) - list(APPEND HYPRE_MPI_LIBRARY_DIRS ${LIB_DIR}) - endforeach() - list(REMOVE_DUPLICATES HYPRE_MPI_LIBRARIES) - string(REPLACE ";" " " HYPRE_MPI_LIBRARIES "${HYPRE_MPI_LIBRARIES}") - list(REMOVE_DUPLICATES HYPRE_MPI_LIBRARY_DIRS) - string(REPLACE ";" " " HYPRE_MPI_LIBRARY_DIRS "${HYPRE_MPI_LIBRARY_DIRS}") - string(REPLACE ";" " " HYPRE_MPI_INCLUDE_DIRS "${MPI_C_INCLUDE_DIRS}") - list(APPEND HYPRE_OPTIONS - "--with-MPI-libs=${HYPRE_MPI_LIBRARIES}" - "--with-MPI-lib-dirs=${HYPRE_MPI_LIBRARY_DIRS}" - "--with-MPI-include=${HYPRE_MPI_INCLUDE_DIRS}" - ) -endif() - -# Configure BLAS/LAPACK -if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") - string(REPLACE "$" " " HYPRE_BLAS_LAPACK_LIBRARIES "${BLAS_LAPACK_LIBRARIES}") - list(APPEND HYPRE_OPTIONS - "--with-blas-lib=${HYPRE_BLAS_LAPACK_LIBRARIES}" - "--with-lapack-lib=${HYPRE_BLAS_LAPACK_LIBRARIES}" - ) -endif() - -# Configure GPU support -if(PALACE_WITH_CUDA OR PALACE_WITH_HIP) - list(APPEND HYPRE_OPTIONS - "--disable-unified-memory" - ) - if(PALACE_WITH_GPU_AWARE_MPI) - list(APPEND HYPRE_OPTIONS - "--enable-gpu-aware-mpi" - ) - else() - list(APPEND HYPRE_OPTIONS - "--disable-gpu-aware-mpi" - ) - endif() -endif() -if(PALACE_WITH_CUDA) - list(APPEND HYPRE_OPTIONS - "--with-cuda" - "--with-cuda-home=${CUDA_DIR}" - "--enable-curand" - "--enable-cusparse" - "--enable-device-memory-pool" - ) - if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") - list(GET CMAKE_CUDA_ARCHITECTURES 0 HYPRE_CUDA_ARCH) - list(APPEND HYPRE_OPTIONS - "--with-gpu-arch=${HYPRE_CUDA_ARCH}" - ) - endif() -endif() -if(PALACE_WITH_HIP) - list(APPEND HYPRE_OPTIONS - "--with-hip" - "ROCM_PATH=${ROCM_DIR}" - "--enable-rocrand" - "--enable-rocsparse" - ) - if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") - list(GET CMAKE_HIP_ARCHITECTURES 0 HYPRE_HIP_ARCH) - list(APPEND HYPRE_OPTIONS - "--with-gpu-arch=${HYPRE_HIP_ARCH}" - ) - endif() -endif() - -string(REPLACE ";" "; " HYPRE_OPTIONS_PRINT "${HYPRE_OPTIONS}") -message(STATUS "HYPRE_OPTIONS: ${HYPRE_OPTIONS_PRINT}") - -include(ExternalProject) -ExternalProject_Add(hypre - DEPENDS ${HYPRE_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_HYPRE_URL} - GIT_TAG ${EXTERN_HYPRE_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/hypre - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/hypre-cmake - BUILD_IN_SOURCE TRUE - SOURCE_SUBDIR src - UPDATE_COMMAND "" - CONFIGURE_COMMAND ./configure ${HYPRE_OPTIONS} - TEST_COMMAND "" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build HYPRE +# + +# Force build order +set(HYPRE_DEPENDENCIES) + +# Hypre does not add OpenMP flags +set(HYPRE_CFLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${BUILD_TYPE_UPPER}}") +set(HYPRE_CXXFLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${BUILD_TYPE_UPPER}}") +set(HYPRE_LDFLAGS "${CMAKE_EXE_LINKER_FLAGS}") +if(PALACE_WITH_OPENMP) + find_package(OpenMP REQUIRED) + set(HYPRE_CFLAGS "${OpenMP_C_FLAGS} ${HYPRE_CFLAGS}") + set(HYPRE_CXXFLAGS "${OpenMP_CXX_FLAGS} ${HYPRE_CXXFLAGS}") + string(REPLACE ";" " " HYPRE_OPENMP_LIBRARIES "${OpenMP_C_LIBRARIES}") + set(HYPRE_LDFLAGS "${HYPRE_OPENMP_LIBRARIES} ${HYPRE_LDFLAGS}") +endif() + +# Silence some CUDA/HIP include file warnings +if(HYPRE_CFLAGS MATCHES "-pedantic" AND (PALACE_WITH_CUDA OR PALACE_WITH_HIP)) + string(REGEX REPLACE "-pedantic" "" HYPRE_CFLAGS ${HYPRE_CFLAGS}) + string(REGEX REPLACE "-pedantic" "" HYPRE_CXXFLAGS ${HYPRE_CXXFLAGS}) +endif() +if(PALACE_WITH_CUDA) + set(HYPRE_CFLAGS "${HYPRE_CFLAGS} -isystem ${CUDAToolkit_INCLUDE_DIRS}") + set(HYPRE_CXXFLAGS "${HYPRE_CXXFLAGS} -isystem ${CUDAToolkit_INCLUDE_DIRS}") +endif() +if(PALACE_WITH_HIP) + set(HYPRE_CFLAGS "${HYPRE_CFLAGS} -isystem ${ROCM_DIR}/include") + set(HYPRE_CXXFLAGS "${HYPRE_CXXFLAGS} -isystem ${ROCM_DIR}/include") +endif() + +# Need to manually specify MPI flags for test program compilation/linkage during configure +if(NOT MPI_FOUND) + message(FATAL_ERROR "MPI is not found when trying to build HYPRE") +endif() +if(NOT CMAKE_C_COMPILER STREQUAL MPI_C_COMPILER) + foreach(INCLUDE_DIR IN LISTS MPI_C_INCLUDE_DIRS) + set(HYPRE_CFLAGS "${HYPRE_CFLAGS} -I${INCLUDE_DIR}") + set(HYPRE_CXXFLAGS "${HYPRE_CXXFLAGS} -I${INCLUDE_DIR}") + endforeach() + string(REPLACE ";" " " HYPRE_MPI_LIBRARIES "${MPI_C_LIBRARIES}") + set(HYPRE_LDFLAGS "${HYPRE_LDFLAGS} ${HYPRE_MPI_LIBRARIES} -lm") +endif() + +# Use Autotools build instead of CMake for HIP support +set(HYPRE_OPTIONS + "CC=${CMAKE_C_COMPILER}" + "CFLAGS=${HYPRE_CFLAGS}" + "CXX=${CMAKE_CXX_COMPILER}" + "CXXFLAGS=${HYPRE_CXXFLAGS}" + "FC=" + "LDFLAGS=${HYPRE_LDFLAGS}" + "--prefix=${CMAKE_INSTALL_PREFIX}" + "--disable-fortran" + "--with-MPI" + "--with-cxxstandard=17" +) +if(CMAKE_BUILD_TYPE MATCHES "Debug|debug|DEBUG") + list(APPEND HYPRE_OPTIONS "--enable-debug") +else() + list(APPEND HYPRE_OPTIONS "--disable-debug") +endif() +if(BUILD_SHARED_LIBS) + list(APPEND HYPRE_OPTIONS "--enable-shared") +else() + list(APPEND HYPRE_OPTIONS "--disable-shared") +endif() +if(PALACE_WITH_64BIT_INT) + list(APPEND HYPRE_OPTIONS + "--enable-mixedint" + "--disable-bigint" + ) +else() + list(APPEND HYPRE_OPTIONS + "--disable-mixedint" + "--disable-bigint" + ) +endif() +if(PALACE_WITH_OPENMP) + list(APPEND HYPRE_OPTIONS "--with-openmp") +endif() + +# User might specify the MPI compiler wrappers directly, otherwise we need to supply MPI +# as found from the CMake module +if(NOT MPI_FOUND) + message(FATAL_ERROR "MPI is not found when trying to build HYPRE") +endif() +if(NOT CMAKE_C_COMPILER STREQUAL MPI_C_COMPILER) + set(HYPRE_MPI_LIBRARIES) + set(HYPRE_MPI_LIBRARY_DIRS) + foreach(LIB IN LISTS MPI_C_LIBRARIES) + get_filename_component(LIB_NAME ${LIB} NAME_WE) + get_filename_component(LIB_DIR ${LIB} DIRECTORY) + string(REGEX REPLACE "^lib" "" LIB_NAME ${LIB_NAME}) + list(APPEND HYPRE_MPI_LIBRARIES ${LIB_NAME}) + list(APPEND HYPRE_MPI_LIBRARY_DIRS ${LIB_DIR}) + endforeach() + list(REMOVE_DUPLICATES HYPRE_MPI_LIBRARIES) + string(REPLACE ";" " " HYPRE_MPI_LIBRARIES "${HYPRE_MPI_LIBRARIES}") + list(REMOVE_DUPLICATES HYPRE_MPI_LIBRARY_DIRS) + string(REPLACE ";" " " HYPRE_MPI_LIBRARY_DIRS "${HYPRE_MPI_LIBRARY_DIRS}") + string(REPLACE ";" " " HYPRE_MPI_INCLUDE_DIRS "${MPI_C_INCLUDE_DIRS}") + list(APPEND HYPRE_OPTIONS + "--with-MPI-libs=${HYPRE_MPI_LIBRARIES}" + "--with-MPI-lib-dirs=${HYPRE_MPI_LIBRARY_DIRS}" + "--with-MPI-include=${HYPRE_MPI_INCLUDE_DIRS}" + ) +endif() + +# Configure BLAS/LAPACK +if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") + string(REPLACE "$" " " HYPRE_BLAS_LAPACK_LIBRARIES "${BLAS_LAPACK_LIBRARIES}") + list(APPEND HYPRE_OPTIONS + "--with-blas-lib=${HYPRE_BLAS_LAPACK_LIBRARIES}" + "--with-lapack-lib=${HYPRE_BLAS_LAPACK_LIBRARIES}" + ) +endif() + +# Configure GPU support +if(PALACE_WITH_CUDA OR PALACE_WITH_HIP) + list(APPEND HYPRE_OPTIONS + "--disable-unified-memory" + ) + if(PALACE_WITH_GPU_AWARE_MPI) + list(APPEND HYPRE_OPTIONS + "--enable-gpu-aware-mpi" + ) + else() + list(APPEND HYPRE_OPTIONS + "--disable-gpu-aware-mpi" + ) + endif() +endif() +if(PALACE_WITH_CUDA) + list(APPEND HYPRE_OPTIONS + "--with-cuda" + "--with-cuda-home=${CUDAToolkit_LIBRARY_ROOT}" + "--enable-curand" + "--enable-cusparse" + "--enable-device-memory-pool" + ) + if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") + list(GET CMAKE_CUDA_ARCHITECTURES 0 HYPRE_CUDA_ARCH) + list(APPEND HYPRE_OPTIONS + "--with-gpu-arch=${HYPRE_CUDA_ARCH}" + ) + endif() +endif() +if(PALACE_WITH_HIP) + list(APPEND HYPRE_OPTIONS + "--with-hip" + "ROCM_PATH=${ROCM_DIR}" + "--enable-rocrand" + "--enable-rocsparse" + ) + if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") + list(GET CMAKE_HIP_ARCHITECTURES 0 HYPRE_HIP_ARCH) + list(APPEND HYPRE_OPTIONS + "--with-gpu-arch=${HYPRE_HIP_ARCH}" + ) + endif() +endif() + +string(REPLACE ";" "; " HYPRE_OPTIONS_PRINT "${HYPRE_OPTIONS}") +message(STATUS "HYPRE_OPTIONS: ${HYPRE_OPTIONS_PRINT}") + +include(ExternalProject) +ExternalProject_Add(hypre + DEPENDS ${HYPRE_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_HYPRE_URL} + GIT_TAG ${EXTERN_HYPRE_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/hypre + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/hypre-cmake + BUILD_IN_SOURCE TRUE + SOURCE_SUBDIR src + UPDATE_COMMAND "" + CONFIGURE_COMMAND ./configure ${HYPRE_OPTIONS} + TEST_COMMAND "" +) diff --git a/cmake/ExternalJSON.cmake b/cmake/ExternalJSON.cmake index 4a7e50f1a7..2f69f33912 100644 --- a/cmake/ExternalJSON.cmake +++ b/cmake/ExternalJSON.cmake @@ -1,29 +1,29 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Configure JSON library from nlohamnn/json (header-only) -# - -set(JSON_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND JSON_OPTIONS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DJSON_Install=ON" - "-DJSON_BuildTests=OFF" -) - -string(REPLACE ";" "; " JSON_OPTIONS_PRINT "${JSON_OPTIONS}") -message(STATUS "JSON_OPTIONS: ${JSON_OPTIONS_PRINT}") - -include(ExternalProject) -ExternalProject_Add(json - URL ${EXTERN_JSON_URL} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/json - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/json-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/json-cmake - UPDATE_COMMAND "" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${JSON_OPTIONS}" - TEST_COMMAND "" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Configure JSON library from nlohamnn/json (header-only) +# + +set(JSON_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) +list(APPEND JSON_OPTIONS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DJSON_Install=ON" + "-DJSON_BuildTests=OFF" +) + +string(REPLACE ";" "; " JSON_OPTIONS_PRINT "${JSON_OPTIONS}") +message(STATUS "JSON_OPTIONS: ${JSON_OPTIONS_PRINT}") + +include(ExternalProject) +ExternalProject_Add(json + URL ${EXTERN_JSON_URL} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/json + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/json-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/json-cmake + UPDATE_COMMAND "" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${JSON_OPTIONS}" + TEST_COMMAND "" +) diff --git a/cmake/ExternalLIBXSMM.cmake b/cmake/ExternalLIBXSMM.cmake index 1e37954fa5..b351a48a66 100644 --- a/cmake/ExternalLIBXSMM.cmake +++ b/cmake/ExternalLIBXSMM.cmake @@ -1,61 +1,100 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build LIBXSMM (for libCEED) -# - -# Force build order -set(LIBXSMM_DEPENDENCIES) - -set(LIBXSMM_OPTIONS - "PREFIX=${CMAKE_INSTALL_PREFIX}" - "CC=${CMAKE_C_COMPILER}" - "CXX=${CMAKE_CXX_COMPILER}" - "FC=0" - "FORTRAN=0" - "BLAS=0" # For now, no BLAS linkage (like PyFR) - "SYM=1" # Always build with symbols - "VERBOSE=1" - "PPKGDIR=lib/pkgconfig" - "PMODDIR=lib/pkgconfig" -) - -# Always build LIBXSMM as a shared library -list(APPEND LIBXSMM_OPTIONS - "STATIC=0" -) - -# Configure debugging -if(CMAKE_BUILD_TYPE MATCHES "Debug|debug|DEBUG") - list(APPEND LIBXSMM_OPTIONS - "DBG=1" - "TRACE=1" - ) -endif() - -# Fix libxsmmext library linkage on macOS -if(CMAKE_SYSTEM_NAME MATCHES "Darwin") - list(APPEND LIBXSMM_OPTIONS - "LDFLAGS=-undefined dynamic_lookup" - ) -endif() - -string(REPLACE ";" "; " LIBXSMM_OPTIONS_PRINT "${LIBXSMM_OPTIONS}") -message(STATUS "LIBXSMM_OPTIONS: ${LIBXSMM_OPTIONS_PRINT}") - -include(ExternalProject) -ExternalProject_Add(libxsmm - DEPENDS ${LIBXSMM_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_LIBXSMM_URL} - GIT_TAG ${EXTERN_LIBXSMM_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/libxsmm - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/libxsmm-cmake - BUILD_IN_SOURCE TRUE - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} ${LIBXSMM_OPTIONS} install-minimal - TEST_COMMAND "" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build LIBXSMM (for libCEED) +# + +# Force build order +set(LIBXSMM_DEPENDENCIES) + +set(LIBXSMM_OPTIONS + # "PREFIX=${CMAKE_INSTALL_PREFIX}" # Don't use install step, see comment below + "OUTDIR=${CMAKE_INSTALL_PREFIX}/lib" + "DIRSTATE=." + "CC=${CMAKE_C_COMPILER}" + "CXX=${CMAKE_CXX_COMPILER}" + "FC=" + "FORTRAN=0" + "BLAS=0" # For now, no BLAS linkage (like PyFR) + "SYM=1" # Always build with symbols + "VERBOSE=1" +) + +# Always build LIBXSMM as a shared library +list(APPEND LIBXSMM_OPTIONS + "STATIC=0" +) + +# Configure debugging +if(CMAKE_BUILD_TYPE MATCHES "Debug|debug|DEBUG") + list(APPEND LIBXSMM_OPTIONS + "DBG=1" + "TRACE=1" + ) +endif() + +# Fix libxsmmext library linkage on macOS +if(CMAKE_SYSTEM_NAME MATCHES "Darwin") + list(APPEND LIBXSMM_OPTIONS + "LDFLAGS=-undefined dynamic_lookup" + ) +endif() + +string(REPLACE ";" "; " LIBXSMM_OPTIONS_PRINT "${LIBXSMM_OPTIONS}") +message(STATUS "LIBXSMM_OPTIONS: ${LIBXSMM_OPTIONS_PRINT}") + +# Don't use LIBXSMM install step, since it just copies shared libraries and doesn't modify +# the dependency locations directly (doesn't use RPATH). Just build directly into the +# installation directory instead. See https://github.com/libxsmm/libxsmm/issues/883. +set(LIBXSMM_INSTALL_HEADERS + libxsmm.h + libxsmm_config.h + libxsmm_version.h + libxsmm_cpuid.h + libxsmm_fsspmdm.h + libxsmm_generator.h + libxsmm_intrinsics_x86.h + libxsmm_macros.h + libxsmm_math.h + libxsmm_malloc.h + libxsmm_memory.h + libxsmm_sync.h + libxsmm_typedefs.h +) +list(TRANSFORM LIBXSMM_INSTALL_HEADERS PREPEND /include/) +set(LIBXSMM_INSTALL_PKGCONFIG + libxsmm.pc + libxsmmext.pc + libxsmmnoblas.pc + libxsmm-shared.pc + libxsmmext-shared.pc + libxsmmnoblas-shared.pc + libxsmm.env +) +list(TRANSFORM LIBXSMM_INSTALL_PKGCONFIG PREPEND ${CMAKE_INSTALL_PREFIX}/lib/) + +include(ExternalProject) +ExternalProject_Add(libxsmm + DEPENDS ${LIBXSMM_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_LIBXSMM_URL} + GIT_TAG ${EXTERN_LIBXSMM_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/libxsmm + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/libxsmm-cmake + BUILD_IN_SOURCE TRUE + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${LIBXSMM_OPTIONS} + INSTALL_COMMAND + ${CMAKE_COMMAND} -E echo "LIBXSMM installing interface..." && + ${CMAKE_COMMAND} -E make_directory ${CMAKE_INSTALL_PREFIX}/include && + ${CMAKE_COMMAND} -E copy ${LIBXSMM_INSTALL_HEADERS} ${CMAKE_INSTALL_PREFIX}/include && + ${CMAKE_COMMAND} -E echo "LIBXSMM installing pkg-config and module files..." && + ${CMAKE_COMMAND} -E make_directory ${CMAKE_INSTALL_PREFIX}/lib/pkgconfig && + ${CMAKE_COMMAND} -E copy ${LIBXSMM_INSTALL_PKGCONFIG} ${CMAKE_INSTALL_PREFIX}/lib/pkgconfig || + ${CMAKE_COMMAND} -E true && # No error if files don't exist + ${CMAKE_COMMAND} -E rm -f ${LIBXSMM_INSTALL_PKGCONFIG} && + ${CMAKE_COMMAND} -E rm -f ${CMAKE_INSTALL_PREFIX}/lib/.make + TEST_COMMAND "" +) diff --git a/cmake/ExternalLibCEED.cmake b/cmake/ExternalLibCEED.cmake index f52e233a92..a45da7fb4c 100644 --- a/cmake/ExternalLibCEED.cmake +++ b/cmake/ExternalLibCEED.cmake @@ -1,142 +1,124 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build libCEED -# - -# Force build order -set(LIBCEED_DEPENDENCIES) -if(PALACE_BUILD_EXTERNAL_DEPS) - if(PALACE_WITH_LIBXSMM) - list(APPEND LIBCEED_DEPENDENCIES libxsmm) - endif() - if(PALACE_WITH_MAGMA) - list(APPEND LIBCEED_DEPENDENCIES magma) - endif() -endif() - -# Note on recommended flags for libCEED (from Makefile, Spack): -# OPT: -O3 -g -march=native -ffp-contract=fast [-fopenmp-simd/-qopenmp-simd] -include(CheckCCompilerFlag) -set(LIBCEED_OPT_FLAGS "${CMAKE_C_FLAGS}") -if(CMAKE_C_COMPILER_ID MATCHES "Intel|IntelLLVM") - set(OMP_SIMD_FLAG -qopenmp-simd) -else() - set(OMP_SIMD_FLAG -fopenmp-simd) -endif() -check_c_compiler_flag(${OMP_SIMD_FLAG} SUPPORTS_OMP_SIMD) -if(SUPPORTS_OMP_SIMD) - set(LIBCEED_OPT_FLAGS "${LIBCEED_OPT_FLAGS} ${OMP_SIMD_FLAG}") -endif() - -# Silence some CUDA/HIP include file warnings -if(PALACE_WITH_CUDA) - set(LIBCEED_OPT_FLAGS "${LIBCEED_OPT_FLAGS} -isystem ${CUDA_DIR}/include") -endif() -if(PALACE_WITH_HIP) - set(LIBCEED_OPT_FLAGS "${LIBCEED_OPT_FLAGS} -isystem ${ROCM_DIR}/include") -endif() - -# Configure -pedantic flag if specified (don't want to enable for GPU code) -if(LIBCEED_OPT_FLAGS MATCHES "-pedantic") - string(REGEX REPLACE "-pedantic" "" LIBCEED_OPT_FLAGS ${LIBCEED_OPT_FLAGS}) - set(LIBCEED_PEDANTIC "1") -else() - set(LIBCEED_PEDANTIC "") -endif() - -# Build libCEED (always as a shared library) -set(LIBCEED_OPTIONS - "prefix=${CMAKE_INSTALL_PREFIX}" - "CC=${CMAKE_C_COMPILER}" - "OPT=${LIBCEED_OPT_FLAGS}" - "STATIC=" - "PEDANTIC=${LIBCEED_PEDANTIC}" -) - -# Configure OpenMP -if(PALACE_WITH_OPENMP) - list(APPEND LIBCEED_OPTIONS - "OPENMP=1" - ) -endif() - -# Configure libCEED backends (nvcc, hipcc flags are configured by libCEED) -if(PALACE_WITH_LIBXSMM) - if(PALACE_BUILD_EXTERNAL_DEPS) - list(APPEND LIBCEED_OPTIONS - "XSMM_DIR=${CMAKE_INSTALL_PREFIX}" - ) - else() - list(APPEND LIBCEED_OPTIONS - "XSMM_DIR=${LIBXSMM_DIR}" - ) - endif() - # LIBXSMM can require linkage with BLAS for fallback - if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") - string(REPLACE "$" " " LIBCEED_BLAS_LAPACK_LIBRARIES "${BLAS_LAPACK_LIBRARIES}") - list(APPEND LIBCEED_OPTIONS - "BLAS_LIB=${LIBCEED_BLAS_LAPACK_LIBRARIES}" - ) - endif() -endif() -if(PALACE_WITH_CUDA) - list(APPEND LIBCEED_OPTIONS - "CUDA_DIR=${CUDA_DIR}" - ) - if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") - list(GET CMAKE_CUDA_ARCHITECTURES 0 LIBCEED_CUDA_ARCH) - list(APPEND LIBCEED_OPTIONS - "CUDA_ARCH=sm_${LIBCEED_CUDA_ARCH}" - ) - endif() -endif() -if(PALACE_WITH_HIP) - list(APPEND LIBCEED_OPTIONS - "ROCM_DIR=${ROCM_DIR}" - ) - if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") - list(GET CMAKE_HIP_ARCHITECTURES 0 LIBCEED_HIP_ARCH) - list(APPEND LIBCEED_OPTIONS - "HIP_ARCH=${LIBCEED_HIP_ARCH}" - ) - endif() -endif() -if(PALACE_WITH_MAGMA) - if(PALACE_BUILD_EXTERNAL_DEPS) - list(APPEND LIBCEED_OPTIONS - "MAGMA_DIR=${CMAKE_INSTALL_PREFIX}" - ) - else() - list(APPEND LIBCEED_OPTIONS - "MAGMA_DIR=${MAGMA_DIR}" - ) - endif() -endif() - -string(REPLACE ";" "; " LIBCEED_OPTIONS_PRINT "${LIBCEED_OPTIONS}") -message(STATUS "LIBCEED_OPTIONS: ${LIBCEED_OPTIONS_PRINT}") - -# Add OpenMP support to libCEED -set(LIBCEED_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/libCEED/patch_gpu_restriction_dev.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/libCEED/patch_hcurl_hdiv_basis_cuda_hip.diff" -) - -include(ExternalProject) -ExternalProject_Add(libCEED - DEPENDS ${LIBCEED_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_LIBCEED_URL} - GIT_TAG ${EXTERN_LIBCEED_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/libCEED - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/libCEED-cmake - BUILD_IN_SOURCE TRUE - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${LIBCEED_PATCH_FILES}" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} ${LIBCEED_OPTIONS} install - TEST_COMMAND "" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build libCEED +# + +# Force build order +set(LIBCEED_DEPENDENCIES) +if(PALACE_WITH_LIBXSMM) + list(APPEND LIBCEED_DEPENDENCIES libxsmm) +endif() +if(PALACE_WITH_MAGMA) + list(APPEND LIBCEED_DEPENDENCIES magma) +endif() + +# Note on recommended flags for libCEED (from Makefile, Spack): +# OPT: -O3 -g -march=native -ffp-contract=fast [-fopenmp-simd/-qopenmp-simd] +include(CheckCCompilerFlag) +set(LIBCEED_OPT_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${BUILD_TYPE_UPPER}}") +if(CMAKE_C_COMPILER_ID MATCHES "Intel|IntelLLVM") + set(OMP_SIMD_FLAG -qopenmp-simd) +else() + set(OMP_SIMD_FLAG -fopenmp-simd) +endif() +check_c_compiler_flag(${OMP_SIMD_FLAG} SUPPORTS_OMP_SIMD) +if(SUPPORTS_OMP_SIMD) + set(LIBCEED_OPT_FLAGS "${LIBCEED_OPT_FLAGS} ${OMP_SIMD_FLAG}") +endif() + +# Silence some CUDA/HIP include file warnings +if(PALACE_WITH_CUDA) + set(LIBCEED_OPT_FLAGS "${LIBCEED_OPT_FLAGS} -isystem ${CUDAToolkit_INCLUDE_DIRS}") +endif() +if(PALACE_WITH_HIP) + set(LIBCEED_OPT_FLAGS "${LIBCEED_OPT_FLAGS} -isystem ${ROCM_DIR}/include") +endif() + +# Configure -pedantic flag if specified (don't want to enable for GPU code) +if(LIBCEED_OPT_FLAGS MATCHES "-pedantic") + string(REGEX REPLACE "-pedantic" "" LIBCEED_OPT_FLAGS ${LIBCEED_OPT_FLAGS}) + set(LIBCEED_PEDANTIC "1") +else() + set(LIBCEED_PEDANTIC "") +endif() + +# Build libCEED (always as a shared library) +set(LIBCEED_OPTIONS + "prefix=${CMAKE_INSTALL_PREFIX}" + "LDFLAGS=${CMAKE_EXE_LINKER_FLAGS}" + "CC=${CMAKE_C_COMPILER}" + "CXX=${CMAKE_CXX_COMPILER}" + "FC=" + "OPT=${LIBCEED_OPT_FLAGS}" + "STATIC=" + "PEDANTIC=${LIBCEED_PEDANTIC}" +) + +# Configure OpenMP +if(PALACE_WITH_OPENMP) + list(APPEND LIBCEED_OPTIONS + "OPENMP=1" + ) +endif() + +# Configure libCEED backends (nvcc, hipcc flags are configured by libCEED) +if(PALACE_WITH_LIBXSMM) + list(APPEND LIBCEED_OPTIONS + "XSMM_DIR=${CMAKE_INSTALL_PREFIX}" + ) + # LIBXSMM can require linkage with BLAS for fallback + if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") + string(REPLACE "$" " " LIBCEED_BLAS_LAPACK_LIBRARIES "${BLAS_LAPACK_LIBRARIES}") + list(APPEND LIBCEED_OPTIONS + "BLAS_LIB=${LIBCEED_BLAS_LAPACK_LIBRARIES}" + ) + endif() +endif() +if(PALACE_WITH_CUDA) + list(APPEND LIBCEED_OPTIONS + "CUDA_DIR=${CUDAToolkit_LIBRARY_ROOT}" + ) + if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") + list(GET CMAKE_CUDA_ARCHITECTURES 0 LIBCEED_CUDA_ARCH) + list(APPEND LIBCEED_OPTIONS + "CUDA_ARCH=sm_${LIBCEED_CUDA_ARCH}" + ) + endif() +endif() +if(PALACE_WITH_HIP) + list(APPEND LIBCEED_OPTIONS + "ROCM_DIR=${ROCM_DIR}" + ) + if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") + list(GET CMAKE_HIP_ARCHITECTURES 0 LIBCEED_HIP_ARCH) + list(APPEND LIBCEED_OPTIONS + "HIP_ARCH=${LIBCEED_HIP_ARCH}" + ) + endif() +endif() +if(PALACE_WITH_MAGMA) + list(APPEND LIBCEED_OPTIONS + "MAGMA_DIR=${CMAKE_INSTALL_PREFIX}" + ) +endif() + +string(REPLACE ";" "; " LIBCEED_OPTIONS_PRINT "${LIBCEED_OPTIONS}") +message(STATUS "LIBCEED_OPTIONS: ${LIBCEED_OPTIONS_PRINT}") + +include(ExternalProject) +ExternalProject_Add(libCEED + DEPENDS ${LIBCEED_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_LIBCEED_URL} + GIT_TAG ${EXTERN_LIBCEED_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/libCEED + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/libCEED-cmake + BUILD_IN_SOURCE TRUE + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} ${LIBCEED_OPTIONS} install + TEST_COMMAND "" +) diff --git a/cmake/ExternalMETIS.cmake b/cmake/ExternalMETIS.cmake index fb1021eeea..73c9105ec6 100644 --- a/cmake/ExternalMETIS.cmake +++ b/cmake/ExternalMETIS.cmake @@ -1,150 +1,133 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build METIS and ParMETIS -# - -# Force build order -set(GKLIB_DEPENDENCIES) -set(METIS_DEPENDENCIES gklib) -set(PARMETIS_DEPENDENCIES gklib metis) - -# Build GKlib -set(GKLIB_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND GKLIB_OPTIONS - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" -) -if(CMAKE_BUILD_TYPE MATCHES "Debug|debug|DEBUG") - list(APPEND GKLIB_OPTIONS - "-DDEBUG=ON" - "-DASSERT=ON" - "-DASSERT2=ON" - ) -endif() -if(PALACE_WITH_OPENMP) - list(APPEND GKLIB_OPTIONS - "-DOPENMP=ON" - ) -endif() - -string(REPLACE ";" "; " GKLIB_OPTIONS_PRINT "${GKLIB_OPTIONS}") -message(STATUS "GKLIB_OPTIONS: ${GKLIB_OPTIONS_PRINT}") - -# Some build fixes -set(GKLIB_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/GKlib/patch_build.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/GKlib/patch_install.diff" -) - -include(ExternalProject) -ExternalProject_Add(gklib - DEPENDS ${GKLIB_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_GKLIB_URL} - GIT_TAG ${EXTERN_GKLIB_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/GKlib - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/GKlib-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/GKlib-cmake - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${GKLIB_PATCH_FILES}" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${GKLIB_OPTIONS}" - TEST_COMMAND "" -) - -# Build METIS (build settings are passed from GKlib) -set(METIS_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND METIS_OPTIONS - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DGKlib_ROOT=${CMAKE_INSTALL_PREFIX}" -) - -string(REPLACE ";" "; " METIS_OPTIONS_PRINT "${METIS_OPTIONS}") -message(STATUS "METIS_OPTIONS: ${METIS_OPTIONS_PRINT}") - -# Some build fixes -set(METIS_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/METIS/patch_build.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/METIS/patch_install.diff" -) - -# Configure width of real and integer values -list(APPEND METIS_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/METIS/patch_real32.diff" -) -if(PALACE_WITH_64BIT_INT) - list(APPEND METIS_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/METIS/patch_idx64.diff" - ) -else() - list(APPEND METIS_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/METIS/patch_idx32.diff" - ) -endif() - -ExternalProject_Add(metis - DEPENDS ${METIS_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_METIS_URL} - GIT_TAG ${EXTERN_METIS_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/METIS - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/METIS-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/METIS-cmake - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${METIS_PATCH_FILES}" - CONFIGURE_COMMAND ${CMAKE_COMMAND} ${METIS_OPTIONS} - TEST_COMMAND "" -) - -# Build ParMETIS (as needed) -if(PALACE_WITH_SUPERLU OR PALACE_WITH_STRUMPACK) - set(PARMETIS_OPTIONS ${METIS_OPTIONS}) - list(APPEND PARMETIS_OPTIONS - "-Dmetis_ROOT=${CMAKE_INSTALL_PREFIX}" - ) - - string(REPLACE ";" "; " PARMETIS_OPTIONS_PRINT "${PARMETIS_OPTIONS}") - message(STATUS "PARMETIS_OPTIONS: ${PARMETIS_OPTIONS_PRINT}") - - # Apply some fixes for build and from Spack build - # (https://github.com/spack/spack/tree/develop/var/spack/repos/builtin/packages/parmetis) - set(PARMETIS_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/ParMETIS/patch_build.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/ParMETIS/patch_install.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/ParMETIS/patch_spack.diff" - ) - - ExternalProject_Add(parmetis - DEPENDS ${PARMETIS_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_PARMETIS_URL} - GIT_TAG ${EXTERN_PARMETIS_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/ParMETIS - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/ParMETIS-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/ParMETIS-cmake - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${PARMETIS_PATCH_FILES}" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${PARMETIS_OPTIONS}" - TEST_COMMAND "" - ) -endif() - -# Save variables to cache -include(GNUInstallDirs) -if(BUILD_SHARED_LIBS) - set(_METIS_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) -else() - set(_METIS_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) -endif() -set(_METIS_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libGKlib${_METIS_LIB_SUFFIX}) -set(_METIS_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libmetis${_METIS_LIB_SUFFIX}$${_METIS_LIBRARIES}) -set(METIS_LIBRARIES ${_METIS_LIBRARIES} CACHE STRING - "List of library files for METIS" -) -if(PALACE_WITH_SUPERLU OR PALACE_WITH_STRUMPACK) - set(PARMETIS_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libparmetis${_METIS_LIB_SUFFIX} CACHE STRING - "List of library files for ParMETIS" - ) -endif() +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build METIS and ParMETIS (from PETSc forks) +# + +# Force build order +set(METIS_DEPENDENCIES) +set(PARMETIS_DEPENDENCIES metis) + +# METIS does not add OpenMP flags +set(METIS_CFLAGS "${CMAKE_C_FLAGS}") +set(METIS_MATH_LIB "m") +if(PALACE_WITH_OPENMP) + find_package(OpenMP REQUIRED) + set(METIS_CFLAGS "${OpenMP_C_FLAGS} ${HYPRE_CFLAGS}") + string(REPLACE ";" "$" METIS_OPENMP_LIBRARIES "${OpenMP_C_LIBRARIES}") + set(METIS_MATH_LIB "${METIS_OPENMP_LIBRARIES}$${METIS_MATH_LIB}") +endif() + +# Build METIS +set(METIS_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) +list(APPEND METIS_OPTIONS + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_C_FLAGS=${METIS_CFLAGS}" + "-DGKLIB_PATH=${CMAKE_BINARY_DIR}/extern/metis/GKlib" + "-DGKRAND=1" + "-DMATH_LIB=${METIS_MATH_LIB}" +) +if(CMAKE_BUILD_TYPE MATCHES "Debug|debug|DEBUG") + list(APPEND METIS_OPTIONS "-DDEBUG=1") +else() + list(APPEND METIS_OPTIONS "-DDEBUG=0") +endif() +if(BUILD_SHARED_LIBS) + list(APPEND METIS_OPTIONS "-DSHARED=1") +else() + list(APPEND METIS_OPTIONS "-DSHARED=0") +endif() +if(CMAKE_C_COMPILER_ID STREQUAL "MSVC") + list(APPEND METIS_OPTIONS "-DMSVC=1") +else() + list(APPEND METIS_OPTIONS "-DMSVC=0") +endif() +if(PALACE_WITH_64BIT_INT) + list(APPEND METIS_OPTIONS "-DMETIS_USE_LONGINDEX=1") +else() + list(APPEND METIS_OPTIONS "-DMETIS_USE_LONGINDEX=0") +endif() +# list(APPEND METIS_OPTIONS "-DMETIS_USE_DOUBLEPRECISION=1") + +string(REPLACE ";" "; " METIS_OPTIONS_PRINT "${METIS_OPTIONS}") +message(STATUS "METIS_OPTIONS: ${METIS_OPTIONS_PRINT}") + +include(ExternalProject) +ExternalProject_Add(metis + DEPENDS ${METIS_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_METIS_URL} + GIT_TAG ${EXTERN_METIS_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/metis + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/metis-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/metis-cmake + UPDATE_COMMAND "" + CONFIGURE_COMMAND ${CMAKE_COMMAND} ${METIS_OPTIONS} + TEST_COMMAND "" +) + +# Save variables to cache +if(BUILD_SHARED_LIBS) + set(_METIS_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) +else() + set(_METIS_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) +endif() +set(METIS_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/libmetis${_METIS_LIB_SUFFIX} CACHE STRING + "List of library files for METIS" +) + +# Build ParMETIS (as needed) +if(PALACE_WITH_SUPERLU OR PALACE_WITH_STRUMPACK OR PALACE_WITH_MUMPS) + set(PARMETIS_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) + list(APPEND PARMETIS_OPTIONS + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DGKLIB_PATH=${CMAKE_BINARY_DIR}/extern/metis/GKlib" + "-DMETIS_PATH=${CMAKE_INSTALL_PREFIX}" + ) + if(BUILD_SHARED_LIBS) + list(APPEND PARMETIS_OPTIONS "-DSHARED=1") + else() + list(APPEND PARMETIS_OPTIONS "-DSHARED=0") + endif() + if(CMAKE_C_COMPILER_ID STREQUAL "MSVC") + list(APPEND PARMETIS_OPTIONS "-DMSVC=1") + else() + list(APPEND PARMETIS_OPTIONS "-DMSVC=0") + endif() + + # User might specify the MPI compiler wrappers directly, otherwise we need to supply MPI + # as found from the CMake module + if(NOT MPI_FOUND) + message(FATAL_ERROR "MPI is not found when trying to build ParMETIS") + endif() + if(NOT CMAKE_C_COMPILER STREQUAL MPI_C_COMPILER) + string(REPLACE ";" "$" PARMETIS_MPI_LIBRARIES "${MPI_C_LIBRARIES}") + string(REPLACE ";" "$" PARMETIS_MPI_INCLUDE_DIRS "${MPI_C_INCLUDE_DIRS}") + list(APPEND PARMETIS_OPTIONS + "-DMPI_LIBRARIES=${PARMETIS_MPI_LIBRARIES}" + "-DMPI_INCLUDE_PATH=${PARMETIS_MPI_INCLUDE_DIRS}" + ) + endif() + + string(REPLACE ";" "; " PARMETIS_OPTIONS_PRINT "${PARMETIS_OPTIONS}") + message(STATUS "PARMETIS_OPTIONS: ${PARMETIS_OPTIONS_PRINT}") + + ExternalProject_Add(parmetis + DEPENDS ${PARMETIS_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_PARMETIS_URL} + GIT_TAG ${EXTERN_PARMETIS_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/parmetis + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/parmetis-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/parmetis-cmake + UPDATE_COMMAND "" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${PARMETIS_OPTIONS}" + TEST_COMMAND "" + ) + set(PARMETIS_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/libparmetis${_METIS_LIB_SUFFIX} CACHE STRING + "List of library files for ParMETIS" +) +endif() + diff --git a/cmake/ExternalMFEM.cmake b/cmake/ExternalMFEM.cmake index 748b3e8810..351ee89317 100644 --- a/cmake/ExternalMFEM.cmake +++ b/cmake/ExternalMFEM.cmake @@ -1,388 +1,414 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build MFEM -# - -# Force build order -if(PALACE_BUILD_EXTERNAL_DEPS) - set(MFEM_DEPENDENCIES hypre metis) - if(PALACE_WITH_MUMPS) - list(APPEND MFEM_DEPENDENCIES mumps) - endif() - if(PALACE_WITH_STRUMPACK) - list(APPEND MFEM_DEPENDENCIES strumpack) - endif() - if(PALACE_WITH_SUPERLU) - list(APPEND MFEM_DEPENDENCIES superlu_dist) - endif() -else() - set(MFEM_DEPENDENCIES) -endif() -if(PALACE_WITH_GSLIB) - list(APPEND MFEM_DEPENDENCIES gslib) -endif() - -# Silence #pragma omp warnings when not building with OpenMP -set(MFEM_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -if(PALACE_WITH_STRUMPACK AND NOT PALACE_WITH_OPENMP) - include(CheckCXXCompilerFlag) - check_cxx_compiler_flag(-Wno-unknown-pragmas SUPPORTS_NOPRAGMA_WARNING) - if(SUPPORTS_NOPRAGMA_WARNING) - set(MFEM_CXX_FLAGS "${MFEM_CXX_FLAGS} -Wno-unknown-pragmas") - endif() -endif() - -# Silence some CUDA/HIP include file warnings -if(PALACE_WITH_CUDA OR PALACE_WITH_HIP) - set(MFEM_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") - set(MFEM_HIP_FLAGS "${CMAKE_HIP_FLAGS}") - if(MFEM_CXX_FLAGS MATCHES "-pedantic") - string(REGEX REPLACE "-pedantic" "" MFEM_CXX_FLAGS ${MFEM_CXX_FLAGS}) - endif() - if(MFEM_CUDA_FLAGS MATCHES "-pedantic") - string(REGEX REPLACE "-pedantic" "" MFEM_CUDA_FLAGS ${MFEM_CUDA_FLAGS}) - endif() - if(MFEM_HIP_FLAGS MATCHES "-pedantic") - string(REGEX REPLACE "-pedantic" "" MFEM_HIP_FLAGS ${MFEM_HIP_FLAGS}) - endif() - if(MFEM_CUDA_FLAGS MATCHES "-ccbin") - # MFEM adds this via CMAKE_CUDA_HOST_COMPILER - string(REGEX REPLACE "-ccbin ([^ ]+)" "" MFEM_CUDA_FLAGS ${MFEM_CUDA_FLAGS}) - endif() -endif() - -# Find optional MFEM dependencies with CMake because once passed to MFEM, they are -# required -set(PALACE_MFEM_WITH_ZLIB NO) -set(PALACE_MFEM_WITH_LIBUNWIND NO) -find_package(ZLIB) -if(ZLIB_FOUND) - message(STATUS "Building MFEM with zlib support for binary output compression") - set(PALACE_MFEM_WITH_ZLIB YES) -endif() -if(CMAKE_BUILD_TYPE MATCHES "Debug|debug|DEBUG") - find_path(LIBUNWIND_INCLUDE_DIR - NAMES libunwind.h - HINTS /usr - ) - if(LIBUNWIND_INCLUDE_DIR) - message(STATUS "Building MFEM with libunwind support") - set(PALACE_MFEM_WITH_LIBUNWIND YES) - endif() -endif() - -set(MFEM_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND MFEM_OPTIONS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_CXX_FLAGS=${MFEM_CXX_FLAGS}" - "-DMFEM_USE_MPI=YES" - "-DMFEM_USE_OPENMP=${PALACE_WITH_OPENMP}" - "-DMFEM_THREAD_SAFE=${PALACE_WITH_OPENMP}" - "-DMFEM_USE_SUPERLU=${PALACE_WITH_SUPERLU}" - "-DMFEM_USE_STRUMPACK=${PALACE_WITH_STRUMPACK}" - "-DMFEM_USE_MUMPS=${PALACE_WITH_MUMPS}" - "-DMFEM_USE_ZLIB=${PALACE_MFEM_WITH_ZLIB}" - "-DMFEM_USE_LIBUNWIND=${PALACE_MFEM_WITH_LIBUNWIND}" - "-DMFEM_USE_METIS_5=YES" - "-DMFEM_USE_CEED=NO" -) -if(PALACE_WITH_STRUMPACK OR PALACE_WITH_MUMPS) - list(APPEND MFEM_OPTIONS - "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" - "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" - ) -endif() - -# Configure BLAS/LAPACK for dependencies -if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") - list(APPEND MFEM_OPTIONS - "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - ) -endif() - -# Configure GPU support -if(PALACE_WITH_CUDA) - list(APPEND MFEM_OPTIONS - "-DMFEM_USE_CUDA=YES" - "-DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" - "-DCMAKE_CUDA_FLAGS=${MFEM_CUDA_FLAGS}" - ) - if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") - list(APPEND MFEM_OPTIONS - "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" - "-DCUDA_ARCH=${CMAKE_CUDA_ARCHITECTURES}" - ) - endif() -else() - list(APPEND MFEM_OPTIONS - "-DMFEM_USE_CUDA=NO" - ) -endif() -if(PALACE_WITH_HIP) - list(APPEND MFEM_OPTIONS - "-DMFEM_USE_HIP=YES" - "-DROCM_PATH=${ROCM_DIR}" - "-DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER}" - "-DCMAKE_HIP_FLAGS=${MFEM_HIP_FLAGS}" - ) - if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") - list(APPEND MFEM_OPTIONS - "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}" - "-DHIP_ARCH=${CMAKE_HIP_ARCHITECTURES}" - ) - endif() -else() - list(APPEND MFEM_OPTIONS - "-DMFEM_USE_HIP=NO" - ) -endif() - -# MFEM with GSLIB is always built internally -if(PALACE_WITH_GSLIB) - list(APPEND MFEM_OPTIONS - "-DMFEM_USE_GSLIB=YES" - "-DGSLIB_DIR=${CMAKE_INSTALL_PREFIX}" - ) -endif() - -# Configure the rest of MFEM's dependencies -if(PALACE_BUILD_EXTERNAL_DEPS) - list(APPEND MFEM_OPTIONS - "-DMETIS_LIBRARIES=${METIS_LIBRARIES}" - "-DMETIS_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" - "-DHYPRE_DIR=${CMAKE_INSTALL_PREFIX}" - "-DHYPRE_REQUIRED_PACKAGES=LAPACK$BLAS" - ) - if(PALACE_WITH_SUPERLU OR PALACE_WITH_STRUMPACK) - list(APPEND MFEM_OPTIONS - "-DParMETIS_LIBRARIES=${PARMETIS_LIBRARIES}$${METIS_LIBRARIES}" - "-DParMETIS_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" - ) - endif() - - # HYPRE is built with cusparse, curand (or HIP counterparts). - if(PALACE_WITH_CUDA) - find_package(CUDAToolkit REQUIRED) - get_target_property(HYPRE_CURAND_LIBRARY CUDA::curand LOCATION) - get_target_property(HYPRE_CUSPARSE_LIBRARY CUDA::cusparse LOCATION) - list(APPEND MFEM_OPTIONS - "-DHYPRE_REQUIRED_LIBRARIES=${HYPRE_CURAND_LIBRARY}$${HYPRE_CUSPARSE_LIBRARY}" - ) - endif() - if(PALACE_WITH_HIP) - find_package(rocrand REQUIRED) - find_package(rocsparse REQUIRED) - get_target_property(HYPRE_ROCRAND_LIBRARY roc::rocrand LOCATION) - get_target_property(HYPRE_ROCSPARSE_LIBRARY roc::rocsparse LOCATION) - list(APPEND MFEM_OPTIONS - "-DHYPRE_REQUIRED_LIBRARIES=${HYPRE_ROCRAND_LIBRARY}$${HYPRE_ROCSPARSE_LIBRARY}" - ) - endif() - - # Need to pass gfortran (or similar) dependency to C++ linker for MFEM link line - if(PALACE_WITH_STRUMPACK OR PALACE_WITH_MUMPS) - if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") - if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - set(STRUMPACK_MUMPS_GFORTRAN_LIBRARY gfortran) - else() - find_library(STRUMPACK_MUMPS_GFORTRAN_LIBRARY - NAMES gfortran - PATHS ${CMAKE_Fortran_IMPLICIT_LINK_DIRECTORIES} - NO_DEFAULT_PATH - REQUIRED - ) - endif() - elseif(CMAKE_Fortran_COMPILER_ID MATCHES "Intel|IntelLLVM") - if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Intel|IntelLLVM") - message(FATAL_ERROR "Intel Fortran compiler detected but not compatible without \ -Intel C++ compiler for MUMPS and STRUMPACK dependencies") - endif() - set(STRUMPACK_MUMPS_GFORTRAN_LIBRARY ifport$ifcore) - endif() - endif() - - # Find cuBLAS and cuSOLVER (or ROCm counterparts) - if(PALACE_WITH_SUPERLU OR PALACE_WITH_STRUMPACK) - if(PALACE_WITH_CUDA) - find_package(CUDAToolkit REQUIRED) - get_target_property(SUPERLU_STRUMPACK_CUBLAS_LIBRARY CUDA::cublas LOCATION) - get_target_property(SUPERLU_STRUMPACK_CUBLASLT_LIBRARY CUDA::cublasLt LOCATION) - get_target_property(SUPERLU_STRUMPACK_CUSOLVER_LIBRARY CUDA::cusolver LOCATION) - set(SUPERLU_STRUMPACK_CUDA_LIBRARIES - ${SUPERLU_STRUMPACK_CUBLAS_LIBRARY} - ${SUPERLU_STRUMPACK_CUBLASLT_LIBRARY} - ${SUPERLU_STRUMPACK_CUSOLVER_LIBRARY} - ) - endif() - if(PALACE_WITH_HIP) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) - find_package(rocsolver REQUIRED) - get_target_property(SUPERLU_STRUMPACK_HIPBLAS_LIBRARY roc::hipblas LOCATION) - get_target_property(SUPERLU_STRUMPACK_ROCBLAS_LIBRARY roc::rocblas LOCATION) - get_target_property(SUPERLU_STRUMPACK_ROCSOLVER_LIBRARY roc::rocsolver LOCATION) - set(SUPERLU_STRUMPACK_ROCM_LIBRARIES - ${SUPERLU_STRUMPACK_ROCBLAS_LIBRARY} - ${SUPERLU_STRUMPACK_HIPBLAS_LIBRARY} - ${SUPERLU_STRUMPACK_ROCSOLVER_LIBRARY} - ) - endif() - endif() - - # Configure SuperLU_DIST - if(PALACE_WITH_SUPERLU) - set(SUPERLU_REQUIRED_PACKAGES "ParMETIS" "METIS" "LAPACK" "BLAS" "MPI") - set(SUPERLU_REQUIRED_LIBRARIES) - if(PALACE_WITH_OPENMP) - list(APPEND SUPERLU_REQUIRED_PACKAGES "OpenMP") - endif() - if(PALACE_WITH_CUDA) - list(APPEND SUPERLU_REQUIRED_PACKAGES "CUDA") - list(APPEND SUPERLU_REQUIRED_LIBRARIES ${SUPERLU_STRUMPACK_CUDA_LIBRARIES}) - endif() - if(PALACE_WITH_HIP) - list(APPEND SUPERLU_REQUIRED_PACKAGES "HIP") - list(APPEND SUPERLU_REQUIRED_LIBRARIES ${SUPERLU_STRUMPACK_ROCM_LIBRARIES}) - endif() - string(REPLACE ";" "$" SUPERLU_REQUIRED_PACKAGES "${SUPERLU_REQUIRED_PACKAGES}") - string(REPLACE ";" "$" SUPERLU_REQUIRED_LIBRARIES "${SUPERLU_REQUIRED_LIBRARIES}") - list(APPEND MFEM_OPTIONS - "-DSuperLUDist_DIR=${CMAKE_INSTALL_PREFIX}" - "-DSuperLUDist_REQUIRED_PACKAGES=${SUPERLU_REQUIRED_PACKAGES}" - ) - if(NOT "${SUPERLU_REQUIRED_LIBRARIES}" STREQUAL "") - list(APPEND MFEM_OPTIONS - "-DSuperLUDist_REQUIRED_LIBRARIES=${SUPERLU_REQUIRED_LIBRARIES}" - ) - endif() - endif() - - # Configure STRUMPACK - if(PALACE_WITH_STRUMPACK) - set(STRUMPACK_REQUIRED_PACKAGES "ParMETIS" "METIS" "LAPACK" "BLAS" "MPI" "MPI_Fortran") - set(STRUMPACK_REQUIRED_LIBRARIES ${SCALAPACK_LIBRARIES} ${STRUMPACK_MUMPS_GFORTRAN_LIBRARY}) - if(NOT "${STRUMPACK_EXTRA_LIBRARIES}" STREQUAL "") - list(PREPEND STRUMPACK_REQUIRED_LIBRARIES ${STRUMPACK_EXTRA_LIBRARIES}) - endif() - if(PALACE_WITH_OPENMP) - list(APPEND STRUMPACK_REQUIRED_PACKAGES "OpenMP") - endif() - if(PALACE_WITH_CUDA) - list(APPEND STRUMPACK_REQUIRED_PACKAGES "CUDA") - list(APPEND STRUMPACK_REQUIRED_LIBRARIES ${SUPERLU_STRUMPACK_CUDA_LIBRARIES}) - endif() - if(PALACE_WITH_HIP) - list(APPEND STRUMPACK_REQUIRED_PACKAGES "HIP") - list(APPEND STRUMPACK_REQUIRED_LIBRARIES ${SUPERLU_STRUMPACK_ROCM_LIBRARIES}) - endif() - string(REPLACE ";" "$" STRUMPACK_REQUIRED_PACKAGES "${STRUMPACK_REQUIRED_PACKAGES}") - string(REPLACE ";" "$" STRUMPACK_REQUIRED_LIBRARIES "${STRUMPACK_REQUIRED_LIBRARIES}") - list(APPEND MFEM_OPTIONS - "-DSTRUMPACK_DIR=${CMAKE_INSTALL_PREFIX}" - "-DSTRUMPACK_REQUIRED_PACKAGES=${STRUMPACK_REQUIRED_PACKAGES}" - "-DSTRUMPACK_REQUIRED_LIBRARIES=${STRUMPACK_REQUIRED_LIBRARIES}" - ) - endif() - - # Configure MUMPS - if(PALACE_WITH_MUMPS) - set(MUMPS_REQUIRED_PACKAGES "METIS" "LAPACK" "BLAS" "MPI" "MPI_Fortran" "Threads") - if(PALACE_WITH_OPENMP) - list(APPEND MUMPS_REQUIRED_PACKAGES "OpenMP") - endif() - string(REPLACE ";" "$" MUMPS_REQUIRED_PACKAGES "${MUMPS_REQUIRED_PACKAGES}") - list(APPEND MFEM_OPTIONS - "-DMUMPS_DIR=${CMAKE_INSTALL_PREFIX}" - "-DMUMPS_REQUIRED_PACKAGES=${MUMPS_REQUIRED_PACKAGES}" - "-DMUMPS_REQUIRED_LIBRARIES=${SCALAPACK_LIBRARIES}$${STRUMPACK_MUMPS_GFORTRAN_LIBRARY}" - ) - endif() -else() - # Help find dependencies for the internal MFEM build - # If we trust MFEM's Find.cmake module, we can just set _DIR and, if - # needed, _REQUIRED_PACKAGES. The extra _REQUIRED_LIBRARIES can be used - # to add any additional dependency libraries. - set(PALACE_MFEM_DEPS - "METIS" - "ParMETIS" - "HYPRE" - "SuperLUDist" - "STRUMPACK" - "MUMPS" - ) - foreach(DEP IN LISTS PALACE_MFEM_DEPS) - set(${DEP}_DIR "" CACHE STRING "Path to ${DEP} build or installation directory") - set(${DEP}_REQUIRED_PACKAGES "" CACHE STRING "List of additional required packages for ${DEP}") - set(${DEP}_REQUIRED_LIBRARIES "" CACHE STRING "List of additional required libraries for ${DEP}") - # set(${DEP}_LIBRARIES "" CACHE STRING "List of library files for ${DEP}") - # set(${DEP}_INCLUDE_DIRS "" CACHE STRING "Path to ${DEP} include directories") - if(NOT "${${DEP}_DIR}" STREQUAL "") - string(REPLACE ";" "$" DEP_DIR "${${DEP}_DIR}") - list(APPEND MFEM_OPTIONS - "-D${DEP}_DIR=${DEP_DIR}" - ) - endif() - if(NOT "${${DEP}_REQUIRED_PACKAGES}" STREQUAL "") - string(REPLACE ";" "$" DEP_REQUIRED_PACKAGES "${${DEP}_REQUIRED_PACKAGES}") - list(APPEND MFEM_OPTIONS - "-D${DEP}_REQUIRED_PACKAGES=${DEP_REQUIRED_PACKAGES}" - ) - endif() - if(NOT "${${DEP}_REQUIRED_LIBRARIES}" STREQUAL "") - string(REPLACE ";" "$" DEP_REQUIRED_LIBRARIES "${${DEP}_REQUIRED_LIBRARIES}") - list(APPEND MFEM_OPTIONS - "-D${DEP}_REQUIRED_LIBRARIES=${DEP_REQUIRED_LIBRARIES}" - ) - endif() - # if(NOT "${${DEP}_LIBRARIES}" STREQUAL "") - # string(REPLACE ";" "$" DEP_LIBRARIES "${${DEP}_LIBRARIES}") - # list(APPEND MFEM_OPTIONS - # "-D${DEP}_LIBRARIES=${DEP_LIBRARIES}" - # ) - # endif() - # if(NOT "${${DEP}_INCLUDE_DIRS}" STREQUAL "") - # string(REPLACE ";" "$" DEP_INCLUDE_DIRS "${${DEP}_INCLUDE_DIRS}") - # list(APPEND MFEM_OPTIONS - # "-D${DEP}_INCLUDE_DIRS=${DEP_INCLUDE_DIRS}" - # ) - # endif() - endforeach() -endif() - -string(REPLACE ";" "; " MFEM_OPTIONS_PRINT "${MFEM_OPTIONS}") -message(STATUS "MFEM_OPTIONS: ${MFEM_OPTIONS_PRINT}") - -# A number of patches to MFEM for our use cases -set(MFEM_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_mfem_device_fixes.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_cmake_cuda_fix.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_strumpack_solver_dev.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_mesh_vis_dev.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_pfespace_constructor_fix.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_par_tet_mesh_fix.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_global_variables_threadsafe.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_stateless_doftrans_threadsafe.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_mesh_partitioner_dev.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_ncmesh_interior_boundary_dev.diff" -) - -include(ExternalProject) -ExternalProject_Add(mfem - DEPENDS ${MFEM_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_MFEM_URL} - GIT_TAG ${EXTERN_MFEM_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/mfem - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/mfem-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/mfem-cmake - UPDATE_COMMAND "" - PATCH_COMMAND - git reset --hard && - git clean -fd && - git apply "${MFEM_PATCH_FILES}" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${MFEM_OPTIONS}" - TEST_COMMAND ${CMAKE_MAKE_PROGRAM} ex1 ex1p -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build MFEM +# + +# Force build order +if(PALACE_BUILD_EXTERNAL_DEPS) + set(MFEM_DEPENDENCIES hypre metis) + if(PALACE_WITH_MUMPS) + list(APPEND MFEM_DEPENDENCIES mumps) + endif() + if(PALACE_WITH_STRUMPACK) + list(APPEND MFEM_DEPENDENCIES strumpack) + endif() + if(PALACE_WITH_SUPERLU) + list(APPEND MFEM_DEPENDENCIES superlu_dist) + endif() + if(PALACE_WITH_SUNDIALS) + list(APPEND MFEM_DEPENDENCIES sundials) + endif() + if(PALACE_WITH_GSLIB) + list(APPEND MFEM_DEPENDENCIES gslib) + endif() +else() + set(MFEM_DEPENDENCIES) +endif() + +# Silence #pragma omp warnings when not building with OpenMP +set(MFEM_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +if(PALACE_WITH_STRUMPACK AND NOT PALACE_WITH_OPENMP) + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag(-Wno-unknown-pragmas SUPPORTS_NOPRAGMA_WARNING) + if(SUPPORTS_NOPRAGMA_WARNING) + set(MFEM_CXX_FLAGS "${MFEM_CXX_FLAGS} -Wno-unknown-pragmas") + endif() +endif() + +# Silence some CUDA/HIP include file warnings +if(PALACE_WITH_CUDA OR PALACE_WITH_HIP) + set(MFEM_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") + set(MFEM_HIP_FLAGS "${CMAKE_HIP_FLAGS}") + if(MFEM_CXX_FLAGS MATCHES "-pedantic") + string(REGEX REPLACE "-pedantic" "" MFEM_CXX_FLAGS ${MFEM_CXX_FLAGS}) + endif() + if(MFEM_CUDA_FLAGS MATCHES "-pedantic") + string(REGEX REPLACE "-pedantic" "" MFEM_CUDA_FLAGS ${MFEM_CUDA_FLAGS}) + endif() + if(MFEM_HIP_FLAGS MATCHES "-pedantic") + string(REGEX REPLACE "-pedantic" "" MFEM_HIP_FLAGS ${MFEM_HIP_FLAGS}) + endif() + if(MFEM_CUDA_FLAGS MATCHES "-ccbin") + # MFEM adds this via CMAKE_CUDA_HOST_COMPILER + string(REGEX REPLACE "-ccbin ([^ ]+)" "" MFEM_CUDA_FLAGS ${MFEM_CUDA_FLAGS}) + endif() +endif() + +# Find optional MFEM dependencies with CMake because once passed to MFEM, they are +# required +set(PALACE_MFEM_WITH_ZLIB NO) +set(PALACE_MFEM_WITH_LIBUNWIND NO) +find_package(ZLIB) +if(ZLIB_FOUND) + message(STATUS "Building MFEM with zlib support for binary output compression") + set(PALACE_MFEM_WITH_ZLIB YES) +endif() +if(CMAKE_BUILD_TYPE MATCHES "Debug|debug|DEBUG") + find_path(LIBUNWIND_INCLUDE_DIR + NAMES libunwind.h + HINTS /usr + ) + if(LIBUNWIND_INCLUDE_DIR) + message(STATUS "Building MFEM with libunwind support") + set(PALACE_MFEM_WITH_LIBUNWIND YES) + endif() +endif() + +# Replace mfem abort calls with exceptions for testing, default off +set(PALACE_MFEM_USE_EXCEPTIONS NO CACHE BOOL "MFEM throw exceptions instead of abort calls") + +set(MFEM_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) +list(APPEND MFEM_OPTIONS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_CXX_FLAGS=${MFEM_CXX_FLAGS}" + "-DMFEM_USE_MPI=YES" + "-DMFEM_USE_OPENMP=${PALACE_WITH_OPENMP}" + "-DMFEM_THREAD_SAFE=${PALACE_WITH_OPENMP}" + "-DMFEM_USE_SUPERLU=${PALACE_WITH_SUPERLU}" + "-DMFEM_USE_STRUMPACK=${PALACE_WITH_STRUMPACK}" + "-DMFEM_USE_MUMPS=${PALACE_WITH_MUMPS}" + "-DMFEM_USE_ZLIB=${PALACE_MFEM_WITH_ZLIB}" + "-DMFEM_USE_LIBUNWIND=${PALACE_MFEM_WITH_LIBUNWIND}" + "-DMFEM_USE_METIS_5=YES" + "-DMFEM_USE_CEED=NO" + "-DMFEM_USE_SUNDIALS=${PALACE_WITH_SUNDIALS}" + "-DMFEM_USE_EXCEPTIONS=${PALACE_MFEM_USE_EXCEPTIONS}" +) +if(PALACE_WITH_STRUMPACK OR PALACE_WITH_MUMPS) + list(APPEND MFEM_OPTIONS + "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" + "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" + ) +endif() + +# Configure BLAS/LAPACK for dependencies +if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") + list(APPEND MFEM_OPTIONS + # "-DMFEM_USE_LAPACK=YES" + "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + ) +endif() + +# Configure GPU support +if(PALACE_WITH_CUDA) + list(APPEND MFEM_OPTIONS + "-DMFEM_USE_CUDA=YES" + "-DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" + "-DCMAKE_CUDA_FLAGS=${MFEM_CUDA_FLAGS}" + ) + if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") + list(APPEND MFEM_OPTIONS + "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" + "-DCUDA_ARCH=${CMAKE_CUDA_ARCHITECTURES}" + ) + endif() +else() + list(APPEND MFEM_OPTIONS + "-DMFEM_USE_CUDA=NO" + ) +endif() +if(PALACE_WITH_HIP) + list(APPEND MFEM_OPTIONS + "-DMFEM_USE_HIP=YES" + "-DROCM_PATH=${ROCM_DIR}" + "-DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER}" + "-DCMAKE_HIP_FLAGS=${MFEM_HIP_FLAGS}" + ) + if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") + list(APPEND MFEM_OPTIONS + "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}" + "-DHIP_ARCH=${CMAKE_HIP_ARCHITECTURES}" + ) + endif() +else() + list(APPEND MFEM_OPTIONS + "-DMFEM_USE_HIP=NO" + ) +endif() + +# MFEM with GSLIB is always built internally +if(PALACE_WITH_GSLIB) + list(APPEND MFEM_OPTIONS "-DMFEM_USE_GSLIB=YES") + if(PALACE_BUILD_EXTERNAL_DEPS) + list(APPEND MFEM_OPTIONS "-DGSLIB_DIR=${CMAKE_INSTALL_PREFIX}") + else() + list(APPEND MFEM_OPTIONS "-DGSLIB_DIR=${GSLIB_DIR}") + endif() +endif() + +# Configure the rest of MFEM's dependencies +if(PALACE_BUILD_EXTERNAL_DEPS) + list(APPEND MFEM_OPTIONS + "-DMETIS_LIBRARIES=${METIS_LIBRARIES}" + "-DMETIS_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" + "-DHYPRE_DIR=${CMAKE_INSTALL_PREFIX}" + "-DHYPRE_REQUIRED_PACKAGES=LAPACK$BLAS" + ) + if(PALACE_WITH_SUPERLU OR PALACE_WITH_STRUMPACK OR PALACE_WITH_MUMPS) + list(APPEND MFEM_OPTIONS + "-DParMETIS_LIBRARIES=${PARMETIS_LIBRARIES}$${METIS_LIBRARIES}" + "-DParMETIS_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" + ) + endif() + + # HYPRE is built with cusparse, curand (or HIP counterparts), and these are added to + # HYPRE_LIBRARIES by the MFEM CMake build. However, this ignores the include directories + # (for #include , for example), which we can add this way via CMake defining + # CUDAToolkit_INCLUDE_DIRS (and the HIP counterpart). + if(PALACE_WITH_CUDA) + list(APPEND MFEM_OPTIONS + "-DHYPRE_REQUIRED_PACKAGES=CUDAToolkit" + ) + endif() + if(PALACE_WITH_HIP) + list(APPEND MFEM_OPTIONS + "-DHYPRE_REQUIRED_PACKAGES=rocsparse" + ) + endif() + + # Need to pass gfortran (or similar) dependency to C++ linker for MFEM link line + if(PALACE_WITH_STRUMPACK OR PALACE_WITH_MUMPS) + if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(STRUMPACK_MUMPS_GFORTRAN_LIBRARY gfortran) + else() + find_library(STRUMPACK_MUMPS_GFORTRAN_LIBRARY + NAMES gfortran + PATHS ${CMAKE_Fortran_IMPLICIT_LINK_DIRECTORIES} + NO_DEFAULT_PATH + REQUIRED + ) + endif() + elseif(CMAKE_Fortran_COMPILER_ID MATCHES "Intel|IntelLLVM") + if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Intel|IntelLLVM") + message(FATAL_ERROR "Intel Fortran compiler detected but not compatible without \ +Intel C++ compiler for MUMPS and STRUMPACK dependencies") + endif() + set(STRUMPACK_MUMPS_GFORTRAN_LIBRARY ifport$ifcore) + endif() + endif() + + # Find cuBLAS and cuSOLVER (or ROCm counterparts) + if(PALACE_WITH_SUPERLU OR PALACE_WITH_STRUMPACK) + if(PALACE_WITH_CUDA) + find_package(CUDAToolkit REQUIRED) + get_target_property(SUPERLU_STRUMPACK_CUBLAS_LIBRARY CUDA::cublas LOCATION) + get_target_property(SUPERLU_STRUMPACK_CUBLASLT_LIBRARY CUDA::cublasLt LOCATION) + get_target_property(SUPERLU_STRUMPACK_CUSOLVER_LIBRARY CUDA::cusolver LOCATION) + set(SUPERLU_STRUMPACK_CUDA_LIBRARIES + ${SUPERLU_STRUMPACK_CUBLAS_LIBRARY} + ${SUPERLU_STRUMPACK_CUBLASLT_LIBRARY} + ${SUPERLU_STRUMPACK_CUSOLVER_LIBRARY} + ) + endif() + if(PALACE_WITH_HIP) + find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) + find_package(rocsolver REQUIRED) + get_target_property(SUPERLU_STRUMPACK_HIPBLAS_LIBRARY roc::hipblas LOCATION) + get_target_property(SUPERLU_STRUMPACK_ROCBLAS_LIBRARY roc::rocblas LOCATION) + get_target_property(SUPERLU_STRUMPACK_ROCSOLVER_LIBRARY roc::rocsolver LOCATION) + set(SUPERLU_STRUMPACK_ROCM_LIBRARIES + ${SUPERLU_STRUMPACK_ROCBLAS_LIBRARY} + ${SUPERLU_STRUMPACK_HIPBLAS_LIBRARY} + ${SUPERLU_STRUMPACK_ROCSOLVER_LIBRARY} + ) + endif() + endif() + + # Configure SuperLU_DIST + if(PALACE_WITH_SUPERLU) + set(SUPERLU_REQUIRED_PACKAGES "ParMETIS" "METIS" "LAPACK" "BLAS" "MPI") + set(SUPERLU_REQUIRED_LIBRARIES) + if(PALACE_WITH_OPENMP) + list(APPEND SUPERLU_REQUIRED_PACKAGES "OpenMP") + endif() + if(PALACE_WITH_CUDA) + list(APPEND SUPERLU_REQUIRED_PACKAGES "CUDAToolkit") + list(APPEND SUPERLU_REQUIRED_LIBRARIES ${SUPERLU_STRUMPACK_CUDA_LIBRARIES}) + endif() + if(PALACE_WITH_HIP) + list(APPEND SUPERLU_REQUIRED_PACKAGES "hipblas$rocblas") + list(APPEND SUPERLU_REQUIRED_LIBRARIES ${SUPERLU_STRUMPACK_ROCM_LIBRARIES}) + endif() + string(REPLACE ";" "$" SUPERLU_REQUIRED_PACKAGES "${SUPERLU_REQUIRED_PACKAGES}") + string(REPLACE ";" "$" SUPERLU_REQUIRED_LIBRARIES "${SUPERLU_REQUIRED_LIBRARIES}") + list(APPEND MFEM_OPTIONS + "-DSuperLUDist_DIR=${CMAKE_INSTALL_PREFIX}" + "-DSuperLUDist_REQUIRED_PACKAGES=${SUPERLU_REQUIRED_PACKAGES}" + ) + if(NOT "${SUPERLU_REQUIRED_LIBRARIES}" STREQUAL "") + list(APPEND MFEM_OPTIONS + "-DSuperLUDist_REQUIRED_LIBRARIES=${SUPERLU_REQUIRED_LIBRARIES}" + ) + endif() + endif() + + # Configure STRUMPACK + if(PALACE_WITH_STRUMPACK) + set(STRUMPACK_REQUIRED_PACKAGES "ParMETIS" "METIS" "LAPACK" "BLAS" "MPI" "MPI_Fortran") + set(STRUMPACK_REQUIRED_LIBRARIES ${SCALAPACK_LIBRARIES} ${STRUMPACK_MUMPS_GFORTRAN_LIBRARY}) + if(NOT "${STRUMPACK_EXTRA_LIBRARIES}" STREQUAL "") + list(PREPEND STRUMPACK_REQUIRED_LIBRARIES ${STRUMPACK_EXTRA_LIBRARIES}) + endif() + if(PALACE_WITH_OPENMP) + list(APPEND STRUMPACK_REQUIRED_PACKAGES "OpenMP") + endif() + if(PALACE_WITH_CUDA) + list(APPEND STRUMPACK_REQUIRED_PACKAGES "CUDAToolkit") + list(APPEND STRUMPACK_REQUIRED_LIBRARIES ${SUPERLU_STRUMPACK_CUDA_LIBRARIES}) + endif() + if(PALACE_WITH_HIP) + list(APPEND STRUMPACK_REQUIRED_PACKAGES "hipblas$rocblas") + list(APPEND STRUMPACK_REQUIRED_LIBRARIES ${SUPERLU_STRUMPACK_ROCM_LIBRARIES}) + endif() + string(REPLACE ";" "$" STRUMPACK_REQUIRED_PACKAGES "${STRUMPACK_REQUIRED_PACKAGES}") + string(REPLACE ";" "$" STRUMPACK_REQUIRED_LIBRARIES "${STRUMPACK_REQUIRED_LIBRARIES}") + list(APPEND MFEM_OPTIONS + "-DSTRUMPACK_DIR=${CMAKE_INSTALL_PREFIX}" + "-DSTRUMPACK_REQUIRED_PACKAGES=${STRUMPACK_REQUIRED_PACKAGES}" + "-DSTRUMPACK_REQUIRED_LIBRARIES=${STRUMPACK_REQUIRED_LIBRARIES}" + ) + endif() + + # Configure MUMPS + if(PALACE_WITH_MUMPS) + set(MUMPS_REQUIRED_PACKAGES "ParMETIS" "METIS" "LAPACK" "BLAS" "MPI" "MPI_Fortran" "Threads") + if(PALACE_WITH_OPENMP) + list(APPEND MUMPS_REQUIRED_PACKAGES "OpenMP") + endif() + string(REPLACE ";" "$" MUMPS_REQUIRED_PACKAGES "${MUMPS_REQUIRED_PACKAGES}") + list(APPEND MFEM_OPTIONS + "-DMUMPS_DIR=${CMAKE_INSTALL_PREFIX}" + "-DMUMPS_REQUIRED_PACKAGES=${MUMPS_REQUIRED_PACKAGES}" + "-DMUMPS_REQUIRED_LIBRARIES=${SCALAPACK_LIBRARIES}$${STRUMPACK_MUMPS_GFORTRAN_LIBRARY}" + ) + endif() + + # Configure SUNDIALS + if(PALACE_WITH_SUNDIALS) + set(SUNDIALS_REQUIRED_PACKAGES "LAPACK" "BLAS" "MPI") + if(PALACE_WITH_OPENMP) + list(APPEND SUNDIALS_REQUIRED_PACKAGES "OpenMP") + endif() + if(PALACE_WITH_CUDA) + list(APPEND SUNDIALS_REQUIRED_PACKAGES "CUDAToolkit") + list(APPEND SUNDIALS_REQUIRED_LIBRARIES ${SUPERLU_STRUMPACK_CUDA_LIBRARIES}) + endif() + string(REPLACE ";" "$" SUNDIALS_REQUIRED_PACKAGES "${SUNDIALS_REQUIRED_PACKAGES}") + string(REPLACE ";" "$" SUNDIALS_REQUIRED_LIBRARIES "${SUNDIALS_REQUIRED_LIBRARIES}") + list(APPEND MFEM_OPTIONS + "-DSUNDIALS_DIR=${CMAKE_INSTALL_PREFIX}" + "-DSUNDIALS_REQUIRED_PACKAGES=${SUNDIALS_REQUIRED_PACKAGES}" + ) + if(NOT "${SUNDIALS_REQUIRED_LIBRARIES}" STREQUAL "") + list(APPEND MFEM_OPTIONS + "-DSUNDIALS_REQUIRED_LIBRARIES=${SUNDIALS_REQUIRED_LIBRARIES}" + ) + endif() + endif() + +else() + # Help find dependencies for the internal MFEM build + # If we trust MFEM's Find.cmake module, we can just set _DIR and, if + # needed, _REQUIRED_PACKAGES. The extra _REQUIRED_LIBRARIES can be used + # to add any additional dependency libraries. + set(PALACE_MFEM_DEPS + "METIS" + "ParMETIS" + "HYPRE" + "SuperLUDist" + "STRUMPACK" + "MUMPS" + "SUNDIALS" + ) + foreach(DEP IN LISTS PALACE_MFEM_DEPS) + set(${DEP}_DIR "" CACHE STRING "Path to ${DEP} build or installation directory") + set(${DEP}_REQUIRED_PACKAGES "" CACHE STRING "List of additional required packages for ${DEP}") + set(${DEP}_REQUIRED_LIBRARIES "" CACHE STRING "List of additional required libraries for ${DEP}") + # set(${DEP}_LIBRARIES "" CACHE STRING "List of library files for ${DEP}") + # set(${DEP}_INCLUDE_DIRS "" CACHE STRING "Path to ${DEP} include directories") + if(NOT "${${DEP}_DIR}" STREQUAL "") + string(REPLACE ";" "$" DEP_DIR "${${DEP}_DIR}") + list(APPEND MFEM_OPTIONS + "-D${DEP}_DIR=${DEP_DIR}" + ) + endif() + if(NOT "${${DEP}_REQUIRED_PACKAGES}" STREQUAL "") + string(REPLACE ";" "$" DEP_REQUIRED_PACKAGES "${${DEP}_REQUIRED_PACKAGES}") + list(APPEND MFEM_OPTIONS + "-D${DEP}_REQUIRED_PACKAGES=${DEP_REQUIRED_PACKAGES}" + ) + endif() + if(NOT "${${DEP}_REQUIRED_LIBRARIES}" STREQUAL "") + string(REPLACE ";" "$" DEP_REQUIRED_LIBRARIES "${${DEP}_REQUIRED_LIBRARIES}") + list(APPEND MFEM_OPTIONS + "-D${DEP}_REQUIRED_LIBRARIES=${DEP_REQUIRED_LIBRARIES}" + ) + endif() + # if(NOT "${${DEP}_LIBRARIES}" STREQUAL "") + # string(REPLACE ";" "$" DEP_LIBRARIES "${${DEP}_LIBRARIES}") + # list(APPEND MFEM_OPTIONS + # "-D${DEP}_LIBRARIES=${DEP_LIBRARIES}" + # ) + # endif() + # if(NOT "${${DEP}_INCLUDE_DIRS}" STREQUAL "") + # string(REPLACE ";" "$" DEP_INCLUDE_DIRS "${${DEP}_INCLUDE_DIRS}") + # list(APPEND MFEM_OPTIONS + # "-D${DEP}_INCLUDE_DIRS=${DEP_INCLUDE_DIRS}" + # ) + # endif() + endforeach() +endif() + +string(REPLACE ";" "; " MFEM_OPTIONS_PRINT "${MFEM_OPTIONS}") +message(STATUS "MFEM_OPTIONS: ${MFEM_OPTIONS_PRINT}") + +# A number of patches to MFEM for our use cases +set(MFEM_PATCH_FILES + "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_mesh_vis_dev.diff" + "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_par_tet_mesh_fix_dev.diff" + "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_gmsh_parser_performance.diff" + "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_race_condition_fix.diff" +) + +include(ExternalProject) +ExternalProject_Add(mfem + DEPENDS ${MFEM_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_MFEM_URL} + GIT_TAG ${EXTERN_MFEM_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/mfem + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/mfem-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/mfem-cmake + UPDATE_COMMAND "" + PATCH_COMMAND + git reset --hard && + git clean -fd && + git apply "${MFEM_PATCH_FILES}" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${MFEM_OPTIONS}" + TEST_COMMAND ${CMAKE_MAKE_PROGRAM} ex1 ex1p +) diff --git a/cmake/ExternalMUMPS.cmake b/cmake/ExternalMUMPS.cmake index d2ce6ff7cf..66278ead94 100644 --- a/cmake/ExternalMUMPS.cmake +++ b/cmake/ExternalMUMPS.cmake @@ -1,63 +1,66 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build MUMPS (from scivision, with CMake) -# - -# Force build order -set(MUMPS_DEPENDENCIES scalapack parmetis) - -set(MUMPS_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND MUMPS_OPTIONS - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" - "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" - "-Dparallel=ON" - "-Dopenmp=${PALACE_WITH_OPENMP}" - "-Dintsize64=OFF" - "-DBUILD_SINGLE=OFF" - "-DBUILD_DOUBLE=ON" - "-DBUILD_COMPLEX=OFF" - "-DBUILD_COMPLEX16=OFF" - "-Dmetis=ON" - "-Dparmetis=ON" - "-Dscotch=OFF" - "-DPARMETIS_LIBRARY=${PARMETIS_LIBRARIES}" - "-DMETIS_LIBRARY=${METIS_LIBRARIES}" - "-DMETIS_INCLUDE_DIR=${CMAKE_INSTALL_PREFIX}/include" - "-DSCALAPACK_LIBRARIES=${SCALAPACK_LIBRARIES}" - "-DSCALAPACK_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" -) - -# Configure LAPACK dependency -if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") - list(APPEND MUMPS_OPTIONS - "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - "-DLAPACK_INCLUDE_DIRS=${BLAS_LAPACK_INCLUDE_DIRS}" - ) -endif() - -string(REPLACE ";" "; " MUMPS_OPTIONS_PRINT "${MUMPS_OPTIONS}") -message(STATUS "MUMPS_OPTIONS: ${MUMPS_OPTIONS_PRINT}") - -# Fix FindLAPACK and FindScaLAPACK in configuration -set(MUMPS_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/mumps/patch_build.diff" -) - -include(ExternalProject) -ExternalProject_Add(mumps - DEPENDS ${MUMPS_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_MUMPS_URL} - GIT_TAG ${EXTERN_MUMPS_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/mumps - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/mumps-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/mumps-cmake - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${MUMPS_PATCH_FILES}" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${MUMPS_OPTIONS}" - TEST_COMMAND "" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build MUMPS (from scivision, with CMake) +# + +# Force build order +set(MUMPS_DEPENDENCIES scalapack parmetis metis) + +set(MUMPS_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) +list(APPEND MUMPS_OPTIONS + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" + "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" + "-DMUMPS_parallel=ON" + "-DMUMPS_openmp=${PALACE_WITH_OPENMP}" + "-Dintsize64=OFF" + "-DBUILD_SINGLE=OFF" + "-DBUILD_DOUBLE=ON" + "-DBUILD_COMPLEX=OFF" + "-DBUILD_COMPLEX16=OFF" + "-DMUMPS_BUILD_TESTING=OFF" + "-Dmetis=ON" + "-Dparmetis=ON" + "-Dscotch=OFF" + "-DMUMPS_scalapack=ON" + "-DPARMETIS_LIBRARY=${PARMETIS_LIBRARIES}" + "-DMETIS_LIBRARY=${METIS_LIBRARIES}" + "-DMETIS_INCLUDE_DIR=${CMAKE_INSTALL_PREFIX}/include" + "-DSCALAPACK_LIBRARIES=${SCALAPACK_LIBRARIES}" + "-DSCALAPACK_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" +) + +# Configure LAPACK dependency +if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") + list(APPEND MUMPS_OPTIONS + "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + "-DLAPACK_INCLUDE_DIRS=${BLAS_LAPACK_INCLUDE_DIRS}" + ) +endif() + +string(REPLACE ";" "; " MUMPS_OPTIONS_PRINT "${MUMPS_OPTIONS}") +message(STATUS "MUMPS_OPTIONS: ${MUMPS_OPTIONS_PRINT}") + +# Fix FindLAPACK and FindScaLAPACK in configuration +set(MUMPS_PATCH_FILES + "${CMAKE_SOURCE_DIR}/extern/patch/mumps/patch_build.diff" +) + +include(ExternalProject) +ExternalProject_Add(mumps + DEPENDS ${MUMPS_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_MUMPS_URL} + GIT_TAG ${EXTERN_MUMPS_GIT_TAG} + GIT_SUBMODULES "" # prevent downloading any submodules + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/mumps + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/mumps-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/mumps-cmake + UPDATE_COMMAND "" + PATCH_COMMAND git apply "${MUMPS_PATCH_FILES}" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${MUMPS_OPTIONS}" + TEST_COMMAND "" +) diff --git a/cmake/ExternalPalace.cmake b/cmake/ExternalPalace.cmake index 991875692d..6033c4e7ef 100644 --- a/cmake/ExternalPalace.cmake +++ b/cmake/ExternalPalace.cmake @@ -1,99 +1,110 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build Palace -# - -# Force build order -set(PALACE_DEPENDENCIES mfem libCEED) -if(PALACE_BUILD_EXTERNAL_DEPS) - list(APPEND PALACE_DEPENDENCIES json fmt eigen) - if(PALACE_WITH_SLEPC) - list(APPEND PALACE_DEPENDENCIES slepc) - endif() - if(PALACE_WITH_ARPACK) - list(APPEND PALACE_DEPENDENCIES arpack-ng) - endif() -endif() - -set(PALACE_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND PALACE_OPTIONS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DPALACE_WITH_OPENMP=${PALACE_WITH_OPENMP}" - "-DPALACE_WITH_SLEPC=${PALACE_WITH_SLEPC}" - "-DPALACE_WITH_ARPACK=${PALACE_WITH_ARPACK}" - "-DANALYZE_SOURCES_CLANG_TIDY=${ANALYZE_SOURCES_CLANG_TIDY}" - "-DANALYZE_SOURCES_CPPCHECK=${ANALYZE_SOURCES_CPPCHECK}" -) -if(PALACE_WITH_ARPACK) - list(APPEND PALACE_OPTIONS - "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" - "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" - ) -endif() - -# Configure GPU support -if(PALACE_WITH_CUDA) - list(APPEND PALACE_OPTIONS - "-DPALACE_WITH_CUDA=ON" - "-DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" - "-DCMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}" - ) - if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") - list(APPEND PALACE_OPTIONS - "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" - ) - endif() -else() - list(APPEND PALACE_OPTIONS - "-DPALACE_WITH_CUDA=OFF" - ) -endif() -if(PALACE_WITH_HIP) - list(APPEND PALACE_OPTIONS - "-DPALACE_WITH_HIP=ON" - "-DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER}" - "-DCMAKE_HIP_FLAGS=${CMAKE_HIP_FLAGS}" - ) - if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") - list(APPEND PALACE_OPTIONS - "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}" - ) - endif() -else() - list(APPEND PALACE_OPTIONS - "-DPALACE_WITH_HIP=OFF" - ) -endif() - -string(REPLACE ";" "; " PALACE_OPTIONS_PRINT "${PALACE_OPTIONS}") -message(STATUS "PALACE_OPTIONS: ${PALACE_OPTIONS_PRINT}") - -include(ExternalProject) -if(POLICY CMP0114) - cmake_policy(SET CMP0114 NEW) -endif() -ExternalProject_Add(palace - DEPENDS ${PALACE_DEPENDENCIES} - SOURCE_DIR ${CMAKE_SOURCE_DIR}/palace - BINARY_DIR ${CMAKE_BINARY_DIR}/palace-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/palace-cmake - BUILD_ALWAYS TRUE - DOWNLOAD_COMMAND "" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${PALACE_OPTIONS}" - TEST_COMMAND "" -) - -# Add target for Palace unit tests -ExternalProject_Add_Step(palace tests - COMMAND ${CMAKE_MAKE_PROGRAM} unit-tests - DEPENDEES install - DEPENDERS "" - COMMENT "Building unit tests for 'palace'" - WORKING_DIRECTORY - EXCLUDE_FROM_MAIN TRUE -) -ExternalProject_Add_StepTargets(palace tests) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build Palace +# + +# Force build order +set(PALACE_DEPENDENCIES mfem) +if(PALACE_BUILD_EXTERNAL_DEPS) + list(APPEND PALACE_DEPENDENCIES libCEED json fmt eigen scn) + if(PALACE_WITH_SLEPC) + list(APPEND PALACE_DEPENDENCIES slepc) + endif() + if(PALACE_WITH_ARPACK) + list(APPEND PALACE_DEPENDENCIES arpack-ng) + endif() +endif() + +set(PALACE_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) +list(APPEND PALACE_OPTIONS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DPALACE_WITH_OPENMP=${PALACE_WITH_OPENMP}" + "-DPALACE_WITH_SLEPC=${PALACE_WITH_SLEPC}" + "-DPALACE_WITH_ARPACK=${PALACE_WITH_ARPACK}" + "-DANALYZE_SOURCES_CLANG_TIDY=${ANALYZE_SOURCES_CLANG_TIDY}" + "-DANALYZE_SOURCES_CPPCHECK=${ANALYZE_SOURCES_CPPCHECK}" + "-DMFEM_DATA_PATH=${CMAKE_BINARY_DIR}/extern/mfem/data" # Path to meshes for testing + "-DPALACE_BUILD_EXTERNAL_DEPS=${PALACE_BUILD_EXTERNAL_DEPS}" # For Catch2 + "-DPALACE_BUILD_WITH_COVERAGE=${PALACE_BUILD_WITH_COVERAGE}" +) +if(PALACE_WITH_ARPACK) + list(APPEND PALACE_OPTIONS + "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" + "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" + ) +endif() + +# Configure LAPACK dependency +if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") + list(APPEND PALACE_OPTIONS + "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + ) +endif() + +# Configure GPU support +if(PALACE_WITH_CUDA) + list(APPEND PALACE_OPTIONS + "-DPALACE_WITH_CUDA=ON" + "-DPALACE_WITH_GPU_AWARE_MPI=${PALACE_WITH_GPU_AWARE_MPI}" + "-DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" + "-DCMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}" + ) + if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") + list(APPEND PALACE_OPTIONS + "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" + ) + endif() +else() + list(APPEND PALACE_OPTIONS + "-DPALACE_WITH_CUDA=OFF" + ) +endif() +if(PALACE_WITH_HIP) + list(APPEND PALACE_OPTIONS + "-DPALACE_WITH_HIP=ON" + "-DPALACE_WITH_GPU_AWARE_MPI=${PALACE_WITH_GPU_AWARE_MPI}" + "-DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER}" + "-DCMAKE_HIP_FLAGS=${CMAKE_HIP_FLAGS}" + ) + if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") + list(APPEND PALACE_OPTIONS + "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}" + ) + endif() +else() + list(APPEND PALACE_OPTIONS + "-DPALACE_WITH_HIP=OFF" + ) +endif() + +string(REPLACE ";" "; " PALACE_OPTIONS_PRINT "${PALACE_OPTIONS}") +message(STATUS "PALACE_OPTIONS: ${PALACE_OPTIONS_PRINT}") + +include(ExternalProject) +ExternalProject_Add(palace + DEPENDS ${PALACE_DEPENDENCIES} + SOURCE_DIR ${CMAKE_SOURCE_DIR}/palace + BINARY_DIR ${CMAKE_BINARY_DIR}/palace-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/palace-cmake + BUILD_ALWAYS TRUE + DOWNLOAD_COMMAND "" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${PALACE_OPTIONS}" + TEST_COMMAND "" +) + +# Add target for Palace unit tests +ExternalProject_Add_Step(palace tests + COMMAND ${CMAKE_MAKE_PROGRAM} unit-tests + COMMAND ${CMAKE_COMMAND} --install /test/unit --prefix ${CMAKE_INSTALL_PREFIX} + DEPENDEES install + DEPENDERS "" + COMMENT "Building and installing unit tests for 'palace'" + WORKING_DIRECTORY + EXCLUDE_FROM_MAIN TRUE +) +ExternalProject_Add_StepTargets(palace tests) diff --git a/cmake/ExternalSLEPc.cmake b/cmake/ExternalSLEPc.cmake index 03957f1779..fc88fbb1d3 100644 --- a/cmake/ExternalSLEPc.cmake +++ b/cmake/ExternalSLEPc.cmake @@ -1,145 +1,141 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build PETSc and SLEPc -# - -# Force build order -set(PETSC_DEPENDENCIES) -set(SLEPC_DEPENDENCIES petsc) - -# First build PETSc -set(PETSC_OPTIONS - "COPTFLAGS=${CMAKE_C_FLAGS}" - "CXXOPTFLAGS=${CMAKE_CXX_FLAGS}" - "--prefix=${CMAKE_INSTALL_PREFIX}" - "--with-cc=${CMAKE_C_COMPILER}" - "--with-cxx=${CMAKE_CXX_COMPILER}" - "--with-fc=0" - "--with-scalar-type=complex" - "--with-precision=double" - "--with-clanguage=c" - "--with-x=0" - # "--with-petsc4py=1" -) -if(CMAKE_BUILD_TYPE MATCHES "Debug|debug|DEBUG") - list(APPEND PETSC_OPTIONS "--with-debugging=1") -else() - list(APPEND PETSC_OPTIONS "--with-debugging=0") -endif() -if(BUILD_SHARED_LIBS) - list(APPEND PETSC_OPTIONS "--with-shared-libraries=1") -else() - list(APPEND PETSC_OPTIONS "--with-shared-libraries=0") -endif() -if(PALACE_WITH_64BIT_INT) - list(APPEND PETSC_OPTIONS "--with-64-bit-indices") -endif() -if(PALACE_WITH_64BIT_BLAS_INT) - list(APPEND PETSC_OPTIONS "--known-64-bit-blas-indices=1") - list(APPEND PETSC_OPTIONS "--with-64-bit-blas-indices") -else() - list(APPEND PETSC_OPTIONS "--known-64-bit-blas-indices=0") -endif() -if(PALACE_WITH_OPENMP) - list(APPEND PETSC_OPTIONS "--with-openmp") -endif() - -# User might specify the MPI compiler wrappers directly, otherwise we need to supply MPI -# as found from the CMake module -if(NOT MPI_FOUND) - message(FATAL_ERROR "MPI is not found when trying to build PETSc") -endif() -if(NOT CMAKE_CXX_COMPILER STREQUAL MPI_CXX_COMPILER) - # For OpenMPI at least, when given a C++ compiler, PETSc needs the C++ MPI libraries for - # its CxxMPICheck - string(REPLACE ";" "," PETSC_MPI_LIBRARIES "${MPI_CXX_LIBRARIES}") - string(REPLACE ";" "," PETSC_MPI_INCLUDE_DIRS "${MPI_CXX_INCLUDE_DIRS}") - list(APPEND PETSC_OPTIONS - "--with-mpi-lib=[${PETSC_MPI_LIBRARIES}]" - "--with-mpi-include=[${PETSC_MPI_INCLUDE_DIRS}]" - ) -endif() - -# Configure BLAS/LAPACK -if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") - string(REPLACE "$" "," PETSC_BLAS_LAPACK_LIBRARIES "${BLAS_LAPACK_LIBRARIES}") - string(REPLACE "$" "," PETSC_BLAS_LAPACK_INCLUDE_DIRS "${BLAS_LAPACK_INCLUDE_DIRS}") - list(APPEND PETSC_OPTIONS - "--with-blaslapack-lib=[${PETSC_BLAS_LAPACK_LIBRARIES}]" - "--with-blaslapack-include=[${BLAS_LAPACK_INCLUDE_DIRS}]" - ) -endif() - -# Configure GPU support -if(PALACE_WITH_CUDA) - list(APPEND PETSC_OPTIONS "--with-cuda") - if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") - list(GET CMAKE_CUDA_ARCHITECTURES 0 PETSC_CUDA_ARCH) - list(APPEND PETSC_OPTIONS - "--with-cuda-arch=${PETSC_CUDA_ARCH}" - ) - endif() -endif() -if(PALACE_WITH_HIP) - list(APPEND PETSC_OPTIONS "--with-hip") - if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") - list(GET CMAKE_HIP_ARCHITECTURES 0 PETSC_HIP_ARCH) - list(APPEND PETSC_OPTIONS - "--with-hip-arch=${PETSC_HIP_ARCH}" - ) - endif() -endif() - -string(REPLACE ";" "; " PETSC_OPTIONS_PRINT "${PETSC_OPTIONS}") -message(STATUS "PETSC_OPTIONS: ${PETSC_OPTIONS_PRINT}") - -# Fix build -set(PETSC_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/petsc/patch_build.diff" -) - -include(ExternalProject) -ExternalProject_Add(petsc - DEPENDS ${PETSC_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_PETSC_URL} - GIT_TAG ${EXTERN_PETSC_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/petsc - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/petsc-cmake - BUILD_IN_SOURCE TRUE - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${PETSC_PATCH_FILES}" - CONFIGURE_COMMAND ./configure ${PETSC_OPTIONS} - TEST_COMMAND ${CMAKE_MAKE_PROGRAM} check # Use auto-detected PETSC_DIR/PETSC_ARCH - TEST_BEFORE_INSTALL TRUE -) - -# Configure SLEPc eigenvalue solver (most options come from PETSc) -set(SLEPC_OPTIONS - "--prefix=${CMAKE_INSTALL_PREFIX}" - "--with-feast=0" - "--with-arpack=0" - # "--with-slepc4py=1 -) - -string(REPLACE ";" "; " SLEPC_OPTIONS_PRINT "${SLEPC_OPTIONS}") -message(STATUS "SLEPC_OPTIONS: ${SLEPC_OPTIONS_PRINT}") - -include(ExternalProject) -ExternalProject_Add(slepc - DEPENDS ${SLEPC_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_SLEPC_URL} - GIT_TAG ${EXTERN_SLEPC_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/slepc - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/slepc-cmake - BUILD_IN_SOURCE TRUE - UPDATE_COMMAND "" - CONFIGURE_COMMAND SLEPC_DIR= PETSC_DIR=${CMAKE_INSTALL_PREFIX} PETSC_ARCH= ./configure ${SLEPC_OPTIONS} - BUILD_COMMAND SLEPC_DIR= PETSC_DIR=${CMAKE_INSTALL_PREFIX} PETSC_ARCH= ${CMAKE_MAKE_PROGRAM} - INSTALL_COMMAND SLEPC_DIR= PETSC_DIR=${CMAKE_INSTALL_PREFIX} PETSC_ARCH= ${CMAKE_MAKE_PROGRAM} install - TEST_COMMAND SLEPC_DIR= PETSC_DIR=${CMAKE_INSTALL_PREFIX} PETSC_ARCH= ${CMAKE_MAKE_PROGRAM} check - TEST_BEFORE_INSTALL TRUE -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build PETSc and SLEPc +# + +# Force build order +set(PETSC_DEPENDENCIES) +set(SLEPC_DEPENDENCIES petsc) + +# First build PETSc +set(PETSC_OPTIONS + "COPTFLAGS=${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${BUILD_TYPE_UPPER}}" + "CXXOPTFLAGS=${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${BUILD_TYPE_UPPER}}" + "--prefix=${CMAKE_INSTALL_PREFIX}" + "--with-cc=${CMAKE_C_COMPILER}" + "--with-cxx=${CMAKE_CXX_COMPILER}" + "--with-fc=0" + "--with-scalar-type=complex" + "--with-precision=double" + "--with-clanguage=c" + "--with-x=0" + # "--with-petsc4py=1" +) +if(CMAKE_BUILD_TYPE MATCHES "Debug|debug|DEBUG") + list(APPEND PETSC_OPTIONS "--with-debugging=1") +else() + list(APPEND PETSC_OPTIONS "--with-debugging=0") +endif() +if(BUILD_SHARED_LIBS) + list(APPEND PETSC_OPTIONS "--with-shared-libraries=1") +else() + list(APPEND PETSC_OPTIONS "--with-shared-libraries=0") +endif() +if(PALACE_WITH_64BIT_INT) + list(APPEND PETSC_OPTIONS "--with-64-bit-indices") +endif() +if(PALACE_WITH_64BIT_BLAS_INT) + list(APPEND PETSC_OPTIONS "--known-64-bit-blas-indices=1") + list(APPEND PETSC_OPTIONS "--with-64-bit-blas-indices") +else() + list(APPEND PETSC_OPTIONS "--known-64-bit-blas-indices=0") +endif() +if(PALACE_WITH_OPENMP) + list(APPEND PETSC_OPTIONS "--with-openmp") +endif() + +# User might specify the MPI compiler wrappers directly, otherwise we need to supply MPI +# as found from the CMake module +if(NOT MPI_FOUND) + message(FATAL_ERROR "MPI is not found when trying to build PETSc") +endif() +if(NOT CMAKE_CXX_COMPILER STREQUAL MPI_CXX_COMPILER) + # For OpenMPI at least, when given a C++ compiler, PETSc needs the C++ MPI libraries for + # its CxxMPICheck + string(REPLACE ";" "," PETSC_MPI_LIBRARIES "${MPI_CXX_LIBRARIES}") + string(REPLACE ";" "," PETSC_MPI_INCLUDE_DIRS "${MPI_CXX_INCLUDE_DIRS}") + list(APPEND PETSC_OPTIONS + "--with-mpi-lib=[${PETSC_MPI_LIBRARIES}]" + "--with-mpi-include=[${PETSC_MPI_INCLUDE_DIRS}]" + ) +endif() + +# Configure BLAS/LAPACK +if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") + string(REPLACE "$" "," PETSC_BLAS_LAPACK_LIBRARIES "${BLAS_LAPACK_LIBRARIES}") + string(REPLACE "$" "," PETSC_BLAS_LAPACK_INCLUDE_DIRS "${BLAS_LAPACK_INCLUDE_DIRS}") + list(APPEND PETSC_OPTIONS + "--with-blaslapack-lib=[${PETSC_BLAS_LAPACK_LIBRARIES}]" + "--with-blaslapack-include=[${BLAS_LAPACK_INCLUDE_DIRS}]" + ) +endif() + +# Configure GPU support +if(PALACE_WITH_CUDA) + list(APPEND PETSC_OPTIONS "--with-cuda") + if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") + list(GET CMAKE_CUDA_ARCHITECTURES 0 PETSC_CUDA_ARCH) + list(APPEND PETSC_OPTIONS + "--with-cuda-arch=${PETSC_CUDA_ARCH}" + ) + endif() +endif() +if(PALACE_WITH_HIP) + list(APPEND PETSC_OPTIONS "--with-hip") + if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") + list(GET CMAKE_HIP_ARCHITECTURES 0 PETSC_HIP_ARCH) + list(APPEND PETSC_OPTIONS + "--with-hip-arch=${PETSC_HIP_ARCH}" + ) + endif() +endif() + +string(REPLACE ";" "; " PETSC_OPTIONS_PRINT "${PETSC_OPTIONS}") +message(STATUS "PETSC_OPTIONS: ${PETSC_OPTIONS_PRINT}") + +include(ExternalProject) +ExternalProject_Add(petsc + DEPENDS ${PETSC_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_PETSC_URL} + GIT_TAG ${EXTERN_PETSC_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/petsc + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/petsc-cmake + BUILD_IN_SOURCE TRUE + UPDATE_COMMAND "" + CONFIGURE_COMMAND ./configure ${PETSC_OPTIONS} + # TEST_COMMAND ${CMAKE_MAKE_PROGRAM} check # Use auto-detected PETSC_DIR/PETSC_ARCH + TEST_COMMAND "" + TEST_BEFORE_INSTALL TRUE +) + +# Configure SLEPc eigenvalue solver (most options come from PETSc) +set(SLEPC_OPTIONS + "--prefix=${CMAKE_INSTALL_PREFIX}" + "--with-feast=0" + "--with-arpack=0" + # "--with-slepc4py=1 +) + +string(REPLACE ";" "; " SLEPC_OPTIONS_PRINT "${SLEPC_OPTIONS}") +message(STATUS "SLEPC_OPTIONS: ${SLEPC_OPTIONS_PRINT}") + +include(ExternalProject) +ExternalProject_Add(slepc + DEPENDS ${SLEPC_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_SLEPC_URL} + GIT_TAG ${EXTERN_SLEPC_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/slepc + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/slepc-cmake + BUILD_IN_SOURCE TRUE + UPDATE_COMMAND "" + CONFIGURE_COMMAND SLEPC_DIR= PETSC_DIR=${CMAKE_INSTALL_PREFIX} PETSC_ARCH= ./configure ${SLEPC_OPTIONS} + BUILD_COMMAND SLEPC_DIR= PETSC_DIR=${CMAKE_INSTALL_PREFIX} PETSC_ARCH= ${CMAKE_MAKE_PROGRAM} + INSTALL_COMMAND SLEPC_DIR= PETSC_DIR=${CMAKE_INSTALL_PREFIX} PETSC_ARCH= ${CMAKE_MAKE_PROGRAM} install + # TEST_COMMAND SLEPC_DIR= PETSC_DIR=${CMAKE_INSTALL_PREFIX} PETSC_ARCH= ${CMAKE_MAKE_PROGRAM} check + TEST_COMMAND "" + TEST_BEFORE_INSTALL TRUE +) diff --git a/cmake/ExternalSTRUMPACK.cmake b/cmake/ExternalSTRUMPACK.cmake index 66f22f5706..0469a53e59 100644 --- a/cmake/ExternalSTRUMPACK.cmake +++ b/cmake/ExternalSTRUMPACK.cmake @@ -1,300 +1,313 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build STRUMPACK and dependencies -# - -# Force build order -set(STRUMPACK_DEPENDENCIES scalapack parmetis) -if(PALACE_WITH_CUDA OR PALACE_WITH_HIP) - list(APPEND STRUMPACK_DEPENDENCIES slate) -endif() -if(PALACE_WITH_MAGMA) - list(APPEND STRUMPACK_DEPENDENCIES magma) -endif() - -# Build ZFP dependency for lossy compression -set(PALACE_STRUMPACK_WITH_ZFP ON) -if(PALACE_STRUMPACK_WITH_ZFP) - set(ZFP_DEPENDENCIES) - - set(ZFP_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) - list(APPEND ZFP_OPTIONS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DBUILD_ZFORP=OFF" - "-DBUILD_UTILITIES=OFF" - "-DBUILD_TESTING=OFF" - "-DBUILD_EXAMPLES=OFF" - "-DZFP_WITH_OPENMP=${PALACE_WITH_OPENMP}" - ) - - string(REPLACE ";" "; " ZFP_OPTIONS_PRINT "${ZFP_OPTIONS}") - message(STATUS "ZFP_OPTIONS: ${ZFP_OPTIONS_PRINT}") - - include(ExternalProject) - ExternalProject_Add(zfp - DEPENDS ${ZFP_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_ZFP_URL} - GIT_TAG ${EXTERN_ZFP_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/zfp - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/zfp-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/zfp-cmake - UPDATE_COMMAND "" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${ZFP_OPTIONS}" - TEST_COMMAND "" - ) - list(APPEND STRUMPACK_DEPENDENCIES zfp) - - include(GNUInstallDirs) - if(BUILD_SHARED_LIBS) - set(_ZFP_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) - else() - set(_ZFP_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) - endif() - set(_ZFP_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libzfp${_ZFP_LIB_SUFFIX}) - # list(APPEND _ZFP_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/zFORp${_ZFP_LIB_SUFFIX}) -endif() - -# Build ButterflyPACK dependency for HODLR/HODBF compression -set(PALACE_STRUMPACK_WITH_BUTTERFLYPACK ON) -if(PALACE_STRUMPACK_WITH_BUTTERFLYPACK) - set(BUTTERFLYPACK_DEPENDENCIES scalapack) - - set(BUTTERFLYPACK_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) - list(APPEND BUTTERFLYPACK_OPTIONS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" - "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" - "-Denable_doc=OFF" - "-Denable_openmp=${PALACE_WITH_OPENMP}" - "-DTPL_SCALAPACK_LIBRARIES=${SCALAPACK_LIBRARIES}" - ) - # if(PALACE_STRUMPACK_WITH_ZFP) - # string(REPLACE ";" "$" _ZFP_LIBRARIES "${_ZFP_LIBRARIES}") - # list(APPEND BUTTERFLYPACK_OPTIONS - # "-DTPL_ZFP_LIBRARIES=${_ZFP_LIBRARIES}" - # "-DTPL_ZFP_INCLUDE=${CMAKE_INSTALL_PREFIX}/include" - # ) - # endif() - - # Nested Fortran functions in ButterflyPACK cause a static linkage problem Clang on MacOS - if(NOT BUILD_SHARED_LIBS AND - (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" AND CMAKE_C_COMPILER_ID MATCHES "Clang")) - list(TRANSFORM BUTTERFLYPACK_OPTIONS REPLACE - ".*BUILD_SHARED_LIBS.*" "-DBUILD_SHARED_LIBS=ON" - ) - endif() - - # Configure BLAS/LAPACK - if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") - list(APPEND BUTTERFLYPACK_OPTIONS - "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - ) - endif() - - string(REPLACE ";" "; " BUTTERFLYPACK_OPTIONS_PRINT "${BUTTERFLYPACK_OPTIONS}") - message(STATUS "BUTTERFLYPACK_OPTIONS: ${BUTTERFLYPACK_OPTIONS_PRINT}") - - # Fix build - set(BUTTERFLYPACK_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/ButterflyPACK/patch_build.diff" - ) - - include(ExternalProject) - ExternalProject_Add(butterflypack - DEPENDS ${BUTTERFLYPACK_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_BUTTERFLYPACK_URL} - GIT_TAG ${EXTERN_BUTTERFLYPACK_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/ButterflyPACK - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/ButterflyPACK-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/ButterflyPACK-cmake - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${BUTTERFLYPACK_PATCH_FILES}" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${BUTTERFLYPACK_OPTIONS}" - TEST_COMMAND "" - ) - list(APPEND STRUMPACK_DEPENDENCIES butterflypack) - - include(GNUInstallDirs) - if(BUILD_SHARED_LIBS OR - (${CMAKE_SYSTEM_NAME} MATCHES "Darwin" AND CMAKE_C_COMPILER_ID MATCHES "Clang")) - set(_BUTTERFLYPACK_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) - else() - set(_BUTTERFLYPACK_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) - endif() - set(_BUTTERFLYPACK_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libdbutterflypack${_BUTTERFLYPACK_LIB_SUFFIX}) - list(APPEND _BUTTERFLYPACK_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libsbutterflypack${_BUTTERFLYPACK_LIB_SUFFIX}) - list(APPEND _BUTTERFLYPACK_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libzbutterflypack${_BUTTERFLYPACK_LIB_SUFFIX}) - list(APPEND _BUTTERFLYPACK_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libcbutterflypack${_BUTTERFLYPACK_LIB_SUFFIX}) -endif() - -set(STRUMPACK_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND STRUMPACK_OPTIONS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" - "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" - "-DSTRUMPACK_USE_MPI=ON" - "-DSTRUMPACK_USE_OPENMP=${PALACE_WITH_OPENMP}" - "-DTPL_ENABLE_PARMETIS=ON" - "-DTPL_METIS_LIBRARIES=${METIS_LIBRARIES}" - "-DTPL_METIS_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" - "-DTPL_PARMETIS_LIBRARIES=${PARMETIS_LIBRARIES}$${METIS_LIBRARIES}" - "-DTPL_PARMETIS_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" - "-DTPL_ENABLE_SCOTCH=OFF" - "-DTPL_ENABLE_PTSCOTCH=OFF" - "-DTPL_ENABLE_COMBBLAS=OFF" - "-DTPL_ENABLE_PAPI=OFF" - "-DTPL_SCALAPACK_LIBRARIES=${SCALAPACK_LIBRARIES}" -) -if(PALACE_STRUMPACK_WITH_BUTTERFLYPACK) - list(APPEND STRUMPACK_OPTIONS - "-DTPL_ENABLE_BPACK=ON" - "-DTPL_BUTTERFLYPACK_PREFIX=${CMAKE_INSTALL_PREFIX}" - ) -else() - list(APPEND STRUMPACK_OPTIONS - "-DTPL_ENABLE_BPACK=OFF" - ) -endif() -if(PALACE_STRUMPACK_WITH_ZFP) - list(APPEND STRUMPACK_OPTIONS - "-DTPL_ENABLE_ZFP=ON" - "-DTPL_ZFP_PREFIX=${CMAKE_INSTALL_PREFIX}" - ) -else() - list(APPEND STRUMPACK_OPTIONS - "-DTPL_ENABLE_ZFP=OFF" - ) -endif() - -# Configure BLAS/LAPACK -if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") - list(APPEND STRUMPACK_OPTIONS - "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - ) -endif() - -# Configure GPU support -if(PALACE_WITH_CUDA OR PALACE_WITH_HIP) - list(APPEND STRUMPACK_OPTIONS - "-DTPL_ENABLE_SLATE=ON" - "-DTPL_SLATE_PREFIX=${CMAKE_INSTALL_PREFIX}" - ) -else() - list(APPEND STRUMPACK_OPTIONS - "-DTPL_ENABLE_SLATE=OFF" - ) -endif() -if(PALACE_WITH_CUDA) - list(APPEND STRUMPACK_OPTIONS - "-DSTRUMPACK_USE_CUDA=ON" - "-DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" - "-DCMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}" - ) - if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") - list(APPEND STRUMPACK_OPTIONS - "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" - ) - endif() -else() - list(APPEND STRUMPACK_OPTIONS - "-DSTRUMPACK_USE_CUDA=OFF" - ) -endif() -if(PALACE_WITH_HIP) - list(APPEND STRUMPACK_OPTIONS - "-DSTRUMPACK_USE_HIP=ON" - "-DHIP_ROOT_DIR=${ROCM_DIR}" - "-DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER}" - "-DCMAKE_HIP_FLAGS=${CMAKE_HIP_FLAGS}" - ) - if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") - list(APPEND STRUMPACK_OPTIONS - "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}" - ) - endif() -else() - list(APPEND STRUMPACK_OPTIONS - "-DSTRUMPACK_USE_HIP=OFF" - ) -endif() -if(PALACE_WITH_MAGMA) - list(APPEND STRUMPACK_OPTIONS - "-DTPL_ENABLE_MAGMA=ON" - "-DTPL_MAGMA_PREFIX=${CMAKE_INSTALL_PREFIX}" - ) -else() - list(APPEND STRUMPACK_OPTIONS - "-DTPL_ENABLE_MAGMA=OFF" - ) -endif() - -string(REPLACE ";" "; " STRUMPACK_OPTIONS_PRINT "${STRUMPACK_OPTIONS}") -message(STATUS "STRUMPACK_OPTIONS: ${STRUMPACK_OPTIONS_PRINT}") - -# Fix build -set(STRUMPACK_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/STRUMPACK/patch_build.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/STRUMPACK/patch_gpu_init.diff" -) - -include(ExternalProject) -ExternalProject_Add(strumpack - DEPENDS ${STRUMPACK_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_STRUMPACK_URL} - GIT_TAG ${EXTERN_STRUMPACK_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/STRUMPACK - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/STRUMPACK-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/STRUMPACK-cmake - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${STRUMPACK_PATCH_FILES}" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${STRUMPACK_OPTIONS}" - TEST_COMMAND "" -) - -# Save variables to cache -set(_STRUMPACK_EXTRA_LIBRARIES) -if(PALACE_STRUMPACK_WITH_BUTTERFLYPACK) - list(APPEND _STRUMPACK_EXTRA_LIBRARIES ${_BUTTERFLYPACK_LIBRARIES}) -endif() -if(PALACE_STRUMPACK_WITH_ZFP) - list(APPEND _STRUMPACK_EXTRA_LIBRARIES ${_ZFP_LIBRARIES}) -endif() -if(PALACE_WITH_CUDA OR PALACE_WITH_HIP) - include(GNUInstallDirs) - if(BUILD_SHARED_LIBS) - set(_SLATE_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) - else() - set(_SLATE_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) - endif() - set(_SLATE_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libslate${_SLATE_LIB_SUFFIX}) - list(APPEND _SLATE_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/liblapackpp${_SLATE_LIB_SUFFIX}) - list(APPEND _SLATE_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libblaspp${_SLATE_LIB_SUFFIX}) - list(APPEND _STRUMPACK_EXTRA_LIBRARIES ${_SLATE_LIBRARIES}) -endif() -if(PALACE_WITH_MAGMA) - set(_MAGMA_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/libmagma_sparse${CMAKE_SHARED_LIBRARY_SUFFIX}) - list(APPEND _MAGMA_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/libmagma${CMAKE_SHARED_LIBRARY_SUFFIX}) - list(APPEND _STRUMPACK_EXTRA_LIBRARIES ${_MAGMA_LIBRARIES}) -endif() -if(NOT "${_STRUMPACK_EXTRA_LIBRARIES}" STREQUAL "") - string(REPLACE ";" "$" _STRUMPACK_EXTRA_LIBRARIES "${_STRUMPACK_EXTRA_LIBRARIES}") - set(STRUMPACK_EXTRA_LIBRARIES ${_STRUMPACK_EXTRA_LIBRARIES} CACHE STRING - "List of extra library files for STRUMPACK" - ) -endif() +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build STRUMPACK and dependencies +# + +# Force build order +set(STRUMPACK_DEPENDENCIES scalapack parmetis) +# if(PALACE_WITH_CUDA OR PALACE_WITH_HIP) +# list(APPEND STRUMPACK_DEPENDENCIES slate) +# endif() +# if(PALACE_WITH_MAGMA) +# list(APPEND STRUMPACK_DEPENDENCIES magma) +# endif() + +# Build ZFP dependency for lossy compression +if(PALACE_WITH_STRUMPACK_ZFP) + set(ZFP_DEPENDENCIES) + + set(ZFP_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) + list(APPEND ZFP_OPTIONS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DBUILD_ZFORP=OFF" + "-DBUILD_UTILITIES=OFF" + "-DBUILD_TESTING=OFF" + "-DBUILD_EXAMPLES=OFF" + "-DZFP_WITH_OPENMP=${PALACE_WITH_OPENMP}" + ) + + string(REPLACE ";" "; " ZFP_OPTIONS_PRINT "${ZFP_OPTIONS}") + message(STATUS "ZFP_OPTIONS: ${ZFP_OPTIONS_PRINT}") + + include(ExternalProject) + ExternalProject_Add(zfp + DEPENDS ${ZFP_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_ZFP_URL} + GIT_TAG ${EXTERN_ZFP_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/zfp + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/zfp-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/zfp-cmake + UPDATE_COMMAND "" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${ZFP_OPTIONS}" + TEST_COMMAND "" + ) + list(APPEND STRUMPACK_DEPENDENCIES zfp) + + include(GNUInstallDirs) + if(BUILD_SHARED_LIBS) + set(_ZFP_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + set(_ZFP_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() + set(_ZFP_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libzfp${_ZFP_LIB_SUFFIX}) + # list(APPEND _ZFP_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/zFORp${_ZFP_LIB_SUFFIX}) +endif() + +# Build ButterflyPACK dependency for HODLR/HODBF compression +if(PALACE_WITH_STRUMPACK_BUTTERFLYPACK) + set(BUTTERFLYPACK_DEPENDENCIES scalapack) + + set(BUTTERFLYPACK_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) + list(APPEND BUTTERFLYPACK_OPTIONS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" + "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" + "-Denable_doc=OFF" + "-Denable_openmp=${PALACE_WITH_OPENMP}" + "-DTPL_SCALAPACK_LIBRARIES=${SCALAPACK_LIBRARIES}" + ) + # if(PALACE_WITH_STRUMPACK_ZFP) + # string(REPLACE ";" "$" _ZFP_LIBRARIES "${_ZFP_LIBRARIES}") + # list(APPEND BUTTERFLYPACK_OPTIONS + # "-DTPL_ZFP_LIBRARIES=${_ZFP_LIBRARIES}" + # "-DTPL_ZFP_INCLUDE=${CMAKE_INSTALL_PREFIX}/include" + # ) + # endif() + + # Nested Fortran functions in ButterflyPACK cause a static linkage problem Clang on MacOS + if(NOT BUILD_SHARED_LIBS AND + (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_C_COMPILER_ID MATCHES "Clang")) + list(TRANSFORM BUTTERFLYPACK_OPTIONS REPLACE + ".*BUILD_SHARED_LIBS.*" "-DBUILD_SHARED_LIBS=ON" + ) + endif() + + # Configure BLAS/LAPACK + if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") + list(APPEND BUTTERFLYPACK_OPTIONS + "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + ) + endif() + + string(REPLACE ";" "; " BUTTERFLYPACK_OPTIONS_PRINT "${BUTTERFLYPACK_OPTIONS}") + message(STATUS "BUTTERFLYPACK_OPTIONS: ${BUTTERFLYPACK_OPTIONS_PRINT}") + + # Fix build + set(BUTTERFLYPACK_PATCH_FILES + "${CMAKE_SOURCE_DIR}/extern/patch/ButterflyPACK/patch_build.diff" + ) + + include(ExternalProject) + ExternalProject_Add(butterflypack + DEPENDS ${BUTTERFLYPACK_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_BUTTERFLYPACK_URL} + GIT_TAG ${EXTERN_BUTTERFLYPACK_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/ButterflyPACK + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/ButterflyPACK-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/ButterflyPACK-cmake + UPDATE_COMMAND "" + PATCH_COMMAND git apply "${BUTTERFLYPACK_PATCH_FILES}" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${BUTTERFLYPACK_OPTIONS}" + TEST_COMMAND "" + ) + list(APPEND STRUMPACK_DEPENDENCIES butterflypack) + + include(GNUInstallDirs) + if(BUILD_SHARED_LIBS OR + (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_C_COMPILER_ID MATCHES "Clang")) + set(_BUTTERFLYPACK_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + set(_BUTTERFLYPACK_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() + set(_BUTTERFLYPACK_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libdbutterflypack${_BUTTERFLYPACK_LIB_SUFFIX}) + list(APPEND _BUTTERFLYPACK_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libsbutterflypack${_BUTTERFLYPACK_LIB_SUFFIX}) + list(APPEND _BUTTERFLYPACK_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libzbutterflypack${_BUTTERFLYPACK_LIB_SUFFIX}) + list(APPEND _BUTTERFLYPACK_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libcbutterflypack${_BUTTERFLYPACK_LIB_SUFFIX}) +endif() + +set(STRUMPACK_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) +list(APPEND STRUMPACK_OPTIONS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" + "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" + "-DSTRUMPACK_USE_MPI=ON" + "-DTPL_ENABLE_PARMETIS=ON" + "-DTPL_METIS_LIBRARIES=${METIS_LIBRARIES}" + "-DTPL_METIS_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" + "-DTPL_PARMETIS_LIBRARIES=${PARMETIS_LIBRARIES}$${METIS_LIBRARIES}" + "-DTPL_PARMETIS_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" + "-DTPL_ENABLE_SCOTCH=OFF" + "-DTPL_ENABLE_PTSCOTCH=OFF" + "-DTPL_ENABLE_COMBBLAS=OFF" + "-DTPL_ENABLE_PAPI=OFF" + "-DTPL_SCALAPACK_LIBRARIES=${SCALAPACK_LIBRARIES}" +) +if(PALACE_WITH_STRUMPACK_BUTTERFLYPACK) + list(APPEND STRUMPACK_OPTIONS + "-DTPL_ENABLE_BPACK=ON" + "-DTPL_BUTTERFLYPACK_PREFIX=${CMAKE_INSTALL_PREFIX}" + ) +else() + list(APPEND STRUMPACK_OPTIONS + "-DTPL_ENABLE_BPACK=OFF" + ) +endif() +if(PALACE_WITH_STRUMPACK_ZFP) + list(APPEND STRUMPACK_OPTIONS + "-DTPL_ENABLE_ZFP=ON" + "-DTPL_ZFP_PREFIX=${CMAKE_INSTALL_PREFIX}" + ) +else() + list(APPEND STRUMPACK_OPTIONS + "-DTPL_ENABLE_ZFP=OFF" + ) +endif() + +# Always disable OpenMP (seems slower in all cases, just link to threaded BLAS/LAPACK) +# if(PALACE_WITH_OPENMP) +# list(APPEND STRUMPACK_OPTIONS +# "-DSTRUMPACK_USE_OPENMP=ON" +# ) +# else() + list(APPEND STRUMPACK_OPTIONS + "-DSTRUMPACK_USE_OPENMP=OFF" + ) +# endif() + +# Configure BLAS/LAPACK +if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") + list(APPEND STRUMPACK_OPTIONS + "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + ) +endif() + +# Configure GPU support (for now, disable since faster on CPU) +list(APPEND STRUMPACK_OPTIONS + "-DTPL_ENABLE_SLATE=OFF" + "-DSTRUMPACK_USE_CUDA=OFF" + "-DSTRUMPACK_USE_HIP=OFF" +) +# if(PALACE_WITH_CUDA OR PALACE_WITH_HIP) +# list(APPEND STRUMPACK_OPTIONS +# "-DTPL_ENABLE_SLATE=ON" +# "-DTPL_SLATE_PREFIX=${CMAKE_INSTALL_PREFIX}" +# ) +# else() +# list(APPEND STRUMPACK_OPTIONS +# "-DTPL_ENABLE_SLATE=OFF" +# ) +# endif() +# if(PALACE_WITH_CUDA) +# list(APPEND STRUMPACK_OPTIONS +# "-DSTRUMPACK_USE_CUDA=ON" +# "-DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" +# "-DCMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}" +# ) +# if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") +# list(APPEND STRUMPACK_OPTIONS +# "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" +# ) +# endif() +# else() +# list(APPEND STRUMPACK_OPTIONS +# "-DSTRUMPACK_USE_CUDA=OFF" +# ) +# endif() +# if(PALACE_WITH_HIP) +# list(APPEND STRUMPACK_OPTIONS +# "-DSTRUMPACK_USE_HIP=ON" +# "-DHIP_ROOT_DIR=${ROCM_DIR}" +# "-DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER}" +# "-DCMAKE_HIP_FLAGS=${CMAKE_HIP_FLAGS}" +# ) +# if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") +# list(APPEND STRUMPACK_OPTIONS +# "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}" +# ) +# endif() +# else() +# list(APPEND STRUMPACK_OPTIONS +# "-DSTRUMPACK_USE_HIP=OFF" +# ) +# endif() +# if(PALACE_WITH_MAGMA) +# list(APPEND STRUMPACK_OPTIONS +# "-DTPL_ENABLE_MAGMA=ON" +# "-DTPL_MAGMA_PREFIX=${CMAKE_INSTALL_PREFIX}" +# ) +# else() +# list(APPEND STRUMPACK_OPTIONS +# "-DTPL_ENABLE_MAGMA=OFF" +# ) +# endif() + +string(REPLACE ";" "; " STRUMPACK_OPTIONS_PRINT "${STRUMPACK_OPTIONS}") +message(STATUS "STRUMPACK_OPTIONS: ${STRUMPACK_OPTIONS_PRINT}") + +# Fix build +set(STRUMPACK_PATCH_FILES + "${CMAKE_SOURCE_DIR}/extern/patch/STRUMPACK/patch_build.diff" + "${CMAKE_SOURCE_DIR}/extern/patch/STRUMPACK/patch_gpu_init.diff" +) + +include(ExternalProject) +ExternalProject_Add(strumpack + DEPENDS ${STRUMPACK_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_STRUMPACK_URL} + GIT_TAG ${EXTERN_STRUMPACK_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/STRUMPACK + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/STRUMPACK-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/STRUMPACK-cmake + UPDATE_COMMAND "" + PATCH_COMMAND git apply "${STRUMPACK_PATCH_FILES}" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${STRUMPACK_OPTIONS}" + TEST_COMMAND "" +) + +# Save variables to cache +set(_STRUMPACK_EXTRA_LIBRARIES) +if(PALACE_WITH_STRUMPACK_BUTTERFLYPACK) + list(APPEND _STRUMPACK_EXTRA_LIBRARIES ${_BUTTERFLYPACK_LIBRARIES}) +endif() +if(PALACE_WITH_STRUMPACK_ZFP) + list(APPEND _STRUMPACK_EXTRA_LIBRARIES ${_ZFP_LIBRARIES}) +endif() +# if(PALACE_WITH_CUDA OR PALACE_WITH_HIP) +# include(GNUInstallDirs) +# if(BUILD_SHARED_LIBS) +# set(_SLATE_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) +# else() +# set(_SLATE_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) +# endif() +# set(_SLATE_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libslate${_SLATE_LIB_SUFFIX}) +# list(APPEND _SLATE_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/liblapackpp${_SLATE_LIB_SUFFIX}) +# list(APPEND _SLATE_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libblaspp${_SLATE_LIB_SUFFIX}) +# list(APPEND _STRUMPACK_EXTRA_LIBRARIES ${_SLATE_LIBRARIES}) +# endif() +# if(PALACE_WITH_MAGMA) +# set(_MAGMA_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/libmagma_sparse${CMAKE_SHARED_LIBRARY_SUFFIX}) +# list(APPEND _MAGMA_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/libmagma${CMAKE_SHARED_LIBRARY_SUFFIX}) +# list(APPEND _STRUMPACK_EXTRA_LIBRARIES ${_MAGMA_LIBRARIES}) +# endif() +if(NOT "${_STRUMPACK_EXTRA_LIBRARIES}" STREQUAL "") + string(REPLACE ";" "$" _STRUMPACK_EXTRA_LIBRARIES "${_STRUMPACK_EXTRA_LIBRARIES}") + set(STRUMPACK_EXTRA_LIBRARIES ${_STRUMPACK_EXTRA_LIBRARIES} CACHE STRING + "List of extra library files for STRUMPACK" + ) +endif() diff --git a/cmake/ExternalScaLAPACK.cmake b/cmake/ExternalScaLAPACK.cmake index 3e729b4e51..0b5e1be8cf 100644 --- a/cmake/ExternalScaLAPACK.cmake +++ b/cmake/ExternalScaLAPACK.cmake @@ -1,66 +1,62 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build ScaLAPACK (from scivision, with CMake) -# - -# Force build order -set(SCALAPACK_DEPENDENCIES) - -set(SCALAPACK_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND SCALAPACK_OPTIONS - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" - "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" - "-DBUILD_SINGLE=ON" - "-DBUILD_DOUBLE=ON" - "-DBUILD_COMPLEX=ON" - "-DBUILD_COMPLEX16=ON" - "-DBUILD_TESTING=OFF" -) - -# Configure LAPACK dependency -if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") - list(APPEND SCALAPACK_OPTIONS - "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - ) -endif() - -string(REPLACE ";" "; " SCALAPACK_OPTIONS_PRINT "${SCALAPACK_OPTIONS}") -message(STATUS "SCALAPACK_OPTIONS: ${SCALAPACK_OPTIONS_PRINT}") - -# Fix build -set(SCALAPACK_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/scalapack/patch_build.diff" -) - -include(ExternalProject) -ExternalProject_Add(scalapack - DEPENDS ${SCALAPACK_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_SCALAPACK_URL} - GIT_TAG ${EXTERN_SCALAPACK_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/scalapack - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/scalapack-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/scalapack-cmake - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${SCALAPACK_PATCH_FILES}" && cd scalapack && git checkout master - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${SCALAPACK_OPTIONS}" - TEST_COMMAND "" -) - -# Save variables to cache -include(GNUInstallDirs) -if(BUILD_SHARED_LIBS) - set(_SCALAPACK_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) -else() - set(_SCALAPACK_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) -endif() -set(_SCALAPACK_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libblacs${_SCALAPACK_LIB_SUFFIX}) -set(_SCALAPACK_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libscalapack${_SCALAPACK_LIB_SUFFIX}$${_SCALAPACK_LIBRARIES}) -set(SCALAPACK_LIBRARIES ${_SCALAPACK_LIBRARIES} - CACHE STRING "List of library files for ScaLAPACK" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build ScaLAPACK +# + +# Force build order +set(SCALAPACK_DEPENDENCIES) + +# Silence compiler error +include(CheckCCompilerFlag) +set(SCALAPACK_CFLAGS "${CMAKE_C_FLAGS}") +check_c_compiler_flag(-Wno-implicit-function-declaration SUPPORTS_NOIMPLICITFUNC_WARNING) +if(SUPPORTS_NOIMPLICITFUNC_WARNING) + set(SCALAPACK_CFLAGS "${SCALAPACK_CFLAGS} -Wno-implicit-function-declaration") +endif() + +set(SCALAPACK_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) +list(APPEND SCALAPACK_OPTIONS + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_C_FLAGS=${SCALAPACK_CFLAGS}" + "-DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}" + "-DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}" + "-DSCALAPACK_BUILD_TESTS=OFF" +) + +# Configure LAPACK dependency +if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") + list(APPEND SCALAPACK_OPTIONS + "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + ) +endif() + +string(REPLACE ";" "; " SCALAPACK_OPTIONS_PRINT "${SCALAPACK_OPTIONS}") +message(STATUS "SCALAPACK_OPTIONS: ${SCALAPACK_OPTIONS_PRINT}") + +include(ExternalProject) +ExternalProject_Add(scalapack + DEPENDS ${SCALAPACK_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_SCALAPACK_URL} + GIT_TAG ${EXTERN_SCALAPACK_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/scalapack + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/scalapack-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/scalapack-cmake + UPDATE_COMMAND "" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${SCALAPACK_OPTIONS}" + TEST_COMMAND "" +) + +include(GNUInstallDirs) +# Save variables to cache +if(BUILD_SHARED_LIBS) + set(_SCALAPACK_LIB_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) +else() + set(_SCALAPACK_LIB_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX}) +endif() +set(SCALAPACK_LIBRARIES ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libscalapack${_SCALAPACK_LIB_SUFFIX} + CACHE STRING "List of library files for ScaLAPACK" +) diff --git a/cmake/ExternalSuperLU_DIST.cmake b/cmake/ExternalSuperLU_DIST.cmake index 1c157d2129..9bce31e8a4 100644 --- a/cmake/ExternalSuperLU_DIST.cmake +++ b/cmake/ExternalSuperLU_DIST.cmake @@ -1,111 +1,125 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Build SuperLU_DIST -# - -# Force build order -set(SUPERLU_DEPENDENCIES parmetis) - -set(SUPERLU_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) -list(APPEND SUPERLU_OPTIONS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DXSDK_ENABLE_Fortran=OFF" - "-Denable_tests=OFF" - "-Denable_examples=OFF" - "-Denable_double=ON" - "-Denable_single=ON" - "-Denable_complex16=ON" - "-Denable_openmp=${PALACE_WITH_OPENMP}" - "-DTPL_ENABLE_PARMETISLIB=ON" - "-DTPL_PARMETIS_LIBRARIES=${PARMETIS_LIBRARIES}$${METIS_LIBRARIES}" - "-DTPL_PARMETIS_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" - "-DTPL_ENABLE_COMBBLASLIB=OFF" -) - -# SuperLU_DIST has a BUILD_STATIC_LIBS option which defaults to ON -if(BUILD_SHARED_LIBS) - list(APPEND SUPERLU_OPTIONS - "-DBUILD_STATIC_LIBS=OFF" - ) -endif() - -# Configure 64-bit indices -if(PALACE_WITH_64BIT_INT) - list(APPEND SUPERLU_OPTIONS - "-DXSDK_INDEX_SIZE=64" - ) -endif() - -# Configure LAPACK dependency -if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") - list(APPEND SUPERLU_OPTIONS - "-DTPL_ENABLE_LAPACKLIB=ON" - "-DTPL_ENABLE_INTERNAL_BLASLIB=OFF" - "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" - ) -endif() - -# Configure GPU support -if(PALACE_WITH_CUDA) - list(APPEND SUPERLU_OPTIONS - "-DTPL_ENABLE_CUDALIB=ON" - "-DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" - "-DCMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}" - ) - if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") - list(APPEND SUPERLU_OPTIONS - "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" - ) - endif() -else() - list(APPEND SUPERLU_OPTIONS - "-DTPL_ENABLE_CUDALIB=OFF" - ) -endif() -if(PALACE_WITH_HIP) - list(APPEND SUPERLU_OPTIONS - "-DTPL_ENABLE_HIPLIB=ON" - "-DHIP_ROOT_DIR=${ROCM_DIR}" - "-DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER}" - "-DCMAKE_HIP_FLAGS=${CMAKE_HIP_FLAGS}" - ) - if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") - list(APPEND SUPERLU_OPTIONS - "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}" - ) - endif() -else() - list(APPEND SUPERLU_OPTIONS - "-DTPL_ENABLE_HIPLIB=OFF" - ) -endif() - -string(REPLACE ";" "; " SUPERLU_OPTIONS_PRINT "${SUPERLU_OPTIONS}") -message(STATUS "SUPERLU_OPTIONS: ${SUPERLU_OPTIONS_PRINT}") - -# Fix column permutations -set(SUPERLU_PATCH_FILES - "${CMAKE_SOURCE_DIR}/extern/patch/superlu_dist/patch_metis.diff" - "${CMAKE_SOURCE_DIR}/extern/patch/superlu_dist/patch_parmetis.diff" -) - -include(ExternalProject) -ExternalProject_Add(superlu_dist - DEPENDS ${SUPERLU_DEPENDENCIES} - GIT_REPOSITORY ${EXTERN_SUPERLU_URL} - GIT_TAG ${EXTERN_SUPERLU_GIT_TAG} - SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/superlu_dist - BINARY_DIR ${CMAKE_BINARY_DIR}/extern/superlu_dist-build - INSTALL_DIR ${CMAKE_INSTALL_PREFIX} - PREFIX ${CMAKE_BINARY_DIR}/extern/superlu_dist-cmake - UPDATE_COMMAND "" - PATCH_COMMAND git apply "${SUPERLU_PATCH_FILES}" - CONFIGURE_COMMAND ${CMAKE_COMMAND} "${SUPERLU_OPTIONS}" - TEST_COMMAND "" -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Build SuperLU_DIST +# + +# Force build order +set(SUPERLU_DEPENDENCIES parmetis) + +set(SUPERLU_OPTIONS ${PALACE_SUPERBUILD_DEFAULT_ARGS}) +list(APPEND SUPERLU_OPTIONS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DXSDK_ENABLE_Fortran=OFF" + "-Denable_tests=OFF" + "-Denable_examples=OFF" + "-Denable_double=ON" + "-Denable_single=ON" + "-Denable_complex16=ON" + "-Denable_python=OFF" + "-DTPL_ENABLE_PARMETISLIB=ON" + "-DTPL_PARMETIS_LIBRARIES=${PARMETIS_LIBRARIES}$${METIS_LIBRARIES}" + "-DTPL_PARMETIS_INCLUDE_DIRS=${CMAKE_INSTALL_PREFIX}/include" + "-DTPL_ENABLE_COMBBLASLIB=OFF" +) + +# Always disable OpenMP (seems slower in all cases, just link to threaded BLAS/LAPACK) +# if(PALACE_WITH_OPENMP) +# list(APPEND SUPERLU_OPTIONS +# "-Denable_openmp=ON" +# ) +# else() + list(APPEND SUPERLU_OPTIONS + "-Denable_openmp=OFF" + ) +# endif() + +# SuperLU_DIST has a BUILD_STATIC_LIBS option which defaults to ON +if(BUILD_SHARED_LIBS) + list(APPEND SUPERLU_OPTIONS + "-DBUILD_STATIC_LIBS=OFF" + ) +endif() + +# Configure 64-bit indices +if(PALACE_WITH_64BIT_INT) + list(APPEND SUPERLU_OPTIONS + "-DXSDK_INDEX_SIZE=64" + ) +endif() + +# Configure LAPACK dependency +if(NOT "${BLAS_LAPACK_LIBRARIES}" STREQUAL "") + list(APPEND SUPERLU_OPTIONS + "-DTPL_ENABLE_LAPACKLIB=ON" + "-DTPL_ENABLE_INTERNAL_BLASLIB=OFF" + "-DLAPACK_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + "-DBLAS_LIBRARIES=${BLAS_LAPACK_LIBRARIES}" + ) +endif() + +# Configure GPU support (for now, disable since faster on CPU) +list(APPEND SUPERLU_OPTIONS + "-DTPL_ENABLE_CUDALIB=OFF" + "-DTPL_ENABLE_HIPLIB=OFF" +) +# if(PALACE_WITH_CUDA) +# list(APPEND SUPERLU_OPTIONS +# "-DTPL_ENABLE_CUDALIB=ON" +# "-DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" +# "-DCMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}" +# ) +# if(NOT "${CMAKE_CUDA_ARCHITECTURES}" STREQUAL "") +# list(APPEND SUPERLU_OPTIONS +# "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" +# ) +# endif() +# else() +# list(APPEND SUPERLU_OPTIONS +# "-DTPL_ENABLE_CUDALIB=OFF" +# ) +# endif() +# if(PALACE_WITH_HIP) +# list(APPEND SUPERLU_OPTIONS +# "-DTPL_ENABLE_HIPLIB=ON" +# "-DHIP_ROOT_DIR=${ROCM_DIR}" +# "-DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER}" +# "-DCMAKE_HIP_FLAGS=${CMAKE_HIP_FLAGS}" +# ) +# if(NOT "${CMAKE_HIP_ARCHITECTURES}" STREQUAL "") +# list(APPEND SUPERLU_OPTIONS +# "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}" +# ) +# endif() +# else() +# list(APPEND SUPERLU_OPTIONS +# "-DTPL_ENABLE_HIPLIB=OFF" +# ) +# endif() + +string(REPLACE ";" "; " SUPERLU_OPTIONS_PRINT "${SUPERLU_OPTIONS}") +message(STATUS "SUPERLU_OPTIONS: ${SUPERLU_OPTIONS_PRINT}") + +# Fix column permutations +set(SUPERLU_PATCH_FILES + "${CMAKE_SOURCE_DIR}/extern/patch/superlu_dist/patch_parmetis.diff" +) + +include(ExternalProject) +ExternalProject_Add(superlu_dist + DEPENDS ${SUPERLU_DEPENDENCIES} + GIT_REPOSITORY ${EXTERN_SUPERLU_URL} + GIT_TAG ${EXTERN_SUPERLU_GIT_TAG} + SOURCE_DIR ${CMAKE_BINARY_DIR}/extern/superlu_dist + BINARY_DIR ${CMAKE_BINARY_DIR}/extern/superlu_dist-build + INSTALL_DIR ${CMAKE_INSTALL_PREFIX} + PREFIX ${CMAKE_BINARY_DIR}/extern/superlu_dist-cmake + UPDATE_COMMAND "" + PATCH_COMMAND git apply "${SUPERLU_PATCH_FILES}" + CONFIGURE_COMMAND ${CMAKE_COMMAND} "${SUPERLU_OPTIONS}" + TEST_COMMAND "" +) diff --git a/docs/Project.toml b/docs/Project.toml index dfa65cd107..2844e2aa5e 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,2 +1,4 @@ -[deps] -Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" diff --git a/docs/make.jl b/docs/make.jl index 7db1581112..0c5a85362d 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,54 +1,67 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -using Documenter - -makedocs( - format=Documenter.HTML( - # Always use clean URLs so that raw HTML works, view local builds using a local - # HTTP server with `python3 -m http.server`, for example - prettyurls=true, - sidebar_sitename=false, - collapselevel=2, - assets=["assets/favicon.ico"] - ), - sitename="Palace", - authors="Sebastian Grimberg, sjg@amazon.com", - pages=[ - "Home" => "index.md", - "install.md", - "run.md", - "User Guide" => Any[ - "guide/guide.md", - "guide/problem.md", - "guide/model.md", - "guide/boundaries.md", - "guide/postprocessing.md" - ], - "Configuration File" => Any[ - "config/config.md", - "config/problem.md", - "config/model.md", - "config/domains.md", - "config/boundaries.md", - "config/solver.md" - ], - "Examples" => Any[ - "examples/examples.md", - "examples/spheres.md", - "examples/rings.md", - "examples/cavity.md", - "examples/coaxial.md", - "examples/cpw.md" - ], - "developer.md", - "reference.md" - ] -) - -deploydocs( - repo="github.com/awslabs/palace.git", - devbranch="main", - push_preview=true, - forcepush=true -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Run with `julia --project make.jl` from within the `docs` folder. +# Output HTML is saved to the `build` folder. + +using Documenter + +makedocs( + format=Documenter.HTML( + # Always use clean URLs so that raw HTML works, view local builds using a local + # HTTP server with `python3 -m http.server`, for example + prettyurls=true, + sidebar_sitename=false, + collapselevel=2, + assets=["assets/favicon.ico"] + ), + sitename="Palace", + authors="The Palace Developers and Maintainers, palace-maint@amazon.com", + pages=[ + "Home" => "index.md", + "Quick Start" => "quick.md", + "install.md", + "run.md", + "User Guide" => Any[ + "guide/guide.md", + "guide/problem.md", + "guide/model.md", + "guide/boundaries.md", + "guide/postprocessing.md", + "guide/parallelism.md" + ], + "Configuration File" => Any[ + "config/config.md", + "config/problem.md", + "config/model.md", + "config/domains.md", + "config/boundaries.md", + "config/solver.md" + ], + "Features" => Any["features/farfield.md",], + "Examples" => Any[ + "examples/examples.md", + "examples/spheres.md", + "examples/rings.md", + "examples/antenna.md", + "examples/cylinder.md", + "examples/coaxial.md", + "examples/cpw.md" + ], + "faq.md", + "For Developers" => Any[ + "developer/notes.md", + "developer/testing.md", + "developer/tutorial_add_new_unit_test.md", + "developer/tutorial_gpu_profiling.md" + ], + "reference.md" + ] +) + +deploydocs( + repo="github.com/awslabs/palace.git", + devbranch="main", + push_preview=true, + forcepush=true +) diff --git a/docs/src/assets/examples/cavity-1.png b/docs/src/assets/examples/cavity-1.png index 526fa71608..0d7061beda 100644 Binary files a/docs/src/assets/examples/cavity-1.png and b/docs/src/assets/examples/cavity-1.png differ diff --git a/docs/src/assets/examples/cavity-2a.png b/docs/src/assets/examples/cavity-2a.png index f1b2919f65..1a4a08674b 100644 Binary files a/docs/src/assets/examples/cavity-2a.png and b/docs/src/assets/examples/cavity-2a.png differ diff --git a/docs/src/assets/examples/cavity-2b.png b/docs/src/assets/examples/cavity-2b.png index 7a54b35e81..b719a1c037 100644 Binary files a/docs/src/assets/examples/cavity-2b.png and b/docs/src/assets/examples/cavity-2b.png differ diff --git a/docs/src/assets/examples/cpw-1.png b/docs/src/assets/examples/cpw-1.png index 36ddba5b9d..348bce224b 100644 Binary files a/docs/src/assets/examples/cpw-1.png and b/docs/src/assets/examples/cpw-1.png differ diff --git a/docs/src/assets/examples/cpw-2.png b/docs/src/assets/examples/cpw-2.png index d12f50893c..cf2d315ad9 100644 Binary files a/docs/src/assets/examples/cpw-2.png and b/docs/src/assets/examples/cpw-2.png differ diff --git a/docs/src/assets/examples/cpw-3.png b/docs/src/assets/examples/cpw-3.png index 47b59be805..541acf4812 100644 Binary files a/docs/src/assets/examples/cpw-3.png and b/docs/src/assets/examples/cpw-3.png differ diff --git a/docs/src/config/boundaries.md b/docs/src/config/boundaries.md index 0a4419718a..2640270d0e 100644 --- a/docs/src/config/boundaries.md +++ b/docs/src/config/boundaries.md @@ -1,631 +1,690 @@ -```@raw html - - -``` - -# `config["Boundaries"]` - -```json -"Boundaries": -{ - "PEC": - { - ... - }, - "PMC": - { - ... - }, - "Impedance": - [ - ... - ], - "Absorbing": - { - ... - }, - "Conductivity": - [ - ... - ], - "LumpedPort": - [ - ... - ], - "WavePort": - [ - ... - ], - "WavePortPEC": - { - ... - }, - "SurfaceCurrent": - [ - ... - ], - "Ground": - { - ... - }, - "ZeroCharge": - { - ... - }, - "Terminal": - [ - ... - ], - "Postprocessing": - { - "Capacitance": - [ - ... - ], - "Inductance": - [ - ... - ], - "Dielectric": - [ - ... - ] - } -} -``` - -with - -`"PEC"` : Top-level object for configuring perfect electric conductor (PEC) boundary -conditions (zero tangential electric field). - -`"PMC"` : Top-level object for configuring perfect magnetic conductor (PMC) boundary -conditions (zero tangential magnetic field). Also imposes symmetry of the electric field -across the boundary surface. - -`"Impedance"` : Array of objects for configuring surface impedance boundary conditions. A -surface impedance boundary relates the tangential electric and magnetic fields on the -boundary using a user specified surface impedance. - -`"Absorbing"` : Top-level object for configuring absorbing boundary conditions. These are -artificial scattering boundary conditions at farfield boundaries. - -`"Conductivity"` : Array of objects for configuring finite conductivity surface impedance -boundary conditions. Finite conductivity boundaries are only available for the frequency -domain driven simulation type. - -`"LumpedPort"` : Array of objects for configuring lumped port boundary conditions. Lumped -ports can be specified on boundaries which are internal to the computational domain. - -`"WavePort"` : Array of objects for configuring numeric wave port boundary conditions. Wave -ports can only be specified on boundaries which are on the true boundary of the -computational domain. Additionally, wave port boundaries are only available for the -frequency domain driven simulation type. - -`"WavePortPEC"` : Top-level object for configuring PEC boundary conditions for boundary -mode analysis performed on the wave port boundaries. Thus, this object is only relevant -when wave port boundaries are specified under [`config["Boundaries"]["WavePort"]`] -(#boundaries[%5B%22WavePort%22%5D]). - -`"SurfaceCurrent"` : Array of objects for configuring surface current boundary conditions. -This boundary prescribes a unit source surface current excitation on the given boundary in -order to excite a frequency or time domain driven simulation or magnetostatic simulation. -For the magnetostatic simulation type, entries of the inductance matrix are extracted -corresponding to each surface current boundary. - -`"Ground"` : Top-level object for specifying ground, or zero voltage, boundary conditions -for for electrostatic simulations. - -`"ZeroCharge"` : Top-level object for specifying zero charge boundary conditions for for -electrostatic simulations. Also imposes symmetry of the electric field across the boundary -surface. - -`"Terminal"` : Array of objects for configuring terminal boundary conditions for -electrostatic simulations. Entries of the capacitance matrix are extracted corresponding to -each terminal boundary. - -`"Postprocessing"` : Top-level object for configuring boundary postprocessing. - -`"Capacitance"` : Array of objects for postprocessing surface capacitance by the ratio of -the integral of the induced surface charge on the boundary and the excitation voltage. - -`"Inductance"` : Array of objects for postprocessing surface inductance by the ratio of the -integral of the magnetic flux through the boundary and the excitation current. - -`"Dielectric"` : Array of objects for postprocessing surface interface dielectric loss. - -## `boundaries["PEC"]` - -```json -"PEC": -{ - "Attributes": [] -} -``` - -with - -`"Attributes" [None]` : Integer array of mesh boundary attributes at which to apply the PEC -boundary condition. - -## `boundaries["PMC"]` - -```json -"PMC": -{ - "Attributes": [] -} -``` - -with - -`"Attributes" [None]` : Integer array of mesh boundary attributes at which to apply the -PMC boundary condition. - -## `boundaries["Impedance"]` - -```json -"Impedance": -[ - { - "Attributes": [], - "Rs": , - "Ls": , - "Cs": - }, - ... -] -``` - -with - -`"Attributes" [None]` : Integer array of mesh boundary attributes for this surface -impedance boundary. - -`"Rs" [0.0]` : Surface resistance used for computing this surface impedance boundary's -impedance per square, ``\Omega``/sq. - -`"Ls" [0.0]` : Surface inductance used for computing this surface impedance boundary's -impedance per square, H/sq. - -`"Cs" [0.0]` : Surface capacitance used computing this surface impedance boundary's -impedance per square, F/sq. - -## `boundaries["Absorbing"]` - -```json -"Absorbing": -{ - "Attributes": [], - "Order": -} -``` - -with - -`"Attributes" [None]` : Integer array of mesh boundary attributes at which to apply -farfield absorbing boundary conditions. - -`"Order" [1]` : Specify a first- or second-order approximation for the farfield absorbing -boundary condition. Second-order absorbing boundary conditions are only available for the -frequency domain driven simulation type. - -## `boundaries["Conductivity"]` - -```json -"Conductivity": -[ - { - "Attributes": [], - "Conductivity": , - "Permeability": , - "Thickness": - }, - ... -] -``` - -with - -`"Attributes" [None]` : Integer array of mesh boundary attributes for this finite -conductivity boundary. - -`"Conductivity" [None]` : Electrical conductivity for this finite conductivity boundary, -S/m. - -`"Permeability" [1.0]` : Relative permeability for this finite conductivity boundary. - -`"Thickness" [None]` : Optional conductor thickness for this finite conductivity boundary -specified in mesh length units. Activates a finite conductivity boundary condition which -accounts for nonzero metal thickness. - -## `boundaries["LumpedPort"]` - -```json -"LumpedPort": -[ - { - "Index": , - "Attributes": [], - "Direction": or [], - "CoordinateSystem": , - "Excitation": , - "R": , - "L": , - "C": , - "Rs": , - "Ls": , - "Cs": , - "Elements": - [ - { - "Attributes": or [], - "Direction": , - "CoordinateSystem": - }, - ... - ] - }, - ... -] -``` - -with - -`"Index" [None]` : Index of this lumped port, used in postprocessing output files. - -`"Attributes" [None]` : Integer array of mesh boundary attributes for this lumped port -boundary. If this port is to be a multielement lumped port with more than a single lumped -element, use the `"Elements"` array described below. - -`"Direction" [None]` : Direction to define the polarization direction of the port field -mode on this lumped port boundary. Axis aligned lumped ports can be specified using -keywords: `"+X"`, `"-X"`, `"+Y"`, `"-Y"`, `"+Z"`, `"-Z"`, while coaxial lumped ports can be -specified using `"+R"`, `"-R"`. The direction can alternatively be specified as a -normalized array of three values, for example `[0, 1, 0]`. If a vector direction is -specified, the `"CoordinateSystem"` value specifies the coordinate system it is expressed -in. If this port is to be a multielement lumped port with more than a single lumped -element, use the `"Elements"` array described below. - -`"CoordinateSystem" ["Cartesian"]` : Coordinate system used to express the `"Direction"` -vector, the options are `"Cartesian"` and `"Cylindrical"`. If a keyword argument is used -for `"Direction"` this value is ignored, and the appropriate coordinate system is used -instead. - -`"Excitation" [false]` : Turns on or off port excitation for this lumped port boundary for -driven or transient simulation types. - -`"R" [0.0]` : Circuit resistance used for computing this lumped port boundary's impedance, -``\Omega``. This option should only be used along with the corresponding `"L"` and `"C"` -parameters, and not with any of the surface parameters `"Rs"`, `"Ls"`, or `"Cs"`. - -`"L" [0.0]` : Circuit inductance used for computing this lumped port boundary's impedance, -H. This option should only be used along with the corresponding `"R"` and `"C"` parameters, -and not with any of the surface parameters `"Rs"`, `"Ls"`, or `"Cs"`. - -`"C" [0.0]` : Circuit capacitance used for computing this lumped port boundary's impedance, -F. This option should only be used along with the corresponding `"R"` and `"L"` parameters, -and not with any of the surface parameters `"Rs"`, `"Ls"`, or `"Cs"`. - -`"Rs" [0.0]` : Surface resistance used for computing this lumped port boundary's impedance, -``\Omega``/sq. This option should only be used along with the corresponding `"Ls"` and -`"Cs"` parameters, and not with any of the circuit parameters `"R"`, `"L"`, or `"C"`. - -`"Ls" [0.0]` : Surface inductance used for computing this lumped port boundary's impedance, -H/sq. This option should only be used along with the corresponding `"Rs"` and `"Cs"` -parameters, and not with any of the circuit parameters `"R"`, `"L"`, or `"C"`. - -`"Cs" [0.0]` : Surface capacitance used for computing this lumped port boundary's -impedance, F/sq. This option should only be used along with the corresponding `"Rs"` and -`"Ls"` parameters, and not with any of the circuit parameters `"R"`, `"L"`, or `"C"`. - -`"Elements"[]["Attributes"] [None]` : This option is for multielement lumped ports and -should not be combined with the `"Attributes"` field described above. Each element of a -multielement lumped port can be described by its own unique integer array of mesh boundary -attributes, which are specified here. The elements of a multielement port add in parallel. - -`"Elements"[]["Direction"] [None]` : This option is for multielement lumped ports and -should not be combined with the `"Direction"` field described above. Each element of a -multielement lumped port can be described by its own unique direction, which is specified -here. The elements of a multielement port add in parallel. - -`"Elements"[]["CoordinateSystem"] ["Cartesian"]` : This option is for multielement lumped -ports and should not be combined with the `"CoordinateSystem"` field described above. Each -element of a multielement lumped port can be described by its own unique direction, and -corresponding coordinate system. - -## `boundaries["WavePort"]` - -```json -"WavePort": -[ - { - "Index": , - "Attributes": [], - "Excitation": , - "Mode": , - "Offset": - }, - ... -] -``` - -with - -`"Index" [None]` : Index of this wave port boundary, used in postprocessing output files. - -`"Attributes" [None]` : Integer array of mesh boundary attributes for this wave port -boundary. - -`"Excitation" [false]` : Turns on or off port excitation for this wave port boundary for -driven simulation types. - -`"Mode" [1]` : Mode index (1-based) for the characteristic port mode of this wave port -boundary. Ranked in order of decreasing wave number. - -`"Offset" [0.0]` : Offset distance used for scattering parameter de-embedding for this wave -port boundary, specified in mesh length units. - -## `boundaries["WavePortPEC"]` - -```json -"WavePortPEC": -{ - "Attributes": [] -} -``` - -with - -`"Attributes" [None]` : Integer array of mesh boundary attributes to consider along with -those specified under [`config["Boundaries"]["PEC"]["Attributes"]`] -(#boundaries%5B%22PEC%22%5D) as PEC when performing wave port boundary mode analysis. - -## `boundaries["SurfaceCurrent"]` - -```json -"SurfaceCurrent": -[ - { - "Index": , - "Attributes": [], - "Direction": or [], - "CoordinateSystem": , - "Elements": - [ - { - "Attributes": [], - "Direction": or [], - "CoordinateSystem": , - }, - ... - ] - }, - ... -] -``` - -with - -`"Index" [None]` : Index of this surface current boundary, used in postprocessing output -files. - -`"Attributes" [None]` : Integer array of mesh boundary attributes for this surface current -boundary. If this source is to be a multielement source which distributes the source -across more than a single lumped element, use the `"Elements"` array described below. - -`"Direction" [None]` : Defines the source current direction for this surface current -boundary. The available options are the same as under -[`config["Boundaries"]["LumpedPort"]["Direction"]`](#boundaries%5B%22LumpedPort%22%5D). If -this source is to be a multielement source which distributes the source across more than a -single lumped element, use the `"Elements"` array described below. - -`"CoordinateSystem" ["Cartesian"]` : Defines the coordinate system for the source current -direction for this surface current boundary. The available options are the same as under -[`config["Boundaries"]["LumpedPort"]["CoordinateSystem"]`](#boundaries%5B%22LumpedPort%22%5D). -If this source is to be a multielement source which distributes the source across more than -a single lumped element, use the `"Elements"` array described below. - -`"Elements"[]["Attributes"] [None]` : This option is for multielement surface current -boundaries should not be combined with the `"Attributes"` field described above. Each -element of a multielement current source can be described by its own unique integer array of -mesh boundary attributes, which are specified here. The elements of a multielement source -add in parallel to give the same total current as a single-element source. - -`"Elements"[]["Direction"] [None]` : This option is for multielement surface current -boundaries and should not be combined with the `"Direction"` field described above. Each -element of a multielement current source can be described by its own unique direction, -which is specified here. The elements of a multielement source add in parallel to give the -same total current as a single-element source. - -`"Elements"[]["CoordinateSystem"] ["Cartesian"]` : This option is for multielement surface current -boundaries and should not be combined with the `"CoordinateSystem"` field described above. Each -element of a multielement current source can be described by its own unique -direction, and corresponding coordinate system. - -## `boundaries["Ground"]` - -```json -"Ground": -{ - "Attributes": [] -} -``` - -with - -`"Attributes" [None]` : Integer array of mesh boundary attributes at which to apply the -ground boundary condition. - -## `boundaries["ZeroCharge"]` - -```json -"ZeroCharge": -{ - "Attributes": [] -} -``` - -with - -`"Attributes" [None]` : Integer array of mesh boundary attributes at which to apply the -zero-charge boundary condition. - -## `boundaries["Terminal"]` - -```json -"Terminal": -[ - { - "Index": - "Attributes": [], - }, - ... -] -``` - -with - -`"Index" [None]` : Index of this terminal boundary, used in postprocessing output files and -to index the computed capacitance matrix. - -`"Attributes" [None]` : Integer array of mesh boundary attributes for this terminal -boundary. - -## `boundaries["Postprocessing"]["Capacitance"]` - -```json -"Postprocessing": -{ - "Capacitance": - [ - { - "Index": - "Attributes": [], - }, - ... - ] -} -``` - -with - -`"Index" [None]` : Index of this capacitance postprocessing boundary, used in -postprocessing output files. - -`"Attributes" [None]` : Integer array of mesh boundary attributes for this capacitance -postprocessing boundary. - -## `boundaries["Postprocessing"]["Inductance"]` - -```json -"Postprocessing": -{ - "Inductance": - [ - { - "Index": , - "Attributes": [], - "Direction": - }, - ... - ] -} -``` - -with - -`"Index" [None]` : Index of this inductance postprocessing boundary, used in postprocessing -output files. - -`"Attributes" [None]` : Integer array of mesh boundary attributes for this inductance -postprocessing boundary. - -`"Direction" [None]` : Defines the global direction with which to orient the surface -normals with computing the magnetic flux for this inductance postprocessing boundary. The -available options are: `"+X"`, `"-X"`, `"+Y"`, `"-Y"`, `"+Z"`, `"-Z"`. The direction can -alternatively be specified as a normalized array of three values, for example `[0, 1, 0]`. -The true surface normal is used in the calculation, `"Direction"` is only used to ensure -the correct choice of orientation of the normal. - -## `boundaries["Postprocessing"]["Dielectric"]` - -```json -"Postprocessing": -{ - "Dielectric": - [ - { - "Index": , - "Attributes": [], - "Side": or [], - "Thickness": , - "Permittivity": , - "PermittivityMA": , - "PermittivityMS": , - "PermittivitySA": , - "LossTan": , - "Elements": - [ - { - "Attributes": [], - "Side": or [] - }, - ... - ] - }, - ... - ] -} -``` - -with - -`"Index" [None]` : Index of this lossy dielectric interface, used in postprocessing output -files. - -`"Attributes" [None]` : Integer array of mesh boundary attributes for this lossy dielectric -interface. If the interface consists of multiple elements with different `"Side"` values, -use the `"Elements"` array described below. - -`"Side" [None]` : Defines the postprocessing side when this dielectric interface is an -internal boundary surface (and thus the electric field on the boundary is in general -double-valued). The available options are: `"+X"`, `"-X"`, `"+Y"`, `"-Y"`, `"+Z"`, `"-Z"`. -The direction can alternatively be specified as a normalized array of three values, for -example `[0, 1, 0]`. If the boundary is not axis-aligned, the field value is taken from the -side which is oriented along the specified direction. If no `"Side"` is specified, the -field solution is taken from the neighboring element with the smaller electrical -permittivity, which is an attempt to get the field in the domain corresponding to vacuum. -If the interface consists of multiple elements with different `"Side"` values, use the -`"Elements"` array described below. - -`"Thickness" [None]` : Thickness of this dielectric interface, specified in mesh length -units. - -`"Permittivity" [None]` : Relative permittivity for this dielectric interface. Leads to the -general quality factor calculation without assuming the interface is a specific metal-air -(MA), metal-substrate (MS), or substrate-air (SA) interface. None of `"PermittivityMA"`, -`"PermittivityMS"`, or `"PermittivitySA"` should be specified when this value is given. - -`"PermittivityMA" [None]` : Relative permittivity for this dielectric interface assuming it -is a metal-air (MA) interface. None of `"PermittivityMS"`, `"PermittivitySA"`, or the -general `"Permittivity"` should be specified when this value is given. - -`"PermittivityMS" [None]` : Relative permittivity for this dielectric interface assuming it -is a metal-substrate (MS) interface. None of `"PermittivityMA"`, `"PermittivitySA"`, or the -general `"Permittivity"` should be specified when this value is given. - -`"PermittivitySA" [None]` : Relative permittivity for this dielectric interface assuming it -is a substrate-air (SA) interface. None of `"PermittivityMA"`, `"PermittivityMS"`, or the -general `"Permittivity"` should be specified when this value is given. - -`"LossTan" [0.0]` : Loss tangent for this lossy dielectric interface. - -`"Elements"[]."Attributes" [None]` : This option should not be combined with the -`"Attributes"` field described above. In the case where a single dielectric interface is -made up of contributions with their own unique integer arrays of mesh boundary attributes, -they can be specified here. - -`"Elements"[]."Side" [None]` : This option should not be combined with the `"Side"` field -described above. In the case where a single dielectric interface is made up of contributions -with their own entry for side, they can be specified here. +```@raw html + + +``` + +# `config["Boundaries"]` + +```json +"Boundaries": +{ + "PEC": + { + ... + }, + "PMC": + { + ... + }, + "Impedance": + [ + ... + ], + "Absorbing": + { + ... + }, + "Conductivity": + [ + ... + ], + "LumpedPort": + [ + ... + ], + "WavePort": + [ + ... + ], + "WavePortPEC": + { + ... + }, + "SurfaceCurrent": + [ + ... + ], + "Ground": + { + ... + }, + "ZeroCharge": + { + ... + }, + "Terminal": + [ + ... + ], + "Periodic": + { + ... + }, + "Postprocessing": + { + "SurfaceFlux": + [ + ... + ], + "Dielectric": + [ + ... + ] + "FarField": + { + ... + } + } +} +``` + +with + +`"PEC"` : Top-level object for configuring perfect electric conductor (PEC) boundary +conditions (zero tangential electric field). + +`"PMC"` : Top-level object for configuring perfect magnetic conductor (PMC) boundary +conditions (zero tangential magnetic field). Also imposes symmetry of the electric field +across the boundary surface. + +`"Impedance"` : Array of objects for configuring surface impedance boundary conditions. A +surface impedance boundary relates the tangential electric and magnetic fields on the +boundary using a user specified surface impedance. + +`"Absorbing"` : Top-level object for configuring absorbing boundary conditions. These are +artificial scattering boundary conditions at farfield boundaries. + +`"Conductivity"` : Array of objects for configuring finite conductivity surface impedance +boundary conditions. Finite conductivity boundaries are only available for frequency +domain driven and eigenmode simulation types. + +`"LumpedPort"` : Array of objects for configuring lumped port boundary conditions. Lumped +ports can be specified on boundaries which are internal to the computational domain. + +`"WavePort"` : Array of objects for configuring numeric wave port boundary conditions. Wave +ports can only be specified on boundaries which are on the true boundary of the +computational domain. Additionally, wave port boundaries are only available for +frequency domain driven and eigenmode simulation types. + +`"WavePortPEC"` : Top-level object for configuring additional PEC boundary conditions for boundary +mode analysis performed on the wave port boundaries. Thus, this object is only relevant +when wave port boundaries are specified under +[`config["Boundaries"]["WavePort"]`](#boundaries%5B%22WavePort%22%5D). + +`"SurfaceCurrent"` : Array of objects for configuring surface current boundary conditions. +This boundary prescribes a unit source surface current excitation on the given boundary in +order to excite a frequency or time domain driven simulation or magnetostatic simulation. +For the magnetostatic simulation type, entries of the inductance matrix are extracted +corresponding to each surface current boundary. + +`"Ground"` : Top-level object for specifying ground, or zero voltage, boundary conditions +for for electrostatic simulations. + +`"ZeroCharge"` : Top-level object for specifying zero charge boundary conditions for for +electrostatic simulations. Also imposes symmetry of the electric field across the boundary +surface. + +`"Terminal"` : Array of objects for configuring terminal boundary conditions for +electrostatic simulations. Entries of the capacitance matrix are extracted corresponding to +each terminal boundary. + +`"Periodic"` : Top-level object for configuring periodic boundary conditions for surfaces +with meshes that are identical after translation and/or rotation. + +`"Postprocessing"` : Top-level object for configuring boundary postprocessing. + +`"SurfaceFlux"` : Array of objects for postprocessing surface flux. + +`"Dielectric"` : Array of objects for postprocessing surface interface dielectric loss. + +`"FarField"` : Top-level object for extracting electric fields in the far-field region. + +## `boundaries["PEC"]` + +```json +"PEC": +{ + "Attributes": [] +} +``` + +with + +`"Attributes" [None]` : Integer array of mesh boundary attributes at which to apply the PEC +boundary condition. + +## `boundaries["PMC"]` + +```json +"PMC": +{ + "Attributes": [] +} +``` + +with + +`"Attributes" [None]` : Integer array of mesh boundary attributes at which to apply the +PMC boundary condition. + +## `boundaries["Impedance"]` + +```json +"Impedance": +[ + { + "Attributes": [], + "Rs": , + "Ls": , + "Cs": + }, + ... +] +``` + +with + +`"Attributes" [None]` : Integer array of mesh boundary attributes for this surface +impedance boundary. + +`"Rs" [0.0]` : Surface resistance used for computing this surface impedance boundary's +impedance per square, ``\Omega``/sq. + +`"Ls" [0.0]` : Surface inductance used for computing this surface impedance boundary's +impedance per square, H/sq. + +`"Cs" [0.0]` : Surface capacitance used computing this surface impedance boundary's +impedance per square, F/sq. + +## `boundaries["Absorbing"]` + +```json +"Absorbing": +{ + "Attributes": [], + "Order": +} +``` + +with + +`"Attributes" [None]` : Integer array of mesh boundary attributes at which to apply +farfield absorbing boundary conditions. + +`"Order" [1]` : Specify a first- or second-order approximation for the farfield absorbing +boundary condition. Second-order absorbing boundary conditions are only available for the +frequency domain driven simulation type. + +## `boundaries["Conductivity"]` + +```json +"Conductivity": +[ + { + "Attributes": [], + "Conductivity": , + "Permeability": , + "Thickness": + }, + ... +] +``` + +with + +`"Attributes" [None]` : Integer array of mesh boundary attributes for this finite +conductivity boundary. + +`"Conductivity" [None]` : Electrical conductivity for this finite conductivity boundary, +S/m. + +`"Permeability" [1.0]` : Relative permeability for this finite conductivity boundary. + +`"Thickness" [None]` : Optional conductor thickness for this finite conductivity boundary +specified in mesh length units. Activates a finite conductivity boundary condition which +accounts for nonzero metal thickness. + +## `boundaries["LumpedPort"]` + +```json +"LumpedPort": +[ + { + "Index": , + "Attributes": [], + "Direction": or [], + "CoordinateSystem": , + "Excitation": , + "Active": , + "R": , + "L": , + "C": , + "Rs": , + "Ls": , + "Cs": , + "Elements": + [ + { + "Attributes": , + "Direction": or [], + "CoordinateSystem": + }, + ... + ] + }, + ... +] +``` + +with + +`"Index" [None]` : Index of this lumped port, used in postprocessing output files. + +`"Attributes" [None]` : Integer array of mesh boundary attributes for this lumped port +boundary. If this port is to be a multielement lumped port with more than a single lumped +element, use the `"Elements"` array described below. + +`"Direction" [None]` : Direction to define the polarization direction of the port field +mode on this lumped port boundary. Axis aligned lumped ports can be specified using +keywords: `"+X"`, `"-X"`, `"+Y"`, `"-Y"`, `"+Z"`, `"-Z"`, while coaxial lumped ports can be +specified using `"+R"`, `"-R"`. The direction can alternatively be specified as a +normalized array of three values, for example `[0.0, 1.0, 0.0]`. If a vector direction is +specified, the `"CoordinateSystem"` value specifies the coordinate system it is expressed +in. If this port is to be a multielement lumped port with more than a single lumped +element, use the `"Elements"` array described below. + +`"CoordinateSystem" ["Cartesian"]` : Coordinate system used to express the `"Direction"` +vector, the options are `"Cartesian"` and `"Cylindrical"`. If a keyword argument is used +for `"Direction"` this value is ignored, and the appropriate coordinate system is used +instead. + +`"Excitation" [false/0]` : Turns on or off port excitation for this lumped port boundary for driven +or transient simulation types. Can be specified either as a bool or as a non-negative integer — see +[Boundary Conditions](../guide/boundaries.md#Lumped-and-wave-port-excitation). + +`"Active" [true]` : Turns on or off damping boundary condition for this lumped port +boundary for driven or transient simulation types. + +`"R" [0.0]` : Circuit resistance used for computing this lumped port boundary's impedance, +``\Omega``. This option should only be used along with the corresponding `"L"` and `"C"` +parameters, and not with any of the surface parameters `"Rs"`, `"Ls"`, or `"Cs"`. + +`"L" [0.0]` : Circuit inductance used for computing this lumped port boundary's impedance, +H. This option should only be used along with the corresponding `"R"` and `"C"` parameters, +and not with any of the surface parameters `"Rs"`, `"Ls"`, or `"Cs"`. + +`"C" [0.0]` : Circuit capacitance used for computing this lumped port boundary's impedance, +F. This option should only be used along with the corresponding `"R"` and `"L"` parameters, +and not with any of the surface parameters `"Rs"`, `"Ls"`, or `"Cs"`. + +`"Rs" [0.0]` : Surface resistance used for computing this lumped port boundary's impedance, +``\Omega``/sq. This option should only be used along with the corresponding `"Ls"` and +`"Cs"` parameters, and not with any of the circuit parameters `"R"`, `"L"`, or `"C"`. + +`"Ls" [0.0]` : Surface inductance used for computing this lumped port boundary's impedance, +H/sq. This option should only be used along with the corresponding `"Rs"` and `"Cs"` +parameters, and not with any of the circuit parameters `"R"`, `"L"`, or `"C"`. + +`"Cs" [0.0]` : Surface capacitance used for computing this lumped port boundary's +impedance, F/sq. This option should only be used along with the corresponding `"Rs"` and +`"Ls"` parameters, and not with any of the circuit parameters `"R"`, `"L"`, or `"C"`. + +`"Elements"[]["Attributes"] [None]` : This option is for multielement lumped ports and +should not be combined with the `"Attributes"` field described above. Each element of a +multielement lumped port can be described by its own unique integer array of mesh boundary +attributes, which are specified here. The elements of a multielement port add in parallel. + +`"Elements"[]["Direction"] [None]` : This option is for multielement lumped ports and +should not be combined with the `"Direction"` field described above. Each element of a +multielement lumped port can be described by its own unique direction, which is specified +here. The elements of a multielement port add in parallel. + +`"Elements"[]["CoordinateSystem"] ["Cartesian"]` : This option is for multielement lumped +ports and should not be combined with the `"CoordinateSystem"` field described above. Each +element of a multielement lumped port can be described by its own unique direction, and +corresponding coordinate system. + +## `boundaries["WavePort"]` + +```json +"WavePort": +[ + { + "Index": , + "Attributes": [], + "Excitation": , + "Active": , + "Mode": , + "Offset": , + "SolverType": , + "MaxIts": , + "KSPTol": , + "EigenTol": , + "Verbose": + }, + ... +] +``` + +with + +`"Index" [None]` : Index of this wave port boundary, used in postprocessing output files. + +`"Attributes" [None]` : Integer array of mesh boundary attributes for this wave port +boundary. + +`"Excitation" [false/0]` : Turns on or off port excitation for this wave port boundary for driven +simulation types. Can be specified either as a bool or as a non-negative integer — see [Boundary +Conditions](../guide/boundaries.md#Lumped-and-wave-port-excitation). + +`"Active" [true]` : Turns on or off damping boundary condition for this wave port boundary +for driven simulation types. + +`"Mode" [1]` : Mode index (1-based) for the characteristic port mode of this wave port +boundary. Ranked in order of decreasing wave number. + +`"Offset" [0.0]` : Offset distance used for scattering parameter de-embedding for this wave +port boundary, specified in mesh length units. + +`"SolverType" ["Default"]` : Specifies the eigenvalue solver to be used in computing +the boundary mode for this wave port. See +[`config["Solver"]["Eigenmode"]["Type"]`](solver.md#solver%5B%22Eigenmode%22%5D). + +`"MaxIts" [30]` : Specifies the maximum number of iterations to be used in the GMRES +solver. + +`"KSPTol" [1e-8]` : Specifies the tolerance to be used in the linear solver. + +`"EigenTol" [1e-6]` : Specifies the tolerance to be used in the eigenvalue solver. + +`"Verbose" [0]` : Specifies the verbosity level to be used in the linear and eigensolver +for the wave port problem. + +## `boundaries["WavePortPEC"]` + +```json +"WavePortPEC": +{ + "Attributes": [] +} +``` + +with + +`"Attributes" [None]` : Integer array of mesh boundary attributes to consider as PEC when solving the +2D eigenproblem for the wave port boundary mode analysis, along with those specified under +[`config["Boundaries"]["PEC"]["Attributes"]`](#boundaries%5B%22PEC%22%5D) and +[`config["Boundaries"]["Conductivity"]["Attributes"]`](#boundaries%5B%22Conductivity%22%5D). + +## `boundaries["SurfaceCurrent"]` + +```json +"SurfaceCurrent": +[ + { + "Index": , + "Attributes": [], + "Direction": or [], + "CoordinateSystem": , + "Elements": + [ + { + "Attributes": [], + "Direction": or [], + "CoordinateSystem": , + }, + ... + ] + }, + ... +] +``` + +with + +`"Index" [None]` : Index of this surface current boundary, used in postprocessing output +files. + +`"Attributes" [None]` : Integer array of mesh boundary attributes for this surface current +boundary. If this source is to be a multielement source which distributes the source +across more than a single lumped element, use the `"Elements"` array described below. + +`"Direction" [None]` : Defines the source current direction for this surface current +boundary. The available options are the same as under +[`config["Boundaries"]["LumpedPort"]["Direction"]`](#boundaries%5B%22LumpedPort%22%5D). If +this source is to be a multielement source which distributes the source across more than a +single lumped element, use the `"Elements"` array described below. + +`"CoordinateSystem" ["Cartesian"]` : Defines the coordinate system for the source current +direction for this surface current boundary. The available options are the same as under +[`config["Boundaries"]["LumpedPort"]["CoordinateSystem"]`](#boundaries%5B%22LumpedPort%22%5D). +If this source is to be a multielement source which distributes the source across more than +a single lumped element, use the `"Elements"` array described below. + +`"Elements"[]["Attributes"] [None]` : This option is for multielement surface current +boundaries should not be combined with the `"Attributes"` field described above. Each +element of a multielement current source can be described by its own unique integer array of +mesh boundary attributes, which are specified here. The elements of a multielement source +add in parallel to give the same total current as a single-element source. + +`"Elements"[]["Direction"] [None]` : This option is for multielement surface current +boundaries and should not be combined with the `"Direction"` field described above. Each +element of a multielement current source can be described by its own unique direction, +which is specified here. The elements of a multielement source add in parallel to give the +same total current as a single-element source. + +`"Elements"[]["CoordinateSystem"] ["Cartesian"]` : This option is for multielement surface +current boundaries and should not be combined with the `"CoordinateSystem"` field described +above. Each element of a multielement current source can be described by its own unique +direction, and corresponding coordinate system. + +## `boundaries["Ground"]` + +```json +"Ground": +{ + "Attributes": [] +} +``` + +with + +`"Attributes" [None]` : Integer array of mesh boundary attributes at which to apply the +ground boundary condition. + +## `boundaries["ZeroCharge"]` + +```json +"ZeroCharge": +{ + "Attributes": [] +} +``` + +with + +`"Attributes" [None]` : Integer array of mesh boundary attributes at which to apply the +zero-charge boundary condition. + +## `boundaries["Terminal"]` + +```json +"Terminal": +[ + { + "Index": , + "Attributes": [], + }, + ... +] +``` + +with + +`"Index" [None]` : Index of this terminal boundary, used in postprocessing output files and +to index the computed capacitance matrix. + +`"Attributes" [None]` : Integer array of mesh boundary attributes for this terminal +boundary. + +## `boundaries["Periodic"]` + +```json +"Periodic": +{ + "FloquetWaveVector": [], + "BoundaryPairs": + [ + { + "DonorAttributes": [], + "ReceiverAttributes": [], + "Translation": [], + "AffineTransformation": [], + }, + ... + ] +} +``` + +with + +`"DonorAttributes" [None]` : Integer array of the donor attributes of the mesh boundary +attributes for this periodic boundary. + +`"ReceiverAttributes" [None]` : Integer array of the receiver attributes of the mesh boundary +attributes for this periodic boundary. + +`"Translation" [None]` : Optional floating point array defining the distance from the donor +attribute to the receiver attribute in mesh units. If neither `"Translation"` nor +`"AffineTransformation"` are specified, the transformation between donor and receiver boundaries +is automatically detected. + +`"AffineTransformation" [None]` : Optional floating point array of size 16 defining the +three-dimensional (4 x 4) affine transformation matrix (in row major format) from the donor attribute +to the receiver attribute in mesh units. If neither `"Translation"` or `"AffineTransformation"` are +specified, the transformation between donor and receiver boundaries is automatically detected. + +`"FloquetWaveVector" [None]` : Optional floating point array defining the phase delay between the +periodic boundaries in the X/Y/Z directions in radians per mesh unit. + +## `boundaries["Postprocessing"]["SurfaceFlux"]` + +```json +"Postprocessing": +{ + "SurfaceFlux": + [ + { + "Index": , + "Attributes": [], + "Type": , + "TwoSided": , + "Center": [] + }, + ... + ] +} +``` + +with + +`"Index" [None]` : Index of this surface flux postprocessing boundary, used in +postprocessing output files. + +`"Attributes" [None]` : Integer array of mesh boundary attributes for this surface flux +postprocessing boundary. + +`"Type" [None]` : Specifies the type of surface flux to calculate for this postprocessing +boundary. The available options are: + + - `"Electric"` : Integrate the electric flux density over the boundary surface. + - `"Magnetic"` : Integrate the magnetic flux density over the boundary surface. + - `"Power"` : Integrate the energy flux density, given by the Poynting vector, over the + boundary surface. + +`"TwoSided" [false]` : Specifies how to account for internal boundary surfaces with a +possible discontinuous field on either side. When set to `false`, the flux on either side of +an internal boundary surface is averaged. When `true`, it is summed with an opposite normal +direction. + +`"Center" [None]` : Floating point array of length equal to the model spatial dimension +specifying the coordinates of a central point used to compute the outward flux. The true +surface normal is used in the calculation, and this point is only used to ensure the correct +orientation of the normal. Specified in mesh length units, and only relevant when +`"TwoSided"` is `false`. If not specified, the point will be computed as the centroid of the +axis-aligned bounding box for all elements making up the postprocessing boundary. + +## `boundaries["Postprocessing"]["Dielectric"]` + +```json +"Postprocessing": +{ + "Dielectric": + [ + { + "Index": , + "Attributes": [], + "Type": , + "Thickness": , + "Permittivity": , + "LossTan": + }, + ... + ] +} +``` + +with + +`"Index" [None]` : Index of this dielectric interface, used in postprocessing output files. + +`"Attributes" [None]` : Integer array of mesh boundary attributes for this dielectric +interface. + +`"Type" [None]` : Specifies the type of dielectric interface for this postprocessing +boundary. See also [this page](../reference.md#Bulk-and-interface-dielectric-loss). +Available options are: + + - `"Default"` : Use the full electric field evaluated at the boundary to compute the + energy participation ratio (EPR) of this dielectric interface and estimate loss. + - `"MA"` : Use the boundary conditions assuming a metal-air interface to compute the EPR + of this dielectric interface. + - `"MS"` : Use the boundary conditions assuming a metal-substrate interface to compute + the EPR of this dielectric interface. + - `"SA"` : Use the boundary conditions assuming a substrate-air interface to compute the + EPR of this dielectric interface. + +`"Thickness" [None]` : Thickness of this dielectric interface, specified in mesh length +units. + +`"Permittivity" [None]` : Relative permittivity for this dielectric interface. This should +be the interface layer permittivity for the specific `"Type"` of interface specified. + +`"LossTan" [0.0]` : Loss tangent for this lossy dielectric interface. + +## `boundaries["Postprocessing"]["FarField"]` + +```json +"Postprocessing": +{ + "FarField": + { + "Attributes": [], + "NSample": , + "ThetaPhis": [] + } +} +``` + +with + +`"Attributes" [None]` : Integer array of mesh boundary attributes to be used to +compute the far fields. It has to be an external boundary and enclose the +system. + +`"NSample" [0]` : Number of uniformly-spaced points to use to discretize the +far-field sphere. + +`"ThetaPhi" [None]` : Evaluate the far-field electric field at these specific +angles too (in degrees). $\theta \in [0, 180°]$ is the polar angle and $\phi \in +[0, 360°]$ is the azimuthal angle. diff --git a/docs/src/config/config.md b/docs/src/config/config.md index 384199f620..cda7072d97 100644 --- a/docs/src/config/config.md +++ b/docs/src/config/config.md @@ -1,59 +1,59 @@ -```@raw html - - -``` - -# Overview - -A configuration file written in the [JSON format](https://en.wikipedia.org/wiki/JSON) is -used specify the runtime options for a *Palace* simulation. The following sections give a -detailed overview of the file format and available settings. - -Parameters are specified in the form of keyword/value pairs where the key is a string and -the value may be a string, boolean, integer or floating point number, or array. Parameters -are grouped into a hierarchy of objects. We support relaxed JSON formatting with C++-style -comments (`//`, `/* */`). Integer arrays can be specified as comma-separated lists of -integers or integer ranges, for example `[1,3-5,6]` is parsed as `[1,3,4,5,6]`. - -In the following sections, default values for the parameters are specified alongside the -description of each keyword in square brackets. Keywords for which there is no default -value listed (`[None]`) are required in general if specifying values for other keywords -under the same top-level object. - -The top-level JSON object of the configuration file has the following structure: - -```json -{ - "Problem": - { - ... - }, - "Model": - { - ... - }, - "Domains": - { - ... - }, - "Boundaries": - { - ... - }, - "Solver": - { - ... - } -} -``` - -Each property of the top-level `config` JSON object is detailed in its corresponding -section of the documentation. - -## Contents - - - [`config["Problem"]`](problem.md) - - [`config["Model"]`](model.md) - - [`config["Domains"]`](domains.md) - - [`config["Boundaries"]`](boundaries.md) - - [`config["Solver"]`](solver.md) +```@raw html + + +``` + +# Overview + +A configuration file written in the [JSON format](https://en.wikipedia.org/wiki/JSON) is +used specify the runtime options for a *Palace* simulation. The following sections give a +detailed overview of the file format and available settings. + +Parameters are specified in the form of keyword/value pairs where the key is a string and +the value may be a string, boolean, integer or floating point number, or array. Parameters +are grouped into a hierarchy of objects. We support relaxed JSON formatting with C++-style +comments (`//`, `/* */`). Integer arrays can be specified as comma-separated lists of +integers or integer ranges, for example `[1,3-5,6]` is parsed as `[1,3,4,5,6]`. + +In the following sections, default values for the parameters are specified alongside the +description of each keyword in square brackets. Keywords for which there is no default +value listed (`[None]`) are required in general if specifying values for other keywords +under the same top-level object. + +The top-level JSON object of the configuration file has the following structure: + +```json +{ + "Problem": + { + ... + }, + "Model": + { + ... + }, + "Domains": + { + ... + }, + "Boundaries": + { + ... + }, + "Solver": + { + ... + } +} +``` + +Each property of the top-level `config` JSON object is detailed in its corresponding +section of the documentation. + +## Contents + + - [`config["Problem"]`](problem.md) + - [`config["Model"]`](model.md) + - [`config["Domains"]`](domains.md) + - [`config["Boundaries"]`](boundaries.md) + - [`config["Solver"]`](solver.md) diff --git a/docs/src/config/domains.md b/docs/src/config/domains.md index ed5eab46a4..5cc103d172 100644 --- a/docs/src/config/domains.md +++ b/docs/src/config/domains.md @@ -1,133 +1,129 @@ -```@raw html - - -``` - -# `config["Domains"]` - -```json -"Domains": -{ - "Materials": - [ - ... - ], - "Postprocessing": - { - "Dielectric": - [ - ... - ], - "Probe": - [ - ... - ] - } -} -``` - -with - -`"Materials"` : Array of material properties objects. - -`"Postprocessing"` : Top-level object for configuring domain postprocessing. - -`"Dielectric"` : Array of objects for postprocessing bulk dielectric loss. - -`"Probe"` : Array of objects for postprocessing solution field values evaluated at a probe -location in space. - -## `domains["Materials"]` - -```json -"Materials": -[ - // Material 1 - { - "Attributes": [], - "Permeability": , - "Permittivity": , - "LossTan": , - "Conductivity": , - "LondonDepth": , - "MaterialAxes": - }, - // Material 2, 3, ... - ... -] -``` - -with - -`"Attributes" [None]` : Integer array of mesh domain attributes for this material. - -`"Permeability" [1.0]` : Relative permeability for this material. Scalar or -vector of 3 coefficients corresponding to each of `"MaterialAxes"`. - -`"Permittivity" [1.0]` : Relative permittivity for this material. Scalar or -vector of 3 coefficients corresponding to each of `"MaterialAxes"`. - -`"LossTan" [0.0]` : Loss tangent for this material. Scalar or -vector of 3 coefficients corresponding to each of `"MaterialAxes"`. - -`"Conductivity" [0.0]` : Electrical conductivity for this material, S/m. Activates Ohmic -loss model in the material domain. Scalar or -vector of 3 coefficients corresponding to each of `"MaterialAxes"`. - -`"LondonDepth" [0.0]` : London penetration depth for this material, specified in mesh -length units. Activates London equations-based model relating superconducting current and -electromagnetic fields in the material domain. - -`"MaterialAxes" [[1.0,0.0,0.0], [0.0,1.0,0.0], [0.0,0.0,1.0]]` : Axes directions -for specification of anisotropic material properties. Required to be unit length -and orthogonal. - -## `domains["Postprocessing"]["Dielectric"]` - -```json -"Postprocessing": -{ - "Dielectric": - [ - { - "Index": , - "Attributes": [] - }, - ... - ] -} -``` - -with - -`"Index" [None]` : Index of this lossy domain, used in postprocessing output files. - -`"Attributes" [None]` : Integer array of mesh domain attributes for this lossy domain. - -## `domains["Postprocessing"]["Probe"]` - -```json -"Postprocessing": -{ - "Probe": - [ - { - "Index": , - "X": , - "Y": , - "Z": - }, - ... - ] -} -``` - -with - -`"Index" [None]` : Index of this probe, used in postprocessing output files. - -`"X" [None]` : ``x``-coordinate of this probe, specified in mesh length units. - -`"Y" [None]` : ``y``-coordinate of this probe, specified in mesh length units. - -`"Z" [None]` : ``z``-coordinate of this probe, specified in mesh length units. +```@raw html + + +``` + +# `config["Domains"]` + +```json +"Domains": +{ + "Materials": + [ + ... + ], + "Postprocessing": + { + "Energy": + [ + ... + ], + "Probe": + [ + ... + ] + } +} +``` + +with + +`"Materials"` : Array of material properties objects. + +`"Postprocessing"` : Top-level object for configuring domain postprocessing. + +`"Energy"` : Array of objects for postprocessing domain energies. + +`"Probe"` : Array of objects for postprocessing solution field values evaluated at a probe +location in space. + +## `domains["Materials"]` + +```json +"Materials": +[ + // Material 1 + { + "Attributes": [], + "Permeability": or [], + "Permittivity": or [], + "LossTan": or [], + "Conductivity": or [], + "LondonDepth": , + "MaterialAxes": [[]] + }, + // Material 2, 3, ... + ... +] +``` + +with + +`"Attributes" [None]` : Integer array of mesh domain attributes for this material. + +`"Permeability" [1.0]` : Relative permeability for this material. Scalar or vector of 3 +coefficients corresponding to each of `"MaterialAxes"`. + +`"Permittivity" [1.0]` : Relative permittivity for this material. Scalar or vector of 3 +coefficients corresponding to each of `"MaterialAxes"`. + +`"LossTan" [0.0]` : Loss tangent for this material. Scalar or vector of 3 coefficients +corresponding to each of `"MaterialAxes"`. + +`"Conductivity" [0.0]` : Electrical conductivity for this material, S/m. Activates Ohmic +loss model in the material domain. Scalar or vector of 3 coefficients corresponding to each +of `"MaterialAxes"`. + +`"LondonDepth" [0.0]` : London penetration depth for this material, specified in mesh +length units. Activates London equations-based model relating superconducting current and +electromagnetic fields in the material domain. + +`"MaterialAxes" [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]` : Axes directions for +specification of anisotropic material properties. Required to be unit length and orthogonal. + +## `domains["Postprocessing"]["Energy"]` + +```json +"Postprocessing": +{ + "Energy": + [ + { + "Index": , + "Attributes": [] + }, + ... + ] +} +``` + +with + +`"Index" [None]` : Index of this energy postprocessing domain, used in postprocessing +output files. + +`"Attributes" [None]` : Integer array of mesh domain attributes for this energy +postprocessing domain. + +## `domains["Postprocessing"]["Probe"]` + +```json +"Postprocessing": +{ + "Probe": + [ + { + "Index": , + "Center": [] + }, + ... + ] +} +``` + +with + +`"Index" [None]` : Index of this probe, used in postprocessing output files. + +`"Center" [None]` : Floating point array of length equal to the model spatial dimension +specifying the coordinates of this probe in mesh length units. diff --git a/docs/src/config/model.md b/docs/src/config/model.md index 71bec86f46..5f084f04ed 100644 --- a/docs/src/config/model.md +++ b/docs/src/config/model.md @@ -1,123 +1,136 @@ -```@raw html - - -``` - -# `config["Model"]` - -```json -"Model": -{ - "Mesh": - "L0": , - "Lc": , - "Refinement": - { - ... - } -} -``` - -with - -`"Mesh" [None]` : Input mesh file path, an absolute path is recommended. - -`"L0" [1.0e-6]` : Mesh vertex coordinate length unit, m. - -`"Lc" [0.0]` : Characteristic length scale used for nondimensionalization, specified in -mesh length units. A value less than or equal to zero uses an internally calculated length -scale based on the bounding box of the computational domain. - -`"Refinement"` : Top-level object for configuring mesh refinement. - -## `model["Refinement"]` - -```json -"Refinement": -{ - "Tol": , - "MaxIts": , - "MaxSize": , - "Nonconformal": , - "UpdateFraction": , - "UniformLevels": , - "Boxes": - [ - { - "Levels": , - "XLimits": [], - "YLimits": [], - "ZLimits": [] - }, - ... - ], - "Spheres": - [ - { - "Levels": , - "Center": [], - "Radius": float - }, - ... - ] -} -``` - -with - -`"Tol" [1e-2]` : Relative error convergence tolerance for adaptive mesh refinement (AMR). - -`"MaxIts" [0]` : Maximum number of iterations of AMR to perform. - -`"MaxSize" [0]` : The maximum allowable number of degrees of freedom for AMR. If an adapted -mesh exceeds this value no further adaptation will occur. A value less than 1 means that no -maximum size constraint will be imposed. - -`"Nonconformal" [true]` : Chose whether the adaptation should use nonconformal refinement. -Nonconformal refinement is required for non-simplex meshes. - -`"UpdateFraction" [0.7]` : Dörfler marking fraction used to specify which elements to -refine. This marking strategy will mark the smallest number of elements that make up -`"UpdateFraction"` of the total error in the mesh. A larger value will refine more elements -per iteration, at the cost of the final mesh being less efficient. - -`"UniformLevels" [0]` : Levels of uniform parallel mesh refinement to be performed on the -input mesh. If not performing AMR, these may be used as levels within a geometric multigrid -scheme. - -`"Boxes"` : Array of box region refinement objects. All elements with a node inside the box -region will be marked for refinement. - -`"Spheres"` : Array of sphere region refinement objects. All elements with a node inside -the sphere region will be marked for refinement. - -`"Levels" [0]` : Levels of parallel mesh refinement inside the specified refinement region. - -`"XLimits" [None]` : Floating point array of length 2 specifying the limits in the -``x``-direction of the axis-aligned bounding box for this box refinement region. Specified -in mesh length units. - -`"YLimits" [None]` : Floating point array of length 2 specifying the limits in the -``y``-direction of the axis-aligned bounding box for this box refinement region. Specified -in mesh length units. - -`"ZLimits" [None]` : Floating point array of length 2 specifying the limits in the -``z``-direction of the axis-aligned bounding box for this box refinement region. Specified -in mesh length units. - -`"Center" [None]` : Floating point array of length equal to the model spatial dimension -specfiying the center coordinates of the sphere for this sphere refinement region. -Specified in mesh length units. - -`"Radius" [None]` : The radius of the sphere for this sphere refinement region, specified in -mesh length units. - -### Advanced model options - - - `"Partition" [""]` - - `"ReorientTetMesh" [false]` - - `"RemoveCurvature" [false]` - - `"MaxNCLevels" [1]` - - `"MaximumImbalance" [1.1]` - - `"SaveAdaptIterations" [true]` - - `"SaveAdaptMesh" [false]` +```@raw html + + +``` + +# `config["Model"]` + +```json +"Model": +{ + "Mesh": + "L0": , + "Lc": , + "Refinement": + { + ... + } +} +``` + +with + +`"Mesh" [None]` : Input mesh file path, an absolute path is recommended. If the provided +mesh is nonconformal, it is assumed that it comes from a previous *Palace* solve using AMR, +and all mesh preprocessing checks and modifications (for example +[`model["Refinement"]["CrackInternalBoundaryElements"]`](#model%5B%22Refinement%22%5D)), are +skipped . + +`"L0" [1.0e-6]` : Unit, relative to m, for mesh vertex coordinates. For example, a value +of `1.0e-6` implies the mesh coordinates are in μm. + +`"Lc" [0.0]` : Characteristic length scale used for nondimensionalization, specified in +mesh length units. This keyword should typically not be specified by the user. A value less +than or equal to zero uses an internally calculated length scale based on the bounding box +of the computational domain. A value of 1.0 will disable nondimensionalization of lengths +in the model and all computations will take place in the same units as the mesh. + +`"Refinement"` : Top-level object for configuring mesh refinement. + +## `model["Refinement"]` + +```json +"Refinement": +{ + "Tol": , + "MaxIts": , + "MaxSize": , + "Nonconformal": , + "UpdateFraction": , + "UniformLevels": , + "Boxes": + [ + { + "Levels": , + "BoundingBoxMin": [], + "BoundingBoxMax": [] + }, + ... + ], + "Spheres": + [ + { + "Levels": , + "Center": [], + "Radius": + }, + ... + ] +} +``` + +with + +`"Tol" [1.0e-2]` : Relative error convergence tolerance for adaptive mesh refinement (AMR). + +`"MaxIts" [0]` : Maximum number of iterations of AMR to perform. + +`"MaxSize" [0]` : The maximum allowable number of degrees of freedom for AMR. If an adapted +mesh exceeds this value no further adaptation will occur. A value less than 1 means that no +maximum size constraint will be imposed. + +`"Nonconformal" [true]` : Chose whether the adaptation should use nonconformal refinement. +Nonconformal refinement is required for non-simplex meshes. + +`"UpdateFraction" [0.7]` : Dörfler marking fraction used to specify which elements to +refine. This marking strategy will mark the smallest number of elements that make up +`"UpdateFraction"` of the total error in the mesh. A larger value will refine more elements +per iteration, at the cost of the final mesh being less efficient. + +`"UniformLevels" [0]` : Levels of uniform parallel mesh refinement to be performed on the +input mesh. If not performing AMR, these may be used as levels within a geometric multigrid +scheme. If performing AMR the most refined mesh is used as the initial mesh and the coarser +meshes cannot be used in a geometric multigrid scheme. + +`"Boxes"` : Array of box region refinement objects. All elements with a node inside the box +region will be marked for refinement. + +`"Spheres"` : Array of sphere region refinement objects. All elements with a node inside +the sphere region will be marked for refinement. + +`"Levels" [0]` : Levels of parallel mesh refinement inside the specified refinement region. + +`"BoundingBoxMin" [None]` : Floating point array of length equal to the model spatial +dimension specifying the minimum coordinates of the axis-aligned bounding box for this +refinement region. Specified in mesh length units. + +`"BoundingBoxMax" [None]` : Floating point array of length equal to the model spatial +dimension specifying the maximum coordinates of the axis-aligned bounding box for this +refinement region. Specified in mesh length units. + +`"Center" [None]` : Floating point array of length equal to the model spatial dimension +specifying the center coordinates of the sphere for this sphere refinement region. +Specified in mesh length units. + +`"Radius" [None]` : The radius of the sphere for this sphere refinement region, specified in +mesh length units. + +### Advanced model options + + - `"RemoveCurvature" [false]` + - `"MakeSimplex" [false]` + - `"MakeHexahedral" [false]` + - `"ReorderElements" [false]` + - `"CleanUnusedElements" [true]` + - `"CrackInternalBoundaryElements" [true]` + - `"RefineCrackElements" [true]` + - `"CrackDisplacementFactor" [1.0e-12]` + - `"AddInterfaceBoundaryElements" [true]` + - `"ExportPrerefinedMesh" [false]` + - `"ReorientTetMesh" [false]` + - `"Partitioning" [""]` + - `"MaxNCLevels" [1]` + - `"MaximumImbalance" [1.1]` + - `"SaveAdaptIterations" [true]` + - `"SaveAdaptMesh" [false]` + - `"SerialUniformLevels" [0]` diff --git a/docs/src/config/problem.md b/docs/src/config/problem.md index c1a852cc28..8b306bc9e8 100644 --- a/docs/src/config/problem.md +++ b/docs/src/config/problem.md @@ -1,31 +1,53 @@ -```@raw html - - -``` - -# `config["Problem"]` - -```json -"Problem": -{ - "Type": - "Verbose": , - "Output": -} -``` - -with - -`"Type" [None]` : Controls the simulation type. The available options are: - - - `"Eigenmode"` : Perform a undamped or damped eigenfrequency analysis. - - `"Driven"` : Perform a frequency response simulation. - - `"Transient"` : Perform a time domain excitation response simulation. - - `"Electrostatic"` : Perform an electrostatic analysis to compute the capacitance matrix - for a set of voltage terminals. - - `"Magnetostatic"` : Perform a magnetostatic analysis to compute the inductance matrix - for a set of current sources. - -`"Verbose" [1]` : Controls the level of log file printing. - -`"Output" [None]` : Directory path for saving postprocessing outputs. +```@raw html + + +``` + +# `config["Problem"]` + +```json +"Problem": +{ + "Type": , + "Verbose": , + "Output": , + "OutputFormats": + { + ... + } +} +``` + +with + +`"Type" [None]` : Controls the simulation type. The available options are: + + - `"Eigenmode"` : Perform a undamped or damped eigenfrequency analysis. + - `"Driven"` : Perform a frequency response simulation. + - `"Transient"` : Perform a time domain excitation response simulation. + - `"Electrostatic"` : Perform an electrostatic analysis to compute the capacitance matrix + for a set of voltage terminals. + - `"Magnetostatic"` : Perform a magnetostatic analysis to compute the inductance matrix + for a set of current sources. + +`"Verbose" [1]` : Controls the level of log file printing. + +`"Output" [None]` : Directory path for saving postprocessing outputs. + +`"OutputFormats"` : Top-level object for configuring the field output formats. + +## `problem["OutputFormats"]` + +```json +"OutputFormats": +{ + "Paraview": , + "GridFunction": +} +``` + +with + +`"Paraview" [true]` : Set to true to output fields in Paraview format. + +`"GridFunction" [false]` : Set to true to output fields in MFEM grid function format to visualize with GLVis. diff --git a/docs/src/config/solver.md b/docs/src/config/solver.md index 4735c612d8..351843be00 100644 --- a/docs/src/config/solver.md +++ b/docs/src/config/solver.md @@ -1,467 +1,564 @@ -```@raw html - - -``` - -# `config["Solver"]` - -```json -"Solver": -{ - "Order": , - "PartialAssemblyOrder": , - "Device": , - "Backend": , - "Eigenmode": - { - ... - }, - "Driven": - { - ... - }, - "Transient": - { - ... - }, - "Electrostatic": - { - ... - }, - "Magnetostatic": - { - ... - }, - "Linear": - { - ... - } -} -``` - -with - -`"Order" [1]` : Finite element order (degree). Arbitrary high-order spaces are supported. - -`"PartialAssemblyOrder" [100]` : Order at which to switch from full assembly of finite -element operators to [partial assembly](https://mfem.org/howto/assembly_levels/). Setting -this parameter equal to 1 will fully activate operator partial assembly on all levels. - -`"Device" ["CPU"]` : The runtime device configuration passed to [MFEM] -(https://mfem.org/howto/assembly_levels/) in order to activate different options specified -during configuration. The available options are: - - - `"CPU"` - - `"GPU"` - - `"Debug"` - -The `"GPU"` option will automatically activate the `cuda` or `hip` device based on whether -MFEM is built with CUDA (`MFEM_USE_CUDA=ON`) or HIP (`MFEM_USE_HIP=ON`) support. When -*Palace* is built with OpenMP support (`PALACE_WITH_OPENMP=ON`), `omp` is automatically -added to the list of activated MFEM devices. The `"Debug"` option for MFEM's `debug` device -is useful for debugging issues associated with GPU-based runs of *Palace*. - -`"Backend" [""]` : Specifies the [libCEED backend] -(https://libceed.org/en/latest/gettingstarted/#backends) to use for the simulation. If no -backend is specified, a suitable default backend is selected based on the given -`config["Solver"]["Device"]`. - -`"Eigenmode"` : Top-level object for configuring the eigenvalue solver for the eigenmode -simulation type. Thus, this object is only relevant for -[`config["Problem"]["Type"]: "Eigenmode"`](problem.md#config%5B%22Problem%22%5D). - -`"Driven"` : Top-level object for configuring the frequency domain driven simulation type. -Thus, this object is only relevant for [`config["Problem"]["Type"]: "Driven"`] -(problem.md#config%5B%22Problem%22%5D). - -`"Transient"` : Top-level object for configuring the time domain driven simulation type. -Thus, this object is only relevant for [`config["Problem"]["Type"]: "Transient"`] -(problem.md#config%5B%22Problem%22%5D). - -`"Electrostatic"` : Top-level object for configuring the electrostatic simulation type. -Thus, this object is only relevant for [`config["Problem"]["Type"]: "Electrostatic"`] -(problem.md#config%5B%22Problem%22%5D). - -`"Magnetostatic"` : Top-level object for configuring the magnetostatic simulation type. -Thus, this object is only relevant for [`config["Problem"]["Type"]: "Magnetostatic"`] -(problem.md#config%5B%22Problem%22%5D). - -`"Linear"` : Top-level object for configuring the linear solver employed by all simulation -types. - -## `solver["Eigenmode"]` - -```json -"Eigenmode": -{ - "Target": , - "Tol": , - "MaxIts": , - "MaxSize": , - "N": , - "Save": , - "Type": , - "ContourTargetUpper": , - "ContourAspectRatio": , - "ContourNPoints": -} -``` - -with - -`"Target" [None]` : (Nonzero) frequency target above which to search for eigenvalues, GHz. - -`"Tol" [1.0e-6]` : Relative convergence tolerance for the eigenvalue solver. - -`"MaxIts" [0]` : Maximum number of iterations for the iterative eigenvalue solver. A value -less than 1 uses the solver default. - -`"MaxSize" [0]` : Maximum subspace dimension for eigenvalue solver. A value less than 1 -uses the solver default. - -`"N" [1]` : Number of eigenvalues to compute. - -`"Save" [0]` : Number of computed field modes to save to disk for visualization with -[ParaView](https://www.paraview.org/). Files are saved in the `paraview/` directory under -the directory specified by [`config["Problem"]["Output"]`] -(problem.md#config%5B%22Problem%22%5D). - -`"Type" ["Default"]` : Specifies the eigenvalue solver to be used in computing the given -number of eigenmodes of the problem. The available options are: - - - `"SLEPc"` - - `"ARPACK"` - - `"FEAST"` - - `"Default"` : Use the default eigensolver. Currently, this is the Krylov-Schur - eigenvalue solver from `"SLEPc"`. - -`"ContourTargetUpper" [None]` : Specifies the upper frequency target of the contour used -for the FEAST eigenvalue solver, GHz. This option is relevant only for `"Type": "FEAST"`. - -`"ContourAspectRatio" [None]` : Specifies the aspect ratio of the contour used for the -FEAST eigenvalue solver. This should be greater than zero, where the aspect ratio is the -ratio of the contour width to the frequency range(`"ContourTargetUpper"` - `"Target"`). -This option is relevant only for `"Type": "FEAST"`. - -`"ContourNPoints" [4]` : Number of contour integration points used for the FEAST eigenvalue -solver. This option is relevant only for `"Type": "FEAST"`. - -### Advanced eigenmode solver options - - - `"PEPLinear" [true]` - - `"Scaling" [true]` - - `"StartVector" [true]` - - `"StartVectorConstant" [false]` - - `"MassOrthogonal" [false]` - -## `solver["Driven"]` - -```json -"Driven": -{ - "MinFreq": , - "MaxFreq": , - "FreqStep": , - "SaveStep": , - "SaveOnlyPorts": , - "AdaptiveTol": , - "AdaptiveMaxSamples": , - "AdaptiveMaxCandidates": , - "Restart": -} -``` - -with - -`"MinFreq" [None]` : Lower bound of frequency sweep interval, GHz. - -`"MaxFreq" [None]` : Upper bound of frequency sweep interval, GHz. - -`"FreqStep" [None]` : Frequency step size for frequency sweep, GHz. - -`"SaveStep" [0]` : Controls how often, in number of frequency steps, to save computed -fields to disk for visualization with [ParaView](https://www.paraview.org/). Files are -saved in the `paraview/` directory under the directory specified by -[`config["Problem"]["Output"]`](problem.md#config%5B%22Problem%22%5D). - -`"SaveOnlyPorts" [false]` : If set to `true`, postprocessing is only performed for port -boundaries and skipped for quantities depending on, for example, field integrals over all -or part of the interior of the computational domain. This can be useful in speeding up -simulations if only port boundary quantities are required. - -`"AdaptiveTol" [0.0]` : Relative error convergence tolerance for adaptive frequency sweep. -If zero, adaptive frequency sweep is disabled and the full-order model is solved at each -frequency step in the specified interval. If positive, this tolerance is used to ensure the -reliability of the reduced-order model relative to the full-order one in the frequency band -of interest. - -`"AdaptiveMaxSamples" [10]` : Maximum number of frequency samples used to construct the -reduced-order model for adaptive fast frequency sweep, if the specified tolerance -(`"AdaptiveTol"`) is not met first. - -`"AdaptiveMaxCandidates" [NumFreq/5]` : Maximum number of frequency samples to consider as -candidates for computing the reduced-order model error when adaptively sampling new points -in order to construct the reduced-order for adaptive fast frequency sweep. The default is -less than the requested number of frequency points in the sweep. - -`"Restart" [1]` : Iteration (1-based) from which to restart for a partial frequency sweep -simulation. That is, the initial frequency will be computed as -`"MinFreq" + ("Restart" - 1) * "FreqStep"`. - -### Advanced driven solver options - - - `"AdaptiveAPosterioriError" [false]` - -## `solver["Transient"]` - -```json -"Transient": -{ - "Type": , - "Excitation": , - "ExcitationFreq": , - "ExcitationWidth": , - "MaxTime": , - "TimeStep": , - "SaveStep": , - "SaveOnlyPorts": -} -``` - -with - -`"Type" ["Default"]` : Specifies the time integration scheme used for the discretization of -the second-order system of differential equations. The available options are: - - - `"GeneralizedAlpha"` : The second-order implicit generalized-``\alpha`` method with - ``\rho_\inf = 1.0``. This scheme is unconditionally stable. - - `"NewmarkBeta"` : The second-order implicit Newmark-``\beta`` method with - ``\beta = 1/4`` and ``\gamma = 1/2``. This scheme is unconditionally stable. - - `"CentralDifference"` : The second-order explicit central difference method, obtained - by setting ``\beta = 0`` and ``\gamma = 1/2`` in the Newmark-``\beta`` method. In this - case, the maximum eigenvalue of the system operator is estimated at the start of the - simulation and used to restrict the simulation time step to below the maximum stability - time step. - - `"Default"` : Use the default `"GeneralizedAlpha"` time integration scheme. - -`"Excitation" [None]` : Controls the time dependence of the source excitation. The -available options are: - - - `"Sinusoidal"` : A sinusoidal excitation at a user specified frequency. - - `"Gaussian"` : A Gaussian pulse with a user specified width which defines the - bandwidth. - - `"DifferentiatedGaussian"` : A differentiated Gaussian pulse with a user specified - width which defines the bandwidth. - - `"ModulatedGaussian"` : A modulated Gaussian pulse at a user specified center frequency - and width used to excite the system without any DC component. - - `"Ramp"` : A differentiable unit step function to model the ramp up to a DC signal. - - `"SmoothStep"` : A smoother many-times differentiable unit step function to model the - ramp up to a DC signal over a specified width of time. - -`"ExcitationFreq" [None]` : Center frequency used for harmonic source excitations, GHz. -Only relevant when `"Excitation"` is one of `"Sinusoidal"`, `"Gaussian"`, -`"DifferentiatedGaussian"`, or `"ModulatedGaussian"`. - -`"ExcitationWidth" [None]` : Pulse width for Gaussian-type source excitations, ns. Only -relevant when `"Excitation"` is one of `"Gaussian"`, `"DifferentiatedGaussian"`, -`"ModulatedGaussian"`, or `"SmoothStep"`. - -`"MaxTime" [None]` : End of simulation time interval, ns. Transient simulations always -start from rest at ``t = 0.0``. - -`"TimeStep" [None]` : Uniform time step size for time integration, ns. - -`"SaveStep" [0]` : Controls how often, in number of time steps, to save computed fields to -disk for visualization with [ParaView](https://www.paraview.org/). Files are saved in the -`paraview/` directory under the directory specified by [`config["Problem"]["Output"]`] -(problem.md#config%5B%22Problem%22%5D). - -`"SaveOnlyPorts" [false]` : If set to `true`, postprocessing is only performed for port -boundaries and skipped for quantities depending on, for example, field integrals over all -or part of the interior of the computational domain. This can be useful in speeding up -simulations if only port boundary quantities are required. - -## `solver["Electrostatic"]` - -```json -"Electrostatic": -{ - "Save": -} -``` - -with - -`"Save" [0]` : Number of computed electric field solutions to save to disk for -visualization with [ParaView](https://www.paraview.org/), ordered by the entries in the -computed capacitance matrix. Files are saved in the `paraview/` directory under the -directory specified by [`config["Problem"]["Output"]`] -(problem.md#config%5B%22Problem%22%5D). - -## `solver["Magnetostatic"]` - -```json -"Magnetostatic": -{ - "Save": -} -``` - -with - -`"Save" [0]` : Number of computed magnetic field solutions to save to disk for -visualization with [ParaView](https://www.paraview.org/), ordered by the entries in the -computed inductance matrix. Files are saved in the `paraview/` directory under the -directory specified by [`config["Problem"]["Output"]`] -(problem.md#config%5B%22Problem%22%5D). - -## `solver["Linear"]` - -```json -"Linear": -{ - "Type": , - "KSPType": , - "Tol": , - "MaxIts": , - "MaxSize": , - "MGMaxLevels": , - "MGCoarsenType": , - "MGCycleIts": , - "MGSmoothIts": , - "MGSmoothOrder": , - "PCMatReal": , - "PCMatShifted": , - "PCSide": , - "DivFreeTol": , - "DivFreeMaxIts": , - "GSOrthogonalization": -} -``` - -with - -`"Type" ["Default"]` : Specifies the solver used for [preconditioning] -(https://en.wikipedia.org/wiki/Preconditioner) the linear system of equations to be solved -for each simulation type. The available options are: - - - `"SuperLU"` : The [SuperLU_DIST](https://github.com/xiaoyeli/superlu_dist) sparse - direct solver in real double precision is used to factorize the system matrix. For - frequency domain problems this uses a real approximation to the true complex linear - system matrix. This option is only available when *Palace* has been - [built with SuperLU_DIST support](../install.md#Configuration-options). - - `"STRUMPACK"` : The [STRUMPACK](https://portal.nersc.gov/project/sparse/strumpack) - sparse direct solver in real double precision is used to factorize the system matrix. - For frequency domain problems this uses a real approximation to the true complex linear - system matrix. This option is only available when *Palace* has been - [built with STRUMPACK support](../install.md#Configuration-options). - - `"MUMPS"` : The [MUMPS](http://mumps.enseeiht.fr/) sparse direct solver in real double - precision is used to factorize the system matrix. For frequency domain problems this - uses a real approximation to the true complex linear system matrix. This option is only - available when *Palace* has been [built with MUMPS support] - (../install.md#Configuration-options). - - `"AMS"` : Hypre's [Auxiliary-space Maxwell Solver (AMS)] - (https://hypre.readthedocs.io/en/latest/solvers-ams.html), an algebraic multigrid - (AMG)-based preconditioner. - - `"BoomerAMG"` : The [BoomerAMG] - (https://hypre.readthedocs.io/en/latest/solvers-boomeramg.html) algebraic multigrid - solver from Hypre. - - `"Default"` : Use the default `"AMS"` solver for simulation types involving definite or - semi-definite curl-curl operators (time domain problems as well as magnetostatics). For - frequency domain problems, use a sparse direct solver if available, otherwise uses - `"AMS"`. For electrostatic problems, uses `"BoomerAMG"`. - -`"KSPType" ["Default"]` : Specifies the iterative [Krylov subspace] -(https://en.wikipedia.org/wiki/Krylov_subspace) solver type for solving linear systems of -equations arising for each simulation type. The available options are: - - - `"CG"` - - `"GMRES"` - - `"FGMRES"` - - `"Default"` : Use the default `"GMRES"` Krylov subspace solver for frequency domain - problems, that is when [`config["Problem"]["Type"]`] - (problem.md#config%5B%22Problem%22%5D) is `"Eigenmode"` or `"Driven"`. For the other - simulation types, the linear system matrix is always real and symmetric positive - definite (SPD) and the preconditioned conjugate gradient method (`"CG"`) is used as the - Krylov solver. - -`"Tol" [1.0e-6]` : Relative residual convergence tolerance for the iterative linear solver. - -`"MaxIts" [100]` : Maximum number of iterations for the iterative linear solver. - -`"MaxSize" [0]` : Maximum Krylov space size for the GMRES and FGMRES solvers. A value less -than 1 defaults to the value specified by `"MaxIts"`. - -`"MGMaxLevels" [100]` : Chose whether to enable [geometric multigrid preconditioning] -(https://en.wikipedia.org/wiki/Multigrid_method) which uses p- and h-multigrid coarsening as -available to construct the multigrid hierarchy. The solver specified by `"Type"` is used on -the coarsest level. Relaxation on the fine levels is performed with Chebyshev smoothing. - -`"MGCoarsenType" ["Logarithmic"]` : Coarsening to create p-multigrid levels. - - - `"Logarithmic"` - - `"Linear"` - -`"MGCycleIts" [1]` : Number of V-cycle iterations per preconditioner application for -multigrid preconditioners (when `"UseMultigrid"` is `true` or `"Type"` is `"AMS"` or -`"BoomerAMG"`). - -`"MGSmoothIts" [1]` : Number of pre- and post-smooth iterations used for multigrid -preconditioners (when `"UseMultigrid"` is `true` or `"Type"` is `"AMS"` or `"BoomerAMG"`). - -`"MGSmoothOrder" [0]` : Order of polynomial smoothing for geometric multigrid -preconditioning (when `"UseMultigrid"` is `true`). A value less than 1 defaults to twice -the solution order given in [`config["Solver"]["Order"]`] -(problem.md#config%5B%22Solver%22%5D) or 4, whichever is larger. - -`"PCMatReal" [false]` : When set to `true`, constructs the preconditioner for frequency -domain problems using a real-valued approximation of the system matrix. This is always -performed for the coarsest multigrid level regardless of the setting of `"PCMatReal"`. - -`"PCMatShifted" [false]` : When set to `true`, constructs the preconditioner for frequency -domain problems using a positive definite approximation of the system matrix by flipping -the sign for the mass matrix contribution, which can help performance at high frequencies -(relative to the lowest nonzero eigenfrequencies of the model). - -`"PCSide" ["Default"]` : Side for preconditioning. Not all options are available for all -iterative solver choices, and the default choice depends on the iterative solver used. - - - `"Left"` - - `"Right"` - - `"Default"` - -`"DivFreeTol" [1.0e-12]` : Relative tolerance for divergence-free cleaning used in the -eigenmode simulation type. - -`"DivFreeMaxIts" [1000]` : Maximum number of iterations for divergence-free cleaning use in -the eigenmode simulation type. - -`"EstimatorTol" [1e-6]` : Relative tolerance for flux projection used in the -error estimate calculation. - -`"EstimatorMaxIts" [100]` : Maximum number of iterations for flux projection use in -the error estimate calculation. - -`"GSOrthogonalization" ["MGS"]` : Gram-Schmidt variant used to explicitly orthogonalize -vectors in Krylov subspace methods or other parts of the code. - - - `"MGS"` : Modified Gram-Schmidt - - `"CGS"` : Classical Gram-Schmidt - - `"CGS2"` : Two-step classical Gram-Schmidt with reorthogonalization - -### Advanced linear solver options - - - `"InitialGuess" [true]` - - `"MGAuxiliarySmoother" [true]` - - `"MGSmoothEigScaleMax" [1.0]` - - `"MGSmoothEigScaleMin" [0.0]` - - `"MGSmoothChebyshev4th" [true]` - - `"ColumnOrdering" ["Default"]` : `"METIS"`, `"ParMETIS"`,`"Scotch"`, `"PTScotch"`, - `"Default"` - - `"STRUMPACKCompressionType" ["None"]` : `"None"`, `"BLR"`, `"HSS"`, `"HODLR"`, `"ZFP"`, - `"BLR-HODLR"`, `"ZFP-BLR-HODLR"` - - `"STRUMPACKCompressionTol" [1.0e-3]` - - `"STRUMPACKLossyPrecision" [16]` - - `"STRUMPACKButterflyLevels" [1]` - - `"SuperLU3D" [false]` - - `"AMSVector" [false]` +```@raw html + + +``` + +# `config["Solver"]` + +```json +"Solver": +{ + "Order": , + "PartialAssemblyOrder": , + "Device": , + "Backend": , + "Eigenmode": + { + ... + }, + "Driven": + { + ... + }, + "Transient": + { + ... + }, + "Electrostatic": + { + ... + }, + "Magnetostatic": + { + ... + }, + "Linear": + { + ... + } +} +``` + +with + +`"Order" [1]` : Finite element order (degree). Arbitrary high-order spaces are supported. + +`"PartialAssemblyOrder" [1]` : Order at which to switch from full assembly of finite +element operators to [partial assembly](https://mfem.org/howto/assembly_levels/). Setting +this parameter equal to 1 will fully activate operator partial assembly on all levels, while +setting it to some large number (greater than the finite element order) will result in +fully assembled operators as sparse matrices. + +`"Device" ["CPU"]` : The runtime device configuration passed to +[MFEM](https://mfem.org/howto/assembly_levels/) in order to activate different options +specified during configuration. The available options are: + + - `"CPU"` + - `"GPU"` + - `"Debug"` + +The `"GPU"` option will automatically activate the `cuda` or `hip` device based on whether +MFEM is built with CUDA (`MFEM_USE_CUDA=ON`) or HIP (`MFEM_USE_HIP=ON`) support. When +*Palace* is built with OpenMP support (`PALACE_WITH_OPENMP=ON`), `omp` is automatically +added to the list of activated MFEM devices. The `"Debug"` option for MFEM's `debug` device +is useful for debugging issues associated with GPU-based runs of *Palace*. + +`"Backend" [""]` : Specifies the +[libCEED backend](https://libceed.org/en/latest/gettingstarted/#backends) to use for the +simulation. If no backend is specified, a suitable default backend is selected based on the +given `config["Solver"]["Device"]`. + +`"Eigenmode"` : Top-level object for configuring the eigenvalue solver for the eigenmode +simulation type. Thus, this object is only relevant for +[`config["Problem"]["Type"]: "Eigenmode"`](problem.md#config%5B%22Problem%22%5D). + +`"Driven"` : Top-level object for configuring the frequency domain driven simulation type. +Thus, this object is only relevant for +[`config["Problem"]["Type"]: "Driven"`](problem.md#config%5B%22Problem%22%5D). + +`"Transient"` : Top-level object for configuring the time domain driven simulation type. +Thus, this object is only relevant for +[`config["Problem"]["Type"]: "Transient"`](problem.md#config%5B%22Problem%22%5D). + +`"Electrostatic"` : Top-level object for configuring the electrostatic simulation type. +Thus, this object is only relevant for +[`config["Problem"]["Type"]: "Electrostatic"`](problem.md#config%5B%22Problem%22%5D). + +`"Magnetostatic"` : Top-level object for configuring the magnetostatic simulation type. +Thus, this object is only relevant for +[`config["Problem"]["Type"]: "Magnetostatic"`](problem.md#config%5B%22Problem%22%5D). + +`"Linear"` : Top-level object for configuring the linear solver employed by all simulation +types. + +### Advanced solver options + + - `"QuadratureOrderJacobian" [false]` + - `"ExtraQuadratureOrder" [0]` + +## `solver["Eigenmode"]` + +```json +"Eigenmode": +{ + "Target": , + "Tol": , + "MaxIts": , + "MaxSize": , + "N": , + "Save": , + "Type": , + "NonlinearType" : , +} +``` + +with + +`"Target" [None]` : (Nonzero) frequency target above which to search for eigenvalues, GHz. + +`"Tol" [1.0e-6]` : Relative convergence tolerance for the eigenvalue solver. + +`"MaxIts" [0]` : Maximum number of iterations for the iterative eigenvalue solver. A value +less than 1 uses the solver default. + +`"MaxSize" [0]` : Maximum subspace dimension for eigenvalue solver. A value less than 1 +uses the solver default. + +`"N" [1]` : Number of eigenvalues to compute. + +`"Save" [0]` : Number of computed field modes to save to disk for +[visualization with ParaView](../guide/postprocessing.md#Visualization). Files are saved in +the `paraview/` (and/or `gridfunction/`) directory under the directory specified by +[`config["Problem"]["Output"]`](problem.md#config%5B%22Problem%22%5D). + +`"Type" ["Default"]` : Specifies the eigenvalue solver to be used in computing the given +number of eigenmodes of the problem. The available options are: + + - `"SLEPc"` + - `"ARPACK"` + - `"Default"` : Use the default eigensolver. Currently, this is the Krylov-Schur + eigenvalue solver from `"SLEPc"`. + +`"NonlinearType" ["Hybrid"]` : Specifies the nonlinear eigenvalue solver to be used for nonlinear problems (e.g. frequency-dependent boundary conditions). The available options are: + + - `"Hybrid"` : Hybrid algorithm where a (quadratic) polynomial approximation of the nonlinear problem is first solved and the eigenmodes are then refined with a quasi-Newton nonlinear eigensolver. + - `"SLP"` : SLEPc's Successive Linear Problem nonlinear eigensolver. + +`"TargetUpper" [3 * Target]` : Upper end of the frequency target range in which to search for eigenvalues, GHz. Only used in nonlinear problems. Using an inaccurate upper bound (significantly smaller or greater than the largest eigenvalue sought) can negatively affect the convergence of the nonlinear eigensolver. + +### Advanced eigenmode solver options + + - `"PEPLinear" [true]` + - `"Scaling" [true]` + - `"StartVector" [true]` + - `"StartVectorConstant" [false]` + - `"MassOrthogonal" [false]` + - `"RefineNonlinear" [true]` + - `"LinearTol" [1e-3]` + - `"PreconditionerLag" [10]` + - `"PreconditionerLagTol" [1e-4]` + - `"MaxRestart" [2]` + +## `solver["Driven"]` + +```json +"Driven": +{ + "MinFreq": , + "MaxFreq": , + "FreqStep": , + "SaveStep": , + "Samples": [ ... ], + "Save": [], + "Restart": , + "AdaptiveTol": , + "AdaptiveMaxSamples": , + "AdaptiveConvergenceMemory": +} +``` + +with + +`"MinFreq" [None]` : Lower bound of frequency sweep interval, GHz. + +`"MaxFreq" [None]` : Upper bound of frequency sweep interval, GHz. + +`"FreqStep" [None]` : Frequency step size for frequency sweep, GHz. + +`"SaveStep" [0]` : Controls how often, in number of frequency steps, to save computed +fields to disk for [visualization with ParaView](../guide/postprocessing.md#Visualization). +Files are saved in the `paraview/` (and/or `gridfunction/`) directory under the directory specified by +[`config["Problem"]["Output"]`](problem.md#config%5B%22Problem%22%5D). + +`"Samples" [None]` : Array of [sample +specifications](solver.md#solver%5B%22Driven%22%5D%5B%22Samples%22%5D) that specify how to +construct frequency samples. These are all combined to form a sorted and unique collection +of samples. These samples can be instead of, or in addition to, the interface provided by +`"MinFreq"`, `"MaxFreq"`, `"FreqStep"` and `"SaveStep"`. See +[`solver["Driven"]["Samples"]`](solver.md#solver%5B%22Driven%22%5D%5B%22Samples%22%5D) for +the construction of each of these structs. + +`"Save" [None]` : Array of frequencies to save computed fields to disk for [visualization +with ParaView](../guide/postprocessing.md#Visualization), in addition to those specified by +`"SaveStep"` in any sample specification. Files are saved in the `paraview/` (and/or `gridfunction/`) +directory under the directory specified by +[`config["Problem"]["Output"]`](problem.md#config%5B%22Problem%22%5D). + +`"Restart" [1]` : Iteration (1-based) from which to restart for a partial frequency sweep +simulation. That is `"Restart": x` will start the frequency sweep from the ``x``-th sample +rather than the first sample. This indexing is from the *combined* set of frequency samples. +Not valid for an adaptive fast frequency sweep. + +`"AdaptiveTol" [0.0]` : Relative error convergence tolerance for adaptive frequency sweep. +If zero, adaptive frequency sweep is disabled and the full-order model is solved at each +frequency step in the specified interval. If positive, this tolerance is used to ensure the +reliability of the reduced-order model relative to the full-order one in the frequency band +of interest. + +`"AdaptiveMaxSamples" [20]` : Maximum number of frequency samples used to construct the +reduced-order model for adaptive fast frequency sweep, if the specified tolerance (`"AdaptiveTol"`) +is not met first. In simulations with multiple excitations, this is the maximum number of samples +per excitation. + +`"AdaptiveConvergenceMemory" [2]` : Memory used for assessing convergence of the adaptive +sampling algorithm for constructing the reduced-order model for adaptive fast frequency +sweep. For example, a memory of "2" requires two consecutive samples which satisfy the +error tolerance. + +### `solver["Driven"]["Samples"]` + +```json +"Samples": +{ + "Type": , + "MinFreq": , + "MaxFreq": , + "FreqStep": , + "NSample": , + "Freq": [], + "SaveStep": , + "AddToPROM": +} +``` + +`"Type" [None]` : The type of range being specified. The list of valid options are +`"Linear"`, `"Point"`, `"Log"`. For non-ambiguous combinations of other fields, this can be +inferred for convenience. + +`"MinFreq" [None]` : Lower bound of frequency sweep interval, GHz. Valid for `"Linear"` and `"Log"`. + +`"MaxFreq" [None]` : Upper bound of frequency sweep interval, GHz. Valid for `"Linear"` and `"Log"`. + +`"FreqStep" [None]` : Frequency step size for frequency sweep, GHz. Valid for `"Linear"` only. +Mutually exclusive with `"NSample"` + +`"NSample" [None]` : Number of frequency samples over the specified range. Valid for `"Linear"` and `"Log"`. +Mutually exclusive with `"FreqStep"`. + +`"Freq" [None]` : Explicit frequencies to be sample, GHz. Valid for `"Point"` only. + +`"SaveStep" [0]` : Controls how often, in number of frequency steps, to save computed +fields to disk for [visualization with ParaView](../guide/postprocessing.md#Visualization). +Files are saved in the `paraview/` (and/or `gridfunction/`) directory under the directory specified by +[`config["Problem"]["Output"]`](problem.md#config%5B%22Problem%22%5D). + +`"AddToPROM" [false]` : Advanced option to force the inclusion of this sample into the PROM +when performing an adaptive sweep. This is primarily a debugging tool as the error +estimation procedure will in general make more efficient selections of sampling points, and +using this mechanism can result in a significantly larger and less efficient PROM. + +## `solver["Transient"]` + +```json +"Transient": +{ + "Type": , + "Excitation": , + "ExcitationFreq": , + "ExcitationWidth": , + "MaxTime": , + "TimeStep": , + "SaveStep": , + "Order": , + "RelTol": , + "AbsTol": +} +``` + +with + +`"Type" ["Default"]` : Specifies the time integration scheme used for the discretization of +the second-order system of differential equations. The available options are: + + - `"GeneralizedAlpha"` : The second-order implicit generalized-``\alpha`` method with + ``\rho_{\inf} = 1.0``. This scheme is unconditionally stable. + - `"ARKODE"` : SUNDIALS ARKode implicit Runge-Kutta scheme applied to the first-order + ODE system for the electric field with adaptive time-stepping. This option is only available when *Palace* has been [built with SUNDIALS support](../install.md#Configuration-options). + - `"CVODE"` : SUNDIALS CVODE implicit multistep method scheme applied to the first-order + ODE system for the electric field with adaptive time-stepping. This option is only available when *Palace* has been [built with SUNDIALS support](../install.md#Configuration-options). + - `"RungeKutta"` : Two stage, singly diagonal implicit Runge-Kutta (SDIRK) method. Second order and L-stable. + - `"Default"` : Use the default `"GeneralizedAlpha"` time integration scheme. + +`"Excitation" [None]` : Controls the time dependence of the source excitation. The +available options are: + + - `"Sinusoidal"` : A sinusoidal excitation at a user specified frequency. + - `"Gaussian"` : A Gaussian pulse with a user specified width which defines the + bandwidth. + - `"DifferentiatedGaussian"` : A differentiated Gaussian pulse with a user specified + width which defines the bandwidth. + - `"ModulatedGaussian"` : A modulated Gaussian pulse at a user specified center frequency + and width used to excite the system without any DC component. + - `"Ramp"` : A differentiable unit step function to model the ramp up to a DC signal. + - `"SmoothStep"` : A smoother many-times differentiable unit step function to model the + ramp up to a DC signal over a specified width of time. + +`"ExcitationFreq" [None]` : Center frequency used for harmonic source excitations, GHz. +Only relevant when `"Excitation"` is one of `"Sinusoidal"`, `"Gaussian"`, +`"DifferentiatedGaussian"`, or `"ModulatedGaussian"`. + +`"ExcitationWidth" [None]` : Pulse width for Gaussian-type source excitations, ns. Only +relevant when `"Excitation"` is one of `"Gaussian"`, `"DifferentiatedGaussian"`, +`"ModulatedGaussian"`, or `"SmoothStep"`. + +`"MaxTime" [None]` : End of simulation time interval, ns. Transient simulations always +start from rest at ``t = 0.0``. + +`"TimeStep" [None]` : Uniform time step size for time integration, ns. + +`"SaveStep" [0]` : Controls how often, in number of time steps, to save computed fields to +disk for [visualization with ParaView](../guide/postprocessing.md#Visualization). Files are +saved in the `paraview/` (and/or `gridfunction/`) directory under the directory specified by +[`config["Problem"]["Output"]`](problem.md#config%5B%22Problem%22%5D). + +`"Order" [2]` : Order of the adaptive Runge-Kutta integrators or maximum order of the +multistep method, must be within `[2,5]`. Should only be specified if `"Type"` is `"ARKODE"` +or `"CVODE"`. + +`"RelTol" [1e-4]` : Relative tolerance used in adaptive time-stepping schemes. Should only +be specified if `"Type"` is `"ARKODE"` or `"CVODE"`. + +`"AbsTol" [1e-9]` : Absolute tolerance used in adaptive time-stepping schemes. Should only +be specified if `"Type"` is `"ARKODE"` or `"CVODE"`. + +## `solver["Electrostatic"]` + +```json +"Electrostatic": +{ + "Save": +} +``` + +with + +`"Save" [0]` : Number of computed electric field solutions to save to disk for +[visualization with ParaView](../guide/postprocessing.md#Visualization), ordered by the +entries in the computed capacitance matrix. Files are saved in the `paraview/` (and/or `gridfunction/`) directory +under the directory specified by +[`config["Problem"]["Output"]`](problem.md#config%5B%22Problem%22%5D). + +## `solver["Magnetostatic"]` + +```json +"Magnetostatic": +{ + "Save": +} +``` + +with + +`"Save" [0]` : Number of computed magnetic field solutions to save to disk for +[visualization with ParaView](../guide/postprocessing.md#Visualization)), ordered by the +entries in the computed inductance matrix. Files are saved in the `paraview/` (and/or `gridfunction/`) directory +under the directory specified by +[`config["Problem"]["Output"]`](problem.md#config%5B%22Problem%22%5D). + +## `solver["Linear"]` + +```json +"Linear": +{ + "Type": , + "KSPType": , + "Tol": , + "MaxIts": , + "MaxSize": , + "MGMaxLevels": , + "MGCoarsenType": , + "MGCycleIts": , + "MGSmoothIts": , + "MGSmoothOrder": , + "PCMatReal": , + "PCMatShifted": , + "ComplexCoarseSolve": , + "DropSmallEntries": , + "PCSide": , + "DivFreeTol": , + "DivFreeMaxIts": , + "EstimatorTol": , + "EstimatorMaxIts": , + "EstimatorMG": , + "GSOrthogonalization": +} +``` + +with + +`"Type" ["Default"]` : Specifies the solver used for +[preconditioning](https://en.wikipedia.org/wiki/Preconditioner) the linear system of +equations to be solved for each simulation type. The available options are: + + - `"SuperLU"` : The [SuperLU_DIST](https://github.com/xiaoyeli/superlu_dist) sparse + direct solver in real double precision is used to factorize the system matrix. For + frequency domain problems this uses a real approximation to the true complex linear + system matrix. This option is only available when *Palace* has been + [built with SuperLU_DIST support](../install.md#Configuration-options). + - `"STRUMPACK"` : The [STRUMPACK](https://portal.nersc.gov/project/sparse/strumpack) + sparse direct solver in real double precision is used to factorize the system matrix. + For frequency domain problems this uses a real approximation to the true complex linear + system matrix. This option is only available when *Palace* has been + [built with STRUMPACK support](../install.md#Configuration-options). + - `"MUMPS"` : The [MUMPS](http://mumps.enseeiht.fr/) sparse direct solver in real double + precision is used to factorize the system matrix. For frequency domain problems this + uses a real approximation to the true complex linear system matrix. This option is only + available when *Palace* has been + [built with MUMPS support](../install.md#Configuration-options). + - `"AMS"` : Hypre's + [Auxiliary-space Maxwell Solver (AMS)](https://hypre.readthedocs.io/en/latest/solvers-ams.html), + an algebraic multigrid (AMG)-based preconditioner. + - `"BoomerAMG"` : The + [BoomerAMG](https://hypre.readthedocs.io/en/latest/solvers-boomeramg.html) AMG solver + from Hypre. + - `"Jacobi"` : Diagonal scaling with a simple Jacobi preconditioner (not recommended in + general). + - `"Default"` : Use the default `"AMS"` solver for simulation types involving definite or + semi-definite curl-curl operators (time domain problems as well as magnetostatics). For + frequency domain problems, use a sparse direct solver if available, otherwise uses + `"AMS"`. For electrostatic problems, uses `"BoomerAMG"`. + +`"KSPType" ["Default"]` : Specifies the iterative +[Krylov subspace](https://en.wikipedia.org/wiki/Krylov_subspace) solver type for solving +linear systems of equations arising for each simulation type. The available options are: + + - `"CG"` + - `"GMRES"` + - `"FGMRES"` + - `"Default"` : Use the default `"GMRES"` Krylov subspace solver for frequency domain + problems, that is when + [`config["Problem"]["Type"]`](problem.md#config%5B%22Problem%22%5D) is `"Eigenmode"` or + `"Driven"`. For the other simulation types, the linear system matrix is always real and + symmetric positive definite (SPD) and the preconditioned conjugate gradient method + (`"CG"`) is used as the Krylov solver. + +`"Tol" [1.0e-6]` : Relative residual convergence tolerance for the iterative linear solver. + +`"MaxIts" [100]` : Maximum number of iterations for the iterative linear solver. + +`"MaxSize" [0]` : Maximum Krylov space size for the GMRES and FGMRES solvers. A value less +than 1 defaults to the value specified by `"MaxIts"`. + +`"MGMaxLevels" [100]` : When greater than 1, enable the [geometric multigrid +preconditioning](https://en.wikipedia.org/wiki/Multigrid_method), which uses p- +and h-multigrid coarsening as available to construct the multigrid hierarchy. +The solver specified by `"Type"` is used on the coarsest level. Relaxation on +the fine levels is performed with Chebyshev smoothing. + +`"MGCoarsenType" ["Logarithmic"]` : Coarsening to create p-multigrid levels. + + - `"Logarithmic"` + - `"Linear"` + +`"MGCycleIts" [1]` : Number of V-cycle iterations per preconditioner application +for multigrid preconditioners (when the geometric multigrid preconditioner is +enabled, i.e. when `MGMaxLevels` > 1, or when `"Type"` is `"AMS"` or +`"BoomerAMG"`). + +`"MGSmoothIts" [1]` : Number of pre- and post-smooth iterations used for +multigrid preconditioners (when the geometric multigrid preconditioner is +enabled, i.e. when `MGMaxLevels` > 1, or when `"Type"` is `"AMS"` or +`"BoomerAMG"`). + +`"MGSmoothOrder" [0]` : Order of polynomial smoothing for geometric multigrid +preconditioning. A value less than 1 defaults to twice +the solution order given in +[`config["Solver"]["Order"]`](problem.md#config%5B%22Solver%22%5D) or 4, whichever is +larger. + +`"PCMatReal" [false]` : When set to `true`, constructs the preconditioner for frequency +domain problems using a real-valued approximation of the system matrix. This is always +performed for the coarsest multigrid level regardless of the setting of `"PCMatReal"`. + +`"PCMatShifted" [false]` : When set to `true`, constructs the preconditioner for frequency +domain problems using a positive definite approximation of the system matrix by flipping +the sign for the mass matrix contribution, which can help performance at high frequencies +(relative to the lowest nonzero eigenfrequencies of the model). + +`"ComplexCoarseSolve" [false]` : When set to `true`, the coarse-level solver uses the true +complex-valued system matrix. When set to `false`, the real-valued approximation is used. + +`"DropSmallEntries" [false]` : When set to `true`, entries smaller than the double precision +machine epsilon are dropped from the system matrix used in the sparse direct solver. + +`"PCSide" ["Default"]` : Side for preconditioning. Not all options are available for all +iterative solver choices, and the default choice depends on the iterative solver used. + + - `"Left"` + - `"Right"` + - `"Default"` + +`"DivFreeTol" [1.0e-12]` : Relative tolerance for divergence-free cleaning used in the +eigenmode simulation type. Ignored if non-zero Floquet wave vector is specified in +[`config["Boundaries"]["Periodic"]["FloquetWaveVector"]`](boundaries.md##boundaries%5B%%22Periodic%22%5D%22FloquetWaveVector%22%5D) +or +[`config["Boundaries"]["FloquetWaveVector"]`](boundaries.md##boundaries%5B%%22FloquetWaveVector%22%5D), +or non-zero +[`config["Domains"]["Materials"]["LondonDepth"]`](domains.md##domains%5B%22Materials%22%5D%5B%22LondonDepth%22%5D) +is specified. + +`"DivFreeMaxIts" [1000]` : Maximum number of iterations for divergence-free cleaning use in +the eigenmode simulation type. Ignored if non-zero Floquet wave vector is specified in +[`config["Boundaries"]["Periodic"]["FloquetWaveVector"]`](boundaries.md##boundaries%5B%%22Periodic%22%5D%22FloquetWaveVector%22%5D) +or +[`config["Boundaries"]["FloquetWaveVector"]`](boundaries.md##boundaries%5B%%22FloquetWaveVector%22%5D), +or non-zero +[`config["Domains"]["Materials"]["LondonDepth"]`](domains.md##domains%5B%22Materials%22%5D%5B%22LondonDepth%22%5D) +is specified. + +`"EstimatorTol" [1.0e-6]` : Relative tolerance for flux projection used in the +error estimate calculation. + +`"EstimatorMaxIts" [10000]` : Maximum number of iterations for flux projection use in the +error estimate calculation. + +`"EstimatorMG" [false]` : Set to true in order to enable multigrid preconditioner with AMG +coarse solve for the error estimate linear solver, instead of just Jacobi. + +`"GSOrthogonalization" ["MGS"]` : Gram-Schmidt variant used to explicitly orthogonalize +vectors in Krylov subspace methods or other parts of the code. + + - `"MGS"` : Modified Gram-Schmidt + - `"CGS"` : Classical Gram-Schmidt + - `"CGS2"` : Two-step classical Gram-Schmidt with reorthogonalization + +### Advanced linear solver options + + - `"InitialGuess" [true]` + - `"MGUseMesh" [true]` + - `"MGAuxiliarySmoother" [true]` + - `"MGSmoothEigScaleMax" [1.0]` + - `"MGSmoothEigScaleMin" [0.0]` + - `"MGSmoothChebyshev4th" [true]` + - `"ReorderingReuse" [true]` + - `"ColumnOrdering" ["Default"]` : `"METIS"`, `"ParMETIS"`,`"Scotch"`, `"PTScotch"`, + `"PORD"`, `"AMD"`, `"RCM"`, `"Default"` + - `"STRUMPACKCompressionType" ["None"]` : `"None"`, `"BLR"`, `"HSS"`, `"HODLR"`, `"ZFP"`, + `"BLR-HODLR"`, `"ZFP-BLR-HODLR"` + - `"STRUMPACKCompressionTol" [1.0e-3]` + - `"STRUMPACKLossyPrecision" [16]` + - `"STRUMPACKButterflyLevels" [1]` + - `"SuperLU3DCommunicator" [false]` + - `"AMSVectorInterpolation" [false]` + - `"AMSSingularOperator" [false]` + - `"AMGAggressiveCoarsening" [false]` diff --git a/docs/src/developer.md b/docs/src/developer.md index 0b26aec0d0..5e30c4ad9f 100644 --- a/docs/src/developer.md +++ b/docs/src/developer.md @@ -1,106 +1,106 @@ -```@raw html - - -``` - -# Developer Notes - -## Style guide - -Automated source code formatting is performed using [`clang-format`] -(https://clang.llvm.org/docs/ClangFormat.html). Run: - -```bash -./scripts/format_source -``` - -in the repository root directory to automatically use `clang-format` to format `C++` source -as well as [`JuliaFormatter.jl`](https://github.com/domluna/JuliaFormatter.jl) for Julia and -Markdown files. The script can be viewed [in the repository] -(https://github.com/awslabs/palace/blob/main/scripts/format_source). - -The following conventions also apply: - - - `PascalCase` for classes and function names. - - Follow 'include what you use' (IWYU), with the include order dictated by the - [Google C++ Style Guide] - (https://google.github.io/styleguide/cppguide.html#Names_and_Order_of_Includes). This - order should be automatically enforced by the `clang-format` [style file] - (https://github.com/awslabs/palace/blob/main/.clang-format). - - Code comments should be full sentences, with punctuation. At this time, no Doxygen API - reference is generated and so comments generally do not need to conform to Doxygen - syntax. - -## Static analysis - -During the `cmake` configuration step, definining the variables `ANALYZE_SOURCES_CLANG_TIDY` -and `ANALYZE_SOURCES_CPPCHECK` to `ON` will turn on static analysis using [`clang-tidy`] -(https://clang.llvm.org/extra/clang-tidy/) and [`cppcheck`] -(https://cppcheck.sourceforge.io/), respectively, during the build step. This requires the -executables to be installed and findable by CMake on your system. - -## JSON Schema for configuration files - -A JSON format [configuration file](config/config.md), for example named `config.json`, can -be validated against the provided Schema using: - -```bash -./scripts/validate_config config.json -``` - -[This script](https://github.com/awslabs/palace/blob/main/scripts/validate_config) uses -Julia's [`JSONSchema.jl`](https://github.com/fredo-dedup/JSONSchema.jl) and the Schema -provided in [`scripts/schema/`] -(https://github.com/awslabs/palace/blob/main/scripts/schema) to parse the configuration -file and check that the fields are correctly specified. This script and the associated -Schema are also installed and can be accessed in `/bin`. - -## Timing - -Timing facilities are provided by the `Timer` and `BlockTimer` classes. - -Creating a block as `BlockTimer b(idx)` where `idx` is a category like `CONSTRUCT`, `SOLVE`, -etc. will record time so long as `b` is in scope; however, timing may be interrupted by -creation of another `BlockTimer` object. It will resume whenever the new block is destroyed. -Only one category is timed at once. This enables functions can declare how calls within them -are timed without needing to know how timing may be done by the calling function. - -The `BlockTimer` implementation relies upon a static member object of the `Timer` class, -which behaves as a stopwatch with some memory functions. It is the responsibility of this -`BlockTimer::timer` object to record time spent in each recorded category. Other `Timer` -objects may be created for local timing purposes, but these will not count toward time -reported at the end of a log file or in the metadata JSON. - -## Testing - -We use [Catch2](https://github.com/catchorg/Catch2) to perform unit testing of the [libCEED] -(https://libceed.org/en/latest/) integration in Palace against the legacy MFEM assembly -routines. The unit tests source code is located in the [`test/unit/`] -(https://github.com/awslabs/palace/blob/main/test/unit/) directory, and can be built from -within the *Palace* build directory using `make unit-tests`, or from the superbuild as -`make palace-tests`. The unit tests can be accelerated using MPI and/or OpenMP parallelism -(when configured with `PALACE_WITH_OPENMP=ON`), but in all cases they are only testing the -local operator assembly on each process. The 2D and 3D sample meshes in [`test/unit/mesh/`] -(https://github.com/awslabs/palace/blob/main/test/unit/mesh/) come from the -[MFEM repository](https://github.com/mfem/mfem/tree/master/data). - -The unit test application also includes a small number of benchmarks to compare performance -between MFEM's legacy assembly backend, MFEM's partial assembly backend, and the specified -libCEED backend (specified with the `--backend` option, use `-h`/`--help` to list all -command line options for the `unit-tests` executable). These can be run using, for -example: - -```bash -./unit-tests "[Benchmark]" --benchmark-samples 10 -``` - -The unit tests are run automatically as part of the project's continuous integration (CI) -workflows. Also run as part of the CI are regression tests based on the provided example -applications in the [`examples/`](https://github.com/awslabs/palace/blob/main/examples/) -directory. These are executed based on the code in [`test/examples/`] -(https://github.com/awslabs/palace/blob/main/test/examples/). - -## Changelog - -Code contributions should generally be accompanied by an entry in the [changelog] -(https://github.com/awslabs/palace/blob/main/CHANGELOG.md). +```@raw html + + +``` + +# Developer Notes + +## Style guide + +Automated source code formatting is performed using [`clang-format`] +(https://clang.llvm.org/docs/ClangFormat.html). Run: + +```bash +./scripts/format_source +``` + +in the repository root directory to automatically use `clang-format` to format `C++` source +as well as [`JuliaFormatter.jl`](https://github.com/domluna/JuliaFormatter.jl) for Julia and +Markdown files. The script can be viewed [in the repository] +(https://github.com/awslabs/palace/blob/main/scripts/format_source). + +The following conventions also apply: + + - `PascalCase` for classes and function names. + - Follow 'include what you use' (IWYU), with the include order dictated by the + [Google C++ Style Guide] + (https://google.github.io/styleguide/cppguide.html#Names_and_Order_of_Includes). This + order should be automatically enforced by the `clang-format` [style file] + (https://github.com/awslabs/palace/blob/main/.clang-format). + - Code comments should be full sentences, with punctuation. At this time, no Doxygen API + reference is generated and so comments generally do not need to conform to Doxygen + syntax. + +## Static analysis + +During the `cmake` configuration step, definining the variables `ANALYZE_SOURCES_CLANG_TIDY` +and `ANALYZE_SOURCES_CPPCHECK` to `ON` will turn on static analysis using [`clang-tidy`] +(https://clang.llvm.org/extra/clang-tidy/) and [`cppcheck`] +(https://cppcheck.sourceforge.io/), respectively, during the build step. This requires the +executables to be installed and findable by CMake on your system. + +## JSON Schema for configuration files + +A JSON format [configuration file](config/config.md), for example named `config.json`, can +be validated against the provided Schema using: + +```bash +./scripts/validate_config config.json +``` + +[This script](https://github.com/awslabs/palace/blob/main/scripts/validate_config) uses +Julia's [`JSONSchema.jl`](https://github.com/fredo-dedup/JSONSchema.jl) and the Schema +provided in [`scripts/schema/`] +(https://github.com/awslabs/palace/blob/main/scripts/schema) to parse the configuration +file and check that the fields are correctly specified. This script and the associated +Schema are also installed and can be accessed in `/bin`. + +## Timing + +Timing facilities are provided by the `Timer` and `BlockTimer` classes. + +Creating a block as `BlockTimer b(idx)` where `idx` is a category like `CONSTRUCT`, `SOLVE`, +etc. will record time so long as `b` is in scope; however, timing may be interrupted by +creation of another `BlockTimer` object. It will resume whenever the new block is destroyed. +Only one category is timed at once. This enables functions can declare how calls within them +are timed without needing to know how timing may be done by the calling function. + +The `BlockTimer` implementation relies upon a static member object of the `Timer` class, +which behaves as a stopwatch with some memory functions. It is the responsibility of this +`BlockTimer::timer` object to record time spent in each recorded category. Other `Timer` +objects may be created for local timing purposes, but these will not count toward time +reported at the end of a log file or in the metadata JSON. + +## Testing + +We use [Catch2](https://github.com/catchorg/Catch2) to perform unit testing of the [libCEED] +(https://libceed.org/en/latest/) integration in Palace against the legacy MFEM assembly +routines. The unit tests source code is located in the [`test/unit/`] +(https://github.com/awslabs/palace/blob/main/test/unit/) directory, and can be built from +within the *Palace* build directory using `make unit-tests`, or from the superbuild as +`make palace-tests`. The unit tests can be accelerated using MPI and/or OpenMP parallelism +(when configured with `PALACE_WITH_OPENMP=ON`), but in all cases they are only testing the +local operator assembly on each process. The 2D and 3D sample meshes in [`test/unit/mesh/`] +(https://github.com/awslabs/palace/blob/main/test/unit/mesh/) come from the +[MFEM repository](https://github.com/mfem/mfem/tree/master/data). + +The unit test application also includes a small number of benchmarks to compare performance +between MFEM's legacy assembly backend, MFEM's partial assembly backend, and the specified +libCEED backend (specified with the `--backend` option, use `-h`/`--help` to list all +command line options for the `unit-tests` executable). These can be run using, for +example: + +```bash +./unit-tests "[Benchmark]" --benchmark-samples 10 +``` + +The unit tests are run automatically as part of the project's continuous integration (CI) +workflows. Also run as part of the CI are regression tests based on the provided example +applications in the [`examples/`](https://github.com/awslabs/palace/blob/main/examples/) +directory. These are executed based on the code in [`test/examples/`] +(https://github.com/awslabs/palace/blob/main/test/examples/). + +## Changelog + +Code contributions should generally be accompanied by an entry in the [changelog] +(https://github.com/awslabs/palace/blob/main/CHANGELOG.md). diff --git a/docs/src/examples/cavity.md b/docs/src/examples/cavity.md index e84f8b74f8..f7d959d8f0 100644 --- a/docs/src/examples/cavity.md +++ b/docs/src/examples/cavity.md @@ -1,232 +1,232 @@ -```@raw html - - -``` - -# Eigenmodes of a Cylindrical Cavity - -!!! note - - The files for this example can be found in the [`examples/cavity/`] - (https://github.com/awslabs/palace/blob/main/examples/cavity) directory of the *Palace* - source code. - -This example demonstrates *Palace*'s eigenmode simulation type to solve for the lowest -frequency modes of a cylindrical cavity resonator. In particular, we consider a cylindrical -cavity filled with Teflon (``\varepsilon_r = 2.08``, -``\tan\delta = 4\times 10^{-4}``), with radius ``a = 2.74\text{ cm}`` and height -``d = 2a``. From [[1]](#References), the frequencies of the ``\text{TE}_{nml}`` and -``\text{TM}_{nml}`` modes are given by - -```math -\begin{aligned} -f_{\text{TE},nml} &= \frac{1}{2\pi\sqrt{\mu\varepsilon}} - \sqrt{\left(\frac{p'_{nm}}{a}\right)^2 + - \left(\frac{l\pi}{d}\right)^2} \\ -f_{\text{TM},nml} &= \frac{1}{2\pi\sqrt{\mu\varepsilon}} - \sqrt{\left(\frac{p_{nm}}{a}\right)^2 + - \left(\frac{l\pi}{d}\right)^2} \\ -\end{aligned} -``` - -where ``p_{nm}`` and ``p'_{nm}`` denote the ``m``-th root (``m\geq 1``) of the ``n``-th -order Bessel function (``n\geq 0``) of the first kind, ``J_n``, and its derivative, -``J'_n``, respectively. - -In addition, we have analytic expressions for the unloaded quality factors due to dielectric -loss, ``Q_d``, and imperfectly conducting walls, ``Q_c``. In particular, - -```math -Q_d = \frac{1}{\tan\delta} -``` - -and, for a surface resistance ``R_s``, - -```math -Q_c = \frac{(ka)^3\eta ad}{4(p'_{nm})^2 R_s} - \left[1-\left(\frac{n}{p'_{nm}}\right)^2\right] - \left\{\frac{ad}{2} - \left[1+\left(\frac{\beta an}{(p'_{nm})^2}\right)^2\right] + - \left(\frac{\beta a^2}{p'_{nm}}\right)^2 - \left(1-\frac{n^2}{(p'_{nm})^2}\right)\right\}^{-1} -``` - -where ``k=\omega\sqrt{\mu\varepsilon}``, ``\eta=\sqrt{\mu/\varepsilon}``, and -``\beta=l\pi/d``. - -The initial Gmsh mesh for this problem, from [`mesh/cavity.msh`] -(https://github.com/awslabs/palace/blob/main/examples/cavity/mesh/cavity.msh), is shown -below. We use quadratic triangular prism elements. - -```@raw html -

- -


-``` - -There are two configuration files for this problem, [`cavity_pec.json`] -(https://github.com/awslabs/palace/blob/main/examples/cavity/cavity_pec.json) and -[`cavity_impedance.json`] -(https://github.com/awslabs/palace/blob/main/examples/cavity/cavity_impedance.json). - -In both, the [`config["Problem"]["Type"]`](../config/problem.md#config%5B%22Problem%22%5D) - -field is set to `"Eigenmode"`, and we use the mesh shown above with a single level of -uniform mesh refinement (`"UniformLevels": 1`). The material properties for Teflon are -entered under [`config["Domains"]["Materials"]`] -(../config/domains.md#domains%5B%22Materials%22%5D). The -[`config["Domains"]["Postprocessing"]["Dielectric]"`] -(../config/domains.md#domains["Postprocessing"]["Dielectric"]) object is used to extract the -quality factor due to bulk dielectric loss; in this problem since there is only one domain -this is trivial, but in problems with multiple material domains this feature can be used to -isolate the energy-participation ratio (EPR) and associated quality factor due to different -domains in the model. - -The only difference between the two configuration files is in the `"Boundaries"` object: -`cavity_pec.json` prescribes a perfect electric conductor (`"PEC"`) boundary condition to -the cavity boundary surfaces, while `cavity_impedance.json` prescribes a surface impedance -condition with the surface resistance ``R_s = 0.0184\text{ }\Omega\text{/sq}``, for copper -at ``5\text{ GHz}``. - -In both cases, we configure the eigenvalue solver to solve for the ``15`` lowest frequency -modes above ``2.0\text{ GHz}`` (the dominant mode frequencies for both the -``\text{TE}`` and ``\text{TM}`` cases fall around ``2.9\text{ GHz}`` frequency for this -problem). A sparse direct solver is used for the solutions of the linear system resulting -from the spatial discretization of the governing equations, using in this case a second- -order finite element space. - -The frequencies for the lowest order ``\text{TE}`` and ``\text{TM}`` modes computed using -the above formula for this problem are listed in the table below. - -| ``(n,m,l)`` | ``f_{\text{TE}}`` | ``f_{\text{TM}}`` | -|:----------- | -----------------------:| -----------------------:| -| ``(0,1,0)`` | ---- | ``2.903605\text{ GHz}`` | -| ``(1,1,0)`` | ---- | ``4.626474\text{ GHz}`` | -| ``(2,1,0)`` | ---- | ``6.200829\text{ GHz}`` | -| ``(0,1,1)`` | ``5.000140\text{ GHz}`` | ``3.468149\text{ GHz}`` | -| ``(1,1,1)`` | ``2.922212\text{ GHz}`` | ``5.000140\text{ GHz}`` | -| ``(2,1,1)`` | ``4.146842\text{ GHz}`` | ``6.484398\text{ GHz}`` | -| ``(0,1,2)`` | ``5.982709\text{ GHz}`` | ``4.776973\text{ GHz}`` | -| ``(1,1,2)`` | ``4.396673\text{ GHz}`` | ``5.982709\text{ GHz}`` | -| ``(2,1,2)`` | ``5.290341\text{ GHz}`` | ``7.269033\text{ GHz}`` | - -First, we examine the output of the `cavity_pec.json` simulation. The file -`postpro/pec/eig.csv` contains information about the computed eigenfrequencies and -associated quality factors: - -``` - m, Re{f} (GHz), Im{f} (GHz), Q - 1.000000e+00, +2.904507338e+00, +5.809012262e-04, +2.500001089e+03 - 2.000000e+00, +2.922515466e+00, +5.845032101e-04, +2.499999550e+03 - 3.000000e+00, +2.922528546e+00, +5.845057488e-04, +2.499999880e+03 - 4.000000e+00, +3.468921611e+00, +6.937841360e-04, +2.500000721e+03 - 5.000000e+00, +4.147607819e+00, +8.295219962e-04, +2.499998747e+03 - 6.000000e+00, +4.147624590e+00, +8.295263017e-04, +2.499995880e+03 - 7.000000e+00, +4.397698897e+00, +8.795405799e-04, +2.499997775e+03 - 8.000000e+00, +4.397707609e+00, +8.795424791e-04, +2.499997329e+03 - 9.000000e+00, +4.630241197e+00, +9.260492789e-04, +2.499997243e+03 - 1.000000e+01, +4.631850092e+00, +9.263712403e-04, +2.499996752e+03 - 1.100000e+01, +4.778292314e+00, +9.556584905e-04, +2.499999978e+03 - 1.200000e+01, +5.002916952e+00, +1.000583103e-03, +2.500000769e+03 - 1.300000e+01, +5.003637424e+00, +1.000727996e-03, +2.499998774e+03 - 1.400000e+01, +5.005126280e+00, +1.001026744e-03, +2.499996334e+03 - 1.500000e+01, +5.291624557e+00, +1.058325143e-03, +2.499999503e+03 -``` - -Indeed we can find a correspondence between the analytic modes predicted and the solutions -obtained by *Palace*. Since the only source of loss in the simulation is the nonzero -dielectric loss tangent, we have ``Q = Q_d = 1/0.0004 = 2.50\times 10^3`` in all cases. - -Next, we run `cavity_impedance.json`, which adds the surface impedance boundary condition. -Examining `postpro/impedance/eig.csv` we see that the mode frequencies are roughly -unchanged but the quality factors have fallen due to the addition of imperfectly conducting -walls to the model: - -``` - m, Re{f} (GHz), Im{f} (GHz), Q - 1.000000e+00, +2.904507340e+00, +7.086038246e-04, +2.049457910e+03 - 2.000000e+00, +2.922515467e+00, +7.051671704e-04, +2.072214699e+03 - 3.000000e+00, +2.922528546e+00, +7.051734731e-04, +2.072205452e+03 - 4.000000e+00, +3.468921613e+00, +8.640197955e-04, +2.007431854e+03 - 5.000000e+00, +4.147607821e+00, +9.784798616e-04, +2.119414052e+03 - 6.000000e+00, +4.147624591e+00, +9.784941280e-04, +2.119391720e+03 - 7.000000e+00, +4.397698899e+00, +1.000289498e-03, +2.198213128e+03 - 8.000000e+00, +4.397707610e+00, +1.000292504e-03, +2.198210877e+03 - 9.000000e+00, +4.630241200e+00, +1.054149598e-03, +2.196197451e+03 - 1.000000e+01, +4.631850095e+00, +1.054707045e-03, +2.195799411e+03 - 1.100000e+01, +4.778292317e+00, +1.126015851e-03, +2.121769621e+03 - 1.200000e+01, +5.002916951e+00, +1.085882618e-03, +2.303617807e+03 - 1.300000e+01, +5.003637428e+00, +1.171361603e-03, +2.135821061e+03 - 1.400000e+01, +5.005126284e+00, +1.171895768e-03, +2.135482762e+03 - 1.500000e+01, +5.291624560e+00, +1.207338551e-03, +2.191441950e+03 -``` - -However, the bulk dielectric loss postprocessing results, written to -`postpro/impedance/domain-Q.csv`, still give ``Q_d = 2.50\times 10^3`` for every mode as -expected. - -Focusing on the ``\text{TE}_{011}`` mode with ``f_{\text{TE},010} = 5.00\text{ GHz}``, we -can read the mode quality factor ``Q = 2.30\times 10^3``. Subtracting out the contribution -of dielectric losses, we have - -```math -Q_c = \left(\frac{1}{Q}-\frac{1}{Q_d}\right)^{-1} = 2.93\times 10^4 -``` - -which agrees very closely to the analytical result of ``Q_c = 2.94\times 10^4`` -given in Example 6.4 from [[1]](#References) for this geometry. - -Finally, a clipped view of the electric field (left) and magnetic flux density magnitudes -for the ``\text{TE}_{011}`` mode is shown below. - -```@raw html -

- - -

-``` - -## Mesh convergence - -The effect of mesh size can be investigated for the cylindrical cavity resonator using -[`convergence_study.jl`] -(https://github.com/awslabs/palace/blob/main/examples/cavity/convergence_study.jl). For -a polynomial order of solution and refinement level, a mesh is generated using Gmsh using -polynomials of the same order to resolve the boundary geometry. The eigenvalue problem is -then solved for ``f_{\text{TM},010}`` and ``f_{\text{TE},111}``, and the relative error, -``\frac{f-f_{\text{true}}}{f_{\text{true}}}``, of each mode plotted against -``\text{DOF}^{-\frac{1}{3}}``, a notional mesh size. Three different element types are -considered: tetrahedra, prisms and hexahedra, and the results are plotted below. The -``x``-axis is a notional measure of the overall cost of the solve, accounting for -polynomial order. - -```@raw html -

- -


-``` - -```@raw html -

- -


-``` - -```@raw html -

- -


-``` - -The observed rate of convergence for the eigenvalues are ``p+1`` for odd polynomials and -``p+2`` for even polynomials. Given the eigenmodes are analytic functions, the theoretical -maximum convergence rate is ``2p`` [[2]](#References). The figures demonstrate that -increasing the polynomial order of the solution will give reduced error, however the effect -may only become significant on sufficiently refined meshes. - -## References - -[1] D. M. Pozar, _Microwave Engineering_, Wiley, Hoboken, NJ, 2012.\ -[2] A. Buffa, P. Houston, I. Perugia, _Discontinuous Galerkin computation of the Maxwell -eigenvalues on simplicial meshes_, Journal of Computational and Applied Mathematics 204 -(2007) 317-333. +```@raw html + + +``` + +# Eigenmodes of a Cylindrical Cavity + +!!! note + + The files for this example can be found in the [`examples/cavity/`] + (https://github.com/awslabs/palace/blob/main/examples/cavity) directory of the *Palace* + source code. + +This example demonstrates *Palace*'s eigenmode simulation type to solve for the lowest +frequency modes of a cylindrical cavity resonator. In particular, we consider a cylindrical +cavity filled with Teflon (``\varepsilon_r = 2.08``, +``\tan\delta = 4\times 10^{-4}``), with radius ``a = 2.74\text{ cm}`` and height +``d = 2a``. From [[1]](#References), the frequencies of the ``\text{TE}_{nml}`` and +``\text{TM}_{nml}`` modes are given by + +```math +\begin{aligned} +f_{\text{TE},nml} &= \frac{1}{2\pi\sqrt{\mu\varepsilon}} + \sqrt{\left(\frac{p'_{nm}}{a}\right)^2 + + \left(\frac{l\pi}{d}\right)^2} \\ +f_{\text{TM},nml} &= \frac{1}{2\pi\sqrt{\mu\varepsilon}} + \sqrt{\left(\frac{p_{nm}}{a}\right)^2 + + \left(\frac{l\pi}{d}\right)^2} \\ +\end{aligned} +``` + +where ``p_{nm}`` and ``p'_{nm}`` denote the ``m``-th root (``m\geq 1``) of the ``n``-th +order Bessel function (``n\geq 0``) of the first kind, ``J_n``, and its derivative, +``J'_n``, respectively. + +In addition, we have analytic expressions for the unloaded quality factors due to dielectric +loss, ``Q_d``, and imperfectly conducting walls, ``Q_c``. In particular, + +```math +Q_d = \frac{1}{\tan\delta} +``` + +and, for a surface resistance ``R_s``, + +```math +Q_c = \frac{(ka)^3\eta ad}{4(p'_{nm})^2 R_s} + \left[1-\left(\frac{n}{p'_{nm}}\right)^2\right] + \left\{\frac{ad}{2} + \left[1+\left(\frac{\beta an}{(p'_{nm})^2}\right)^2\right] + + \left(\frac{\beta a^2}{p'_{nm}}\right)^2 + \left(1-\frac{n^2}{(p'_{nm})^2}\right)\right\}^{-1} +``` + +where ``k=\omega\sqrt{\mu\varepsilon}``, ``\eta=\sqrt{\mu/\varepsilon}``, and +``\beta=l\pi/d``. + +The initial Gmsh mesh for this problem, from [`mesh/cavity.msh`] +(https://github.com/awslabs/palace/blob/main/examples/cavity/mesh/cavity.msh), is shown +below. We use quadratic triangular prism elements. + +```@raw html +

+ +


+``` + +There are two configuration files for this problem, [`cavity_pec.json`] +(https://github.com/awslabs/palace/blob/main/examples/cavity/cavity_pec.json) and +[`cavity_impedance.json`] +(https://github.com/awslabs/palace/blob/main/examples/cavity/cavity_impedance.json). + +In both, the [`config["Problem"]["Type"]`](../config/problem.md#config%5B%22Problem%22%5D) + +field is set to `"Eigenmode"`, and we use the mesh shown above with a single level of +uniform mesh refinement (`"UniformLevels": 1`). The material properties for Teflon are +entered under [`config["Domains"]["Materials"]`] +(../config/domains.md#domains%5B%22Materials%22%5D). The +[`config["Domains"]["Postprocessing"]["Dielectric]"`] +(../config/domains.md#domains["Postprocessing"]["Dielectric"]) object is used to extract the +quality factor due to bulk dielectric loss; in this problem since there is only one domain +this is trivial, but in problems with multiple material domains this feature can be used to +isolate the energy-participation ratio (EPR) and associated quality factor due to different +domains in the model. + +The only difference between the two configuration files is in the `"Boundaries"` object: +`cavity_pec.json` prescribes a perfect electric conductor (`"PEC"`) boundary condition to +the cavity boundary surfaces, while `cavity_impedance.json` prescribes a surface impedance +condition with the surface resistance ``R_s = 0.0184\text{ }\Omega\text{/sq}``, for copper +at ``5\text{ GHz}``. + +In both cases, we configure the eigenvalue solver to solve for the ``15`` lowest frequency +modes above ``2.0\text{ GHz}`` (the dominant mode frequencies for both the +``\text{TE}`` and ``\text{TM}`` cases fall around ``2.9\text{ GHz}`` frequency for this +problem). A sparse direct solver is used for the solutions of the linear system resulting +from the spatial discretization of the governing equations, using in this case a second- +order finite element space. + +The frequencies for the lowest order ``\text{TE}`` and ``\text{TM}`` modes computed using +the above formula for this problem are listed in the table below. + +| ``(n,m,l)`` | ``f_{\text{TE}}`` | ``f_{\text{TM}}`` | +|:----------- | -----------------------:| -----------------------:| +| ``(0,1,0)`` | ---- | ``2.903605\text{ GHz}`` | +| ``(1,1,0)`` | ---- | ``4.626474\text{ GHz}`` | +| ``(2,1,0)`` | ---- | ``6.200829\text{ GHz}`` | +| ``(0,1,1)`` | ``5.000140\text{ GHz}`` | ``3.468149\text{ GHz}`` | +| ``(1,1,1)`` | ``2.922212\text{ GHz}`` | ``5.000140\text{ GHz}`` | +| ``(2,1,1)`` | ``4.146842\text{ GHz}`` | ``6.484398\text{ GHz}`` | +| ``(0,1,2)`` | ``5.982709\text{ GHz}`` | ``4.776973\text{ GHz}`` | +| ``(1,1,2)`` | ``4.396673\text{ GHz}`` | ``5.982709\text{ GHz}`` | +| ``(2,1,2)`` | ``5.290341\text{ GHz}`` | ``7.269033\text{ GHz}`` | + +First, we examine the output of the `cavity_pec.json` simulation. The file +`postpro/pec/eig.csv` contains information about the computed eigenfrequencies and +associated quality factors: + +``` + m, Re{f} (GHz), Im{f} (GHz), Q + 1.000000e+00, +2.904507338e+00, +5.809012262e-04, +2.500001089e+03 + 2.000000e+00, +2.922515466e+00, +5.845032101e-04, +2.499999550e+03 + 3.000000e+00, +2.922528546e+00, +5.845057488e-04, +2.499999880e+03 + 4.000000e+00, +3.468921611e+00, +6.937841360e-04, +2.500000721e+03 + 5.000000e+00, +4.147607819e+00, +8.295219962e-04, +2.499998747e+03 + 6.000000e+00, +4.147624590e+00, +8.295263017e-04, +2.499995880e+03 + 7.000000e+00, +4.397698897e+00, +8.795405799e-04, +2.499997775e+03 + 8.000000e+00, +4.397707609e+00, +8.795424791e-04, +2.499997329e+03 + 9.000000e+00, +4.630241197e+00, +9.260492789e-04, +2.499997243e+03 + 1.000000e+01, +4.631850092e+00, +9.263712403e-04, +2.499996752e+03 + 1.100000e+01, +4.778292314e+00, +9.556584905e-04, +2.499999978e+03 + 1.200000e+01, +5.002916952e+00, +1.000583103e-03, +2.500000769e+03 + 1.300000e+01, +5.003637424e+00, +1.000727996e-03, +2.499998774e+03 + 1.400000e+01, +5.005126280e+00, +1.001026744e-03, +2.499996334e+03 + 1.500000e+01, +5.291624557e+00, +1.058325143e-03, +2.499999503e+03 +``` + +Indeed we can find a correspondence between the analytic modes predicted and the solutions +obtained by *Palace*. Since the only source of loss in the simulation is the nonzero +dielectric loss tangent, we have ``Q = Q_d = 1/0.0004 = 2.50\times 10^3`` in all cases. + +Next, we run `cavity_impedance.json`, which adds the surface impedance boundary condition. +Examining `postpro/impedance/eig.csv` we see that the mode frequencies are roughly +unchanged but the quality factors have fallen due to the addition of imperfectly conducting +walls to the model: + +``` + m, Re{f} (GHz), Im{f} (GHz), Q + 1.000000e+00, +2.904507340e+00, +7.086038246e-04, +2.049457910e+03 + 2.000000e+00, +2.922515467e+00, +7.051671704e-04, +2.072214699e+03 + 3.000000e+00, +2.922528546e+00, +7.051734731e-04, +2.072205452e+03 + 4.000000e+00, +3.468921613e+00, +8.640197955e-04, +2.007431854e+03 + 5.000000e+00, +4.147607821e+00, +9.784798616e-04, +2.119414052e+03 + 6.000000e+00, +4.147624591e+00, +9.784941280e-04, +2.119391720e+03 + 7.000000e+00, +4.397698899e+00, +1.000289498e-03, +2.198213128e+03 + 8.000000e+00, +4.397707610e+00, +1.000292504e-03, +2.198210877e+03 + 9.000000e+00, +4.630241200e+00, +1.054149598e-03, +2.196197451e+03 + 1.000000e+01, +4.631850095e+00, +1.054707045e-03, +2.195799411e+03 + 1.100000e+01, +4.778292317e+00, +1.126015851e-03, +2.121769621e+03 + 1.200000e+01, +5.002916951e+00, +1.085882618e-03, +2.303617807e+03 + 1.300000e+01, +5.003637428e+00, +1.171361603e-03, +2.135821061e+03 + 1.400000e+01, +5.005126284e+00, +1.171895768e-03, +2.135482762e+03 + 1.500000e+01, +5.291624560e+00, +1.207338551e-03, +2.191441950e+03 +``` + +However, the bulk dielectric loss postprocessing results, written to +`postpro/impedance/domain-Q.csv`, still give ``Q_d = 2.50\times 10^3`` for every mode as +expected. + +Focusing on the ``\text{TE}_{011}`` mode with ``f_{\text{TE},010} = 5.00\text{ GHz}``, we +can read the mode quality factor ``Q = 2.30\times 10^3``. Subtracting out the contribution +of dielectric losses, we have + +```math +Q_c = \left(\frac{1}{Q}-\frac{1}{Q_d}\right)^{-1} = 2.93\times 10^4 +``` + +which agrees very closely to the analytical result of ``Q_c = 2.94\times 10^4`` +given in Example 6.4 from [[1]](#References) for this geometry. + +Finally, a clipped view of the electric field (left) and magnetic flux density magnitudes +for the ``\text{TE}_{011}`` mode is shown below. + +```@raw html +

+ + +

+``` + +## Mesh convergence + +The effect of mesh size can be investigated for the cylindrical cavity resonator using +[`convergence_study.jl`] +(https://github.com/awslabs/palace/blob/main/examples/cavity/convergence_study.jl). For +a polynomial order of solution and refinement level, a mesh is generated using Gmsh using +polynomials of the same order to resolve the boundary geometry. The eigenvalue problem is +then solved for ``f_{\text{TM},010}`` and ``f_{\text{TE},111}``, and the relative error, +``\frac{f-f_{\text{true}}}{f_{\text{true}}}``, of each mode plotted against +``\text{DOF}^{-\frac{1}{3}}``, a notional mesh size. Three different element types are +considered: tetrahedra, prisms and hexahedra, and the results are plotted below. The +``x``-axis is a notional measure of the overall cost of the solve, accounting for +polynomial order. + +```@raw html +

+ +


+``` + +```@raw html +

+ +


+``` + +```@raw html +

+ +


+``` + +The observed rate of convergence for the eigenvalues are ``p+1`` for odd polynomials and +``p+2`` for even polynomials. Given the eigenmodes are analytic functions, the theoretical +maximum convergence rate is ``2p`` [[2]](#References). The figures demonstrate that +increasing the polynomial order of the solution will give reduced error, however the effect +may only become significant on sufficiently refined meshes. + +## References + +[1] D. M. Pozar, _Microwave Engineering_, Wiley, Hoboken, NJ, 2012.\ +[2] A. Buffa, P. Houston, I. Perugia, _Discontinuous Galerkin computation of the Maxwell +eigenvalues on simplicial meshes_, Journal of Computational and Applied Mathematics 204 +(2007) 317-333. diff --git a/docs/src/examples/coaxial.md b/docs/src/examples/coaxial.md index a9d69251f3..caa1c134d6 100644 --- a/docs/src/examples/coaxial.md +++ b/docs/src/examples/coaxial.md @@ -1,94 +1,93 @@ -```@raw html - - -``` - -# Signal Propagation in a Coaxial Cable - -!!! note - - The files for this example can be found in the [`examples/coaxial/`] - (https://github.com/awslabs/palace/blob/main/examples/coaxial) directory of the *Palace* - source code. - -*Palace* can perform transient electromagnetic modeling, acting as a so-called finite -element time domain (FETD) solver. To demonstrate this feature, we consider here the -propagation of an electromagnetic pulse through a section of coaxial cable. The model is -constructed based on a ``50\text{ }\Omega`` RG-401/U coaxial cable [[1]] -(#References), with outer and inner conductor diameters of ``0.215\text{ in}`` and -``0.0645\text{ in}``, respectively. The section length is roughly ``1.5\text{ in}``. The -Teflon dielectric material has ``\varepsilon_r = 2.08``, and we consider -``\tan\delta = 4\times 10^{-2}``, a factor of ``100`` above the actual value in order to -exaggerate losses in the transmission line. - -In this example we consider three different configurations of the model, all with a coaxial -lumped port excitation at one end of the line: an open termination at the opposite end -([`coaxial_open.json`] -(https://github.com/awslabs/palace/blob/main/examples/coaxial/coaxial_open.json)), a -shorted termination([`coaxial_short.json`] -(https://github.com/awslabs/palace/blob/main/examples/coaxial/coaxial_short.json)), and a -matched ``50\text{ }\Omega`` lumped port termination ([`coaxial_matched.json`] -(https://github.com/awslabs/palace/blob/main/examples/coaxial/coaxial_matched.json)). - -The mesh is generated using the Julia code in [`mesh/mesh.jl`] -(https://github.com/awslabs/palace/blob/main/examples/coaxial/mesh/mesh.jl) and consists of -quadratically-curved hexahedral elements, as depicted below. Third-order shape functions -are used to approximate the solution. - -```@raw html -

- -


-``` - -Each configuration file sets the simulation `"Type"` to `"Transient"`. The different -termination configurations are specified by using a `"LumpedPort"` with matched impedance -for the matched termination, a `"PEC"` boundary for the shorted termination, leaving no -boundary condition specified for the open termination. This last case applies the natural -boundary condition for the finite element formulation which is a perfect magnetic conductor -boundary condition, enforcing zero tangential magnetic field and thus zero surface current -density. - -The excitation pulse is configured under [`config["Solver"]["Transient"]`] -(../config/solver.md#solver["Transient"]). Here, we use a modulated Gaussian pulse shape, -with time dependence given by the expression - -```math -g(t) = \sin{\left[\omega(t-t_0)\right]} e^{-\frac{(t-t_0)^2}{2\tau}^2} \,. -``` - -For this simulation, we use a center frequency ``f = \omega/2\pi = 10\text{ GHz}`` and pulse -width ``\tau = 0.05\text{ ns}``. The offset ``t_0`` is automatically chosen by *Palace* in -order to smoothly ramp up the excitation from the rest initial condition. Time integration -uses the second-order implicit Generalized-``\alpha`` scheme with a uniform time step -``\Delta t = 5\times 10^{-3}\text{ ns}``, and the solution is computed for the interval -``t\in[0.0,1.0]\text{ ns}``. The electric and magnetic field solutions are sampled every -``10`` time steps for visualization. - -Below, we plot the time histories of the port voltage at the excited coaxial lumped port for -the three simulation cases. - -```@raw html -

- -


-``` - -We can observe that as expected, the matched termination absorbs the incident waveform -nearly perfectly, while it is reflected with the same polarity for the shorted termination -and opposite polarity for the open termination (phase shifted by ``\pi``). Furthermore, the -reflected wave is noticably attenuated due to the material loss of the transmission line -dielectric. - -Lastly, an animation of the signal propagation for the matched (left) and shorted -(right) simulations, constructed using the saved fields, is shown below. - -```@raw html -

- -

-``` - -## References - -[1] D. M. Pozar, _Microwave Engineering_, Wiley, Hoboken, NJ, 2012. +```@raw html + + +``` + +# Signal Propagation in a Coaxial Cable + +!!! note + + The files for this example can be found in the + [`examples/coaxial/`](https://github.com/awslabs/palace/blob/main/examples/coaxial) + directory of the *Palace* source code. + +*Palace* can perform transient electromagnetic modeling, acting as a so-called finite +element time domain (FETD) solver. To demonstrate this feature, we consider here the +propagation of an electromagnetic pulse through a section of coaxial cable. The model is +constructed based on a ``50\text{ }\Omega`` RG-401/U coaxial cable [[1]](#References), with +outer and inner conductor diameters of ``0.215\text{ in}`` and ``0.0645\text{ in}``, +respectively. The section length is roughly ``1.5\text{ in}``. The Teflon dielectric +material has ``\varepsilon_r = 2.08``, and we consider ``\tan\delta = 4\times 10^{-2}``, a +factor of ``100`` above the actual value in order to exaggerate losses in the transmission +line. + +In this example we consider three different configurations of the model, all with a coaxial +lumped port excitation at one end of the line: an open termination at the opposite end +([`coaxial_open.json`](https://github.com/awslabs/palace/blob/main/examples/coaxial/coaxial_open.json)), +a shorted termination +([`coaxial_short.json`](https://github.com/awslabs/palace/blob/main/examples/coaxial/coaxial_short.json)), +and a matched ``50\text{ }\Omega`` lumped port termination +([`coaxial_matched.json`](https://github.com/awslabs/palace/blob/main/examples/coaxial/coaxial_matched.json)). + +The mesh is generated using the Julia code in +[`mesh/mesh.jl`](https://github.com/awslabs/palace/blob/main/examples/coaxial/mesh/mesh.jl) +and consists of quadratically-curved hexahedral elements, as depicted below. Third-order +shape functions are used to approximate the solution. + +```@raw html +

+ +


+``` + +Each configuration file sets the simulation `"Type"` to `"Transient"`. The different +termination configurations are specified by using a `"LumpedPort"` with matched impedance +for the matched termination, a `"PEC"` boundary for the shorted termination, leaving no +boundary condition specified for the open termination. This last case applies the natural +boundary condition for the finite element formulation which is a perfect magnetic conductor +boundary condition, enforcing zero tangential magnetic field and thus zero surface current +density. + +The excitation pulse is configured under +[`config["Solver"]["Transient"]`](../config/solver.md#solver%5B%22Transient%22%5D). Here, we +use a modulated Gaussian pulse shape, with time dependence given by the expression + +```math +g(t) = \sin{\left[\omega(t-t_0)\right]} e^{-\frac{(t-t_0)^2}{2\tau^2}} \,. +``` + +For this simulation, we use a center frequency ``f = \omega/2\pi = 10\text{ GHz}`` and pulse +width ``\tau = 0.05\text{ ns}``. The offset ``t_0`` is automatically chosen by *Palace* in +order to smoothly ramp up the excitation from the rest initial condition. Time integration +uses the second-order implicit Generalized-``\alpha`` scheme with a uniform time step +``\Delta t = 5\times 10^{-3}\text{ ns}``, and the solution is computed for the interval +``t\in[0.0,1.0]\text{ ns}``. The electric and magnetic field solutions are sampled every +``10`` time steps for visualization. + +Below, we plot the time histories of the port voltage at the excited coaxial lumped port for +the three simulation cases. + +```@raw html +

+ +


+``` + +We can observe that as expected, the matched termination absorbs the incident waveform +nearly perfectly, while it is reflected with the same polarity for the shorted termination +and opposite polarity for the open termination (phase shifted by ``\pi``). Furthermore, the +reflected wave is noticeably attenuated due to the material loss of the transmission line +dielectric. + +Lastly, an animation of the signal propagation for the matched (left) and shorted +(right) simulations, constructed using the saved fields, is shown below. + +```@raw html +

+ +

+``` + +## References + +[1] D. M. Pozar, _Microwave Engineering_, Wiley, Hoboken, NJ, 2012. diff --git a/docs/src/examples/cpw.md b/docs/src/examples/cpw.md index a6369ca584..dda9c5d690 100644 --- a/docs/src/examples/cpw.md +++ b/docs/src/examples/cpw.md @@ -1,123 +1,133 @@ -```@raw html - - -``` - -# Crosstalk Between Coplanar Waveguides - -!!! note - - The files for this example can be found in the [`examples/cpw/`] - (https://github.com/awslabs/palace/blob/main/examples/cpw) directory of the *Palace* - source code. - -In this example, we construct a frequency domain model to analyze the wave transmission, -reflection, near-end crosstalk, and far-end crosstalk for a four-port system comprised of -two side-by-side coplanar waveguides (CPW). Each CPW is characterized by a trace width -``w = 30\text{ μm}`` and gap width ``s = 18\text{ μm}``. The metal is modeled as an -infinitely thin, perfectly conducting boundary surface on top of a sapphire dielectric -substrate (parallel to C-axis: ``\varepsilon_r = 11.5``, -``\tan\delta = 8.6\times 10^{-5}``, perpendicular to C-axis: ``\varepsilon_r = 9.3``, -``\tan\delta = 3.0\times 10^{-5}``) of ``500\text{ μm}`` thickness with the -C-axis in the z-direction. This yields a characteristic impedance -``Z_0 = 56.02\text{ }\Omega`` for each of the lines [[1]](#References). The center-to-center -separating distance between the transmission lines on the substrate is ``266\text{ μm}``, -which means there is exactly ``200\text{ μm}`` of ground plane between them. - -A visualization of the computational domain is shown below. - -```@raw html -

- -


-``` - -There are two different options for modeling the termination at the ends of the CPW: - - - Lumped port: A multielement uniform lumped port can be used to terminate the CPW by - connecting the center conductor to the ground plane on each side with impedance - ``Z = 2Z_0``. - - Wave port: We can solve a 2D boundary eigenvalue problem for the mode shape and - propagation constants for the characteristic CPW mode, and use this to terminate the - transmission line. - -Views of the mesh boundaries for these two configurations are shown below. In both cases the -computational domain is discretized using an unstructured tetrahedral mesh. The mesh -files are [mesh/cpw_wave.msh] -(https://github.com/awslabs/palace/blob/main/examples/cpw/mesh/cpw_wave.msh) and -[mesh/cpw_lumped.msh] -(https://github.com/awslabs/palace/blob/main/examples/cpw/mesh/cpw_lumped.msh), -respectively. - -```@raw html -

- - -


-``` - -Likewise, there are two different options for how the system response is calculated over the -desired frequency band: - - - Uniform: Sample the frequency band with the full-fidelity model at equally spaced - frequencies over the desired range. - - Adaptive: Use the full-fidelity model to sample the solution at a few adaptively - selected frequency points in the desired band, and then construct a low-cost surrogate - model which is used to compute the response over the entire band. - -This leads to four possible configurations, for which there are four configuration files in -the example directory: [`cpw_lumped_uniform.json`] -(https://github.com/awslabs/palace/blob/main/examples/cpw/cpw_lumped_uniform.json), -[`cpw_lumped_adaptive.json`] -(https://github.com/awslabs/palace/blob/main/examples/cpw/cpw_lumped_adaptive.json), -[`cpw_wave_uniform.json`] -(https://github.com/awslabs/palace/blob/main/examples/cpw/cpw_wave_uniform.json), -and [`cpw_wave_adaptive.json`] -(https://github.com/awslabs/palace/blob/main/examples/cpw/cpw_wave_adaptive.json). - -The frequency response is computed for the band ``f\in[2.0,30.0]\text{ GHz}``. For the -uniform sweep, a step size of ``\Delta f=2.0\text{ GHz}`` is used, while the adaptive sweep -employs a much finer step size ``\Delta f=0.1\text{ GHz}``. The adaptive fast frequency -sweep algorithm is given a tolerance of ``1\times10^{-3}`` for choosing the sampling -points; the simulation with uniform ports uses ``9`` frequency samples and that with wave -ports uses ``10``. Despite the much finer frequency resolution, the adaptive frequency -sweep simulations take roughly the same amount of time as the uniform ones where the -resulting resolution is worse by a factor of ``20``. Lastly, for all simulations, a single -level of uniform mesh refinement is applied to the initial mesh and a first-order finite -element approximation for the solution is used. - -The results from the four different simulations are presented in the plots below. - -```@raw html -

- - - - -


-``` - -The first remark is that in both the lumped port and wave port cases, the adaptive fast -frequency sweep results are very close to the true solutions sampled by the uniform -sweeps. - -Second, there is a discrepancy between the results using lumped ports and those with wave -ports, namely the lumped port excitation exhibits much higher reflection that that for wave -ports. This can be attributed to the coarse meshes used for these examples. Indeed, -refining the mesh or increasing the order of the solution approximation resolves this issue -and leads to better agreement between the lumped port and wave port results. See below for -the results with again a single level of mesh refinement but ``p = 2`` for the order of the -solution space. - -```@raw html -

- - - - -


-``` - -## References - -[1] H. J. Visser, _Antenna Theory and Applications_, Wiley, Hoboken, NJ, 2012. +```@raw html + + +``` + +# Crosstalk Between Coplanar Waveguides + +!!! note + + The files for this example can be found in the + [`examples/cpw/`](https://github.com/awslabs/palace/blob/main/examples/cpw) + directory of the *Palace* source code. + +In this example, we construct a frequency domain model to analyze the wave transmission, +reflection, near-end crosstalk, and far-end crosstalk for a four-port system comprised of +two side-by-side coplanar waveguides (CPW). Each CPW is characterized by a trace width +``w = 30\text{ μm}`` and gap width ``s = 18\text{ μm}``. The metal is modeled as an +infinitely thin, perfectly conducting boundary surface on top of a sapphire dielectric +substrate (parallel to C-axis: ``\varepsilon_r = 11.5``, +``\tan\delta = 8.6\times 10^{-5}``, perpendicular to C-axis: ``\varepsilon_r = 9.3``, +``\tan\delta = 3.0\times 10^{-5}``) of ``500\text{ μm}`` thickness with the +C-axis in the z-direction. This yields a characteristic impedance +``Z_0 = 56.02\text{ }\Omega`` for each of the lines [[1]](#References). The center-to-center +separating distance between the transmission lines on the substrate is ``266\text{ μm}``, +which means there is exactly ``200\text{ μm}`` of ground plane between them. + +A visualization of the computational domain is shown below. + +```@raw html +

+ +


+``` + +There are two different options for modeling the termination at the ends of the CPW: + + - Lumped port: A multielement uniform lumped port can be used to terminate the CPW by + connecting the center conductor to the ground plane on each side with impedance + ``Z = 2Z_0``. + - Wave port: We can solve a 2D boundary eigenvalue problem for the mode shape and + propagation constants for the characteristic CPW mode, and use this to terminate the + transmission line. + +Views of the mesh boundaries for these two configurations are shown below. In both cases the +computational domain is discretized using an unstructured tetrahedral mesh. The mesh files +are +[`mesh/cpw_wave_0.msh`](https://github.com/awslabs/palace/blob/main/examples/cpw/mesh/cpw_wave_0.msh) +and +[`mesh/cpw_lumped_0.msh`](https://github.com/awslabs/palace/blob/main/examples/cpw/mesh/cpw_lumped_0.msh), +respectively. In addition, this example includes two mesh files which include the thickness +of the metal trace: +[`mesh/cpw_wave.msh`](https://github.com/awslabs/palace/blob/main/examples/cpw/mesh/cpw_wave.msh) +and +[`mesh/cpw_lumped.msh`](https://github.com/awslabs/palace/blob/main/examples/cpw/mesh/cpw_lumped.msh). + +```@raw html +

+ + +


+``` + +Likewise, there are two different options for how the system response is calculated over the +desired frequency band: + + - Uniform: Sample the frequency band with the full-fidelity model at equally spaced + frequencies over the desired range. + - Adaptive: Use the full-fidelity model to sample the solution at a few adaptively + selected frequency points in the desired band, and then construct a low-cost surrogate + model which is used to compute the response over the entire band. + +This leads to four possible configurations, for which there are four configuration files in +the example directory: +[`cpw_lumped_uniform.json`](https://github.com/awslabs/palace/blob/main/examples/cpw/cpw_lumped_uniform.json), +[`cpw_lumped_adaptive.json`](https://github.com/awslabs/palace/blob/main/examples/cpw/cpw_lumped_adaptive.json), +[`cpw_wave_uniform.json`](https://github.com/awslabs/palace/blob/main/examples/cpw/cpw_wave_uniform.json), +and +[`cpw_wave_adaptive.json`](https://github.com/awslabs/palace/blob/main/examples/cpw/cpw_wave_adaptive.json). + +The frequency response is computed for the band ``f\in[2.0,32.0]\text{ GHz}``. For the +uniform sweep, a step size of ``\Delta f=6.0\text{ GHz}`` is used, while the adaptive sweep +employs a much finer step size ``\Delta f=0.1\text{ GHz}``. Additionally both sweeps have an +explicit sample placed at ``17.0\text{ GHz}``. The adaptive fast frequency +sweep algorithm is given a tolerance of ``1\times10^{-3}`` for choosing the sampling +points; the simulation with uniform ports uses ``9`` frequency samples and that with wave +ports uses ``10``. Despite the much finer frequency resolution, the adaptive frequency +sweep simulations take roughly the same amount of time as the uniform ones where the +resulting resolution is worse by a factor of ``20``. Lastly, for all simulations, a +second-order finite element approximation for the solution is used. + +The results from the four different simulations are presented in the plots below. Note that +here, ``\text{dB}`` means ``20\log_{10}(|S_{ij}|)``: + +```@raw html +

+ + + + +


+``` + +The first remark is that in both the lumped port and wave port cases, the adaptive fast +frequency sweep results are very close to the true solutions sampled by the uniform +sweeps. + +Second, there is a discrepancy between the results using lumped ports and those with wave +ports, namely the lumped port excitation exhibits much higher reflection than for wave +ports. This is expected when using a lumped port to approximate the termination of a CPW, +and refining the mesh or increasing the order of the solution approximation leads to less +reflection. See below for the results with again ``p = 4`` for the order of the solution +space, effectively doubling the spatial resolution from ``p = 2``. For the adaptive solver +in these plots, we have also reduced the adaptive tolerance to ``1\times10^{-5}`` due to the +small value of ``|S_{41}|``. + +```@raw html +

+ + + + +


+``` + +!!! note + + The examples files for uniform sampling in `examples/cpw` actually specify excitations + on two ports ("multi-excitation"). The two excitation are run in sequence during a + single palace simulation. + +## References + +[1] H. J. Visser, _Antenna Theory and Applications_, Wiley, Hoboken, NJ, 2012. diff --git a/docs/src/examples/examples.md b/docs/src/examples/examples.md index 15b75384aa..81d553630f 100644 --- a/docs/src/examples/examples.md +++ b/docs/src/examples/examples.md @@ -1,23 +1,24 @@ -```@raw html - - -``` - -# Overview - -Some examples of using *Palace*, including configuration and mesh files, can be found in -the [`examples/`](https://github.com/awslabs/palace/blob/main/examples) directory of the -source code. The following sections provide complete tutorials for each of the available -example applications. - -These examples are also used by *Palace*'s regression testing suite. See the -[`test/examples/`](https://github.com/awslabs/palace/blob/main/test/examples/) directory for -more details. - -## Contents - - - [Capacitance Matrix for Two Spheres](spheres.md) - - [Inductance Matrix for a Pair of Concentric Rings](rings.md) - - [Eigenmodes of a Cylindrical Cavity](cavity.md) - - [Signal Propagation in a Coaxial Cable](coaxial.md) - - [Crosstalk Between Coplanar Waveguides](cpw.md) +```@raw html + + +``` + +# Overview + +Some examples of using *Palace*, including configuration and mesh files, can be found in +the [`examples/`](https://github.com/awslabs/palace/blob/main/examples) directory of the +source code. The following sections provide complete tutorials for each of the available +example applications. + +These examples are also used by *Palace*'s regression testing suite. See the +[`test/examples/`](https://github.com/awslabs/palace/blob/main/test/examples/) directory for +more details. + +## Contents + + - [Capacitance Matrix for Two Spheres](spheres.md) + - [Inductance Matrix for a Pair of Concentric Rings](rings.md) + - [Dipole Antenna and Radiation Fields](antenna.md) + - [Eigenmodes of a Cylinder](cylinder.md) + - [Signal Propagation in a Coaxial Cable](coaxial.md) + - [Crosstalk Between Coplanar Waveguides](cpw.md) diff --git a/docs/src/examples/rings.md b/docs/src/examples/rings.md index ecfa1cffc8..462a8e6192 100644 --- a/docs/src/examples/rings.md +++ b/docs/src/examples/rings.md @@ -1,115 +1,148 @@ -```@raw html - - -``` - -# Inductance Matrix for a Pair of Concentric Rings - -!!! note - - The files for this example can be found in the [`examples/rings/`] - (https://github.com/awslabs/palace/blob/main/examples/rings) directory of the *Palace* - source code. - -This example seeks to compute the inductance matrix for a system of two concentric -current-carrying rings of radii ``r_a`` and ``r_b``, each with width ``w``. As with the -previous example, the permeability of the surrounding medium is assumed to be the -permeability of free space. The mutual inductance, ``M_{ab}``, can be easily computed for -the case where ``r_a\ll r_b`` and ``w = 0`` using the Biot-Savart law as - -```math -M_{ab} = \frac{\mu_0\pi r_b^2}{2 r_a} \,. -``` - -Analytic expressions for the self inductance of this configuration can also be derived, for -example from [[1]](#References) we have - -```math -\begin{aligned} -M_{aa} &= \mu_0 r_a \left(\log{\frac{16 r_a}{w}}-1.75\right) \\ -M_{bb} &= \mu_0 r_b \left(\log{\frac{16 r_b}{w}}-1.75\right) \,. -\end{aligned} -``` - -We take in this case ``r_a = 10 \text{ μm}``, ``r_b = 100 \text{ μm}``, and -``w = 1 \text{ μm}``. The `mesh.jl` script in the [`mesh/`] -(https://github.com/awslabs/palace/blob/main/examples/rings/mesh) directory is used to -generate an unstructured tetrahedral mesh with Gmsh, saved to [`mesh/rings.msh`] -(https://github.com/awslabs/palace/blob/main/examples/rings/mesh/rings.msh), and a depiction -is shown below. - -```@raw html -

- -


-``` - -The configuration file for the *Palace* simulation is [`rings.json`] -(https://github.com/awslabs/palace/blob/main/examples/rings/rings.json). The simulation -`"Type"` is `"Magnetostatic"`, and we add `"SurfaceCurrent"` boundaries for applying a -surface current to drive the inner or outer ring. The rest of the ring boundaries are -labeled as `"PEC"` boundaries, which prescibes a zero magnetic flux, or magnetic -insulation, boundary condition. The farfield is also prescribed the `"PEC"` boundary -condition. We seek a second-order solution and use the geometric multigrid AMS -solver. - -The computed inductance matrix is saved to disk as `postpro/terminal-M.csv`, and below we -show its contents: - -``` - i, M[i][1] (H), M[i][2] (H) - 1.000000e+00, +4.272291158e-11, +1.959927760e-12 - 2.000000e+00, +1.959927760e-12, +7.131293160e-10 -``` - -According to the analytic expressions above, for this geometry we should have - -```math -M_{ab} = 1.973921\text{ pH} -``` - -for the mutual inductance, and - -```math -\begin{aligned} -M_{aa} &= 41.78537\text{ pH}\\ -M_{bb} &= 707.2050\text{ pH} -\end{aligned} -``` - -for the self inductances. Thus, the *Palace* solution has approximately ``0.71\%`` error in -the mutual inductance and ``2.2\%`` and ``0.84\%`` errors in the self inductances versus the -analytic solutions. - -The typical approach used by *Palace* for lumped parameter extraction uses the computed -field energies, but one can also compute the inductance by explicitly integrating the -magnetic flux through a surface and dividing by the excitation current. This is configured -under [`config["Boundaries"]["Postprocessing"]["Inductance"]`] -(../config/boundaries.md#boundaries["Postprocessing"]["Inductance"]) in the configuration -file. The resulting postprocessed values are written to `postpro/surface-M.csv`: - -``` - i, M[1] (H), M[2] (H) - 1.000000e+00, +4.260888637e-11, +1.890068391e-12 - 2.000000e+00, +1.955578068e-12, +7.130510941e-10 -``` - -The values computed using the flux integral method are in close agreement to those above, as -expected. - -Lastly, we visualize the magnitude of the magnetic flux density field for the excitations of -the inner and outer rings. The files for this visualization are again saved to the -`postpro/paraview` directory. - -```@raw html -

- - -

-``` - -## References - -[1] M. R. Alizadeh Pahlavani and H. A. Mohammadpour, Inductance comparison of the solenoidal -coil of modular toroidal coils using the analytical and finite element method, Progress in -Electromagnetics Research 20 (2010) 337-352. +```@raw html + + +``` + +```@setup include_example +function include_example_file(example_path, filename) + print(read(joinpath(@__DIR__, "..", "..", "..", "test", "examples", "ref", example_path, filename), String)) +end +``` + +# Inductance Matrix for a Pair of Concentric Rings + +!!! note + + The files for this example can be found in the + [`examples/rings/`](https://github.com/awslabs/palace/blob/main/examples/rings) + directory of the *Palace* source code. + +This example seeks to compute the inductance matrix for a system of two concentric +current-carrying rings of radii ``r_a`` and ``r_b``, each with width ``w``. As with the +previous example, the permeability of the surrounding medium is assumed to be the +permeability of free space. The mutual inductance, ``M_{ab}``, can be easily computed for +the case where ``r_a\ll r_b`` and ``w = 0`` using the Biot-Savart law as + +```math +M_{ab} = \frac{\mu_0\pi r_b^2}{2 r_a} \,. +``` + +Analytic expressions for the self inductance of this configuration can also be derived, for +example from [[1]](#References) we have + +```math +\begin{aligned} +M_{aa} &= \mu_0 r_a \left(\log{\frac{16 r_a}{w}}-1.75\right) \\ +M_{bb} &= \mu_0 r_b \left(\log{\frac{16 r_b}{w}}-1.75\right) \,. +\end{aligned} +``` + +We take in this case ``r_a = 10 \text{ μm}``, ``r_b = 100 \text{ μm}``, and +``w = 1 \text{ μm}``. The `mesh.jl` script in the +[`mesh/`](https://github.com/awslabs/palace/blob/main/examples/rings/mesh) directory is used +to generate an unstructured tetrahedral mesh with Gmsh, saved to +[`mesh/rings.msh`](https://github.com/awslabs/palace/blob/main/examples/rings/mesh/rings.msh), +and a depiction is shown below. + +```@raw html +

+ +


+``` + +The configuration file for the *Palace* simulation is +[`rings.json`](https://github.com/awslabs/palace/blob/main/examples/rings/rings.json). The +simulation `"Type"` is `"Magnetostatic"`, and we add `"SurfaceCurrent"` boundaries for +applying a surface current to drive the inner or outer ring. The rest of the ring +boundaries are labeled as `"PEC"` boundaries, which prescribes a zero magnetic flux, or +magnetic insulation, boundary condition. The farfield is also prescribed the `"PEC"` +boundary condition. We seek a second-order solution and use the geometric multigrid AMS +solver. + +The computed inductance matrix is saved to disk as `postpro/terminal-M.csv`, and below we +show its contents: + +```@example include_example +include_example_file("rings", "terminal-M.csv") # hide +``` + +According to the analytic expressions above, for this geometry we should have + +```math +M_{ab} = 1.973921\text{ pH} +``` + +for the mutual inductance, and + +```math +\begin{aligned} +M_{aa} &= 41.78537\text{ pH}\\ +M_{bb} &= 707.2050\text{ pH} +\end{aligned} +``` + +for the self inductances. Thus, the *Palace* solution has percent-level errors +in the self inductances versus the analytic solutions. + +The typical approach used by *Palace* for lumped parameter extraction uses the computed +field energies, but one can also compute the inductance by explicitly integrating the +magnetic flux through a surface and dividing by the excitation current. This is configured +under +[`config["Boundaries"]["Postprocessing"]["Inductance"]`](../config/boundaries.md#boundaries%5B%22Postprocessing%22%5D%5B%22Inductance%22%5D) +in the configuration file. The postprocessed magnetic flux values are written to `postpro/surface-F.csv`: + +```@example include_example +include_example_file("rings", "surface-F.csv") # hide +``` + +Combining with the values in `postpro/terminal-I.csv` we can compute the +inductance matrix in this alternative fashion, + +```@example include_example +include_example_file("rings", "terminal-I.csv") # hide +``` + +we arrive at + +```@example +using DelimitedFiles: readdlm #hide +using Printf #hide +path = joinpath(@__DIR__, "..", "..", "..", "test", "examples", "ref", "rings") #hide +surface_F = readdlm(joinpath(path, "surface-F.csv"), ',', Float64, skipstart=1) #hide +terminal_I = readdlm(joinpath(path, "terminal-I.csv"), ',', Float64, skipstart=1) #hide +result = copy(surface_F) #hide +result[:, 2] ./= terminal_I[:, 2] #hide +result[:, 3] ./= terminal_I[:, 2] #hide +println(" i, M[i][1] (H), M[i][2] (H)") #hide +for i = 1:size(result, 1) #hide + @printf( + " %.2e, %+.12e, %+.12e\n", + result[i, 1], + result[i, 2], + result[i, 3] + ) #hide +end #hide +``` + +The values computed using the flux integral method are in close agreement to +those above, as expected. This method of calculating the inductance matrix +directly from flux values is in general less accurate than using the energy +method, due to convergence properties of finite element functional outputs, but +serves as a validation of the energy calculation. + +Lastly, we visualize the magnitude of the magnetic flux density field for the excitations of +the inner and outer rings. The files for this visualization are again saved to the +`postpro/paraview` directory. + +```@raw html +

+ + +

+``` + +## References + +[1] M. R. Alizadeh Pahlavani and H. A. Mohammadpour, Inductance comparison of the solenoidal +coil of modular toroidal coils using the analytical and finite element method, _Progress in +Electromagnetics Research_ 20 (2010) 337-352. diff --git a/docs/src/examples/spheres.md b/docs/src/examples/spheres.md index a149a4c819..95f180c181 100644 --- a/docs/src/examples/spheres.md +++ b/docs/src/examples/spheres.md @@ -1,118 +1,116 @@ -```@raw html - - -``` - -# Capacitance Matrix for Two Spheres - -!!! note - - The files for this example can be found in the [`examples/spheres/`] - (https://github.com/awslabs/palace/blob/main/examples/spheres) directory of the *Palace* - source code. - -In this example, we consider two conducting spheres of radii ``a`` and ``b``, with centers -separated by a distance ``c > a + b``. The surrounding medium is vacuum. An analytic -solution for the capacitance matrix of this configuration exists and is given in -[[1]] (#References). The Maxwell capacitance matrix entries are given by the infinite series - -```math -\begin{aligned} -C_{aa} &= 4\pi\varepsilon_0 ab \sinh{u}\sum_{n=0}^{\infty} \frac{1}{a\sinh{nu}+b\sinh{(n+1)u}} \\ -C_{bb} &= 4\pi\varepsilon_0 ab \sinh{u}\sum_{n=0}^{\infty} \frac{1}{b\sinh{nu}+a\sinh{(n+1)u}} \\ -C_{ab} &= -4\pi\varepsilon_0 \frac{ab}{c} \sinh{u}\sum_{n=1}^{\infty} \frac{1}{\sinh{nu}} -\end{aligned} -``` - -where the subscript ``a`` refers to the sphere with radius ``a`` and likewise for ``b``. The -parameter ``u`` is given by - -```math -\cosh{u} = \frac{c^2-a^2-b^2}{2ab} \,. -``` - -Here we take the values ``a = 1\text{ cm}``, ``b = 2\text{ cm}``, and ``c = 5\text{ cm}``. A -mesh is generated with Gmsh using the `mesh.jl` Julia script found in the [`mesh/`] -(https://github.com/awslabs/palace/blob/main/examples/spheres/mesh) directory, which writes -the mesh to [`mesh/spheres.msh`] -(https://github.com/awslabs/palace/blob/main/examples/spheres/mesh/spheres.msh). The -resulting high-order mesh uses cubically-curved tetrahedral elements, and is pictured below. - -```@raw html -

- - -


-``` - -The configuration file for the *Palace* simulation is found in [`spheres.json`] -(https://github.com/awslabs/palace/blob/main/examples/spheres/spheres.json). We set the -simulation `"Type"` to `"Electrostatic"`, and add `"Terminal"` entries for the surface -boundary of each sphere, corresponding to the entries of the capacitance matrix we wish to -compute. The outer boundary of the computational domain, which is sufficiently far from the -spheres, is prescribed a `"Ground"` boundary condition. We set the `"Order"` of the finite -element approximation to ``3``. - -The resulting extracted Maxwell capacitance matrix is saved to disk in the CSV file -`postpro/terminal-C.csv`: - -``` - i, C[i][1] (F), C[i][2] (F) - 1.000000e+00, +1.237470440e-12, -4.771229193e-13 - 2.000000e+00, -4.771229193e-13, +2.478512278e-12 -``` - -In this case, the analytic solution yields - -```math -\begin{aligned} -C_{aa} &= +1.230518\text{ pF} \\ -C_{bb} &= +2.431543\text{ pF} \\ -C_{ab} &= -0.4945668\text{ pF} -\end{aligned} -``` - -which is computed using the first ``n=12`` terms in the series after which convergence to a -relative tolerance of ``10^{-12}`` is reached. Thus, the errors in the capacitance -coefficients by *Palace* are ``0.57\%``, ``1.9\%``, and ``3.5\%``, respectively. - -The mutual capacitance matrix can be computed from its Maxwell counterpart, and is saved in -`postpro/terminal-Cm.csv`: - -``` - i, C_m[i][1] (F), C_m[i][2] (F) - 1.000000e+00, +7.603475205e-13, +4.771229193e-13 - 2.000000e+00, +4.771229193e-13, +2.001389358e-12 -``` - -Additionally, while the typical approach used by *Palace* for lumped parameter extraction -uses the computed field energies, the capacitance can also be calculated by directly -integrating the charge on a boundary surface and dividing by the excitation voltage. The -configuration file for this example contains this information under -[`config["Boundaries"]["Postprocessing"]["Capacitance"]`] -(../config/boundaries.md#boundaries["Postprocessing"]["Capacitance"]). The resulting -capacitances are written to `postpro/surface-C.csv`: - -``` - i, C[1] (F), C[2] (F) - 1.000000e+00, +1.219433080e-12, -4.711763113e-13 - 2.000000e+00, -4.701805852e-13, +2.443652512e-12 -``` - -and agree closely with the values computed using the default method above, as expected. - -Finally, the `postpro/paraview` directory contains files for visualizing the computed field -solutions with ParaView. Below we present the electrostatic potential fields for each -terminal solution. - -```@raw html -

- - -

-``` - -## References - -[1] J. Lekner, Capacitance coefficients of two spheres, Journal of Electrostatics 69 -(2011) 11-14. +```@raw html + + +``` + +```@setup include_example +function include_example_file(example_path, filename) + print(read(joinpath(@__DIR__, "..", "..", "..", "test", "examples", "ref", example_path, filename), String)) +end +``` + +# Capacitance Matrix for Two Spheres + +!!! note + + The files for this example can be found in the + [`examples/spheres/`](https://github.com/awslabs/palace/blob/main/examples/spheres) + directory of the *Palace* source code. + +In this example, we consider two conducting spheres of radii ``a`` and ``b``, with centers +separated by a distance ``c > a + b``. The surrounding medium is vacuum. An analytic +solution for the capacitance matrix of this configuration exists and is given in +[[1]](#References). The Maxwell capacitance matrix entries are given by the infinite series + +```math +\begin{aligned} +C_{aa} &= 4\pi\varepsilon_0 ab \sinh{u}\sum_{n=0}^{\infty} \frac{1}{a\sinh{nu}+b\sinh{(n+1)u}} \\ +C_{bb} &= 4\pi\varepsilon_0 ab \sinh{u}\sum_{n=0}^{\infty} \frac{1}{b\sinh{nu}+a\sinh{(n+1)u}} \\ +C_{ab} &= -4\pi\varepsilon_0 \frac{ab}{c} \sinh{u}\sum_{n=1}^{\infty} \frac{1}{\sinh{nu}} +\end{aligned} +``` + +where the subscript ``a`` refers to the sphere with radius ``a`` and likewise for ``b``. The +parameter ``u`` is given by + +```math +\cosh{u} = \frac{c^2-a^2-b^2}{2ab} \,. +``` + +Here we take the values ``a = 1\text{ cm}``, ``b = 2\text{ cm}``, and ``c = 5\text{ cm}``. A +mesh is generated with Gmsh using the `mesh.jl` Julia script found in the +[`mesh/`](https://github.com/awslabs/palace/blob/main/examples/spheres/mesh) directory, +which writes the mesh to +[`mesh/spheres.msh`](https://github.com/awslabs/palace/blob/main/examples/spheres/mesh/spheres.msh). +The resulting high-order mesh uses cubically-curved tetrahedral elements, and is pictured +below. + +```@raw html +

+ + +


+``` + +The configuration file for the *Palace* simulation is found in +[`spheres.json`](https://github.com/awslabs/palace/blob/main/examples/spheres/spheres.json). +We set the simulation `"Type"` to `"Electrostatic"`, and add `"Terminal"` entries for the +surface boundary of each sphere, corresponding to the entries of the capacitance matrix we +wish to compute. The outer boundary of the computational domain, which is sufficiently far +from the spheres, is prescribed a `"Ground"` boundary condition. We set the `"Order"` of +the finite element approximation to ``3``. + +The resulting extracted Maxwell capacitance matrix is saved to disk in the CSV file +`postpro/terminal-C.csv`: + +```@example include_example +include_example_file("spheres", "terminal-C.csv") # hide +``` + +In this case, the analytic solution yields + +```math +\begin{aligned} +C_{aa} &= +1.230518\text{ pF} \\ +C_{bb} &= +2.431543\text{ pF} \\ +C_{ab} &= -0.4945668\text{ pF} +\end{aligned} +``` + +which is computed using the first ``n=12`` terms in the series after which convergence to a +relative tolerance of ``10^{-12}`` is reached. Thus, the errors in the capacitance +coefficients by *Palace* are ``0.57\%``, ``1.9\%``, and ``3.5\%``, respectively. + +The mutual capacitance matrix can be computed from its Maxwell counterpart, and is saved in +`postpro/terminal-Cm.csv`: + +```@example include_example +include_example_file("spheres", "terminal-Cm.csv") # hide +``` + +Additionally, while the typical approach used by *Palace* for lumped parameter extraction +uses the computed field energies, the capacitance can also be calculated by directly +integrating the charge on a boundary surface and dividing by the excitation voltage. The +configuration file for this example contains this information under +[`config["Boundaries"]["Postprocessing"]["SurfaceFlux"]`](../config/boundaries.md#boundaries%5B%22Postprocessing%22%5D%5B%22SurfaceFlux%22%5D). +The resulting capacitances are written to `postpro/terminal-C.csv`: + +```@example include_example +include_example_file("spheres", "terminal-C.csv") # hide +``` + +Finally, the `postpro/paraview` directory contains files for visualizing the computed field +solutions with ParaView. Below we present the electrostatic potential fields for each +terminal solution. + +```@raw html +

+ + +

+``` + +## References + +[1] J. Lekner, Capacitance coefficients of two spheres, _Journal of Electrostatics_ 69 +(2011) 11-14. diff --git a/docs/src/guide/boundaries.md b/docs/src/guide/boundaries.md index 7c684d4651..9be0364ebd 100644 --- a/docs/src/guide/boundaries.md +++ b/docs/src/guide/boundaries.md @@ -1,124 +1,174 @@ -```@raw html - - -``` - -# Boundary Conditions - -## Perfect electric conductor (PEC) boundary - -The perfect electric conductor (PEC) boundary condition (zero tangential electric field) is -specified using the `"PEC"` boundary keyword under -[`config["Boundaries"]`](../config/boundaries.md#boundaries%5B%22PEC%22%5D). It is a -homogeneous Dirichlet boundary condition for the frequency or time domain finite element -formulation, as well as the magnetostatic formulation. - -For electrostatic simulations, the homogeneous Dirichlet boundary condition is prescribed -using the [`"Ground"`](../config/boundaries.md#boundaries%5B%22Ground%22%5D) boundary -keyword which prescribes zero voltage at the boundary. - -## Perfect magnetic conductor (PMC) boundary - -The perfect magnetic conductor (PMC) boundary condition (zero tangential magnetic field) is -a homogenous Neumann boundary condition for the frequency or time domain finite element -formulation, as well as the magnetostatic formulation. It is the natural boundary condition -and thus it has the same effect as not specifying any additional boundary condition on -external boundary surfaces. It can also be explicitly specified using the `"PMC"` boundary -keyword under [`config["Boundaries"]`](../config/boundaries.md#boundaries%5B%22PMC%22%5D). - -Likewise, for electrostatic simulations, the homogeneous Neumann boundary condition implies -a zero-charge boundary, and thus zero gradient of the voltage in the direction normal to the -boundary. This is specified using the `"ZeroCharge"` boundary keyword under -[`config["Boundaries"]`](../config/boundaries.md#boundaries%5B%22ZeroCharge%22%5D). - -## Impedance boundary - -The impedance boundary condition is a mixed (Robin) boundary condition and is available for -the frequency or time domain finite element formulations and thus for eigenmode or frequency -or time domain driven simulation types. It is specified using the [`"Impedance"`] -(../config/boundaries.md#boundaries%5B%22Impedance%22%5D) boundary keyword. The surface -impedance relating the tangential electric and magnetic fields on the boundary is computed -from the parallel impedances due to the specified resistance, inductance, and capacitance -per square. - -## Absorbing (scattering) boundary - -Absorbing boundary conditions at farfield boundaries, also referred to as scattering -boundary conditions, can be applied using the `"Absorbing"` boundary keyword under -[`config["Boundaries"]`](../config/boundaries.md#boundaries%5B%22Absorbing%22%5D). The -first-order absorbing boundary condition is a special case of the above impedance boundary -and is available for eigenmode or frequency or time domain driven simulation types. The -second-order absorbing boundary condition is only available for frequency domain driven -simulations. - -[Perfectly matched layer (PML)](https://en.wikipedia.org/wiki/Perfectly_matched_layer) -boundaries for frequency and time domain electromagnetic formulations are not yet -implemented, but are [common] -(https://www.sciencedirect.com/science/article/abs/pii/S0021999112000344) in solvers for -computational electromagnetics and will be a useful addition. - -## Finite conductivity boundary - -A finite conductivity boundary condition can be specified using the [`"Conductivty"`] -(../config/boundaries.md#boundaries%5B%22Conductivity%22%5D) boundary keyword. This boundary -condition models the effect of a boundary with non-infinite conductivity (an imperfect -conductor) for conductors with thickness much larger than the skin depth. It is available -only for frequency domain driven simulations. - -## Lumped and wave port excitation - - - [`config["Boundaries"]["LumpedPort"]`] - (../config/boundaries.md#boundaries["LumpedPort"]) : A lumped port applies a similar - boundary condition to a [surface impedance](#Impedance-boundary) boundary, but takes on - a special meaning for each simulation type. - - For frequency domain driven simulations, ports are used to provide a lumped port - excitation and postprocess voltages, currents, and scattering parameters. Likewise, for - transient simulations, they perform a similar purpose but for time domain computed - quantities. - - For eigenmode simulations where there is no excitation, lumped ports are used to specify - properties and postprocess energy-participation ratios (EPRs) corresponding to - linearized circuit elements. - - Note that a single lumped port (given by a single integer `"Index"`) can be made up of - multiple boundary attributes in the mesh in order to model, for example, a multielement - lumped port. - - - [`config["Boundaries"]["WavePort"]`] - (../config/boundaries.md#boundaries["WavePort"]) : Numeric wave ports are available for - frequency domain driven simulations. In this case, a port boundary condition is applied - with an optional excitation using a modal field shape which is computed by solving a 2D - boundary mode eigenproblem on each wave port boundary. This allows for more accurate - scattering parameter calculations when modeling waveguides or transmission lines with - arbitrary cross sections. - - The homogenous Dirichlet boundary conditions for the wave port boundary mode analysis - are taken from the `"PEC"` boundaries of the full 3D model, as well as any optional - additional boundary attributes given under `"WavePortPEC"`. Any boundary of the wave - port not labeled with with a PEC condition has the natural boundary condition for zero - tangential magnetic field prescribed for the purpose of port mode calculation. - - Unlike lumped ports, wave port boundaries cannot be defined internal to the - computational domain and instead must exist only on the outer boundary of the domain - (they are to be "one-sided" in the sense that mesh elements only exist on one side of - the boundary). - - Wave ports are not currently compatible with nonconformal mesh refinement. - -The incident field excitation at a lumped or wave port is controlled by setting -[`config["Boundaries"]["LumpedPort"][]["Excitation"]: true`] -(../config/boundaries.md#boundaries["LumpedPort"]) or -[`config["WavePort"][]["Excitation"]: true`] -(../config/boundaries.md#boundaries%5B%22WavePort%22%5D) for that port. The excitation for -each port is defined to have unit incident power over the port boundary surface. - -## Surface current excitation - -An alternative source excitation to lumped or wave ports for frequency and time domain -driven simulations is a surface current excitation, specified under -[`config["Boundaries"]["SurfaceCurrent"]`] -(../config/boundaries.md#boundaries["SurfaceCurrent"]). This is the excitation used for -magnetostatic simulation types as well. This option prescribes a unit source surface current -excitation on the given boundary in order to excite the model. It does does not prescribe -any boundary condition to the model and only affects the source term on the right hand side. +```@raw html + + +``` + +# Boundary Conditions + +## Perfect electric conductor (PEC) boundary + +The perfect electric conductor (PEC) boundary condition (zero tangential electric field) is +specified using the `"PEC"` boundary keyword under +[`config["Boundaries"]`](../config/boundaries.md#boundaries%5B%22PEC%22%5D). It is a +homogeneous Dirichlet boundary condition for the frequency or time domain finite element +formulation, as well as the magnetostatic formulation. + +For electrostatic simulations, the homogeneous Dirichlet boundary condition is prescribed +using the [`"Ground"`](../config/boundaries.md#boundaries%5B%22Ground%22%5D) boundary +keyword which prescribes zero voltage at the boundary. + +## Perfect magnetic conductor (PMC) boundary + +The perfect magnetic conductor (PMC) boundary condition (zero tangential magnetic field) is +a homogeneous Neumann boundary condition for the frequency or time domain finite element +formulation, as well as the magnetostatic formulation. It is the natural boundary condition +and thus it has the same effect as not specifying any additional boundary condition on +external boundary surfaces. It can also be explicitly specified using the `"PMC"` boundary +keyword under [`config["Boundaries"]`](../config/boundaries.md#boundaries%5B%22PMC%22%5D). + +Likewise, for electrostatic simulations, the homogeneous Neumann boundary condition implies +a zero-charge boundary, and thus zero gradient of the voltage in the direction normal to the +boundary. This is specified using the `"ZeroCharge"` boundary keyword under +[`config["Boundaries"]`](../config/boundaries.md#boundaries%5B%22ZeroCharge%22%5D). + +## Impedance boundary + +The impedance boundary condition is a mixed (Robin) boundary condition and is available for +the frequency or time domain finite element formulations and thus for eigenmode or frequency +or time domain driven simulation types. It is specified using the +[`"Impedance"`](../config/boundaries.md#boundaries%5B%22Impedance%22%5D) boundary keyword. +The surface impedance relating the tangential electric and magnetic fields on the boundary +is computed from the parallel impedances due to the specified resistance, inductance, and +capacitance per square. + +## Absorbing (scattering) boundary + +Absorbing boundary conditions at farfield boundaries, also referred to as scattering +boundary conditions, can be applied using the `"Absorbing"` boundary keyword under +[`config["Boundaries"]`](../config/boundaries.md#boundaries%5B%22Absorbing%22%5D). The +first-order absorbing boundary condition is a special case of the above impedance boundary +and is available for eigenmode or frequency or time domain driven simulation types. The +second-order absorbing boundary condition is only available for frequency domain driven +and eigenmode simulations. + +[Perfectly matched layer (PML)](https://en.wikipedia.org/wiki/Perfectly_matched_layer) +boundaries for frequency and time domain electromagnetic formulations are not yet +implemented, but are +[common](https://www.sciencedirect.com/science/article/abs/pii/S0021999112000344) in solvers +for computational electromagnetics and will be a useful addition. + +## Finite conductivity boundary + +A finite conductivity boundary condition can be specified using the +[`"Conductivity"`](../config/boundaries.md#boundaries%5B%22Conductivity%22%5D) boundary +keyword. This boundary condition models the effect of a boundary with non-infinite +conductivity (an imperfect conductor) for conductors with thickness much larger than the +skin depth. It is available only for frequency domain driven and eigenmode simulations. For more +information see the +[Other boundary conditions](../reference.md#Other-boundary-conditions) section of the +reference. + +## Periodic boundary + +Periodic boundary conditions on an existing mesh can be specified using the +["Periodic"](../config/boundaries.md#boundaries%5B%22Periodic%22%5D) boundary keyword. This +boundary condition enforces that the solution on the specified boundaries be exactly equal, +and requires that the surface meshes on the donor and receiver boundaries be identical up to +translation or rotation. Periodicity in *Palace* is also supported through meshes generated +incorporating periodicity as part of the meshing process. + +*Palace* also supports Floquet periodic boundary conditions, where a phase shift is imposed +between the fields on the donor and receiver boundaries. The phase shift is +``e^{-i \bm{k}_p \cdot (\bm{x}_{\textrm{receiver}}-\bm{x}_{\textrm{donor}})}``, where +``\bm{k}_p`` is the Floquet wave vector and ``\bm{x}`` is the position vector. See +[Floquet periodic boundary conditions](../reference.md#Floquet-periodic-boundary-conditions) +for implementation details. + +## Lumped and wave port excitation + + - [`config["Boundaries"]["LumpedPort"]`](../config/boundaries.md#boundaries%5B%22LumpedPort%22%5D) : + A lumped port applies a similar boundary condition to a + [surface impedance](#Impedance-boundary) boundary, but takes on a special meaning for + each simulation type. + + For frequency domain driven simulations, ports are used to provide a lumped port + excitation and postprocess voltages, currents, and scattering parameters. Likewise, for + transient simulations, they perform a similar purpose but for time domain computed + quantities. + + For eigenmode simulations where there is no excitation, lumped ports are used to specify + properties and postprocess energy-participation ratios (EPRs) corresponding to + linearized circuit elements. + + Note that a single lumped port (given by a single integer `"Index"`) can be made up of + multiple boundary attributes in the mesh in order to model, for example, a multielement + lumped port. To use this functionality, use the `"Elements"` object under + [`"LumpedPort"`](../config/boundaries.md#boundaries%5B%22LumpedPort%22%5D). + + - [`config["Boundaries"]["WavePort"]`](../config/boundaries.md#boundaries%5B%22WavePort%22%5D) : + Numeric wave ports are available for frequency domain driven and eigenmode simulations. In this case, + a port boundary condition is applied with an optional excitation using a modal field + shape which is computed by solving a 2D boundary mode eigenproblem on each wave port + boundary. This allows for more accurate scattering parameter calculations when modeling + waveguides or transmission lines with arbitrary cross sections. + + The 2D wave port eigenproblem only supports PEC and PMC boundary conditions. Boundaries + that are specified as `"PEC"` or `"Conductivity"` in the full 3D model and intersect the wave port + boundary will be considered as PEC in the 2D boundary mode analysis, as well as any additional + boundary attributes given under `"WavePortPEC"`. [`config["Boundaries"]["WavePortPEC"`](../config/boundaries.md#boundaries%5B%22WavePortPEC%22%5D) + allows to assign non-PEC attributes from the 3D model (e.g. impedance or absorbing boundary conditions) + as a PEC boundary condition for the 2D wave port solve. In addition, boundaries of wave ports other + than the wave port currently being considered, in the case wave ports are touching and share one or + more edges, are also considered as PEC for the wave port boundary mode analysis. Boundaries of the + wave port not labeled with a `"PEC"`, `"Conductivity"`, `"WavePortPEC"`, or `"WavePort"` condition + have the natural boundary condition of zero tangential magnetic field (PMC) prescribed for the purpose + of port mode calculation. + + Unlike lumped ports, wave port boundaries cannot be defined internal to the + computational domain and instead must exist only on the outer boundary of the domain + (they are to be "one-sided" in the sense that mesh elements only exist on one side of + the boundary). + +For each port, the excitation is normalized to have unit incident power over the port boundary +surface. + +The presence of an incident excitation at a port is controlled by the settings +[`config["Boundaries"]["LumpedPort"][]["Excitation"]`](../config/boundaries.md#boundaries%5B%22LumpedPort%22%5D) +and [`config["WavePort"][]["Excitation"]`](../config/boundaries.md#boundaries%5B%22WavePort%22%5D). +The `Excitation` settings can either be specified as non-negative integers or booleans. + + - *Boolean setting*: `true`/`false` indicates the presence / absence of an incident excitation. + Usually, only a single port will be marked as excited. In that case, the `"Excitation"` will promoted to the port `"Index"`. If there are multiple excited ports, the `"Excitation"` is `1`. + + - *Integer setting*: Here the user manually assigns excitation indices to ports. The value `0` + corresponds to no excitation. A positive integer `i` means that port is excited during + excitation `i`. If multiple ports share an excitation index `i`, they will be excited at the + same time. In the special, but common, case that each excitation consists of only a single port, + the port index and excitation index must be equal. This avoids ambiguity in the scattering + matrix. + +For frequency domain driven simulations only, it is possible to specify multiple excitations in the +same simulation using different positive integers ("multi-excitation"). These excitations are +simulated consecutively during the Palace run. The results are printed to shared csv files. When +there are multiple excitations, the columns of the csv files are post-indexed by the excitation +index (e.g. `Φ_elec[1][5] (C)` denoting the flux through surface 1 of excitation 5). Note that a +port can only be part of one excitation. + +!!! warning "Indexing" + + Any `"Index"` of [`"LumpedPort"`](../config/boundaries.md#boundaries%5B%22LumpedPort%22%5D), + [`"WavePort"`](../config/boundaries.md#boundaries%5B%22WavePort%22%5D), + [`"SurfaceCurrent"`](../config/boundaries.md#boundaries%5B%22SurfaceCurrent%22%5D), or + [`"Terminal"`](../config/boundaries.md#boundaries%5B%22Terminal%22%5D) must be unique, including between + different boundary conditions types (e.g. you can not have an lumped port and wave port both with + `Index: 5`). + +## Surface current excitation + +An alternative source excitation to lumped or wave ports for frequency and time domain +driven simulations is a surface current excitation, specified under +[`config["Boundaries"]["SurfaceCurrent"]`](../config/boundaries.md#boundaries%5B%22SurfaceCurrent%22%5D). +This is the excitation used for magnetostatic simulation types as well. This option +prescribes a unit source surface current excitation on the given boundary in order to +excite the model. It does does not prescribe any boundary condition to the model and only +affects the source term on the right hand side. diff --git a/docs/src/guide/guide.md b/docs/src/guide/guide.md index d3183c907d..b1368e9eb6 100644 --- a/docs/src/guide/guide.md +++ b/docs/src/guide/guide.md @@ -1,16 +1,17 @@ -```@raw html - - -``` - -# Overview - -This user guide provides an overview of the different types of electromagnetic simulations -which can be performed with *Palace* and the various features available in the solver. - -## Contents - - - [Problem Types](problem.md) - - [Simulation Models](model.md) - - [Boundary Conditions](boundaries.md) - - [Postprocessing and Visualization](postprocessing.md) +```@raw html + + +``` + +# Overview + +This user guide provides an overview of the different types of electromagnetic simulations +which can be performed with *Palace* and the various features available in the solver. + +## Contents + + - [Problem Types](problem.md) + - [Simulation Models](model.md) + - [Boundary Conditions](boundaries.md) + - [Postprocessing and Visualization](postprocessing.md) + - [Parallelism and GPU Support](parallelism.md) diff --git a/docs/src/guide/model.md b/docs/src/guide/model.md index 9c85f5f08e..f9c0b7a259 100644 --- a/docs/src/guide/model.md +++ b/docs/src/guide/model.md @@ -1,69 +1,71 @@ -```@raw html - - -``` - -# Simulation Models - -## Supported mesh formats - -The [`config["Model"]`](../config/model.md#config%5B%22Model%22%5D) object is used to -specify the mesh for the discretized computational domain. In general, inputs are expected -to be dimensional nondimensionalized internally. A length scale, specified under -[`config["Model"]["L0"]`](../config/model.md#config%5B%22Model%22%5D), describes the length -units of the mesh relative to 1 meter (i.e. `config["Model"]["L0"]: 1.0e-6` if the mesh -coordinates are in ``\mu``m, this is the default value). All other entries in the -configuration file which have units of length should be specified in units of -`config["Model"]["L0"]` m. - -MFEM supports a [wide variety](https://mfem.org/mesh-formats/) of mesh formats. In -addition, *Palace* has built-in support for [Nastran (`.nas`, `.bdf`)] -(https://docs.plm.automation.siemens.com/tdoc/scnastran/2020_1/help/#uid:index_element) and -[COMSOL (`.mphtxt`, `.mphbin`)] -(https://doc.comsol.com/6.0/doc/com.comsol.help.comsol/COMSOL_ProgrammingReferenceManual.pdf) -format mesh files, for both linear and high-order curved elements. - -Geometric attributes for domains and boundaries in the mesh are used to define material -properties and boundary conditions on the desired model regions and surfaces (see -[`config["Domains"]`](../config/domains.md) and [`config["Boundaries"]`] -(../config/boundaries.md)). These attribute integers correspond to tags for the domain and -boundary elements in the mesh, and should be non-negative and start at 1. They do not need -to be contiguous in the mesh file. Throughout the configuration file, the `"Attributes"` -keyword is used to indicate which domain or boundary attributes are relevant to the -material properties or boundary conditions being specified. - -## Mesh refinement - -Refinement of the input mesh file can be performed using levels of global uniform refinement -or region-based refinement, specified using the [`config["Model"]["Refinement"]`] -(../config/model.md#model["Refinement"]) object. The user can specify any combination of -uniform refinement levels as well as local refinement regions which refines the elements -inside of a certain box or sphere-shaped region. For simplex meshes, the refinement -maintains a conforming mesh but meshes containing hexahedra, prism, or pyramid elements -will be nonconforming after local refinement (this is not supported at this time). - -[Adaptive mesh refinement (AMR)](https://en.wikipedia.org/wiki/Adaptive_mesh_refinement) -according to error estimates calculated from the computed solution can also be specified -using the [`config["Model"]["Refinement"]`](../config/model.md#model%5B%22Refinement%22%5D) -object. Nonconformal refinement is supported for all mesh types, and additionally conformal -refinement is supported for simplex meshes. AMR is available for all problem types apart -from driven problems in the time domain. - -## Material models - -Material properties are handled by the [`config["Domains"]["Materials"]`] -(../config/domains.md#domains["Materials"]) object. *Palace* supports linear, frequency -independent constitutive laws for material modeling. - -Materials with scalar or general matrix-valued properties are supported. For most simulation -types, each material in the model requires a specified relative permittivity and relative -permeability (for electrostatic simulations, only the permittivity is required, while for -magnetostatics, only the permeability is required). For dielectric domains, a loss tangent -may be specified. Alternatively, for normal conducting domains, an electrical conductivity -may be specified which is used to relate the current density and electric field via Ohm's -law. - -Modeling of superconducting domains is performed using the current-field constitutive -relations given by the London equations. The user can specify a London penetration depth to -activate this model. It can also be used in conjunction with a materal conductivity when -wishing to model both superconducting and normal currents. +```@raw html + + +``` + +# Simulation Models + +## Supported mesh formats + +The [`config["Model"]`](../config/model.md#config%5B%22Model%22%5D) object is used to +specify the mesh for the discretized computational domain. In general, inputs are expected +to be dimensional nondimensionalized internally. A length scale, specified under +[`config["Model"]["L0"]`](../config/model.md#config%5B%22Model%22%5D), describes the length +units of the mesh relative to 1 meter (i.e. `config["Model"]["L0"]: 1.0e-6` if the mesh +coordinates are in ``\mu``m, this is the default value). All other entries in the +configuration file which have units of length should be specified in units of +`config["Model"]["L0"]` m. + +MFEM supports a [wide variety](https://mfem.org/mesh-formats/) of mesh formats. In +addition, *Palace* has built-in support for +[Nastran (`.nas`, `.bdf`)](https://docs.plm.automation.siemens.com/tdoc/scnastran/2020_1/help/#uid:index_element) +and +[COMSOL (`.mphtxt`, `.mphbin`)](https://doc.comsol.com/6.0/doc/com.comsol.help.comsol/COMSOL_ProgrammingReferenceManual.pdf) +format mesh files, for both linear and high-order curved elements. + +Geometric attributes for domains and boundaries in the mesh are used to define material +properties and boundary conditions on the desired model regions and surfaces (see +[`config["Domains"]`](../config/domains.md) and +[`config["Boundaries"]`](../config/boundaries.md)). These attribute integers correspond to +tags for the domain and boundary elements in the mesh, and should be non-negative and start +at 1. They do not need to be contiguous in the mesh file. Throughout the configuration +file, the `"Attributes"` keyword is used to indicate which domain or boundary attributes +are relevant to the material properties or boundary conditions being specified. + +## Mesh refinement + +Refinement of the input mesh file can be performed using levels of global uniform refinement +or region-based refinement, specified using the +[`config["Model"]["Refinement"]`](../config/model.md#model%5B%22Refinement%22%5D) object. +The user can specify any combination of uniform refinement levels as well as local +refinement regions which refines the elements inside of a certain box or sphere-shaped +region. For simplex meshes, the refinement maintains a conforming mesh but meshes +containing hexahedra, prism, or pyramid elements will be nonconforming after local +refinement (this is not supported at this time). + +[Adaptive mesh refinement (AMR)](https://en.wikipedia.org/wiki/Adaptive_mesh_refinement) +according to error estimates calculated from the computed solution can also be specified +using the [`config["Model"]["Refinement"]`](../config/model.md#model%5B%22Refinement%22%5D) +object. Nonconformal refinement is supported for all mesh types, and additionally conformal +refinement is supported for simplex meshes. AMR is available for all problem types apart +from driven problems in the time domain. + +## Material models + +Material properties are handled by the +[`config["Domains"]["Materials"]`](../config/domains.md#domains%5B%22Materials%22%5D) +object. *Palace* supports linear, frequency independent constitutive laws for material +modeling. + +Materials with scalar or symmetric matrix-valued material properties are supported. For most +simulation types, each material in the model requires a specified relative permittivity and +relative permeability (for electrostatic simulations, only the permittivity is required, +while for magnetostatics, only the permeability is required). For dielectric domains, a +loss tangent may be specified. Alternatively, for normal conducting domains, an electrical +conductivity may be specified which is used to relate the current density and electric +field via Ohm's law. + +Modeling of superconducting domains is performed using the current-field constitutive +relations given by the London equations. The user can specify a London penetration depth to +activate this model. It can also be used in conjunction with a material conductivity when +wishing to model both superconducting and normal currents. diff --git a/docs/src/guide/postprocessing.md b/docs/src/guide/postprocessing.md index 1490bbbde4..9f26cd62e3 100644 --- a/docs/src/guide/postprocessing.md +++ b/docs/src/guide/postprocessing.md @@ -1,109 +1,152 @@ -```@raw html - - -``` - -# Postprocessing and Visualization - -As described in the section [Problem Types](problem.md), each simulation type writes -relevant postprocessed scalar quantities to disk in the directory specified by -[`config["Problem"]["Output"]`](../config/problem.md#config%5B%22Problem%22%5D), including -but not limited to computed values like eigenfrequencies, scattering parameters, or lumped -element parameters. In addition, each simulation type will write a file called -`domain-E.csv`, which includes information about the electric and magnetic field energies, -as well as lumped element energies, for each step of the simulation (eigenmode, frequency, -or time step, for examples). - -Models containing lumped or wave port boundaries or surface current excitations will -automatically postprocess quantities related to those boundaries. This is described in -[Ports and surface currents](#Ports-and-surface-currents). - -The participation ratios for bulk dielectrics and interface dielectric layers can be -computed for simulations involving the electric field. For model boundaries, the integrated -surface charge or magnetic flux can also be postprocessed. These features are described -in [Domain postprocessing](#Domain-postprocessing) and in [Boundary postprocessing] -(#Boundary-postprocessing). - -Additionally, the computed fields can be automatically probed for their vector values at one -or more points in space. This probe functionality is also described in -[Domain postprocessing](#Domain-postprocessing). - -Finally, as described further in [Visualization](#Visualization), various field quantities -on the 3D computational domain as well as 2D domain boundaries and material interfaces are -written to disk when requested using the relevant parameters under [`config["Solver"]`] -(../config/solver.md). These fields are meant to be visualized with [ParaView] -(https://www.paraview.org/). - -## Ports and surface currents - -When lumped ports are present in a model, the lumped port voltages and currents computed for -each step of the simulation (eigenmode, frequency, or time step) are written to ASCII files -named `port-V.csv` and `port-I.csv`, respectively. These files also include the excitation -voltage and current corresponding to the incident wave on excited port boundaries. - -Additionally, when surface current excitations are present, the excitations are written to -`surface-I.csv`. - -For frequency domain problems, the values output are the complex-valued peak voltages and -currents, computed from the field phasors. - -## Domain postprocessing - -Domain postprocessing capabilities are enabled by including objects under -[`config["Domains"]["Postprocessing"]`](../config/domains.md) in the configuration file. -These include: - - - [`config["Domains"]["Postprocessing"]["Dielectric"]`] - (../config/domains.md#domains["Postprocessing"]["Dielectric"]) : Postprocessess bulk - dielectric loss based on the participation ratio of the electric field in a lossy - region. The respective participation ratios and quality factors for each domain - (associated with the specified domain attributes and indexed by the specified integer - `"Index"`) are computed using the material properties provided and are written to - `domain-Q.csv` in the specified postprocessing output directory. - - [`config["Domains"]["Postprocessing"]["Probe"]`] - (../config/domains.md#domains["Postprocessing"]["Probe"]) : Probe the values of the - computed electric field and magnetic flux density solutions at specified locations in - the computational domain. The availability of the ``\bm{E}`` and ``\bm{B}`` fields - depends on the problem type (for example, for magnetostatic problems, only ``\bm{B}`` - is output and ``\bm{E}`` is not computed, whereas the inverse is true for - electrostatics). For each computed field, the postprocessed values are written to - `probe-E.csv` and `probe-B.csv` in the specified output directory. - -## Boundary postprocessing - -Boundary postprocessing capabilities are enabled by including objects under -`config["Boundaries"]["Postprocessing"]` in the configuration file. These include: - - - [`config["Boundaries"]["Postprocessing"]["Capacitance"]`] - (../config/boundaries.md#boundaries["Postprocessing"]["Capacitance"]) : Postprocess the - integral of the surface charge on a surface defined by a list of boundary attributes, - and divide by the excitation voltage to get the capacitive coupling. The resulting - capcitances are written to `surface-C.csv` in the specified output directory. - - [`config["Boundaries"]["Postprocessing"]["Inductance"]`] - (../config/boundaries.md#boundaries["Postprocessing"]["Inductance"]) : Postprocess the - magnetic flux through a surface defined by a list of boundary attributes, and divide by - the excitation current to the inductive coupling. The resulting inductances are written - to `surface-M.csv` in the specified output directory. - - [`config["Boundaries"]["Postprocessing"]["Dielectric"]`] - (../config/boundaries.md#boundaries["Postprocessing"]["Dielectric"]) : Postprocesses - interface dielectric loss at surfaces of the model by specifying the interface - thickness, permittivity, and loss tangent. See [https://arxiv.org/pdf/1509.01854.pdf] - (https://arxiv.org/pdf/1509.01854.pdf) or - [https://aip.scitation.org/doi/10.1063/1.3637047] - (https://aip.scitation.org/doi/10.1063/1.3637047) for more information. The - participation ratios and associated quality factors are written to the file - `surface-Q.csv` in the specified output directory. - -## Visualization - -When specified in the configuration file, the electric field and magnetic flux density -solutions are written to disk for 3D visualization with [ParaView] -(https://www.paraview.org/). Various other postprocessed fields are also written to the -ParaView database as available, including electric and magnetic energy density, surface -currents, and charge density. These files are found in the `paraview/` directory located in -the output directory specified under [`config["Problem"]["Output"]`] -(../config/problem.md#config["Problem"]). - -In addition to the full 3D fields, a ParaView data collection for the boundary mesh is also -written to disk. The boundary mesh includes all surfaces with prescribed boundary -conditions as well as any material interfaces in the computational domain. +```@raw html + + +``` + +# Postprocessing and Visualization + +As described in the section [Problem Types](problem.md), each simulation type writes +relevant postprocessed scalar quantities to disk in the directory specified by +[`config["Problem"]["Output"]`](../config/problem.md#config%5B%22Problem%22%5D), including +but not limited to computed values like eigenfrequencies, scattering parameters, or lumped +element parameters. In addition, each simulation type will write a file called +`domain-E.csv`, which includes information about the electric and magnetic field energies, +as well as lumped element energies, for each step of the simulation (eigenmode, frequency, +or time step, for examples). + +Models containing lumped or wave port boundaries or surface current excitations will +automatically postprocess quantities related to those boundaries. This is described in +[Ports and surface currents](#Ports-and-surface-currents). + +The participation ratios for bulk dielectrics and interface dielectric layers can be +computed for simulations involving the electric field. For model boundaries, the integrated +surface charge or magnetic flux can also be postprocessed. These features are described +in [Domain postprocessing](#Domain-postprocessing) and in +[Boundary postprocessing](#Boundary-postprocessing). + +Additionally, the computed fields can be automatically probed for their vector values at one +or more points in space. This probe functionality is also described in +[Domain postprocessing](#Domain-postprocessing). + +Finally, as described further in [Visualization](#Visualization), various field quantities +on the 3D computational domain as well as 2D domain boundaries and material interfaces are +written to disk when requested using the relevant parameters under +[`config["Solver"]`](../config/solver.md). These fields are meant to be visualized with +[ParaView](https://www.paraview.org/) or [GLVis](https://glvis.org/). + +## Ports and surface currents + +When lumped ports are present in a model, the lumped port voltages and currents computed for +each step of the simulation (eigenmode, frequency, or time step) are written to ASCII files +named `port-V.csv` and `port-I.csv`, respectively. These files also include the excitation +voltage and current corresponding to the incident wave on excited port boundaries. + +Additionally, when surface current excitations are present, the excitations are written to +`surface-I.csv`. + +For frequency domain problems, the values output are the complex-valued peak voltages and +currents, computed from the field phasors. + +## Domain postprocessing + +Domain postprocessing capabilities are enabled by including objects under +[`config["Domains"]["Postprocessing"]`](../config/domains.md) in the configuration file. +These include: + + - [`config["Domains"]["Postprocessing"]["Energy"]`](../config/domains.md#domains%5B%22Postprocessing%22%5D%5B%22Energy%22%5D) : + Postprocessess the electric and magnetic field energy inside of a given domain + (associated with the specified domain attributes and indexed by the specified integer + `"Index"`). These are from the electric and magnetic field solutions and written to the + same `domain-E.csv` file in the specified postprocessing output directory used for the + global energies (described above). + - [`config["Domains"]["Postprocessing"]["Probe"]`](../config/domains.md#domains%5B%22Postprocessing%22%5D%5B%22Probe%22%5D) : + Probe the values of the computed electric field and magnetic flux density solutions at + specified locations in the computational domain. The availability of the ``\bm{E}`` and + ``\bm{B}`` fields depends on the problem type (for example, for magnetostatic problems, + only ``\bm{B}`` is output and ``\bm{E}`` is not computed, whereas the inverse is true + for electrostatics). For each computed field, the postprocessed values are written to + `probe-E.csv` and `probe-B.csv` in the specified output directory. + +## Boundary postprocessing + +Boundary postprocessing capabilities are enabled by including objects under +[`config["Boundaries"]["Postprocessing"]`](../config/boundaries.md) in the configuration +file. These include: + + - [`config["Boundaries"]["Postprocessing"]["SurfaceFlux"]`](../config/boundaries.md#boundaries%5B%22Postprocessing%22%5D%5B%22SurfaceFlux%22%5D) : + Postprocess the integrated flux through a surface defined by a list of boundary + attributes. Electric, magnetic, and power flux are all supported. Surface capacitance + can be computed by dividing the computed electric flux by the excitation voltage, while + inductance can be computed by dividing the computed magnetic flux by the excitation + current. The resulting fluxes are written to `surface-F.csv` in the specified output + directory. + - [`config["Boundaries"]["Postprocessing"]["Dielectric"]`](../config/boundaries.md#boundaries%5B%22Postprocessing%22%5D%5B%22Dielectric%22%5D) : + Postprocesses interface dielectric loss at surfaces of the model by specifying the + interface thickness, permittivity, and loss tangent. See the + [Bulk and interface dielectric loss](../reference.md#Bulk-and-interface-dielectric-loss) + section of the reference, or + [https://arxiv.org/pdf/1509.01854.pdf](https://arxiv.org/pdf/1509.01854.pdf) or + [https://aip.scitation.org/doi/10.1063/1.3637047](https://aip.scitation.org/doi/10.1063/1.3637047) + for more information. The participation ratios and associated quality factors are + written to the file `surface-Q.csv` in the specified output directory. + +## Visualization + +When specified in the configuration file, the electric field and magnetic flux density +solutions are written to disk for 3D visualization with [ParaView](https://www.paraview.org/) +or [GLVis](https://glvis.org/). Various other postprocessed fields are also written to the ParaView +or grid function (GLVis) database as available, including electric and magnetic energy density, +surface currents, and charge density. These files are found in the `paraview/` or `gridfunction/` +directories located in the output directory specified under +[`config["Problem"]["Output"]`](../config/problem.md#config%5B%22Problem%22%5D). The output +formats are specified in [`config["Problem"]["OutputFormats"]`](../config/problem.md#config%5B%22Problem%22%5D). + +ParaView is recommended to visualize large simulations in parallel. The grid function (GLVis) +format can be useful to embed visualizations in webpages with its +[Javascript version](https://github.com/GLVis/glvis-js/). + +All fields are written out in SI units and the post-processing mesh has the same units of `config["Model"]["L0"]` m +as the input mesh. The specific quantities available vary by [simulation type](problem.md#Problem-Types), +but the variable names and corresponding units for various possible postprocessed scalar and vector are: + + - Electric field: `E`, `E_real`, and `E_imag` (V/m) + - Magnetic flux density: `B`, `B_real`, and `B_imag` (Wb/m²) + - Electric potential: `V` (V) + - Magnetic vector potential : `A`, `A_real`, and `A_imag` (A) + - Electric energy density : `U_e` (J/m³) + - Magnetic energy density : `U_m` (J/m³) + - Poynting vector: `S` (W/m²) + +Also, at the final step of the simulation the following element-wise quantities are written +for visualization: + + - Mesh partitioning (1-based): `Rank` + - Error indicator: `Indicator` + +When saving fields in the grid function (GLVis) format, the file names have the format +`Field_xxxxxx.gf.yyyyyy` where `Field` is the variable name of the postprocessed scalar +or vector field, `xxxxxx` is the six-digit index of the terminal index (electrostatic +or magnetostatic), time step index (transient), or frequency index (driven or eigenmode), +and `yyyyyy` is the six-digit index of the rank of the corresponding MPI process. + +In addition to the full 3D fields, a ParaView data collection for the boundary mesh and +fields is also written to disk. The boundary mesh includes all surfaces with prescribed +boundary conditions as well as any material interfaces in the computational domain. It is +located in the same `paraview/` directory, with suffix `_boundary`. The boundary data +collection is only available for the ParaView output format. + +The boundary data collection includes the 3D field values sampled on the boundary mesh as +well as: + + - Surface charge density: `Q_s`, `Q_s_real`, `Q_s_imag` (Wb/m²) + - Surface current density: `J_s`, `J_s_real`, `J_s_imag` (A/m) + - Wave port boundary mode electric field: `E0_real`, `E0_imag` (V/m) + +## Adaptive mesh refinement + +At the start of an adaptive mesh refinement (AMR) iteration, if +[`config["Model"]["Refinement"]["SaveAdaptIterations"]`](../config/model.md#model%5B%22Refinement%22%5D) +is enabled, the postprocessing results from the solve on the previous mesh will be saved off +within a subdirectory denoted `iterationX`, where `X` is the (1-based) iteration number. +The results in the top level directory will always be those from the most recent successful +solve. diff --git a/docs/src/guide/problem.md b/docs/src/guide/problem.md index 81ee982ebd..f114d0390e 100644 --- a/docs/src/guide/problem.md +++ b/docs/src/guide/problem.md @@ -1,116 +1,154 @@ -```@raw html - - -``` - -# Problem Types - -## Eigenmode problems - -For eigenmode simulations, [`config["Problem"]["Type"]: "Eigenmode"`] -(../config/problem.md#config["Problem"]), the user should specify a -nonzero (but arbitrarily small) frequency above which to search for eigenmodes. The computed -eigenvalues are written to an ASCII file named `eig.csv`, in the directory specified by -[`config["Problem"]["Output"]`](../config/problem.md#config%5B%22Problem%22%5D). Also in -this file are the mode quality factors and errors (absolute and backward) computed for each -eigenpair. - -Calculations related to [energy-participation ratio (EPR) quantization] -(https://www.nature.com/articles/s41534-021-00461-8) can be performed with *Palace* when -the user specifies lumped ports corresponding to the linearized lumped circuit elements in -the model. In this case, the participation matrix for inductive elements is automatically -generated for the specified number of modes and number of inductive lumped ports. The -participation matrix is output in an ASCII file named `port-EPR.csv`. - -The EPR framework can be used to characterize the dissipative elements in the model as well. -In particular, lumped ports with nonzero resistance in the model will trigger coupling rate -and quality factor calculations based on input-output (I-O) line coupling loss: By -specifying resistive lumped ports in the model, the mode coupling quality factors will be -computed as ``Q_{ml} = \omega_m/\kappa_{ml}``. The output file `port-Q.csv` will be created -in the output directory containing these mode qualty factor contributions. For bulk and -interface dielectric loss calculations, which are not unique to the eigenmode simulation -type, see the sections [Domain postprocessing](postprocessing.md#Domain-postprocessing) and -[Boundary postprocessing](postprocessing.md#Boundary-postprocessing). - -## Driven problems in the frequency domain - -For frequency domain driven simulations, [`config["Problem"]["Type"]: "Driven"`] -(../config/problem.md#config["Problem"]), the model is excited -by a time harmonic incident field (port boundary) or surface current. The user can specify -a port excitation using [lumped ports or numeric wave ports] -(boundaries.md#Lumped-and-wave-port-excitation). - -The default frequency sweep behavior for frequency domain driven simulations is to perform a -uniform sampling from the minimum to the maximum specified frequency of interest, using the -user specified step size. An adaptive fast frequency sweep strategy can also be used, -activated by specifying a nonzero value for `"AdaptiveTol"` under the -[`config["Solver"]["Driven"]`](../config/solver.md#solver%5B%22Driven%22%5D) object. In this -case, using the high-dimensional model solution computed at a few automatically selected -frequency samples, a low-cost model is constructed and used to compute the frequency -response over the entire frequency range of interest. The specified error tolerance ensures -that the approximate low-cost model is reliably accurate relative to the high-dimensional -model within the frequency band of interest. This is particularly useful for -fine-resolution sweeps containing many sample points, where it can yield a significant -speedup over the default strategy. - -Port scattering parameters, or S-parameters, are postprocessed for the column of the -scattering matrix corresponding to the driven port index automatically for this simulation -type and stored in an ASCII file named `port-S.csv`, in the directory specified by -[`config["Problem"]["Output"]`](../config/problem.md#config%5B%22Problem%22%5D). In the case -that more than a single lumped or wave port is excited or surface current excitations are -used, scattering parameter output will be disabled for the simulation (though other -quantities of interest are still postprocessed). Further postprocessing of quantities -related to ports in the model is described in the section on [Ports and surface currents] -(postprocessing.md#Ports-and-surface-currents). - -## Driven problems in the time domain - -The previous simulation types describe simulations based on frequency domain formulations of -Maxwell's equations. Time domain simulations are also possible through the transient -simulation type: [`config["Problem"]["Type"]: "Transient"`] -(../config/problem.md#config["Problem"]). Similar to the driven simulation type in the -frequency domain, transient simulations involve simulating the response of the system to a -time-dependent excitation field specified at lumped ports or surface current excitations in -the model. The system is always started from rest with zero initial conditions and -time-integrated for a user specified duration, given in nanoseconds. There are several -available excitation types which define the time dependence of the pulse or excitation -waveform. These are specified under the [`config["Solver"]["Transient"]`] -(../config/solver.md#solver["Transient"]) object using the `"Excitation"` keyword. - -As with the frequency domain driven case, postprocessing of quantities related to ports is -described in the section on [Ports and surface currents] -(postprocessing.md#Ports-and-surface-currents). - -## Electrostatic problems - -For electrostatic simulations, ([`config["Problem"]["Type"]: "Electrostatic"`] -(../config/problem.md#config["Problem"]), the user should specify a number of terminal -boundaries ([`config["Boundaries"]["Terminal"]`] -(../config/boundaries.md#boundaries%5B%22Terminal%22%5D)) as well as boundaries which are -grounded ([`config["Boundaries"]["Ground"]`] -(../config/boundaries.md#boundaries%5B%22Ground%22%5D)). For each terminal, an electrostatic -field is computed by assigning the terminal of interest a positive nonzero voltage and all -other terminals and grounded boundaries a zero voltage. The resulting fields are then used -to compute the Maxwell capacitance matrix and its inverse, which are written to an ASCII -file named `terminal-C.csv` and `terminal-Cinv.csv`, respectively, in the directory -specified by [`config["Problem"]["Output"]`] -(../config/problem.md#config%5B%22Problem%22%5D). The mutual capacitance matrix is also -computed and written to `terminal-Cm.csv` in the same directory. - -## Magnetostatic problems - -For magnetostatic simulations, ([`config["Problem"]["Type"]: "Magnetostatic"`] -(../config/problem.md#config["Problem"]), the user should specify -a number of source current boundaries. For each current source, a magnetostatic field is -computed by applying a unit current to the source index of interest, leaving all other -sources open with no excitation. Surfaces which are expected to carry current should be -labeled as perfectly conducting, which prescibes a zero magnetic flux, or -[magnetic insulation] -(https://doc.comsol.com/5.5/doc/com.comsol.help.comsol/comsol_ref_acdc.17.74.html), -boundary condition. The resulting fields are used to compute the inductance matrix and its -inverse, which are written to an ASCII file named `terminal-M.csv` and `terminal-Minv.csv`, -respectively, in the directory specified by [`config["Problem"]["Output"]`] -(../config/problem.md#config["Problem"]). A "mutual" inductance matrix which has the same -form as the mutual capacitance matrix (its entries are based on current differences between -ports rather than absolute currents) is computed and written to `terminal-Mm.csv` in the -same directory. +```@raw html + + +``` + +# Problem Types + +## Eigenmode problems + +For eigenmode simulations, +[`config["Problem"]["Type"]: "Eigenmode"`](../config/problem.md#config%5B%22Problem%22%5D), +the user should specify a nonzero (but arbitrarily small) frequency above which to search +for eigenmodes. The computed eigenvalues are written to an ASCII file named `eig.csv`, in +the directory specified by +[`config["Problem"]["Output"]`](../config/problem.md#config%5B%22Problem%22%5D). Also in +this file are the mode quality factors and errors (absolute and backward) computed for each +eigenpair. + +Calculations related to +[energy-participation ratio (EPR) quantization](https://www.nature.com/articles/s41534-021-00461-8) +can be performed with *Palace* when the user specifies lumped ports corresponding to the +linearized lumped circuit elements in the model. In this case, the participation matrix for +inductive elements is automatically generated for the specified number of modes and number +of inductive lumped ports. The participation matrix is output in an ASCII file named +`port-EPR.csv`. + +The EPR framework can be used to characterize the dissipative elements in the model as well. +In particular, lumped ports with nonzero resistance in the model will trigger coupling rate +and quality factor calculations based on input-output (I-O) line coupling loss: By +specifying resistive lumped ports in the model, the mode coupling quality factors will be +computed as ``Q_{ml} = \omega_m/\kappa_{ml}``. The output file `port-Q.csv` will be created +in the output directory containing these mode quality factor contributions. For bulk and +interface dielectric loss calculations, which are not unique to the eigenmode simulation +type, see the sections [Domain postprocessing](postprocessing.md#Domain-postprocessing) and +[Boundary postprocessing](postprocessing.md#Boundary-postprocessing) of this guide. + +## Driven problems in the frequency domain + +For frequency domain driven simulations, +[`config["Problem"]["Type"]: "Driven"`](../config/problem.md#config%5B%22Problem%22%5D), the +model is excited by a time harmonic incident field (port boundary) or surface current. +The user can specify a port excitation using +[lumped ports or numeric wave ports](boundaries.md#Lumped-and-wave-port-excitation). + +The default frequency sweep behavior for frequency domain driven simulations is to perform a +uniform sampling from the minimum to the maximum specified frequency of interest, using the +user specified step size. An adaptive fast frequency sweep strategy can also be used, +activated by specifying a nonzero value for `"AdaptiveTol"` under the +[`config["Solver"]["Driven"]`](../config/solver.md#solver%5B%22Driven%22%5D) object. In this +case, using the high-dimensional model solution computed at a few automatically selected +frequency samples, a low-cost model is constructed and used to compute the frequency +response over the entire frequency range of interest. The specified error tolerance ensures +that the approximate low-cost model is reliably accurate relative to the high-dimensional +model within the frequency band of interest. This is particularly useful for +fine-resolution sweeps containing many sample points, where it can yield a significant +speedup over the default strategy. + +Port scattering parameters, or S-parameters, are postprocessed for the column of the +scattering matrix corresponding to the driven port index automatically for this simulation +type and stored in an ASCII file named `port-S.csv`, in the directory specified by +[`config["Problem"]["Output"]`](../config/problem.md#config%5B%22Problem%22%5D). Both the +``\text{dB}`` magnitude (``20\log_{10}(|S_{ij}|)``) and the phase ``\angle(S_{ij})`` +(in degrees) are written to the file. In the case that more than a single lumped or wave +port is excited or surface current excitations are used, scattering parameter output will +be disabled for the simulation (though other quantities of interest are still +postprocessed). When lumped ports are present, the peak complex lumped port voltages and +currents computed for each excitation frequency are written to ASCII files named +`port-V.csv` and `port-I.csv`, respectively, Additionally, the surface current excitations +are written to `surface-I.csv`. + +It is often the case that a user wants to compute the entire scattering matrix rather than +just a single column. In this case, each column can be computed in parallel by running +*Palace* multiple times. For example, consider the following short Python code which +modifies a base configuration file `config.json` to generate a complete 4x4 scattering +matrix by running 4 *Palace* simulations, each with 2 MPI processes: + +```python +import json +import os +import subprocess + +# Base configuration file +config_path = "config.json" + +for i in range(4): + # Prepare configuration file for simulation + with open(config_path, "r") as f: + config_json = json.loads(f.read()) + for port in config_json["Boundaries"]["LumpedPort"]: + port["Excitation"] = (1+i == port["Index"]) + + # Write new config file + config_path_i = os.path.splitext(config_path)[0] + f"-{1+i}.json" + with open(config_path_i, "w") as f: + f.write(json.dumps(config_json)) + + # Run Palace simulation (alternatively, use Popen and wait) + subprocess.run(["palace", "-np", 2, config_path_i]) +``` + +## Driven problems in the time domain + +The previous simulation types describe simulations based on frequency domain formulations of +Maxwell's equations. Time domain simulations are also possible through the transient +simulation type: +[`config["Problem"]["Type"]: "Transient"`](../config/problem.md#config%5B%22Problem%22%5D). + +Similar to the driven simulation type in the frequency domain, transient simulations involve +simulating the response of the system to a time-dependent excitation field specified at +lumped ports or surface current excitations in the model. The system is always started from +rest with zero initial conditions and time-integrated for a user specified duration, given +in nanoseconds. There are several available excitation types which define the time +dependence of the pulse or excitation waveform. These are specified under the +[`config["Solver"]["Transient"]`](../config/solver.md#solver%5B%22Transient%22%5D) object +using the `"Excitation"` keyword. + +The time histories of the lumped port voltages and currents are postprocessed and +automatically written to ASCII files named `port-V.csv` and `port-I.csv`, respectively, in +the directory specified by +[`config["Problem"]["Output"]`](../config/problem.md#config%5B%22Problem%22%5D). +Additionally, surface current excitation time histories are written to `surface-I.csv`. + +## Electrostatic problems + +For electrostatic simulations, +([`config["Problem"]["Type"]: "Electrostatic"`](../config/problem.md#config%5B%22Problem%22%5D), +the user should specify a number of terminal boundaries +([`config["Boundaries"]["Terminal"]`](../config/boundaries.md#boundaries%5B%22Terminal%22%5D)) +as well as boundaries which are grounded +([`config["Boundaries"]["Ground"]`](../config/boundaries.md#boundaries%5B%22Ground%22%5D)). +For each terminal, an electrostatic field is computed by assigning the terminal of interest +a positive unit voltage and all other terminals and grounded boundaries a zero voltage. The +resulting fields are then used to compute the Maxwell capacitance matrix and its inverse, +which are written to an ASCII file named `terminal-C.csv` and `terminal-Cinv.csv`, +respectively, in the directory specified by +[`config["Problem"]["Output"]`](../config/problem.md#config%5B%22Problem%22%5D). The mutual +capacitance matrix is also computed and written to `terminal-Cm.csv` in the same directory. + +## Magnetostatic problems + +For magnetostatic simulations, +([`config["Problem"]["Type"]: "Magnetostatic"`](../config/problem.md#config%5B%22Problem%22%5D), +the user should specify a number of source current boundaries. For each current source, a +magnetostatic field is computed by applying a unit current to the source index of interest, +leaving all other sources open with no excitation. Surfaces which are expected to carry +current should be labeled as perfectly conducting, which prescribes a zero magnetic flux, or +[magnetic insulation](https://doc.comsol.com/5.5/doc/com.comsol.help.comsol/comsol_ref_acdc.17.74.html), +boundary condition. The resulting fields are used to compute the inductance matrix and its +inverse, which are written to an ASCII file named `terminal-M.csv` and `terminal-Minv.csv`, +respectively, in the directory specified by +[`config["Problem"]["Output"]`](../config/problem.md#config%5B%22Problem%22%5D). A "mutual" +inductance matrix which has the same form as the mutual capacitance matrix (its entries are +based on current differences between ports rather than absolute currents) is computed and +written to `terminal-Mm.csv` in the same directory. diff --git a/docs/src/index.md b/docs/src/index.md index cf7bd2c910..e280dded0a 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,62 +1,64 @@ -```@raw html - - -``` - -# Palace: 3D Finite Element Solver for Computational Electromagnetics - -*Palace*, for **PA**rallel **LA**rge-scale **C**omputational **E**lectromagnetics, is an -open-source, parallel finite element code for full-wave 3D electromagnetic simulations in -the frequency or time domain, using the [MFEM finite element discretization library] -(http://mfem.org). - -## Key features - - - [Eigenmode calculations](guide/problem.md#Eigenmode-problems) with optional material or - radiative loss including lumped impedance boundaries. Automatic postprocessing of - energy-participation ratios (EPRs) for [circuit quantization] - (https://www.nature.com/articles/s41534-021-00461-8) and interface or bulk - participation ratios for predicting dielectric loss. - - [Frequency domain driven simulations] - (guide/problem.md#Driven-problems-in-the-frequency-domain) with surface current - excitation and lumped or numeric wave port boundaries. Wideband frequency response - calculation using uniform frequency space sampling or an adaptive fast frequency sweep - algorithm. - - Explicit or fully-implicit [time domain solver] - (guide/problem.md#Driven-problems-in-the-time-domain) for transient electromagnetic - analysis. - - Lumped capacitance and inductance matrix extraction via [electrostatic] - (guide/problem.md#Electrostatic-problems) and [magnetostatic] - (guide/problem.md#Magnetostatic-problems) problem formulations. - - Support for a wide range of [mesh file formats] - (guide/model.md#Supported-mesh-formats) for structured and unstructured meshes, - with built-in uniform or region-based parallel [mesh refinement] - (guide/model.md#Mesh-refinement). - - Solution-based [Adaptive Mesh Refinement (AMR)](guide/model.md#Mesh-refinement) for all - simulation types aside from transient. Nonconformal refinement is supported for all - mesh types, and conformal refinement for simplex meshes. - - Arbitrary high-order finite element spaces and curvilinear mesh support thanks to - the [MFEM library](https://mfem.org/features/). - - Scalable algorithms for the [solution of linear systems of equations] - (config/solver.md#solver["Linear"]), including geometric multigrid (GMG), parallel - sparse direct solvers, and algebraic multigrid (AMG) preconditioners, for fast - performance on platforms ranging from laptops to HPC systems. - -## Contents - - - [Installation](install.md) - - [Execution](run.md) - - [User Guide](guide/guide.md) - - [Configuration File](config/config.md) - - [Examples](examples/examples.md) - - [Reference](reference.md) - - [Developer Notes](developer.md) - -## Coming soon - - - Improved adaptive mesh refinement (AMR) support for transient simulation type and - numeric wave ports on nonconformal meshes - - Efficient high-order operator assembly and GPU support - - Perfectly matched layer (PML) boundaries - - Periodic boundaries with phase delay constraints - - Automatic mesh generation +```@raw html + + +``` + +# Palace: 3D Finite Element Solver for Computational Electromagnetics + +*Palace*, for **PA**rallel **LA**rge-scale **C**omputational **E**lectromagnetics, is an +open-source, parallel finite element code for full-wave 3D electromagnetic simulations in +the frequency or time domain, using the +[MFEM finite element discretization library](http://mfem.org) and +[libCEED library](https://github.com/CEED/libCEED) for efficient exascale discretizations. + +## Key features + + - [Eigenmode calculations](guide/problem.md#Eigenmode-problems) with optional material or + radiative loss including lumped impedance boundaries. Automatic postprocessing of + energy-participation ratios (EPRs) for + [circuit quantization](https://www.nature.com/articles/s41534-021-00461-8) and interface + or bulk participation ratios for predicting dielectric loss. + - [Frequency domain driven simulations](guide/problem.md#Driven-problems-in-the-frequency-domain) + with surface current excitation and lumped or numeric wave port boundaries. Wideband + frequency response calculation using uniform frequency space sampling or an adaptive + fast frequency sweep algorithm. + - Explicit or fully-implicit + [time domain solver](guide/problem.md#Driven-problems-in-the-time-domain) for transient + electromagnetic analysis. + - Lumped capacitance and inductance matrix extraction via + [electrostatic](guide/problem.md#Electrostatic-problems) and + [magnetostatic](guide/problem.md#Magnetostatic-problems) problem formulations. + - Support for a wide range of [mesh file formats](guide/model.md#Supported-mesh-formats) + for structured and unstructured meshes, with built-in uniform or region-based parallel + [mesh refinement](guide/model.md#Mesh-refinement). + - Solution-based [Adaptive Mesh Refinement (AMR)](guide/model.md#Mesh-refinement) for all + simulation types aside from transient. Nonconformal refinement is supported for all mesh + types, and conformal refinement for simplex meshes. + - Arbitrary high-order finite element spaces and curvilinear mesh support thanks to + the [MFEM library](https://mfem.org/features/). + - Scalable algorithms for the + [solution of linear systems of equations](config/solver.md#solver%5B%22Linear%22%5D), + including matrix-free $p$-multigrid utilizing + [high-order operator partial assembly](https://mfem.org/performance/), parallel sparse + direct solvers, and algebraic multigrid (AMG) preconditioners, for fast performance on + platforms ranging from laptops to HPC systems. + - Support for + [hardware acceleration using NVIDIA or AMD GPUs](https://libceed.org/en/latest/intro/), + including multi-GPU parallelism, using pure CUDA and HIP code as well as + [MAGMA](https://icl.utk.edu/magma/) and other libraries. + +## Contents + + - [Installation](install.md) + - [Execution](run.md) + - [User Guide](guide/guide.md) + - [Configuration File](config/config.md) + - [Examples](examples/examples.md) + - [Reference](reference.md) + - [Developer Notes](developer/notes.md) + +## Coming soon + + - Improved adaptive mesh refinement (AMR) support for transient simulations + - Perfectly matched layer (PML) boundaries + - Automatic mesh generation and optimization diff --git a/docs/src/install.md b/docs/src/install.md index ee5752bbb7..da0315a3ad 100644 --- a/docs/src/install.md +++ b/docs/src/install.md @@ -1,208 +1,244 @@ -```@raw html - - -``` - -# Installation - -*Palace* can be built and installed using the [Spack HPC package manager] -(https://spack.io/), following the instructions in the [Build using Spack] -(#Build-using-Spack) section. Containerized builds are possible with Singularity/Apptainer, -described in [Build using Singularity/Apptainer](#Build-using-Singularity/Apptainer). -Alternatively, compiling from source using [CMake](https://cmake.org/download) is described -in [Build from source](#Build-from-source). - -## Build using Spack - -*Palace* is a registered package in the built-in Spack package repository. To install the -solver, follow the [instructions for setting up Spack on your system] -(https://spack.readthedocs.io/en/latest/getting_started.html) and run: - -```bash -spack install palace -``` - -More information about about the available configuration options and dependencies can be -found using `spack info palace`. - -## Build using Singularity/Apptainer - -*Palace* can be built in a [Singularity/Apptainer] -(https://apptainer.org/docs/user/main/introduction.html) container for HPC environments -supporting the Singularity/Apptainer container system. To build the container using the -provided definition file in the [singularity/] -(https://github.com/awslabs/palace/blob/main/singularity) directory, first -[set up Singularity/Apptainer on your system] -(https://github.com/apptainer/apptainer/blob/main/INSTALL.md) and subsequently run: - -```bash -singularity build palace.sif /singularity/singularity.def -``` - -where the repository source code has been cloned to ``. For more information -about Singularity/Apptainer, see the [Quick Start] -(https://apptainer.org/docs/user/main/quick_start.html) guide in the Singularity/Apptainer -documentation. - -## Build from source - -A build from source requires the following prerequisites installed on your system: - - - [CMake](https://cmake.org/download) version 3.18.1 or later - - C++ compiler supporting C++17 - - C and (optionally) Fortran compilers for dependency builds - - MPI distribution - - BLAS, LAPACK libraries (described below in [Math libraries](#Math-libraries)) - -In addition, builds from source require the following system packages which are typically -already installed and are available from most package managers (`apt`, `dnf`, `brew`, etc.): - - - Python 3 - - [`pkg-config`](https://www.freedesktop.org/wiki/Software/pkg-config/) - - [`libunwind`](https://www.nongnu.org/libunwind/) (optional) - - [`zlib`](https://zlib.net/) (optional) - -### Quick start - -To start, clone the code using - -```bash -git clone https://github.com/awslabs/palace.git -``` - -Then, a build using the default options can be performed by running the following from -within the directory where the repository was cloned: - -```bash -mkdir build && cd build -cmake .. -make -j -``` - -This installs the binary executable in `build/bin/`. - -### Configuration options - -To configure a *Palace* build in `` using the source code in ``, -run: - -```bash -mkdir && cd -cmake [OPTIONS] -``` - -Here, `[OPTIONS]` is a list of options passed to `cmake` of the form `-D=`. -The *Palace* build respects standard CMake variables, including: - - - `CMAKE_CXX_COMPILER`, `CMAKE_C_COMPILER`, and `CMAKE_Fortran_COMPILER` which define the - desired compilers. - - `CMAKE_CXX_FLAGS`, `CMAKE_C_FLAGS`, and `CMAKE_Fortran_FLAGS` which define the - corresponding compiler flags. - - `CMAKE_INSTALL_PREFIX` which specifies the path for installation (if none is provided, - defaults to ``). - - `CMAKE_BUILD_TYPE` which defines the build type such as `Release`, `Debug`, - `RelWithDebInfo`, and `MinSizeRel` (`Release` if not otherwise specified). - - `BUILD_SHARED_LIBS` which is a flag to create shared libraries for dependency library - builds instead of static libraries (`OFF` by default). - - `CMAKE_PREFIX_PATH` which lists directories specifying installation prefixes to be - searched for dependencies. - - `CMAKE_INSTALL_RPATH` and `CMAKE_INSTALL_RPATH_USE_LINK_PATH` which configure the rpath - for installed library and executable targets. - -Additional build options are (with default values in brackets): - - - `PALACE_WITH_64BIT_INT [OFF]` : Build with 64-bit integer support - - `PALACE_WITH_OPENMP [OFF]` : Use OpenMP - - `PALACE_WITH_GSLIB [ON]` : Build with GSLIB library for high-order field interpolation - - `PALACE_WITH_SUPERLU [ON]` : Build with SuperLU_DIST sparse direct solver - - `PALACE_WITH_STRUMPACK [OFF]` : Build with STRUMPACK sparse direct solver - - `PALACE_WITH_MUMPS [OFF]` : Build with MUMPS sparse direct solver - - `PALACE_WITH_SLEPC [ON]` : Build with SLEPc eigenvalue solver - - `PALACE_WITH_ARPACK [OFF]` : Build with ARPACK eigenvalue solver - - `PALACE_WITH_LIBXSMM [ON]` : Build with LIBXSMM backend for libCEED - -The build step is invoked by running (for example with 4 `make` threads) - -```bash -make -j 4 -``` - -or - -```bash -cmake --build . -- -j 4 -``` - -which installs the binary executable in `${CMAKE_INSTALL_PREFIX}/bin/`. - -### Math libraries - -During the configure step, the build system will try to detect system installations of BLAS -and LAPACK libraries depending on the system architecture according to the following -procedure: - - - For `x86_64` systems: - - + If the `MKLROOT` environment variable is set, looks for an [Intel MKL] - (https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) - installation. - + If the `AOCL_DIR` or `AOCLROOT` environment variables are set, looks for an - [AMD Optimizing CPU Libraries (AOCL)](https://developer.amd.com/amd-aocl) - installation of BLIS and libFLAME. - + Otherwise, tries to locate an installation of [OpenBLAS](https://www.openblas.net/) - which is permissively licensed and available from most package managers. - - - For `aarch64`/`arm64` systems: - - + If the `ARMPL_DIR` environment variable is set, looks for an - [Arm Performance Libraries (PL)] - (https://www.arm.com/products/development-tools/server-and-hpc/allinea-studio/performance-libraries) - installation. - + Otherwise, tries to locate an installation of [OpenBLAS](https://www.openblas.net/). - -If the installation path of OpenBLAS is non-standard or is not found by default, it can be -set using the `OPENBLAS_DIR` or `OPENBLASROOT` environment variables, or added to -`CMAKE_PREFIX_PATH` when calling CMake. - -It is recommended in most cases to use a serial BLAS and LAPACK builds (not multithreaded), -as the standard parallelization in approach in *Palace* is to use pure MPI parallelism. - -## Dependencies - -*Palace* leverages the [MFEM finite element discretization library](http://mfem.org). It -always configures and builds its own installation of MFEM internally in order to support -the most up to date features and patches. Likewise, Palace will always build its own -installation of [libCEED](https://github.com/CEED/libCEED), and [GSLIB] -(https://github.com/Nek5000/gslib), when `PALACE_WITH_GSLIB=ON`. - -As part of the [Build from source](#Build-from-source), the CMake build will automatically -build and install a small number of third-party dependencies before building *Palace*. The -source code for these dependencies is downloaded using using [Git submodules] -(https://git-scm.com/book/en/v2/Git-Tools-Submodules). These libraries include: - - - [METIS](http://glaros.dtc.umn.edu/gkhome/metis/metis/overview) and [ParMETIS] - (http://glaros.dtc.umn.edu/gkhome/metis/parmetis/overview) - - [Hypre](https://github.com/hypre-space/hypre) - - [SuperLU_DIST](https://github.com/xiaoyeli/superlu_dist) (optional, when - `PALACE_WITH_SUPERLU=ON`) - - [STRUMPACK](https://portal.nersc.gov/project/sparse/strumpack) (optional, when - `PALACE_WITH_STRUMPACK=ON`), including [ButterflyPACK] - (https://github.com/liuyangzhuan/ButterflyPACK) and [zfp](https://github.com/LLNL/zfp) - support - - [MUMPS](http://mumps.enseeiht.fr/) (optional, when `PALACE_WITH_MUMPS=ON`) - - [SLEPc](https://slepc.upv.es/) (optional, when `PALACE_WITH_SLEPC=ON`), including - [PETSc](https://petsc.org/release/) - - [ARPACK-NG](https://github.com/opencollab/arpack-ng) (optional, when - `PALACE_WITH_ARPACK=ON`) - - [LIBXSMM](https://github.com/libxsmm/libxsmm) (optional, when `PALACE_WITH_LIBXSMM=ON`) - - [nlohmann/json](https://github.com/nlohmann/json) - - [fmt](https://fmt.dev/latest) - - [Eigen](https://eigen.tuxfamily.org) - -For solving eigenvalue problems, at least one of SLEPc or ARPACK-NG must be specified. -Typically only one of the SuperLU_DIST, STRUMPACK, and MUMPS dependencies is required but -all can be built so the user can decide at runtime which solver to use. - -For unit testing, Palace relies on the [Catch2 library](https://github.com/catchorg/Catch2), -which is automatically downloaded and built when building the `unit-tests` target. See the -[Developer Notes](developer.md#Testing) for more information. +```@raw html + + +``` + +# Installation + +*Palace* can be built and installed using the +[Spack HPC package manager](https://spack.io/), following the instructions in +the [Build using Spack](#Build-using-Spack) section. Containerized builds are possible with +Singularity/Apptainer, described in +[Build using Singularity/Apptainer](#Build-using-Singularity/Apptainer). Alternatively, +compiling from source using [CMake](https://cmake.org/download) is described in +[Build from source](#Build-from-source). + +If you are a user, we recommend you install [*Palace* with +Spack](#Build-using-Spack). If you intend to develop *Palace*, [build from +source](#Build-from-source) instead. + +## Build using Spack + +*Palace* is a registered package in the built-in Spack package repository. To +install the solver, follow the [instructions for setting up Spack on your +system](https://spack.readthedocs.io/en/latest/getting_started.html). Note that +Spack requires basic system utilities that may not be installed by default on +certain systems (such as Ubuntu for Windows Subsystem for Linux). Consult the +[Spack Prerequisites +page](https://spack.readthedocs.io/en/latest/installing_prerequisites.html) to +ensure all required utilities are installed. + +Once you have installed Spack, run: + +```bash +spack install palace +``` + +This will install the default version of *Palace*. Spack supports installing +_variants_ of *Palace*. For instance, if you want to install *Palace* with CUDA, +MUMPS and SLEPc, call + +```bash +spack install palace +mumps +slepc +cuda cuda_arch=90 +``` + +where `cuda_arch` is determined by the [generation of your +GPU](https://developer.nvidia.com/cuda-gpus). More information about the +available configuration options and dependencies can be found using `spack info palace`. See the [official +tutorial](https://spack-tutorial.readthedocs.io/en/latest/tutorial_basics.html) +for an introduction. + +## Build using Singularity/Apptainer + +*Palace* can be built in a +[Singularity/Apptainer](https://apptainer.org/docs/user/main/introduction.html) container +for HPC environments +supporting the Singularity/Apptainer container system. To build the container using the +provided definition file in the +[singularity/](https://github.com/awslabs/palace/blob/main/singularity) directory, first +[set up Singularity/Apptainer on your system](https://github.com/apptainer/apptainer/blob/main/INSTALL.md) +and subsequently run: + +```bash +singularity build palace.sif /singularity/singularity.def +``` + +where the repository source code has been cloned to ``. For more information +about Singularity/Apptainer, see the +[Quick Start](https://apptainer.org/docs/user/main/quick_start.html) guide in the +Singularity/Apptainer documentation. + +## Build from source + +A build from source requires the following prerequisites installed on your system: + + - [CMake](https://cmake.org/download) version 3.24 or later + - C++17 compatible C++ compiler + - C and Fortran (optional) compilers for dependency builds + - MPI distribution + - BLAS, LAPACK libraries (described below in [Math libraries](#Math-libraries)) + - [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) or + [ROCm](https://rocm.docs.amd.com/en/latest/) installation (optional, for GPU support + only) + +In addition, builds from source require the following system packages which are typically +already installed and are available from most package managers (`apt`, `dnf`, `brew`, etc.): + + - Python 3 + - [`pkg-config`](https://www.freedesktop.org/wiki/Software/pkg-config/) + - [`libunwind`](https://www.nongnu.org/libunwind/) (optional) + - [`zlib`](https://zlib.net/) (optional) + +### Quick start + +To start, clone the code using + +```bash +git clone https://github.com/awslabs/palace.git +``` + +Then, a build using the default options can be performed by running the following from +within the directory where the repository was cloned: + +```bash +mkdir build && cd build +cmake .. +make -j +``` + +This installs the binary executable in `build/bin/`. + +### Configuration options + +To configure a *Palace* build in `` using the source code in ``, +run: + +```bash +mkdir && cd +cmake [OPTIONS] +``` + +Here, `[OPTIONS]` is a list of options passed to `cmake` of the form `-D=`. +The *Palace* build respects standard CMake variables, including: + + - `CMAKE_CXX_COMPILER`, `CMAKE_C_COMPILER`, and `CMAKE_Fortran_COMPILER` which define the + desired compilers. + - `CMAKE_CXX_FLAGS`, `CMAKE_C_FLAGS`, and `CMAKE_Fortran_FLAGS` which define the + corresponding compiler flags. + - `CMAKE_CUDA_COMPILER`, `CMAKE_CUDA_FLAGS`, `CMAKE_CUDA_ARCHITECTURES`, and the + corresponding `CMAKE_HIP_COMPILER`, `CMAKE_HIP_FLAGS`, and `CMAKE_HIP_ARCHITECTURES` for + GPU-accelerated builds with CUDA or HIP. + - `CMAKE_INSTALL_PREFIX` which specifies the path for installation (if none is provided, + defaults to ``). + - `CMAKE_BUILD_TYPE` which defines the build type such as `Release`, `Debug`, + `RelWithDebInfo`, and `MinSizeRel` (`Release` if not otherwise specified). + - `BUILD_SHARED_LIBS` which is a flag to create shared libraries for dependency library + builds instead of static libraries (`OFF` by default). + - `CMAKE_PREFIX_PATH` which lists directories specifying installation prefixes to be + searched for dependencies. + - `CMAKE_INSTALL_RPATH` and `CMAKE_INSTALL_RPATH_USE_LINK_PATH` which configure the rpath + for installed library and executable targets. + +Additional build options are (with default values in brackets): + + - `PALACE_WITH_64BIT_INT [OFF]` : Build with 64-bit integer support + - `PALACE_WITH_OPENMP [OFF]` : Use OpenMP for shared-memory parallelism + - `PALACE_WITH_CUDA [OFF]` : Use CUDA for NVIDIA GPU support + - `PALACE_WITH_HIP [OFF]` : Use HIP for AMD or NVIDIA GPU support + - `PALACE_WITH_GPU_AWARE_MPI [OFF]` : Option to set if MPI distribution is GPU aware + - `PALACE_WITH_SUPERLU [ON]` : Build with SuperLU_DIST sparse direct solver + - `PALACE_WITH_STRUMPACK [OFF]` : Build with STRUMPACK sparse direct solver + - `PALACE_WITH_MUMPS [OFF]` : Build with MUMPS sparse direct solver + - `PALACE_WITH_SLEPC [ON]` : Build with SLEPc eigenvalue solver + - `PALACE_WITH_ARPACK [OFF]` : Build with ARPACK eigenvalue solver + - `PALACE_WITH_LIBXSMM [ON]` : Build with LIBXSMM backend for libCEED + - `PALACE_WITH_MAGMA [ON]` : Build with MAGMA backend for libCEED + - `PALACE_WITH_GSLIB [ON]` : Build with GSLIB library for high-order field interpolation + - `PALACE_WITH_SUNDIALS [ON]` : Build with SUNDIALS ODE solver library + +The build step is invoked by running (for example with 4 `make` threads) + +```bash +make -j 4 +``` + +or + +```bash +cmake --build . -- -j 4 +``` + +which installs the binary executable in `${CMAKE_INSTALL_PREFIX}/bin/`. + +### Math libraries + +During the configure step, the build system will try to detect system installations of BLAS +and LAPACK libraries depending on the system architecture according to the following +procedure: + + - For `x86_64` systems: + + + If the `MKLROOT` environment variable is set, looks for an + [Intel MKL](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl.html) + installation. + + If the `AOCL_DIR` or `AOCLROOT` environment variables are set, looks for an + [AMD Optimizing CPU Libraries (AOCL)](https://developer.amd.com/amd-aocl) + installation of BLIS and libFLAME. + + Otherwise, tries to locate an installation of [OpenBLAS](https://www.openblas.net/) + which is permissively licensed and available from most package managers. + + - For `aarch64`/`arm64` systems: + + + If the `ARMPL_DIR` environment variable is set, looks for an + [Arm Performance Libraries (PL)](https://www.arm.com/products/development-tools/server-and-hpc/allinea-studio/performance-libraries) + installation. + + Otherwise, tries to locate an installation of [OpenBLAS](https://www.openblas.net/). + +If the installation path of OpenBLAS is non-standard or is not found by default, it can be +set using the `OPENBLAS_DIR` or `OPENBLASROOT` environment variables, or added to +`CMAKE_PREFIX_PATH` when calling CMake. + +It is recommended in most cases to use serial BLAS and LAPACK builds (not multithreaded), +as the standard parallelization approach in *Palace* is to use pure MPI parallelism. + +## Dependencies + +*Palace* leverages the [MFEM finite element discretization library](http://mfem.org). It +always configures and builds its own installation of MFEM internally in order to support +the most up to date features and patches. Likewise, *Palace* will always build its own +installation of [libCEED](https://github.com/CEED/libCEED), and +[GSLIB](https://github.com/Nek5000/gslib), when `PALACE_WITH_GSLIB=ON`. + +As part of the [Build from source](#Build-from-source), the CMake build will automatically +build and install a small number of third-party dependencies before building *Palace*. The +source code for these dependencies is downloaded during the build process: + + - [METIS](http://glaros.dtc.umn.edu/gkhome/metis/metis/overview) and + [ParMETIS](http://glaros.dtc.umn.edu/gkhome/metis/parmetis/overview) + - [Hypre](https://github.com/hypre-space/hypre) + - [SuperLU_DIST](https://github.com/xiaoyeli/superlu_dist) (optional, when + `PALACE_WITH_SUPERLU=ON`) + - [STRUMPACK](https://portal.nersc.gov/project/sparse/strumpack) (optional, when + `PALACE_WITH_STRUMPACK=ON`), including + [ButterflyPACK](https://github.com/liuyangzhuan/ButterflyPACK) and + [zfp](https://github.com/LLNL/zfp) support + - [MUMPS](http://mumps.enseeiht.fr/) (optional, when `PALACE_WITH_MUMPS=ON`) + - [SLEPc](https://slepc.upv.es/) (optional, when `PALACE_WITH_SLEPC=ON`), including + [PETSc](https://petsc.org/release/) + - [ARPACK-NG](https://github.com/opencollab/arpack-ng) (optional, when + `PALACE_WITH_ARPACK=ON`) + - [LIBXSMM](https://github.com/libxsmm/libxsmm) (optional, when `PALACE_WITH_LIBXSMM=ON`) + - [MAGMA](https://icl.utk.edu/magma/) (optional, when `PALACE_WITH_MAGMA=ON`) + - [SUNDIALS](https://github.com/LLNL/sundials) (optional, when `PALACE_WITH_SUNDIALS=ON`) + - [nlohmann/json](https://github.com/nlohmann/json) + - [fmt](https://fmt.dev/latest) + - [Eigen](https://eigen.tuxfamily.org) + +For solving eigenvalue problems, at least one of SLEPc or ARPACK-NG must be specified. +Typically only one of the SuperLU_DIST, STRUMPACK, and MUMPS dependencies is required but +all can be built so the user can decide at runtime which solver to use. + +For unit testing, *Palace* relies on the [Catch2 +library](https://github.com/catchorg/Catch2), which is automatically downloaded +and built when building the `unit-tests` target. See the [Developer +Notes](developer/testing.md) for more information. diff --git a/docs/src/reference.md b/docs/src/reference.md index 58b13d29f9..b5142fe427 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -1,392 +1,610 @@ -```@raw html - - -``` - -# Reference - -## Mathematical background - -The solver computes a finite element approximation to the three-dimensional, time-harmonic -Maxwell's equations in second-order form. The nondimensionalized, source-free, boundary -value problem for ``\bm{E}(\bm{x})\in\mathbb{C}^3``, ``\bm{x}\in\Omega``, -``\partial\Omega=\Gamma``, where -``\bm{\mathscr{E}}(\bm{x},t) = \text{Re}\{\bm{E}(\bm{x})e^{i\omega t}\}`` denotes the -electric field, is written as - -```math -\begin{aligned} -\nabla\times\mu_r^{-1}\nabla\times\bm{E} + i\omega\sigma\bm{E} - - \omega^2\varepsilon_r\bm{E} &= 0 \,,\; \bm{x}\in\Omega \\ -\bm{n}\times\bm{E} &= 0 \,,\; \bm{x}\in\Gamma_{PEC} \\ -\bm{n}\times(\mu_r^{-1}\nabla\times\bm{E}) &= 0 \,,\; \bm{x}\in\Gamma_{PMC} \\ -\bm{n}\times(\mu_r^{-1}\nabla\times\bm{E}) - + \gamma\bm{n}\times(\bm{n}\times\bm{E}) &= \bm{U}^{inc} \,,\; \bm{x}\in\Gamma_{Z} -\end{aligned} -``` - -where the nondimensionalization has been performed with respect to a characteristic length -``L_0``, time ``L_0/c_0``, magnetic field strength ``H_0``, and electric field strength -``Z_0 H_0``. Here, ``c_0`` and ``Z_0`` are the speed of light and impedance of free space, -respectively. Given the electric field solution, the time-harmonic magnetic flux density -can be calculated as - -```math -\bm{B} = -\frac{1}{i\omega}\nabla\times\bm{E} \,. -``` - -The flux density is related to the magnetic field, ``\bm{H}``, by the standard linear -constitutive relationship ``\bm{H} = \mu_r^{-1}\bm{B}``. - -For a general isotropic lossy dielectric, the relative permittivity ``\varepsilon_r`` is a -complex scalar: - -```math -\varepsilon_r = \varepsilon_r' (1-i\tan{\delta}) -``` - -where ``\varepsilon_r'`` is the real relative permittivity and ``\tan{\delta}`` is the loss -tangent. Alternatively, conductor loss is modeled by Ohm's law ``\bm{J}=\sigma\bm{E}`` with -electrical conductivity ``\sigma>0.0``. For a superconducting domain, the constitive -current-field relationship given by Ohm's law is replaced by that given by the London -equations: - -```math -\frac{\partial \bm{J}}{\partial t}=\frac{1}{\mu_r\lambda_L^2}\bm{E} -``` - -where ``\lambda_L = \sqrt{m/\mu n_s e^2}/L_0`` is the nondimensionalized London penetration -depth. In this case, the term ``+i\omega\sigma \bm{E}`` arising for a normal conductor in -the time-harmonic Maxwell's equations becomes ``+(\mu_r \lambda_L^2)^{-1}\bm{E}``. - -The domain boundary ``\Gamma=\Gamma_{PEC}\cup\Gamma_{PMC}\cup\Gamma_{Z}``, is separated into -perfect electric conductor (PEC), perfect magnetic conductor (PMC), and impedance -boundaries, respectively. The PEC boundary condition is a homogenous Dirichlet condition, -while the PMC boundary condition is the natural boundary condition for the problem and is -satisfied at all exterior boundaries by the finite element formulation. Impedance -boundaries are modeled using a Robin boundary condition with ``\gamma = i\omega/Z_s``, in -which ``Z_s`` the surface impedance of the boundary, with units of impedance per square. - -## Time domain formulation - -A time-dependent formulation is also available to compute the electric field response -``\bm{E}(\bm{x},t)`` for a given time-dependent source excitation -``\bm{U}^{inc}(\bm{x},t)``. The governing equations in this case are - -```math -\nabla\times\mu_r^{-1}\nabla\times\bm{E} + \sigma\frac{\partial\bm{E}}{\partial t} - + \varepsilon_r\frac{\partial^2\bm{E}}{\partial t^2} = 0 \,,\; \bm{x}\in\Omega -``` - -subject to the same boundary conditions as the frequency-dependent case except for the Robin -boundary condition which is written for a lumped resistive port boundary, for example, as - -```math -\bm{n}\times(\mu_r^{-1}\nabla\times\bm{E}) - + Z_s^{-1}\bm{n}\times\left(\bm{n}\times\frac{\partial\bm{E}}{\partial t}\right) - = \bm{U}^{inc} \,,\; \bm{x}\in\Gamma_{Z} \,. -``` - -The second-order electric field formulation is chosen to take advantage of unconditionally -stable implicit time-integration schemes without the expense of a coupled block system -solve for ``\bm{E}(\bm{x},t)`` and ``\bm{B}(\bm{x},t)``. It offers the additional benefit -of sharing many similarities in the spatial discretization as the frequency domain -formulation outlined above. - -## Lumped ports and wave ports - -For lumped port boundaries, the surface impedance can be related to an equivalent circuit -impedance, ``Z``. There are two common cases: - - 1. *Rectangular ports*: ``Z = Z_s l / w``, where ``l`` and ``w`` are the length and width - of the port, respectively (length here is defined as the distance between the two - conductors). - - 2. *Coaxial ports*: ``Z = Z_s \ln(b/a) / 2\pi``, where ``a`` and ``b`` denote the inner and - outer radii of the port, respectively. - -A lumped parallel RLC circuit boundary has a circuit impedance - -```math -\frac{1}{Z} = \frac{1}{R}+\frac{1}{i\omega L}+i\omega C \,. -``` - -Thus, the relationships between the circuit and surface element parameters for the user to -specify are given by ``R_s = \alpha R``, ``L_s = \alpha L``, and ``C_s = C/\alpha``, where -``\alpha = w/l`` for a rectangular port or ``\alpha = 2\pi / \ln(b/a)`` for a coaxial -port. - -For multielement lumped ports, the effective circuit impedance is given by - -```math -\frac{1}{Z} = \sum_k \frac{1}{Z_k} \,. -``` - -That is, the circuit impedances of each port contributing to the multielement port add in -parallel. For the specific case of a two element multielement port with two identical -lumped elements, we have ``Z = (1/Z_1 + 1/Z_2)^{-1} = Z_k / 2``, where ``Z_k`` is the -circuit impedance of a single port element. - -The source term ``\bm{U}^{inc}`` in a driven frequency-response problem is related to the -incident field at an excited port boundary by - -```math -\bm{U}^{inc} = -2\gamma(\bm{n}\times\bm{E}^{inc})\times\bm{n} -``` - -where ``(\bm{n}\times\bm{E}^{inc})\times\bm{n}`` is just the projection of the excitation -field onto the port surface. The incident fields for lumped ports depend on the port -shape: - - 1. *Rectangular ports*: ``\bm{E}^{inc} = E_0 \, \hat{\bm{l}}``, where ``E_0`` is a uniform - constant field strength and ``\hat{\bm{l}}`` a unit vector defining the direction of - polarization on the port (typically should be the direction between the two conductors). - - 2. *Coaxial ports*: ``\bm{E}^{inc} = \frac{E_0 r_0}{r} \, \hat{\bm{r}}``, where ``E_0`` is - again a uniform constant field strength, ``r_0`` is a characteristic length for the - port, ``r`` is the distance from the port center, and ``\hat{\bm{r}}`` a unit vector - specifying the port radial direction. - -In the time domain formulation, the source term ``\bm{U}^{inc}`` appears as - -```math -\bm{U}^{inc} = -2 Z_s^{-1}\left(\bm{n}\times\frac{\partial\bm{E}^{inc}}{\partial t}\right) - \times\bm{n} \,. -``` - -The incident field ``\bm{E}^{inc}(\bm{x},t)`` is - -```math -\bm{E}^{inc}(\bm{x},t) = p(t)\bm{E}^{inc}(\bm{x}) -``` - -where ``\bm{E}^{inc}(\bm{x})`` is identical to the spatial excitation in the frequency -domain formulation, and ``p(t)`` describes the temporal shape of the excitation. Possible -options include a sinusoidal, Gaussian, modulated Gaussian, or step excitation. - -In the frequency domain, the scattering parameters can be postprocessed from the computed -electric field for each lumped port with boundary ``\Gamma_i`` as - -```math -S_{ij} = \frac{\displaystyle\int_{\Gamma_i}\bm{E}\cdot\bm{E}^{inc}_i\,dS} - {\displaystyle\int_{\Gamma_i}\bm{E}^{inc}_i\cdot\bm{E}^{inc}_i\,dS} - \delta_{ij} \,. -``` - -In the time domain, the time histories of the port voltages can be Fourier-transformed to -get their frequency domain representation for scattering parameter calculation. - -Numeric wave ports assume a field with known normal-direction dependence -``\bm{E}(\bm{x})=\bm{e}(\bm{x}_t)e^{ik_n x_n}`` where ``k_n`` is the propagation constant. -For each operating frequency ``\omega``, a two-dimensional eigenvalue problem is solved on -the port yielding the mode shapes ``\bm{e}_m`` and associated propagation constants -``k_{n,m}``. These are used in the full 3D model where the Robin port boundary condition has -coefficient ``\gamma=i\text{Re}\{k_{n,m}\}/\mu_r`` and the computed mode is used to compute -the incident field in the source term ``\bm{U}^{inc}`` at excited ports. Scattering -parameter postprocessing takes the same form as the lumped port counterpart using the -computed modal solutions. Since the propagation constants are known for each wave port, -scattering parameter de-embedding can be performed by specifying an offset distance ``d`` -for each port: - -```math -\tilde{S}_{ij} = S_{ij}e^{ik_{n,i}d_i}e^{ik_{n,j}d_j} \,. -``` - -## Eigenmode calculations - -For eigenmode problems, the source term is zero and a quadratic eigenvalue problem for the -eigenvalues ``\omega`` is solved: - -```math -(\bm{K}+i\omega\bm{C}-\omega^2\bm{M})\bm{x} = 0 -``` - -where the matrix ``\bm{K}`` represents the discretized curl-curl operator, ``\bm{M}`` the -mass term, and ``\bm{C}`` the port impedance boundary conditions. The damped frequency -``\omega_d`` and quality factor ``Q`` is postprocessed from each of the resulting -eigenvalues as - -```math -\omega_d = \text{Re}\{\omega\} \,, \qquad Q = \frac{|\omega|}{2|\text{Im}\{\omega\}|} \,. -``` - -## Energy-participation ratios - -The energy-participation ratio (EPR) for lumped inductive elements is computed from the -electric and magnetic fields corresponding to eigenmode ``m``, ``\bm{E}_m`` and -``\bm{H}_m``, using the formula - -```math -p_{mj} = \frac{1}{\mathcal{E}^{elec}_m} \, \frac{1}{2} \, L_j I_{mj}^2 -``` - -where ``p_{mj}\in[-1,1]`` denotes the signed participation ratio for junction ``j`` in mode -``m``, ``L_j`` is the provided junction circuit inductance, ``I_ {mj}`` is the peak -junction current for mode ``m``, and ``\mathcal{E}^{elec}_m`` is the electric energy in -mode ``m``. The junction current is computed using the mean voltage across the port, -``\overline{V}_{mj}``, as ``I_{mj} = \overline{V}_{mj}/Z_{mj}``, where -``Z_{mj} = 1/(i\omega_m L_j)`` is the impedance of the inductive branch of the lumped -circuit. The mean port voltage depends on the computed electric field mode and the shape of -the port: - - 1. *Rectangular ports*: - ``\overline{V}_{mj} = \frac{1}{w_j}\int_{\Gamma_j}\bm{E}_m\cdot\hat{\bm{l}}_j\,dS``. - - 2. *Coaxial ports*: - ``\overline{V}_{mj} = \frac{1}{2\pi}\int_{\Gamma_j}\frac{\bm{E}_m}{r}\cdot\hat{\bm{r}}_j\,dS``. - -Finally, the total electric energy in mode ``m`` is - -```math -\mathcal{E}^{elec}_m - = \frac{1}{2} \, \text{Re}\left\{\int_\Omega\bm{D}_m^*\cdot\bm{E}_m\,dV\right\} - + \sum_j \frac{1}{2} \, C_jV_{mj}^2 -``` - -where ``\bm{D}_m=\varepsilon_r\bm{E}_m`` is the electric flux density for mode ``m`` and the -second term on the right-hand side accounts for any lumped capacitive boundaries with -nonzero circuit capacitance ``C_j``. - -The EPR can also be used to estimate mode quality factors due to input-output(I-O) line -coupling. The mode coupling quality factor due to the ``j``-th I-O port is given by - -```math -Q_{mj} = \frac{\omega_m}{\kappa_{mj}} -``` - -where the port coupling rate ``\kappa_{mj}`` is calculated as - -```math -\kappa_{mj} = \frac{1}{\mathcal{E}^{elec}_m} \, \frac{1}{2}\,R_j I_{mj}^2 \,. -``` - -## Bulk and interface dielectric loss - -The quality factor due to bulk dielectric loss resulting from an electric field ``\bm{E}`` -present in domain ``j`` with associated loss tangent ``\tan{\delta}_j`` is given by - -```math -\frac{1}{Q_j} = p_j \tan{\delta}_j = - \frac{1}{\mathcal{E}^{elec}} \, \frac{1}{2} \, \tan{\delta}_j \, - \text{Re}\left\{\int_{\Omega_j}\bm{D}^*\cdot\bm{E}\,dV\right\} -``` - -where, as above, ``\mathcal{E}^{elec}`` is the total electric field energy in the domain, -including the contributions due to capacitive lumped elements. - -Likewise, the quality factor due to surface interface dielectric loss for interface ``j`` is -given by - -```math -\frac{1}{Q_j} = p_j \tan{\delta}_j = - \frac{1}{\mathcal{E}^{elec}} \, \frac{1}{2} \, t_j\tan{\delta}_j \, - \text{Re}\left\{\int_{\Gamma_j}\bm{D}^*\cdot\bm{E}\,dS\right\} -``` - -where ``t_j`` is the thickness of the layer and ``\bm{D} = \varepsilon_{r,j}\bm{E}`` is the -electric displacement field in the layer evaluated using the relative permittivity of the -interface ``\varepsilon_{r,j}``. For an internal boundary, this integral is evaluated on a -single side to resolve abiguity due to the discontinuity of ``\bm{E}`` across the boundary. - -The above formula for interface dielectric loss can be specialized for the case of a -metal-air, metal-substrate, or substrate-air interface. In each case, the quality factor -for interface ``j`` is given by - - - *Metal-air*: - -```math -\frac{1}{Q^{MA}_j} = - \frac{1}{\mathcal{E}^{elec}} \, \frac{1}{2} \, - \frac{t_j\tan{\delta}_j}{\varepsilon_{r,j}^{MA}} \, - \text{Re}\left\{\int_{\Gamma_j}\bm{E}_n^*\cdot\bm{E}_n\,dS\right\} -``` - - - *Metal-substrate*: - -```math -\frac{1}{Q^{MS}_j} = - \frac{1}{\mathcal{E}^{elec}} \, \frac{1}{2} \, - \frac{t_j\tan{\delta}_j(\varepsilon_{r,j}^{S})^2}{\varepsilon_{r,j}^{MA}} \, - \text{Re}\left\{\int_{\Gamma_j}\bm{E}_n^*\cdot\bm{E}_n\,dS\right\} -``` - - - *Substrate-air*: - -```math -\frac{1}{Q^{SA}_j} = - \frac{1}{\mathcal{E}^{elec}} \, \frac{1}{2} \, - t_j\tan{\delta}_j\left(\varepsilon_{r,j}^{SA} \, - \text{Re}\left\{\int_{\Gamma_j}\bm{E}_t^*\cdot\bm{E}_t\,dS\right\} - + \frac{1}{\varepsilon_{r,j}^{SA}} \, - \text{Re}\left\{\int_{\Gamma_j}\bm{E}_n^*\cdot\bm{E}_n\,dS\right\}\right) -``` - -where ``\bm{E}_n`` denotes the normal field to the interface and -``\bm{E}_t=\bm{E}-\bm{E}_n`` denotes the tangential field. - -## Lumped parameter extraction - -For electrostatic simulations, the Maxwell capacitance matrix is computed in the following -manner. First, the Laplace equation subject to Dirichlet boundary conditions is solved for -each terminal with boundary ``\Gamma_i`` in the model, yielding an associated voltage field -``V_i(\bm{x})``: - -```math -\begin{aligned} -\nabla\cdot(\varepsilon_r\nabla V_i) &= 0 \,,\; \bm{x}\in\Omega \\ -V_i &= 1 \,,\; \bm{x}\in\Gamma_i \\ -V_i &= 0 \,,\; \bm{x}\in\Gamma_j \,,\; j\neq i \,. -\end{aligned} -``` - -The energy of the electric field associated with any solution is - -```math -\mathcal{E}(V_i) = \frac{1}{2}\int_\Omega\varepsilon_r\bm{E}_i\cdot\bm{E}_i\,dV -``` - -where ``\bm{E}_i=-\nabla V_i`` is the electric field. Then, the entries of the Maxwell -capacitance matrix, ``\bm{C}``, are given by - -```math -\bm{C}_{ij} = \mathcal{E}(V_i+V_j)-\frac{1}{2}(\bm{C}_{ii}+\bm{C}_{jj}) \,. -``` - -Magnetostatic problems for inductance matrix extraction are based on the magnetic vector -potential formulation: - -```math -\begin{aligned} -\nabla\times(\mu_r^{-1}\nabla\times\bm{A}_i) &= 0 \,,\; \bm{x}\in\Omega \\ -\bm{n}\times\bm{A}_i &= 0 \,,\; \bm{x}\in\Gamma_{PEC} \\ -\bm{n}\times(\mu_r^{-1}\nabla\times\bm{A}_i) = - \bm{n}\times\bm{H}_i &= \bm{J}_s^{inc} \,,\; \bm{x}\in\Gamma_i \\ -\bm{n}\times(\mu_r^{-1}\nabla\times\bm{A}_i) &= 0 \,,\; \bm{x}\in\Gamma_j \,,\; j\neq i \,. -\end{aligned} -``` - -For each port with boundary ``\Gamma_i``, a unit source surface current ``\bm{J}_s^{inc}`` -is applied, yielding an associated vector potential solution ``\bm{A}_i(\bm{x})``. -Homogeneous Dirichlet boundary conditions ``\bm{n}\times\bm{A}_i=0`` are also imposed on -specified surfaces of the model. The magnetic field energy associated with any solution is - -```math -\mathcal{E}(\bm{A}_i) = \frac{1}{2}\int_\Omega\mu_r^{-1}\bm{B}_i\cdot\bm{B}_i\,dV -``` - -where ``\bm{B}_i=\nabla\times\bm{A}_i`` is the magnetic flux density. Then, the entries of -the inductance matrix, ``\bm{M}``, are given by - -```math -\bm{M}_{ij} = \frac{1}{I_i I_j}\mathcal{E}(\bm{A}_i+\bm{A}_j) - - \frac{1}{2}\left(\frac{I_i}{I_j}\bm{M}_{ii}+\frac{I_j}{I_i}\bm{M}_{jj}\right) -``` - -where ``I_i`` is the excitation current for port ``i``, computed by integrating the source -surface current ``\bm{J}_s^{inc}`` over the surface of the port. - -## References - -[1] J.-M. Jin, _The Finite Element Method in Electromagnetics_, Wiley-IEEE Press, Hoboken, -NJ, Third edition, 2014.\ -[2] P. Monk, _Finite Element Methods for Maxwell's Equations_, -Oxford University Press, Oxford, 2003. +```@raw html + + +``` + +# Reference + +## Mathematical background + +The solver computes a finite element approximation to the three-dimensional, time-harmonic +Maxwell's equations in second-order form. The nondimensionalized, source-free, boundary +value problem for ``\bm{E}(\bm{x})\in\mathbb{C}^3``, ``\bm{x}\in\Omega``, +``\partial\Omega = \Gamma``, where +``\bm{\mathscr{E}}(\bm{x},t) = \text{Re}\{\bm{E}(\bm{x})e^{i\omega t}\}`` denotes the +electric field, is written as + +```math +\begin{aligned} +\nabla\times\mu_r^{-1}\nabla\times\bm{E} + i\omega\sigma\bm{E} + - \omega^2\varepsilon_r\bm{E} &= 0 \,,\; \bm{x}\in\Omega \\ +\bm{n}\times\bm{E} &= 0 \,,\; \bm{x}\in\Gamma_{PEC} \\ +\bm{n}\times(\mu_r^{-1}\nabla\times\bm{E}) &= 0 \,,\; \bm{x}\in\Gamma_{PMC} \\ +\bm{n}\times(\mu_r^{-1}\nabla\times\bm{E}) + + \gamma\bm{n}\times(\bm{n}\times\bm{E}) &= \bm{U}^{inc} \,,\; \bm{x}\in\Gamma_{Z} +\end{aligned} +``` + +where the nondimensionalization has been performed with respect to a characteristic length +``L_0``, time ``L_0/c_0``, magnetic field strength ``H_0``, and electric field strength +``Z_0 H_0``. Here, ``c_0`` and ``Z_0`` are the speed of light and impedance of free space, +respectively. This nondimensionalization will be used throughout this entire reference. For +more details, see [[1]](#References) and [[2]](#References). + +Given the electric field solution, the time-harmonic magnetic flux density can be calculated +as + +```math +\bm{B} = -\frac{1}{i\omega}\nabla\times\bm{E} \,. +``` + +The flux density is related to the magnetic field, ``\bm{H}``, by the standard linear +constitutive relationship ``\bm{H} = \mu_r^{-1}\bm{B}``. + +In general, the material property coefficients may be scalar- or matrix-valued. In the +matrix-valued (anisotropic) case, the material property coefficients should still always be +symmetric. + +For a general isotropic lossy dielectric, the relative permittivity ``\varepsilon_r`` is a +complex-valued quantity: + +```math +\varepsilon_r = \varepsilon_r' (1-i\tan{\delta}) +``` + +where ``\varepsilon_r'`` is the real relative permittivity and ``\tan{\delta}`` is the loss +tangent. Alternatively, conductor loss is modeled by Ohm's law ``\bm{J} = \sigma\bm{E}`` +with electrical conductivity ``\sigma > 0.0``. For a superconducting domain, the constitive +current-field relationship given by Ohm's law is replaced by that given by the London +equations: + +```math +\frac{\partial \bm{J}}{\partial t} = \frac{1}{\mu_r\lambda_L^2}\bm{E} +``` + +where ``\lambda_L = \sqrt{m/\mu n_s e^2}/L_0`` is the nondimensionalized London penetration +depth. In this case, the term ``+i\omega\sigma \bm{E}`` arising for a normal conductor in +the time-harmonic Maxwell's equations becomes ``+(\mu_r \lambda_L^2)^{-1}\bm{E}``. + +The domain boundary ``\Gamma = \Gamma_{PEC}\cup\Gamma_{PMC}\cup\Gamma_{Z}``, is separated +into perfect electric conductor (PEC), perfect magnetic conductor (PMC), and impedance +boundaries, respectively. The PEC boundary condition is a homogeneous Dirichlet condition, +while the PMC boundary condition is the natural boundary condition for the problem and is +satisfied at all exterior boundaries by the finite element formulation. Impedance +boundaries are modeled using a Robin boundary condition with ``\gamma = i\omega/Z_s``, in +which ``Z_s`` the surface impedance of the boundary, with units of impedance per square. + +## Floquet periodic boundary conditions + +When applying Floquet periodic boundary conditions, the phase delay is incorporated into +the time-harmonic Maxwell equations and exact periodic boundary conditions are applied. +The modified Maxwell equations are obtained by substituting +``\bm{E}(\bm{x}) = \bm{E}_p(\bm{x})e^{-i \bm{k}_p \cdot \bm{x}}``, where ``\bm{E}_p`` is +the periodic electric field and ``\bm{k}_p`` is the user-specified Bloch wavevector. +The resulting equation is + +```math +\begin{aligned} +\nabla\times\mu_r^{-1}\nabla\times\bm{E}_p +- i\bm{k}_p\times\mu_r^{-1}\nabla\times\bm{E}_p +- i\nabla\times(\mu_r^{-1}\bm{k}_p\times\bm{E}_p) & \\ +- \bm{k}_p\times\mu_r^{-1}\bm{k}_p\times\bm{E}_p ++ i\omega\sigma\bm{E}_p +- \omega^2\varepsilon_r\bm{E}_p &= 0 \,,\; \bm{x}\in\Omega +\end{aligned} +``` + +and given the electric field solution, the time-harmonic magnetic flux density can be calculated +as + +```math +\bm{B}_p = -\frac{1}{i\omega}\nabla\times\bm{E}_p + \frac{1}{\omega} \bm{k}_p \times \bm{E}_p \,. +``` + +## Time domain formulation + +A time-dependent formulation is also available to compute the electric field response +``\bm{E}(\bm{x},t)`` for a given time-dependent source excitation +``\bm{U}^{inc}(\bm{x},t)``. The governing equations in this case are + +```math +\nabla\times\mu_r^{-1}\nabla\times\bm{E} + \sigma\frac{\partial\bm{E}}{\partial t} + + \varepsilon_r\frac{\partial^2\bm{E}}{\partial t^2} = 0 \,,\; \bm{x}\in\Omega +``` + +subject to the same boundary conditions as the frequency-dependent case except for the Robin +boundary condition which is written for a lumped resistive port boundary, for example, as + +```math +\bm{n}\times(\mu_r^{-1}\nabla\times\bm{E}) + + Z_s^{-1}\bm{n}\times\left(\bm{n}\times\frac{\partial\bm{E}}{\partial t}\right) + = \bm{U}^{inc} \,,\; \bm{x}\in\Gamma_{Z} \,. +``` + +The second-order electric field differential equation is transformed into a first-order +ODE system which is solved along with the equation for the magnetic flux density + +```math +\left(\begin{matrix} \varepsilon_r & 0 & 0 \\ 0 & I & 0 \\ 0 & 0 & I\end{matrix}\right) + \left(\begin{matrix} \ddot{\bm{E}} \\ \dot{\bm{E}} \\ \dot{\bm{B}}\end{matrix} \right) + = \left(\begin{matrix} -\sigma & -\nabla\times\mu_r^{-1}\nabla\times & 0\\ I & 0 & 0 \\ 0 & -\nabla\times & 0\end{matrix}\right) + \left(\begin{matrix}\dot{\bm{E}}\\ \bm{E} \\ \bm{B} \end{matrix}\right) \,. +``` + +The first-order ODE system formulation is chosen to take advantage of implicit adaptive +time-stepping integration schemes. The ``3 \times 3`` system can be block-eliminated to +avoid an expensive coupled block system solve. It offers the additional benefit +of sharing many similarities in the spatial discretization as the frequency domain +formulation outlined above. + +## Eigenmode calculations + +For eigenmode problems, the source term is zero and a quadratic eigenvalue problem for the +eigenvalues ``\omega`` is solved: + +```math +(\bm{K}+i\omega\bm{C}-\omega^2\bm{M})\bm{x} = 0 +``` + +where the matrix ``\bm{K}`` represents the discretized curl-curl operator, ``\bm{M}`` the +mass term, and ``\bm{C}`` the port impedance boundary conditions. The damped frequency +``\omega_d`` and quality factor ``Q`` are postprocessed from each of the resulting +eigenvalues as + +```math +\omega_d = \text{Re}\{\omega\} \,, \qquad Q = \frac{|\omega|}{2|\text{Im}\{\omega\}|} \,. +``` + +When wave port, surface conductivity, or second-order absorbing boundary conditions are used, +a nonlinear eigenvalue problem is solved: + +```math +(\bm{K}+i\omega\bm{C}-\omega^2\bm{M}+\bm{A}_2(\omega))\bm{x} = 0 +``` + +where the matrix ``\bm{A}_2`` represents the nonlinear frequency-dependent boundary conditions. + +The eigenmodes are normalized such that they have unit norm and their mean phase is a positive real number. + +## Lumped ports and wave ports + +For lumped port boundaries, the surface impedance can be related to an equivalent circuit +impedance, ``Z``. There are two common cases: + + 1. *Rectangular ports*: ``Z = Z_s l / w``, where ``l`` and ``w`` are the length and width + of the port, respectively (length here is defined as the distance between the two + conductors). + + 2. *Coaxial ports*: ``Z = Z_s \ln(b/a) / 2\pi``, where ``a`` and ``b`` denote the inner and + outer radii of the port, respectively. + +A lumped parallel RLC circuit boundary has a circuit impedance + +```math +\frac{1}{Z} = \frac{1}{R}+\frac{1}{i\omega L}+i\omega C \,. +``` + +Thus, the relationships between the circuit and surface element parameters for the user to +specify are given by ``R_s = \alpha R``, ``L_s = \alpha L``, and ``C_s = C/\alpha``, where +``\alpha = w/l`` for a rectangular port or ``\alpha = 2\pi / \ln(b/a)`` for a coaxial +port. + +For multielement lumped ports, the effective circuit impedance is given by + +```math +\frac{1}{Z} = \sum_k \frac{1}{Z_k} \,. +``` + +That is, the circuit impedances of each port contributing to the multielement port add in +parallel. For the specific case of a two element multielement port with two identical +lumped elements, we have ``Z = (1/Z_1 + 1/Z_2)^{-1} = Z_k / 2``, where ``Z_k`` is the +circuit impedance of a single port element. + +The source term ``\bm{U}^{inc}`` in a driven frequency-response problem is related to the +incident field at an excited port boundary by + +```math +\bm{U}^{inc} = -2\gamma(\bm{n}\times\bm{E}^{inc})\times\bm{n} +``` + +where ``(\bm{n}\times\bm{E}^{inc})\times\bm{n}`` is just the projection of the excitation +field onto the port surface. The incident fields for lumped ports depend on the port +shape: + + 1. *Rectangular ports*: ``\bm{E}^{inc} = E_0 \, \hat{\bm{l}}``, where ``E_0`` is a uniform + constant field strength and ``\hat{\bm{l}}`` a unit vector defining the direction of + polarization on the port (typically should be the direction between the two conductors). + + 2. *Coaxial ports*: ``\bm{E}^{inc} = \frac{E_0 r_0}{r} \, \hat{\bm{r}}``, where ``E_0`` is + again a uniform constant field strength, ``r_0`` is a characteristic length for the + port, ``r`` is the distance from the port center, and ``\hat{\bm{r}}`` a unit vector + specifying the port radial direction. + +In the time domain formulation, the source term ``\bm{U}^{inc}`` appears as + +```math +\bm{U}^{inc} = -2 Z_s^{-1}\left(\bm{n}\times\frac{\partial\bm{E}^{inc}}{\partial t}\right) + \times\bm{n} \,. +``` + +The incident field ``\bm{E}^{inc}(\bm{x},t)`` is + +```math +\bm{E}^{inc}(\bm{x},t) = p(t)\bm{E}^{inc}(\bm{x}) +``` + +where ``\bm{E}^{inc}(\bm{x})`` is identical to the spatial excitation in the frequency +domain formulation, and ``p(t)`` describes the temporal shape of the excitation. Possible +options include a sinusoidal, Gaussian, modulated Gaussian, or step excitation. + +In the frequency domain, the scattering parameters can be postprocessed from the computed +electric field for each lumped port with boundary ``\Gamma_i`` as + +```math +S_{ij} = \frac{\displaystyle\int_{\Gamma_i}\bm{E}\cdot\bm{E}^{inc}_i\,dS} + {\displaystyle\int_{\Gamma_i}\bm{E}^{inc}_i\cdot\bm{E}^{inc}_i\,dS} - \delta_{ij} \,. +``` + +In the time domain, the time histories of the port voltages can be Fourier-transformed to +get their frequency domain representation for scattering parameter calculation. + +Numeric wave ports assume a field with known normal-direction dependence +``\bm{E}(\bm{x}) = \bm{e}(\bm{x}_t)e^{ik_n x_n}`` where ``k_n`` is the propagation constant. +For each operating frequency ``\omega``, a two-dimensional eigenvalue problem is solved on +the port yielding the mode shapes ``\bm{e}_m`` and associated propagation constants +``k_{n,m}``. These are used in the full 3D model where the Robin port boundary condition has +coefficient ``\gamma = i\text{Re}\{k_{n,m}\}/\mu_r`` and the computed mode is used to +compute the incident field in the source term ``\bm{U}^{inc}`` at excited ports. Scattering +parameter postprocessing takes the same form as the lumped port counterpart using the +computed modal solutions. Since the propagation constants are known for each wave port, +scattering parameter de-embedding can be performed by specifying an offset distance ``d`` +for each port: + +```math +\tilde{S}_{ij} = S_{ij}e^{ik_{n,i}d_i}e^{ik_{n,j}d_j} \,. +``` + +For more information on the implementation of numeric wave ports, see [[3]](#References). + +## Other boundary conditions + +The first-order absorbing boundary condition, also referred to as a scattering boundary +condition, is a special case of the general impedance boundary condition described above: + +```math +\bm{n}\times(\mu_r^{-1}\nabla\times\bm{E}) + + i\omega\sqrt{\mu_r^{-1}\varepsilon_r}\bm{n}\times(\bm{n}\times\bm{E}) = 0 \,. +``` + +This is also known as the Sommerfeld radiation condition, and one can recognize the +dependence on the impedance of free space ``Z_0^{-1} = \sqrt{\mu_r^{-1}\varepsilon_r}``. The +second-order absorbing boundary condition is + +```math +\bm{n}\times(\mu_r^{-1}\nabla\times\bm{E}) + + i\omega\sqrt{\mu_r^{-1}\varepsilon_r}\bm{n}\times(\bm{n}\times\bm{E}) + - \beta\nabla\times[(\nabla\times\bm{E})_n\bm{n}] = 0 +``` + +where assuming an infinite radius of curvature ``\beta = \mu_r^{-1}c_0/(2i\omega)``, and the +contribution depending on ``(\nabla\cdot\bm{E}_t)`` has been neglected. + +Additionally, while metals with finite conductivity can be modeled using an impedance +boundary condition with constant impedance ``Z_s``, a more accurate model taking into +account the frequency dependence of the skin depth is + +```math +Z_s = \frac{1+i}{\delta\sigma} +``` + +where ``\delta = \sqrt{2/\mu_r\sigma\omega}`` is the skin depth and ``\sigma`` is the +conductivity of the metal. Another model, which takes into account finite thickness effects, +is given by + +```math +Z_s = \frac{1}{\delta\sigma}\left(\frac{\sinh{\nu}+\sin{\nu}}{\cosh{\nu}+\cos{\nu}} + + i\frac{\sinh{\nu}-\sin{\nu}}{\cosh{\nu}+\cos{\nu}}\right) +``` + +where ``\nu = h/\delta`` and ``h`` is the layer thickness. This model correctly produces the +DC limit when ``h\ll\delta``. + +## Energy-participation ratios + +The energy-participation ratio (EPR) for lumped inductive elements is computed from the +electric and magnetic fields corresponding to eigenmode ``m``, ``\bm{E}_m`` and +``\bm{H}_m``, using the formula + +```math +p_{mj} = \frac{1}{\mathcal{E}^{elec}_m} \, \frac{1}{2} \, L_j I_{mj}^2 +``` + +where ``p_{mj}\in[-1,1]`` denotes the signed participation ratio for junction ``j`` in mode +``m``, ``L_j`` is the provided junction circuit inductance, ``I_ {mj}`` is the peak +junction current for mode ``m``, and ``\mathcal{E}^{elec}_m`` is the electric energy in +mode ``m``. The junction current is computed using the mean voltage across the port, +``\overline{V}_{mj}``, as ``I_{mj} = \overline{V}_{mj}/Z_{mj}``, where +``Z_{mj} = 1/(i\omega_m L_j)`` is the impedance of the inductive branch of the lumped +circuit. The mean port voltage depends on the computed electric field mode and the shape of +the port: + + 1. *Rectangular ports*: + ``\overline{V}_{mj} = \frac{1}{w_j}\int_{\Gamma_j}\bm{E}_m\cdot\hat{\bm{l}}_j\,dS``. + + 2. *Coaxial ports*: + ``\overline{V}_{mj} = \frac{1}{2\pi}\int_{\Gamma_j}\frac{\bm{E}_m}{r}\cdot\hat{\bm{r}}_j\,dS``. + +Finally, the total electric energy in mode ``m`` is + +```math +\mathcal{E}^{elec}_m + = \frac{1}{2} \, \text{Re}\left\{\int_\Omega\bm{D}_m^*\cdot\bm{E}_m\,dV\right\} + + \sum_j \frac{1}{2} \, C_jV_{mj}^2 +``` + +where ``\bm{D}_m = \varepsilon_r\bm{E}_m`` is the electric flux density for mode ``m`` and +the second term on the right-hand side accounts for any lumped capacitive boundaries with +nonzero circuit capacitance ``C_j``. + +The EPR can also be used to estimate mode quality factors due to input-output (I-O) line +coupling. The mode coupling quality factor due to the ``j``-th I-O port is given by + +```math +Q_{mj} = \frac{\omega_m}{\kappa_{mj}} +``` + +where the port coupling rate ``\kappa_{mj}`` is calculated as + +```math +\kappa_{mj} = \frac{1}{\mathcal{E}^{elec}_m} \, \frac{1}{2}\,R_j I_{mj}^2 \,. +``` + +## Bulk and interface dielectric loss + +The quality factor due to bulk dielectric loss resulting from an electric field ``\bm{E}`` +present in domain ``j`` with associated loss tangent ``\tan{\delta}_j`` is given by + +```math +\frac{1}{Q_j} = p_j \tan{\delta}_j = + \frac{1}{\mathcal{E}^{elec}} \, \frac{1}{2} \, \tan{\delta}_j \, + \text{Re}\left\{\int_{\Omega_j}\bm{D}^*\cdot\bm{E}\,dV\right\} +``` + +where, as above, ``\mathcal{E}^{elec}`` is the total electric field energy in the domain, +including the contributions due to capacitive lumped elements. + +Likewise, the quality factor due to surface interface dielectric loss for interface ``j`` is +given by + +```math +\frac{1}{Q_j} = p_j \tan{\delta}_j = + \frac{1}{\mathcal{E}^{elec}} \, \frac{1}{2} \, t_j\tan{\delta}_j \, + \text{Re}\left\{\int_{\Gamma_j}\bm{D}^*\cdot\bm{E}\,dS\right\} +``` + +where ``t_j`` is the thickness of the layer and ``\bm{D} = \varepsilon_{r,j}\bm{E}`` is the +electric displacement field in the layer evaluated using the relative permittivity of the +interface ``\varepsilon_{r,j}``. For an internal boundary, this integral is evaluated on a +single side to resolve ambiguity due to the discontinuity of ``\bm{E}`` across the boundary. + +The above formula for interface dielectric loss can be specialized for the case of a +metal-air, metal-substrate, or substrate-air interface [[4]](#References). In each case, the +quality factor for interface ``j`` is given by + + - *Metal-air*: + +```math +\frac{1}{Q^{MA}_j} = + \frac{1}{\mathcal{E}^{elec}} \, \frac{1}{2} \, + \frac{t_j\tan{\delta}_j}{\varepsilon_{r,j}^{MA}} \, + \text{Re}\left\{\int_{\Gamma_j}\bm{E}_n^*\cdot\bm{E}_n\,dS\right\} +``` + + - *Metal-substrate*: + +```math +\frac{1}{Q^{MS}_j} = + \frac{1}{\mathcal{E}^{elec}} \, \frac{1}{2} \, + \frac{t_j\tan{\delta}_j(\varepsilon_{r,j}^{S})^2}{\varepsilon_{r,j}^{MS}} \, + \text{Re}\left\{\int_{\Gamma_j}\bm{E}_n^*\cdot\bm{E}_n\,dS\right\} +``` + + - *Substrate-air*: + +```math +\frac{1}{Q^{SA}_j} = + \frac{1}{\mathcal{E}^{elec}} \, \frac{1}{2} \, + t_j\tan{\delta}_j\left(\varepsilon_{r,j}^{SA} \, + \text{Re}\left\{\int_{\Gamma_j}\bm{E}_t^*\cdot\bm{E}_t\,dS\right\} + + \frac{1}{\varepsilon_{r,j}^{SA}} \, + \text{Re}\left\{\int_{\Gamma_j}\bm{E}_n^*\cdot\bm{E}_n\,dS\right\}\right) +``` + +where ``\bm{E}_n`` denotes the normal field to the interface and +``\bm{E}_t = \bm{E}-\bm{E}_n`` denotes the tangential field. + +## Lumped parameter extraction + +For electrostatic simulations, the Maxwell capacitance matrix is computed in the following +manner. First, the Laplace equation subject to Dirichlet boundary conditions is solved for +each terminal with boundary ``\Gamma_i`` in the model, yielding an associated voltage field +``V_i(\bm{x})``: + +```math +\begin{aligned} +\nabla\cdot(\varepsilon_r\nabla V_i) &= 0 \,,\; \bm{x}\in\Omega \\ +V_i &= 1 \,,\; \bm{x}\in\Gamma_i \\ +V_i &= 0 \,,\; \bm{x}\in\Gamma_j \,,\; j\neq i \,. +\end{aligned} +``` + +The energy of the electric field associated with any solution is + +```math +\mathcal{E}(V_i) = \frac{1}{2}\int_\Omega\varepsilon_r\bm{E}_i\cdot\bm{E}_i\,dV +``` + +where ``\bm{E}_i=-\nabla V_i`` is the electric field. Then, the entries of the Maxwell +capacitance matrix, ``\bm{C}``, are given by + +```math +\bm{C}_{ij} = \mathcal{E}(V_i+V_j)-\frac{1}{2}(\bm{C}_{ii}+\bm{C}_{jj}) \,. +``` + +Magnetostatic problems for inductance matrix extraction are based on the magnetic vector +potential formulation: + +```math +\begin{aligned} +\nabla\times(\mu_r^{-1}\nabla\times\bm{A}_i) &= 0 \,,\; \bm{x}\in\Omega \\ +\bm{n}\times(\mu_r^{-1}\nabla\times\bm{A}_i) = + \bm{n}\times\bm{H}_i &= \bm{J}_s^{inc} \,,\; \bm{x}\in\Gamma_i \\ +\bm{n}\times(\mu_r^{-1}\nabla\times\bm{A}_i) &= 0 \,,\; \bm{x}\in\Gamma_j \,,\; j\neq i \,. +\end{aligned} +``` + +For each port with boundary ``\Gamma_i``, a unit source surface current ``\bm{J}_s^{inc}`` +is applied, yielding an associated vector potential solution ``\bm{A}_i(\bm{x})``. +Homogeneous Dirichlet boundary conditions ``\bm{n}\times\bm{A}_i=0`` are also imposed on +specified surfaces of the model. The magnetic field energy associated with any solution is + +```math +\mathcal{E}(\bm{A}_i) = \frac{1}{2}\int_\Omega\mu_r^{-1}\bm{B}_i\cdot\bm{B}_i\,dV +``` + +where ``\bm{B}_i = \nabla\times\bm{A}_i`` is the magnetic flux density. Then, the entries of +the inductance matrix, ``\bm{M}``, are given by + +```math +\bm{M}_{ij} = \frac{1}{I_i I_j}\mathcal{E}(\bm{A}_i+\bm{A}_j) + - \frac{1}{2}\left(\frac{I_i}{I_j}\bm{M}_{ii}+\frac{I_j}{I_i}\bm{M}_{jj}\right) +``` + +where ``I_i`` is the excitation current for port ``i``, computed by integrating the source +surface current ``\bm{J}_s^{inc}`` over the surface of the port. + +## Error estimation and adaptive mesh refinement (AMR) + +Error estimation is used to provide element-wise error estimates for AMR, as well as a +global error indicator used to terminate AMR iterations or provide an estimate for solution +accuracy. A Zienkiewicz–Zhu (ZZ) error estimator based on [[5]](#References) is +implemented, which measures the error in the recovered magnetic field and electric flux +density. On element ``K``, we have + +```math +\eta^2_K = \eta_{m,2}^2+\eta_{e,K}^2 = + \|\mu_r^{1/2}\bm{R}_{ND}(\mu^{-1}\bm{B}) + - (\mu_r^{-1/2}\bm{B})\|_{L^2(\Omega_K)}^2 + + \|\varepsilon_r^{-1/2}\bm{R}_{RT}(\varepsilon_r\bm{E}) + - (\varepsilon_r^{1/2}\bm{E})\|_{L^2(\Omega_K)}^2 +``` + +where ``\bm{R}_{ND}`` and ``\bm{R}_{RT}`` are the smooth-space recovery operators which +orthogonally project their argument onto ``H(\text{curl})`` and ``H(\text{div})``, +discretized by Nédélec and Raviart-Thomas elements, respectively. + +## Far-field extraction + +This feature is based upon Stratton-Chu's transformations [6] in the limit of ``kr \gg 1`` +(with ``k`` wave number and ``r`` observation distance). One can show (see below) that, in +this limit, + +```math +r \mathbf{E}_p(\mathbf{r}_0) = \frac{ik}{4\pi} \mathbf{r}_0 \times \int_S [\mathbf{n} \times \mathbf{E} - Z \mathbf{r}_0 \times (\mathbf{n} \times \mathbf{H})] \exp(ik\mathbf{r} \cdot \mathbf{r}_0) dS +``` + +where: + + - ``E_p`` is the electric field at the observation point + - ``k`` is the wave number + - ``r₀`` is the unit vector from source to observation point, parameterized by ``(\theta, \phi)`` + - ``n`` is the surface normal (to ``S``) + - ``E, H`` are the tangential fields on the surface + - ``Z`` is the impedance + +The integral is over the exterior surface ``S``. + +Note, we obtain ``r \mathbf{E}_p`` because the electric field decays with +``exp(ikr)/r``, so multiplying it by ``r`` ensures that the quantity is finite. +Note also that the solution is defined up to a global phase factor. + +This equation relies on an analytic form for Green's function and is only valid +in 3D and if ``S`` only crosses isotropic materials. + +From ``r \mathbf{E}_p``, one can obtain the magnetic field assuming that the +waves are propagating in free space, + +```math +r \mathbf{H}_p = \frac{r_0 \times r \mathbf{E}_p}{Z_0}\,, +``` + +with ``Z_0`` impedance of free space. + +With this, one can immediately compute the far-field relative radiation pattern +as ``|r \mathbf{E}_p|``. + +#### How to obtain the equation above from Stratton-Chu's original equations + +Let us start from Stratton-Chu's transformation for the electric field (we will get the magnetic field from ``E``): + +```math +\mathbf{E}(\mathbf{r}_0) = \int_S \left[ i \omega \mu (\mathbf{n} \times \mathbf{H}) g(\mathbf{r}, \mathbf{r}_0) + +(\mathbf{n} \times \mathbf{E}) \times \nabla g(\mathbf{r}, \mathbf{r}_0) + (\mathbf{n} \cdot \mathbf{E}) \nabla g(\mathbf{r}, \mathbf{r}_0) \right] dS +``` + +with Green's function (here is where the assumption of isotropicity comes in): + +```math +g(\mathbf{r}, \mathbf{r}_0) = \frac{e^{-i k |\mathbf{r} - \mathbf{r}_0|}}{4 \pi |\mathbf{r} - \mathbf{r}_0|}. +``` + +Let us take the limit for ``r \to \infty`` and define ``R = |\mathbf{r} - \mathbf{r}_0|`` (``R \to \infty`` when ``r \to \infty``). +For ``r \gg r_0`` (far-field approximation): + +```math +R \approx r - \mathbf{r}\cdot\mathbf{r}_0 +``` + +where ``\mathbf{r}_0 = \mathbf{r}/r`` is the unit vector in the direction of ``\mathbf{r}``. + +The far-field approximation for Green's function becomes: + +```math +g(\mathbf{r}, \mathbf{r}_0) \approx \frac{e^{-i k r}}{4 \pi r} e^{i k \mathbf{r}_0\cdot\mathbf{r}}. +``` + +For the gradient of ``g``, we start with the exact expression and expand phase and magnitude to reach: + +```math +\nabla g(\mathbf{r}, \mathbf{r}_0) = -\frac{e^{-i k R}}{4 \pi R}\left(\frac{1}{R} + i k\right)\hat{R} +``` + +where ``\hat{R} = (\mathbf{r} - \mathbf{r}_0)/R`` is the unit vector pointing from ``\mathbf{r}_0`` to ``\mathbf{r}``. + +In the far-field limit, ``R \approx r`` and ``\hat{R} \approx \mathbf{r}_0``, so: + +```math +\nabla g(\mathbf{r}, \mathbf{r}_0) \approx -i k \mathbf{r}_0 g(\mathbf{r}, \mathbf{r}_0) +``` + +where we've neglected the ``1/R`` term since ``k R \gg 1`` in the far-field. + +With these ingredients, one then uses the triple vector product rule and drops +the radial terms (i.e., those proportional to ``\mathbf{r}_0``, in the wave zone +there are only transverse fields) to arrive at the equation presented in the +previous section and implemented in *Palace*. + +## References + +[1] J.-M. Jin, _The Finite Element Method in Electromagnetics_, Wiley-IEEE Press, Hoboken, +NJ, Third edition, 2014.\ +[2] P. Monk, _Finite Element Methods for Maxwell's Equations_, Oxford University Press, +Oxford, 2003.\ +[3] L. Vardapetyan and L. Demkowicz, Full-wave analysis of dielectric waveguides at a given +frequency, _Mathematics of Computation_ 72 (2003) 105-129.\ +[4] J. Wenner, R. Barends, R. C. Bialczak, et al., Surface loss of superconducting coplanar +waveguide resonators, _Applied Physics Letters_ 99, 113513 (2011).\ +[5] S. Nicaise, On Zienkiewicz-Zhu error estimators for Maxwell’s equations, _Comptes Rendus +Mathematique_ 340 (2005) 697-702.\ +[6] J. A, Stratton and L. J. Chu, Diffraction theory of Electromagnetic +Waves, _Physical Review_, 56, 1, (1939), 99-107. diff --git a/docs/src/run.md b/docs/src/run.md index 912bb5ab1e..7635c43a7c 100644 --- a/docs/src/run.md +++ b/docs/src/run.md @@ -1,60 +1,59 @@ -```@raw html - - -``` - -# Running *Palace* - -Once installed into a directory ``, a parallel simulation using *Palace* can -be started with the following command: - -```bash -/bin/palace -np config.json -``` - -where - - - The installed [`palace`] - (https://github.com/awslabs/palace/blob/main/scripts/palace) script wraps - a call to the desired MPI launcher (`mpirun` by default). - - `` is the number of MPI processes to use for the simulation. - - `config.json` is the JSON format configuration file used to specify the simulation - parameters. The structure of this configuration file is outlined in detail in the - section [Configuration File](config/config.md). - -A full list of available script options is available using the `-h` or `--help` flag. - -During the course of a simulation, the solver will write a number of useful statistics and -logging information to standard output. It is often helpful to save this information to a -file, for example with: - -```bash -/bin/palace ... | tee log.out -``` - -Of course, the interested user can explicitly run the *Palace* binary in parallel, -supplying options directly to their MPI launcher of choice, as: - -```bash - [OPTIONS] /bin/palace-.bin config.json -``` - -where `` is the MPI launcher command, `[OPTIONS]` is a list of command line options -passed to the MPI launcher, and `` is the machine architecture (`x86_64` or -`arm64`). - -## Singularity/Apptainer - -Assuming *Palace* was built using Singularity/Apptainer to `palace.sif`, running: - -```bash -singularity run palace.sif -``` - -corresponds to running a *Palace* simulation with command line arguments `` using: - -```bash -/bin/palace -``` - -as described above. +```@raw html + + +``` + +# Running *Palace* + +Once installed into a directory ``, a parallel simulation using *Palace* can +be started with the following command: + +```bash +/bin/palace -np config.json +``` + +where + + - The installed [`palace`](https://github.com/awslabs/palace/blob/main/scripts/palace) + script wraps a call to the desired MPI launcher (`mpirun` by default). + - `` is the number of MPI processes to use for the simulation. + - `config.json` is the JSON format configuration file used to specify the simulation + parameters. The structure of this configuration file is outlined in detail in the + section [Configuration File](config/config.md). + +A full list of available script options is available using the `-h` or `--help` flag. + +During the course of a simulation, the solver will write a number of useful statistics and +logging information to standard output. It is often helpful to save this information to a +file, for example with: + +```bash +/bin/palace ... | tee log.out +``` + +Of course, the interested user can explicitly run the *Palace* binary in parallel, +supplying options directly to their MPI launcher of choice, as: + +```bash + [OPTIONS] /bin/palace-.bin config.json +``` + +where `` is the MPI launcher command, `[OPTIONS]` is a list of command line options +passed to the MPI launcher, and `` is the machine architecture (`x86_64` or +`arm64`). + +## Singularity/Apptainer + +Assuming *Palace* was built using Singularity/Apptainer to `palace.sif`, running: + +```bash +singularity run palace.sif +``` + +corresponds to running a *Palace* simulation with command line arguments `` using: + +```bash +/bin/palace +``` + +as described above. diff --git a/examples/Project.toml b/examples/Project.toml index 3c3a9c5122..a1ce4caad7 100644 --- a/examples/Project.toml +++ b/examples/Project.toml @@ -1,12 +1,12 @@ -[deps] -CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" -DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" -Gmsh = "705231aa-382f-11e9-3f0c-b7cb4346fdeb" -JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -Measures = "442fdcdd-2543-5da2-b0f3-8c86c306513e" -Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" -PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee" -Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665" -SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +[deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" +DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +Gmsh = "705231aa-382f-11e9-3f0c-b7cb4346fdeb" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +Measures = "442fdcdd-2543-5da2-b0f3-8c86c306513e" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +Roots = "f2b01f46-fcfa-551c-844a-d8ac1e96c665" +SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" diff --git a/examples/cavity/cavity.jl b/examples/cavity/cavity.jl index b18c0b0cf2..c6bd0f2e1b 100644 --- a/examples/cavity/cavity.jl +++ b/examples/cavity/cavity.jl @@ -1,290 +1,290 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -using CSV -using DataFrames -using Dates -using ForwardDiff -using JSON -using Roots -using SpecialFunctions - -include(joinpath(@__DIR__, "mesh", "mesh.jl")) - -""" - solve_cavity_resonator( - params; - order, - refinement, - mesh_type=0, - radius=2.74, - aspect_ratio=1.0, - num_processors=1, - cleanup_files=true, - ) - -Solve the cavity mode problem, with an automatically generated Gmsh mesh - -See also [`generate_cylindrical_cavity_mesh`](@ref) - -# Arguments - - - params - dictionary storing the parsed json parameter file - - order - the polynomial order used in the solution representation - - geo_order - the polynomial order used in the mesh representation - - refinement - the level of mesh refinement - - mesh_type - 0 = tetrahedral mesh, 1 = prism mesh, 2 = hexahedral mesh - - radius - the radius of the cavity resonator - - aspect_ratio - the ratio of the DIAMETER of the cavity to the height - - num_processors - number of processors to use for the simulation - - cleanup_files - delete temporary mesh and configuration files after simulation -""" -function solve_cavity_resonator( - params::Dict; - order::Integer, - geo_order::Integer, - refinement::Integer, - mesh_type::Integer=0, - radius::Real=2.74, - aspect_ratio::Real=1.0, - num_processors::Integer=1, - cleanup_files::Bool=true -) - @assert refinement >= 0 - @assert order > 0 - @assert geo_order > 0 - @assert mesh_type ∈ [0, 1, 2] - - cavity_dir = @__DIR__ - fileroot = string("cavity_p", order, "_h", refinement) - - # Generate a mesh - mesh_filename = joinpath(cavity_dir, "mesh", string(fileroot, ".msh")) - generate_cylindrical_cavity_mesh( - refinement=refinement, - order=geo_order, - mesh_type=mesh_type, - radius=radius, - aspect_ratio=aspect_ratio, - filename=mesh_filename, - verbose=0 - ) - - # Generate solver parameter file - params["Solver"]["Order"] = order - params["Model"]["Mesh"] = mesh_filename - json_filename = joinpath(cavity_dir, string(fileroot, ".json")) - open(json_filename, "w") do f - return JSON.print(f, params) - end - - # Call the solver, storing the terminal output - call_command = `palace -np $num_processors -wdir $cavity_dir $json_filename` - log_file = read(call_command, String) - # println(log_file) - - # Search through for the DOF count - start_ind = findfirst("number of global unknowns: ", log_file)[end] - end_ind = findfirst("\n", log_file[start_ind:end])[1] - dof = parse(Int, filter(isdigit, log_file[start_ind:(start_ind + end_ind)])) - - # Extract the top two frequency modes - eig_df = CSV.read(joinpath(cavity_dir, "postpro", "convergence", "eig.csv"), DataFrame) - eig = Matrix(eig_df[:, 2:end])[:, 1] - - # Clean up the parameter and mesh file - if cleanup_files - rm(mesh_filename) - rm(json_filename) - end - - return dof, eig -end - -∂besselj = (ν, x) -> ForwardDiff.derivative(y -> besselj(ν, y), x) - -""" - besselj_roots(ν, n::Integer)::Float64 - -Compute the n-th root of the Bessel functions of first kind, J_ν - -Note: Compare against https://mathworld.wolfram.com/BesselFunctionZeros.html -with besselj_roots.((0:5)', 1:5) -""" -function besselj_roots(ν, n::Integer)::Float64 - upper_bound = 10 - roots = find_zeros(x -> besselj(ν, x), 0, upper_bound) - while length(roots) < n + 1 - upper_bound *= 2 - roots = find_zeros(x -> besselj(ν, x), 0, upper_bound) - end - if roots[1] < 1e-10 - # If the first root is marked as 0, ignore - popfirst!(roots) - end - return roots[n] -end - -""" - ∂besselj_roots(ν, n::Integer)::Float64 - -Compute the n-th root of the first derivative of the Bessel functions of first kind, J'_ν - -Note: Compare against https://mathworld.wolfram.com/BesselFunctionZeros.html with -∂besselj_roots.((0:5)', 1:5) -""" -function ∂besselj_roots(ν, n::Integer)::Float64 - upper_bound = 10 - roots = find_zeros(x -> ∂besselj(ν, x), 0, upper_bound) - while length(roots) < n + 1 - upper_bound *= 2 - roots = find_zeros(x -> ∂besselj(ν, x), 0, upper_bound) - end - if roots[1] < 1e-10 - # If the first root is marked as 0, ignore - popfirst!(roots) - end - return roots[n] -end - -""" - frequency_transverse(n, m, l; ϵᵣ, μᵣ, a, d) - -Compute the resonant frequency of the transverse electric and magnetic mode indexed by n, m, -l, in GHz - -# Arguments - - - n - mode number in the circumferential direction - - m - mode number in the radial direction - - l - mode number in the z direction - - ϵᵣ - relative electric permittivity - - μᵣ - relative magnetic permeability - - a - radius of cavity in centimeters - - d - height of cavity in centimeters -""" -function frequency_transverse(n, m, l; ϵᵣ, μᵣ, a_cm, d_cm) - ϵ₀ = 8.8541878176e-12 - μ₀ = 4e-7 * π - - a = a_cm * 0.01 - d = d_cm * 0.01 - - c = 1.0 / sqrt(ϵ₀ * μ₀) - p_nm = besselj_roots(n, m) - ∂p_nm = ∂besselj_roots(n, m) - - C = (c / (2 * π * sqrt(ϵᵣ * μᵣ))) - - f_M = C * sqrt((p_nm / a)^2 + (l * π / d)^2) / 1e9 - f_E = C * sqrt((∂p_nm / a)^2 + (l * π / d)^2) / 1e9 - - return f_E, f_M -end - -""" - generate_cavity_convergence_data( - p_min::Integer=1, - p_max::Integer=3, - ref_min::Integer=0, - ref_max::Integer=3, - mesh_type::Integer=0, - num_processors::Integer=1 - ) - -Generate the data for the cavity convergence study - -# Arguments - - - p_min - minimum polynomial order - - p_max - maximum polynomial order - - ref_min - minimum number of levels of uniform mesh refinement - - ref_max - maximum number of levels of uniform mesh refinement - - mesh_type - 0 = tetrahedral mesh, 1 = prism mesh, 2 = hexahedral mesh - - num_processors - number of processors to use for the simulation -""" -function generate_cavity_convergence_data(; - p_min::Integer=1, - p_max::Integer=3, - ref_min::Integer=0, - ref_max::Integer=3, - mesh_type::Integer=0, - num_processors::Integer=1 -) - # Load the default JSON script (the file contains comments and we need to sanitize them) - cavity_dir = @__DIR__ - params = open(joinpath(cavity_dir, "cavity_pec.json"), "r") do f - return JSON.parse(join(getindex.(split.(eachline(f), "//"), 1), "\n")) - end - - # Update the dictionary - params["Problem"]["Verbose"] = 2 - params["Problem"]["Output"] = joinpath(cavity_dir, "postpro", "convergence") - params["Model"]["Refinement"]["UniformLevels"] = 0 # Don't perform any mesh refinement - params["Solver"]["Eigenmode"]["Save"] = 0 # Don't write any fields to file - params["Solver"]["Eigenmode"]["N"] = 4 # Look only for the top 4 modes - params["Solver"]["Eigenmode"]["Tol"] = 1e-12 - params["Solver"]["Eigenmode"]["Target"] = 2.0 - params["Solver"]["Eigenmode"]["StartVectorConstant"] = true - params["Solver"]["Linear"]["Tol"] = 1e-14 - - # Compute the exact solution for reference - radius = 2.74 - aspect_ratio = 1 / sqrt(2) - ~, f_TM_010_true = frequency_transverse( - 0, - 1, - 0; - ϵᵣ = 2.08, - μᵣ = 1.0, - a_cm = radius, - d_cm = aspect_ratio * 2 * radius - ) - f_TE_111_true, ~ = frequency_transverse( - 1, - 1, - 1; - ϵᵣ = 2.08, - μᵣ = 1.0, - a_cm = radius, - d_cm = aspect_ratio * 2 * radius - ) - - dof = Vector{Vector{Int}}() - f_TM_010 = Vector{Vector{Float64}}() - f_TE_111 = Vector{Vector{Float64}}() - - # Generate the data - for p = p_min:p_max - push!(dof, eltype(dof)()) - push!(f_TM_010, eltype(f_TM_010)()) - push!(f_TE_111, eltype(f_TE_111)()) - ref_lower = (p == 1 && mesh_type == 2) ? max(1, ref_min) : ref_min - ref_upper = (p > 3) ? min(3, ref_max) : ref_max - for ref = ref_lower:ref_upper - print("p = ", p, ", ref = ", ref, ": ") - results = solve_cavity_resonator( - params, - order=p, - geo_order=p, - refinement=ref, - mesh_type=mesh_type, - radius=radius, - aspect_ratio=aspect_ratio, - num_processors=ref < 2 ? 1 : num_processors - ) - println("Success! $(results[1]) dofs, finished at $(now())") - push!(dof[end], results[1]) - - # The first dominant frequency should be the magnetic, and the second the - # electric, but just in case we search for the closest - push!(f_TM_010[end], results[2][argmin(abs.(results[2] .- f_TM_010_true))]) - push!(f_TE_111[end], results[2][argmin(abs.(results[2] .- f_TE_111_true))]) - end - end - - f_TM_010_rel_error = map(x -> abs.(x .- f_TM_010_true) ./ f_TM_010_true, f_TM_010) - f_TE_111_rel_error = map(x -> abs.(x .- f_TE_111_true) ./ f_TE_111_true, f_TE_111) - - return dof, f_TM_010_rel_error, f_TE_111_rel_error -end +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +using CSV +using DataFrames +using Dates +using ForwardDiff +using JSON +using Roots +using SpecialFunctions + +include(joinpath(@__DIR__, "mesh", "mesh.jl")) + +""" + solve_cavity_resonator( + params; + order, + refinement, + mesh_type=0, + radius=2.74, + aspect_ratio=1.0, + num_processors=1, + cleanup_files=true, + ) + +Solve the cavity mode problem, with an automatically generated Gmsh mesh + +See also [`generate_cylindrical_cavity_mesh`](@ref) + +# Arguments + + - params - dictionary storing the parsed json parameter file + - order - the polynomial order used in the solution representation + - geo_order - the polynomial order used in the mesh representation + - refinement - the level of mesh refinement + - mesh_type - 0 = tetrahedral mesh, 1 = prism mesh, 2 = hexahedral mesh + - radius - the radius of the cavity resonator + - aspect_ratio - the ratio of the DIAMETER of the cavity to the height + - num_processors - number of processors to use for the simulation + - cleanup_files - delete temporary mesh and configuration files after simulation +""" +function solve_cavity_resonator( + params::Dict; + order::Integer, + geo_order::Integer, + refinement::Integer, + mesh_type::Integer=0, + radius::Real=2.74, + aspect_ratio::Real=1.0, + num_processors::Integer=1, + cleanup_files::Bool=true +) + @assert refinement >= 0 + @assert order > 0 + @assert geo_order > 0 + @assert mesh_type ∈ [0, 1, 2] + + cavity_dir = @__DIR__ + fileroot = string("cavity_p", order, "_h", refinement) + + # Generate a mesh + mesh_filename = joinpath(cavity_dir, "mesh", string(fileroot, ".msh")) + generate_cylindrical_cavity_mesh( + refinement=refinement, + order=geo_order, + mesh_type=mesh_type, + radius=radius, + aspect_ratio=aspect_ratio, + filename=mesh_filename, + verbose=0 + ) + + # Generate solver parameter file + params["Solver"]["Order"] = order + params["Model"]["Mesh"] = mesh_filename + json_filename = joinpath(cavity_dir, string(fileroot, ".json")) + open(json_filename, "w") do f + return JSON.print(f, params) + end + + # Call the solver, storing the terminal output + call_command = `palace -np $num_processors -wdir $cavity_dir $json_filename` + log_file = read(call_command, String) + # println(log_file) + + # Search through for the DOF count + start_ind = findfirst("number of global unknowns: ", log_file)[end] + end_ind = findfirst("\n", log_file[start_ind:end])[1] + dof = parse(Int, filter(isdigit, log_file[start_ind:(start_ind + end_ind)])) + + # Extract the top two frequency modes + eig_df = CSV.read(joinpath(cavity_dir, "postpro", "convergence", "eig.csv"), DataFrame) + eig = Matrix(eig_df[:, 2:end])[:, 1] + + # Clean up the parameter and mesh file + if cleanup_files + rm(mesh_filename) + rm(json_filename) + end + + return dof, eig +end + +∂besselj = (ν, x) -> ForwardDiff.derivative(y -> besselj(ν, y), x) + +""" + besselj_roots(ν, n::Integer)::Float64 + +Compute the n-th root of the Bessel functions of first kind, J_ν + +Note: Compare against https://mathworld.wolfram.com/BesselFunctionZeros.html +with besselj_roots.((0:5)', 1:5) +""" +function besselj_roots(ν, n::Integer)::Float64 + upper_bound = 10 + roots = find_zeros(x -> besselj(ν, x), 0, upper_bound) + while length(roots) < n + 1 + upper_bound *= 2 + roots = find_zeros(x -> besselj(ν, x), 0, upper_bound) + end + if roots[1] < 1e-10 + # If the first root is marked as 0, ignore + popfirst!(roots) + end + return roots[n] +end + +""" + ∂besselj_roots(ν, n::Integer)::Float64 + +Compute the n-th root of the first derivative of the Bessel functions of first kind, J'_ν + +Note: Compare against https://mathworld.wolfram.com/BesselFunctionZeros.html with +∂besselj_roots.((0:5)', 1:5) +""" +function ∂besselj_roots(ν, n::Integer)::Float64 + upper_bound = 10 + roots = find_zeros(x -> ∂besselj(ν, x), 0, upper_bound) + while length(roots) < n + 1 + upper_bound *= 2 + roots = find_zeros(x -> ∂besselj(ν, x), 0, upper_bound) + end + if roots[1] < 1e-10 + # If the first root is marked as 0, ignore + popfirst!(roots) + end + return roots[n] +end + +""" + frequency_transverse(n, m, l; ϵᵣ, μᵣ, a, d) + +Compute the resonant frequency of the transverse electric and magnetic mode indexed by n, m, +l, in GHz + +# Arguments + + - n - mode number in the circumferential direction + - m - mode number in the radial direction + - l - mode number in the z direction + - ϵᵣ - relative electric permittivity + - μᵣ - relative magnetic permeability + - a - radius of cavity in centimeters + - d - height of cavity in centimeters +""" +function frequency_transverse(n, m, l; ϵᵣ, μᵣ, a_cm, d_cm) + ϵ₀ = 8.8541878176e-12 + μ₀ = 4e-7 * π + + a = a_cm * 0.01 + d = d_cm * 0.01 + + c = 1.0 / sqrt(ϵ₀ * μ₀) + p_nm = besselj_roots(n, m) + ∂p_nm = ∂besselj_roots(n, m) + + C = (c / (2 * π * sqrt(ϵᵣ * μᵣ))) + + f_M = C * sqrt((p_nm / a)^2 + (l * π / d)^2) / 1e9 + f_E = C * sqrt((∂p_nm / a)^2 + (l * π / d)^2) / 1e9 + + return f_E, f_M +end + +""" + generate_cavity_convergence_data( + p_min::Integer=1, + p_max::Integer=3, + ref_min::Integer=0, + ref_max::Integer=3, + mesh_type::Integer=0, + num_processors::Integer=1 + ) + +Generate the data for the cavity convergence study + +# Arguments + + - p_min - minimum polynomial order + - p_max - maximum polynomial order + - ref_min - minimum number of levels of uniform mesh refinement + - ref_max - maximum number of levels of uniform mesh refinement + - mesh_type - 0 = tetrahedral mesh, 1 = prism mesh, 2 = hexahedral mesh + - num_processors - number of processors to use for the simulation +""" +function generate_cavity_convergence_data(; + p_min::Integer=1, + p_max::Integer=3, + ref_min::Integer=0, + ref_max::Integer=3, + mesh_type::Integer=0, + num_processors::Integer=1 +) + # Load the default JSON script (the file contains comments and we need to sanitize them) + cavity_dir = @__DIR__ + params = open(joinpath(cavity_dir, "cavity_pec.json"), "r") do f + return JSON.parse(join(getindex.(split.(eachline(f), "//"), 1), "\n")) + end + + # Update the dictionary + params["Problem"]["Verbose"] = 2 + params["Problem"]["Output"] = joinpath(cavity_dir, "postpro", "convergence") + params["Model"]["Refinement"]["UniformLevels"] = 0 # Don't perform any mesh refinement + params["Solver"]["Eigenmode"]["Save"] = 0 # Don't write any fields to file + params["Solver"]["Eigenmode"]["N"] = 4 # Look only for the top 4 modes + params["Solver"]["Eigenmode"]["Tol"] = 1e-12 + params["Solver"]["Eigenmode"]["Target"] = 2.0 + params["Solver"]["Eigenmode"]["StartVectorConstant"] = true + params["Solver"]["Linear"]["Tol"] = 1e-14 + + # Compute the exact solution for reference + radius = 2.74 + aspect_ratio = 1 / sqrt(2) + ~, f_TM_010_true = frequency_transverse( + 0, + 1, + 0; + ϵᵣ = 2.08, + μᵣ = 1.0, + a_cm = radius, + d_cm = aspect_ratio * 2 * radius + ) + f_TE_111_true, ~ = frequency_transverse( + 1, + 1, + 1; + ϵᵣ = 2.08, + μᵣ = 1.0, + a_cm = radius, + d_cm = aspect_ratio * 2 * radius + ) + + dof = Vector{Vector{Int}}() + f_TM_010 = Vector{Vector{Float64}}() + f_TE_111 = Vector{Vector{Float64}}() + + # Generate the data + for p = p_min:p_max + push!(dof, eltype(dof)()) + push!(f_TM_010, eltype(f_TM_010)()) + push!(f_TE_111, eltype(f_TE_111)()) + ref_lower = (p == 1 && mesh_type == 2) ? max(1, ref_min) : ref_min + ref_upper = (p > 3) ? min(3, ref_max) : ref_max + for ref = ref_lower:ref_upper + print("p = ", p, ", ref = ", ref, ": ") + results = solve_cavity_resonator( + params, + order=p, + geo_order=p, + refinement=ref, + mesh_type=mesh_type, + radius=radius, + aspect_ratio=aspect_ratio, + num_processors=ref < 2 ? 1 : num_processors + ) + println("Success! $(results[1]) dofs, finished at $(now())") + push!(dof[end], results[1]) + + # The first dominant frequency should be the magnetic, and the second the + # electric, but just in case we search for the closest + push!(f_TM_010[end], results[2][argmin(abs.(results[2] .- f_TM_010_true))]) + push!(f_TE_111[end], results[2][argmin(abs.(results[2] .- f_TE_111_true))]) + end + end + + f_TM_010_rel_error = map(x -> abs.(x .- f_TM_010_true) ./ f_TM_010_true, f_TM_010) + f_TE_111_rel_error = map(x -> abs.(x .- f_TE_111_true) ./ f_TE_111_true, f_TE_111) + + return dof, f_TM_010_rel_error, f_TE_111_rel_error +end diff --git a/examples/cavity/cavity_impedance.json b/examples/cavity/cavity_impedance.json index 93f184ea09..d99a481be7 100644 --- a/examples/cavity/cavity_impedance.json +++ b/examples/cavity/cavity_impedance.json @@ -1,68 +1,68 @@ -{ - "Problem": - { - "Type": "Eigenmode", - "Verbose": 2, - "Output": "D:/WelSimLLC/executable28/_palace_examples/cavity/postpro/impedance" - }, - "Model": - { - "Mesh": "D:/WelSimLLC/executable28/_palace_examples/cavity/mesh/cavity.msh", - "L0": 1.0e-2, // cm - "Refinement": - { - "UniformLevels": 1 - } - }, - "Domains": - { - "Materials": - [ - { - "Attributes": [1], - "Permeability": 1.0, - "Permittivity": 2.08, - "LossTan": 0.0004 - } - ], - "Postprocessing": - { - "Dielectric": - [ - { - "Index": 1, - "Attributes": [1] - } - ] - } - }, - "Boundaries": - { - "Impedance": - [ - { - "Attributes": [2], - "Rs": 0.0184 // Ω, surface resistance of Cu @ 5 GHz - } - ] - }, - "Solver": - { - "Order": 2, - "Eigenmode": - { - "N": 15, - "Tol": 1.0e-8, - "Target": 2.0, // TE f111 ~ 2.9 GHz - "Save": 15 - }, - "Linear": - { - "Type": "Default", - "KSPType": "GMRES", - "Tol": 1.0e-8, - "MaxIts": 100 - } - } -} - +{ + "Problem": + { + "Type": "Eigenmode", + "Verbose": 2, + "Output": "D:/WelSimLLC/executable28/_palace_examples/cavity/postpro/impedance" + }, + "Model": + { + "Mesh": "D:/WelSimLLC/executable28/_palace_examples/cavity/mesh/cavity.msh", + "L0": 1.0e-2, // cm + "Refinement": + { + "UniformLevels": 1 + } + }, + "Domains": + { + "Materials": + [ + { + "Attributes": [1], + "Permeability": 1.0, + "Permittivity": 2.08, + "LossTan": 0.0004 + } + ], + "Postprocessing": + { + "Dielectric": + [ + { + "Index": 1, + "Attributes": [1] + } + ] + } + }, + "Boundaries": + { + "Impedance": + [ + { + "Attributes": [2], + "Rs": 0.0184 // Ω, surface resistance of Cu @ 5 GHz + } + ] + }, + "Solver": + { + "Order": 2, + "Eigenmode": + { + "N": 15, + "Tol": 1.0e-8, + "Target": 2.0, // TE f111 ~ 2.9 GHz + "Save": 15 + }, + "Linear": + { + "Type": "Default", + "KSPType": "GMRES", + "Tol": 1.0e-8, + "MaxIts": 100 + } + } +} + diff --git a/examples/cavity/cavity_pec.json b/examples/cavity/cavity_pec.json index 755871cdfa..5c2f3643f4 100644 --- a/examples/cavity/cavity_pec.json +++ b/examples/cavity/cavity_pec.json @@ -1,65 +1,65 @@ -{ - "Problem": - { - "Type": "Eigenmode", - "Verbose": 2, - "Output": "D:/WelSimLLC/executable28/_palace_examples/cavity/postpro/pec" - }, - "Model": - { - "Mesh": "D:/WelSimLLC/executable28/_palace_examples/cavity/mesh/cavity.msh", - "L0": 1.0e-2, // cm - "Refinement": - { - "UniformLevels": 1 - } - }, - "Domains": - { - "Materials": - [ - { - "Attributes": [1], - "Permeability": 1.0, - "Permittivity": 2.08, - "LossTan": 0.0004 - } - ], - "Postprocessing": - { - "Dielectric": - [ - { - "Index": 1, - "Attributes": [1] - } - ] - } - }, - "Boundaries": - { - "PEC": - { - "Attributes": [2] - } - }, - "Solver": - { - "Order": 2, - "Eigenmode": - { - "N": 15, - "Tol": 1.0e-8, - "Target": 2.0, // TE f111 ~ 2.9 GHz - "Save": 15 - }, - "Linear": - { - "Type": "Default", - "KSPType": "GMRES", - "Tol": 1.0e-8, - "MaxIts": 100 - } - } -} - +{ + "Problem": + { + "Type": "Eigenmode", + "Verbose": 2, + "Output": "D:/WelSimLLC/executable28/_palace_examples/cavity/postpro/pec" + }, + "Model": + { + "Mesh": "D:/WelSimLLC/executable28/_palace_examples/cavity/mesh/cavity.msh", + "L0": 1.0e-2, // cm + "Refinement": + { + "UniformLevels": 1 + } + }, + "Domains": + { + "Materials": + [ + { + "Attributes": [1], + "Permeability": 1.0, + "Permittivity": 2.08, + "LossTan": 0.0004 + } + ], + "Postprocessing": + { + "Dielectric": + [ + { + "Index": 1, + "Attributes": [1] + } + ] + } + }, + "Boundaries": + { + "PEC": + { + "Attributes": [2] + } + }, + "Solver": + { + "Order": 2, + "Eigenmode": + { + "N": 15, + "Tol": 1.0e-8, + "Target": 2.0, // TE f111 ~ 2.9 GHz + "Save": 15 + }, + "Linear": + { + "Type": "Default", + "KSPType": "GMRES", + "Tol": 1.0e-8, + "MaxIts": 100 + } + } +} + diff --git a/examples/cavity/convergence_study.jl b/examples/cavity/convergence_study.jl index 510b84ddd7..e850a08b34 100644 --- a/examples/cavity/convergence_study.jl +++ b/examples/cavity/convergence_study.jl @@ -1,129 +1,129 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -#= - This script performs a convergence study of the cavity resonator problem - - Loops over mesh refinement level and order - a) Generates a mesh that corresponds, by calling `mesh.jl`, - b) Writes a JSON for driving the simulation, - c) Calls the solver and records the number of DOF, - d) Extracts from the written CSV files the eigenfrequency. - - Once these are generated, plots DOF^(-1/3) against error. -=# - -using DelimitedFiles -using Measures -using Plots -using PyPlot: matplotlib - -include(joinpath(@__DIR__, "cavity.jl")) - -# Plot settings -pyplot() -rcParams = PyPlot.PyDict(matplotlib["rcParams"]) -plotsz = (800, 400) -fntsz = 12 -fnt = font(fntsz) -rcParams["mathtext.fontset"] = "stix" -default( - size=plotsz, - palette=:Set1_9, - dpi=300, - tickfont=fnt, - guidefont=fnt, - legendfontsize=fntsz - 2, - margin=10mm -) - -mkrsz = 8 -markers = [ - (:circle, mkrsz, stroke(0)), - (:utriangle, mkrsz, stroke(0)), - (:square, mkrsz, stroke(0)), - (:dtriangle, mkrsz, stroke(0)), - (:star, mkrsz, stroke(0)) -] - -# Compute the convergence data -p_min = 1 -p_max = 3 -ref_min = 0 -ref_max = 3 -for mesh_type ∈ [0, 1, 2] - if mesh_type == 0 - mesh_name = "Tetrahedra" - elseif mesh_type == 1 - mesh_name = "Prism" - elseif mesh_type == 2 - mesh_name = "Hexahedra" - end - println("$mesh_name:") - - # Run simulations - dof, f_TM_010_relative_error, f_TE_111_relative_error = - generate_cavity_convergence_data( - p_min=p_min, - p_max=p_max, - ref_min=ref_min, - ref_max=ref_max, - mesh_type=mesh_type, - num_processors=6 - ) - - # Plot the convergence - xlbl = "\$DOF^{-1/3}\$" - ylbl = "Relative error, $mesh_name" - pp = plot(xlabel=xlbl, ylabel=ylbl, legend=:bottomright) - for p ∈ p_min:p_max - plot!( - pp, - dof[p] .^ (-1 / 3), - f_TM_010_relative_error[p], - label=string("\$f^{TM}_{010}, p = ", p, "\$"), - linestyle=:solid, - markers=markers[p], - color=p - ) - plot!( - pp, - dof[p] .^ (-1 / 3), - f_TE_111_relative_error[p], - label=string("\$f^{TE}_{111}, p = ", p, "\$"), - linestyle=:dash, - markers=markers[p], - color=p - ) - end - plot!(pp, xaxis=:log, yaxis=:log) - - # Compute the rate from the final entries in the relative error - # Let that e ~ C * h^k, where h ~ DOF^(-1/3), then log and compute the slopes between - # points - Δlogh = map(x -> log.(x[2:end] .^ (-1 / 3)) - log.(x[1:(end - 1)] .^ (-1 / 3)), dof) - Δlogf_TM_010 = map(x -> log.(x[2:end]) - log.(x[1:(end - 1)]), f_TM_010_relative_error) - Δlogf_TE_111 = map(x -> log.(x[2:end]) - log.(x[1:(end - 1)]), f_TE_111_relative_error) - - k_f_TM_010 = map((x, y) -> x ./ y, Δlogf_TM_010, Δlogh) - k_f_TE_111 = map((x, y) -> x ./ y, Δlogf_TE_111, Δlogh) - - println("k_f_TM_010 =", map(x -> round.(x, digits=2), k_f_TM_010)) - println("k_f_TE_111 =", map(x -> round.(x, digits=2), k_f_TE_111)) - - output_dir = joinpath(@__DIR__, "postpro", "convergence") - lmesh_name = lowercase(mesh_name) - - savefig(pp, joinpath(output_dir, string("cavity_error_", lmesh_name, ".png"))) - display(pp) - - writedlm(joinpath(output_dir, string("dof_", lmesh_name, ".csv")), dof) - writedlm( - joinpath(output_dir, string("f_TM_010_error_", lmesh_name, ".csv")), - f_TM_010_relative_error - ) - writedlm( - joinpath(output_dir, string("f_TE_111_error_", lmesh_name, ".csv")), - f_TE_111_relative_error - ) -end +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +#= + This script performs a convergence study of the cavity resonator problem + + Loops over mesh refinement level and order + a) Generates a mesh that corresponds, by calling `mesh.jl`, + b) Writes a JSON for driving the simulation, + c) Calls the solver and records the number of DOF, + d) Extracts from the written CSV files the eigenfrequency. + + Once these are generated, plots DOF^(-1/3) against error. +=# + +using DelimitedFiles +using Measures +using Plots +using PyPlot: matplotlib + +include(joinpath(@__DIR__, "cavity.jl")) + +# Plot settings +pyplot() +rcParams = PyPlot.PyDict(matplotlib["rcParams"]) +plotsz = (800, 400) +fntsz = 12 +fnt = font(fntsz) +rcParams["mathtext.fontset"] = "stix" +default( + size=plotsz, + palette=:Set1_9, + dpi=300, + tickfont=fnt, + guidefont=fnt, + legendfontsize=fntsz - 2, + margin=10mm +) + +mkrsz = 8 +markers = [ + (:circle, mkrsz, stroke(0)), + (:utriangle, mkrsz, stroke(0)), + (:square, mkrsz, stroke(0)), + (:dtriangle, mkrsz, stroke(0)), + (:star, mkrsz, stroke(0)) +] + +# Compute the convergence data +p_min = 1 +p_max = 3 +ref_min = 0 +ref_max = 3 +for mesh_type ∈ [0, 1, 2] + if mesh_type == 0 + mesh_name = "Tetrahedra" + elseif mesh_type == 1 + mesh_name = "Prism" + elseif mesh_type == 2 + mesh_name = "Hexahedra" + end + println("$mesh_name:") + + # Run simulations + dof, f_TM_010_relative_error, f_TE_111_relative_error = + generate_cavity_convergence_data( + p_min=p_min, + p_max=p_max, + ref_min=ref_min, + ref_max=ref_max, + mesh_type=mesh_type, + num_processors=6 + ) + + # Plot the convergence + xlbl = "\$DOF^{-1/3}\$" + ylbl = "Relative error, $mesh_name" + pp = plot(xlabel=xlbl, ylabel=ylbl, legend=:bottomright) + for p ∈ p_min:p_max + plot!( + pp, + dof[p] .^ (-1 / 3), + f_TM_010_relative_error[p], + label=string("\$f^{TM}_{010}, p = ", p, "\$"), + linestyle=:solid, + markers=markers[p], + color=p + ) + plot!( + pp, + dof[p] .^ (-1 / 3), + f_TE_111_relative_error[p], + label=string("\$f^{TE}_{111}, p = ", p, "\$"), + linestyle=:dash, + markers=markers[p], + color=p + ) + end + plot!(pp, xaxis=:log, yaxis=:log) + + # Compute the rate from the final entries in the relative error + # Let that e ~ C * h^k, where h ~ DOF^(-1/3), then log and compute the slopes between + # points + Δlogh = map(x -> log.(x[2:end] .^ (-1 / 3)) - log.(x[1:(end - 1)] .^ (-1 / 3)), dof) + Δlogf_TM_010 = map(x -> log.(x[2:end]) - log.(x[1:(end - 1)]), f_TM_010_relative_error) + Δlogf_TE_111 = map(x -> log.(x[2:end]) - log.(x[1:(end - 1)]), f_TE_111_relative_error) + + k_f_TM_010 = map((x, y) -> x ./ y, Δlogf_TM_010, Δlogh) + k_f_TE_111 = map((x, y) -> x ./ y, Δlogf_TE_111, Δlogh) + + println("k_f_TM_010 =", map(x -> round.(x, digits=2), k_f_TM_010)) + println("k_f_TE_111 =", map(x -> round.(x, digits=2), k_f_TE_111)) + + output_dir = joinpath(@__DIR__, "postpro", "convergence") + lmesh_name = lowercase(mesh_name) + + savefig(pp, joinpath(output_dir, string("cavity_error_", lmesh_name, ".png"))) + display(pp) + + writedlm(joinpath(output_dir, string("dof_", lmesh_name, ".csv")), dof) + writedlm( + joinpath(output_dir, string("f_TM_010_error_", lmesh_name, ".csv")), + f_TM_010_relative_error + ) + writedlm( + joinpath(output_dir, string("f_TE_111_error_", lmesh_name, ".csv")), + f_TE_111_relative_error + ) +end diff --git a/examples/cavity/mesh/cavity.msh b/examples/cavity/mesh/cavity.msh index 5e36000db4..1d18c8f610 100644 --- a/examples/cavity/mesh/cavity.msh +++ b/examples/cavity/mesh/cavity.msh @@ -1,585 +1,585 @@ -$MeshFormat -2.2 0 8 -$EndMeshFormat -$PhysicalNames -2 -2 2 "boundaries" -3 1 "cylinder" -$EndPhysicalNames -$Nodes -423 -1 2.74 0 0 -2 2.74 0 5.48 -3 2.216706564587355 1.610531591281378 0 -4 0.8467065645873526 2.605894854648722 0 -5 -0.8467065645873628 2.605894854648719 0 -6 -2.216706564587362 1.610531591281368 0 -7 -2.74 -1.061568669193617e-14 0 -8 -2.21670656458735 -1.610531591281385 0 -9 -0.8467065645873448 -2.605894854648724 0 -10 0.8467065645873671 -2.605894854648718 0 -11 2.216706564587358 -1.610531591281373 0 -12 2.605894854648721 0.8467065645873567 0 -13 1.610531591281374 2.216706564587358 0 -14 -5.307843345968085e-15 2.74 0 -15 -1.610531591281384 2.21670656458735 0 -16 -2.605894854648724 0.8467065645873459 0 -17 -2.605894854648718 -0.8467065645873662 0 -18 -1.610531591281367 -2.216706564587363 0 -19 1.166471451544216e-14 -2.74 0 -20 1.610531591281384 -2.21670656458735 0 -21 2.605894854648721 -0.8467065645873543 0 -22 2.74 0 1.37 -23 2.74 0 2.74 -24 2.74 0 4.11 -25 2.74 0 0.6850000000000001 -26 2.74 0 2.055 -27 2.74 0 3.425 -28 2.74 0 4.795 -29 2.216706564587355 1.610531591281378 5.48 -30 0.8467065645873526 2.605894854648722 5.48 -31 -0.8467065645873628 2.605894854648719 5.48 -32 -2.216706564587362 1.610531591281368 5.48 -33 -2.74 -1.061568669193617e-14 5.48 -34 -2.21670656458735 -1.610531591281385 5.48 -35 -0.8467065645873448 -2.605894854648724 5.48 -36 0.8467065645873671 -2.605894854648718 5.48 -37 2.216706564587358 -1.610531591281373 5.48 -38 2.605894854648721 0.8467065645873567 5.48 -39 1.610531591281374 2.216706564587358 5.48 -40 -5.307843345968085e-15 2.74 5.48 -41 -1.610531591281384 2.21670656458735 5.48 -42 -2.605894854648724 0.8467065645873459 5.48 -43 -2.605894854648718 -0.8467065645873662 5.48 -44 -1.610531591281367 -2.216706564587363 5.48 -45 1.166471451544216e-14 -2.74 5.48 -46 1.610531591281384 -2.21670656458735 5.48 -47 2.605894854648721 -0.8467065645873543 5.48 -48 0.5795539226760824 -0.9482384955974738 0 -49 -0.6331807601024405 0.7144594569041222 0 -50 1.289670577110049 0.3070664825272769 0 -51 -1.098636832574902 -0.8260373617931617 0 -52 0.50042300124461 1.392933945380423 0 -53 0.9346122498930655 -0.3205860065350984 0 -54 0.3282449085038041 0.5107629697156995 0 -55 -0.02681341871317906 -0.1168895193466757 0 -56 -1.919318416287451 -0.4130186808965861 0 -57 -0.8659087963386712 -0.0557889524445197 0 -58 -1.68659038005122 0.3572297284520558 0 -59 -0.2595414549494097 -0.8871379286953177 0 -60 -0.9726716985811233 -1.715966108220943 0 -61 -0.1335763209556312 -1.777066675123099 0 -62 -1.424943662344901 1.162495524092745 0 -63 0.7131302436317247 -1.777066675123096 0 -64 2.014835288555024 0.1535332412636384 0 -65 1.753188570848704 -0.6517325543770481 0 -66 1.398130243631721 -1.279385043439423 0 -67 0.8950467891773294 0.8500002139538499 0 -68 -0.06637887942891524 1.053696701142273 0 -69 -1.657671698581126 -1.218284476537273 0 -70 -0.7399436623449017 1.66017715577642 0 -71 -0.1731417816713763 1.999414400014571 0 -72 0.6735647829159813 1.999414400014573 0 -73 1.358564782915983 1.5017327683309 0 -74 1.753188570848702 0.9587990369043272 0 -75 2.216706564587355 1.610531591281378 1.37 -76 2.216706564587355 1.610531591281378 2.74 -77 2.216706564587355 1.610531591281378 4.11 -78 0.8467065645873526 2.605894854648722 1.37 -79 0.8467065645873526 2.605894854648722 2.74 -80 0.8467065645873526 2.605894854648722 4.11 -81 -0.8467065645873628 2.605894854648719 1.37 -82 -0.8467065645873628 2.605894854648719 2.74 -83 -0.8467065645873628 2.605894854648719 4.11 -84 -2.216706564587362 1.610531591281368 1.37 -85 -2.216706564587362 1.610531591281368 2.74 -86 -2.216706564587362 1.610531591281368 4.11 -87 -2.74 -1.061568669193617e-14 1.37 -88 -2.74 -1.061568669193617e-14 2.74 -89 -2.74 -1.061568669193617e-14 4.11 -90 -2.21670656458735 -1.610531591281385 1.37 -91 -2.21670656458735 -1.610531591281385 2.74 -92 -2.21670656458735 -1.610531591281385 4.11 -93 -0.8467065645873448 -2.605894854648724 1.37 -94 -0.8467065645873448 -2.605894854648724 2.74 -95 -0.8467065645873448 -2.605894854648724 4.11 -96 0.8467065645873671 -2.605894854648718 1.37 -97 0.8467065645873671 -2.605894854648718 2.74 -98 0.8467065645873671 -2.605894854648718 4.11 -99 2.216706564587358 -1.610531591281373 1.37 -100 2.216706564587358 -1.610531591281373 2.74 -101 2.216706564587358 -1.610531591281373 4.11 -102 2.216706564587355 1.610531591281378 0.6850000000000001 -103 2.60589485464872 0.8467065645873568 1.37 -104 2.605894854648721 0.8467065645873568 0.6850000000000001 -105 2.216706564587355 1.610531591281378 2.055 -106 2.60589485464872 0.8467065645873568 2.74 -107 2.605894854648721 0.8467065645873568 2.055 -108 2.216706564587355 1.610531591281378 3.425 -109 2.60589485464872 0.8467065645873568 4.11 -110 2.605894854648721 0.8467065645873568 3.425000000000001 -111 2.216706564587355 1.610531591281378 4.795 -112 2.605894854648721 0.8467065645873568 4.795 -113 0.8467065645873522 2.605894854648723 0.6850000000000001 -114 1.610531591281374 2.216706564587358 1.37 -115 1.610531591281374 2.216706564587358 0.6850000000000001 -116 0.8467065645873522 2.605894854648723 2.055 -117 1.610531591281374 2.216706564587358 2.74 -118 1.610531591281374 2.216706564587358 2.055 -119 0.8467065645873522 2.605894854648723 3.425 -120 1.610531591281374 2.216706564587358 4.11 -121 1.610531591281374 2.216706564587358 3.425000000000001 -122 0.8467065645873522 2.605894854648723 4.795 -123 1.610531591281374 2.216706564587358 4.795 -124 -0.8467065645873628 2.605894854648719 0.6850000000000001 -125 1.677766114831874e-16 2.74 1.37 -126 -2.775557561562891e-15 2.74 0.6850000000000001 -127 -0.8467065645873628 2.605894854648719 2.055 -128 1.677766114831874e-16 2.74 2.74 -129 0 2.74 2.055 -130 -0.8467065645873628 2.605894854648719 3.425 -131 1.677766114831874e-16 2.74 4.11 -132 0 2.74 3.425000000000001 -133 -0.8467065645873628 2.605894854648719 4.795 -134 -2.775557561562891e-15 2.74 4.795 -135 -2.216706564587362 1.610531591281368 0.6850000000000001 -136 -1.610531591281384 2.21670656458735 1.37 -137 -1.610531591281384 2.21670656458735 0.6850000000000001 -138 -2.216706564587362 1.610531591281368 2.055 -139 -1.610531591281384 2.21670656458735 2.74 -140 -1.610531591281384 2.21670656458735 2.055 -141 -2.216706564587362 1.610531591281368 3.425 -142 -1.610531591281384 2.21670656458735 4.11 -143 -1.610531591281384 2.21670656458735 3.425000000000001 -144 -2.216706564587362 1.610531591281368 4.795 -145 -1.610531591281384 2.21670656458735 4.795 -146 -2.74 3.355532229663748e-16 0.6850000000000001 -147 -2.605894854648724 0.8467065645873459 1.37 -148 -2.605894854648724 0.8467065645873515 0.6850000000000001 -149 -2.74 3.355532229663748e-16 2.055 -150 -2.605894854648724 0.8467065645873459 2.74 -151 -2.605894854648724 0.8467065645873515 2.055 -152 -2.74 3.355532229663748e-16 3.425 -153 -2.605894854648724 0.8467065645873459 4.11 -154 -2.605894854648724 0.8467065645873515 3.425000000000001 -155 -2.74 3.355532229663748e-16 4.795 -156 -2.605894854648724 0.8467065645873515 4.795 -157 -2.21670656458735 -1.610531591281384 0.6850000000000001 -158 -2.605894854648718 -0.8467065645873649 1.37 -159 -2.605894854648718 -0.8467065645873595 0.6850000000000001 -160 -2.21670656458735 -1.610531591281384 2.055 -161 -2.605894854648718 -0.8467065645873649 2.74 -162 -2.605894854648719 -0.8467065645873589 2.055 -163 -2.21670656458735 -1.610531591281384 3.425 -164 -2.605894854648718 -0.8467065645873649 4.11 -165 -2.605894854648719 -0.8467065645873589 3.425000000000001 -166 -2.21670656458735 -1.610531591281384 4.795 -167 -2.605894854648718 -0.8467065645873595 4.795 -168 -0.8467065645873448 -2.605894854648724 0.6850000000000001 -169 -1.610531591281369 -2.216706564587362 1.37 -170 -1.610531591281368 -2.216706564587362 0.6850000000000001 -171 -0.8467065645873448 -2.605894854648724 2.055 -172 -1.610531591281369 -2.216706564587362 2.74 -173 -1.61053159128137 -2.216706564587361 2.055 -174 -0.8467065645873448 -2.605894854648724 3.425 -175 -1.610531591281369 -2.216706564587362 4.11 -176 -1.61053159128137 -2.216706564587361 3.425000000000001 -177 -0.8467065645873448 -2.605894854648724 4.795 -178 -1.610531591281368 -2.216706564587362 4.795 -179 0.8467065645873671 -2.605894854648718 0.6850000000000001 -180 -5.033298344495622e-16 -2.74 1.37 -181 5.551115123125783e-15 -2.74 0.6850000000000001 -182 0.8467065645873671 -2.605894854648718 2.055 -183 -5.033298344495622e-16 -2.74 2.74 -184 -5.551115123125783e-16 -2.74 2.055 -185 0.8467065645873671 -2.605894854648718 3.425 -186 -5.033298344495622e-16 -2.74 4.11 -187 -5.551115123125783e-16 -2.74 3.425000000000001 -188 0.8467065645873671 -2.605894854648718 4.795 -189 5.551115123125783e-15 -2.74 4.795 -190 2.216706564587358 -1.610531591281373 0.6850000000000001 -191 1.610531591281382 -2.216706564587352 1.37 -192 1.610531591281383 -2.216706564587351 0.6850000000000001 -193 2.216706564587358 -1.610531591281373 2.055 -194 1.610531591281382 -2.216706564587352 2.74 -195 1.610531591281382 -2.216706564587352 2.055 -196 2.216706564587358 -1.610531591281373 3.425 -197 1.610531591281382 -2.216706564587352 4.11 -198 1.610531591281382 -2.216706564587352 3.425000000000001 -199 2.216706564587358 -1.610531591281373 4.795 -200 1.610531591281383 -2.216706564587351 4.795 -201 2.605894854648721 -0.8467065645873543 1.37 -202 2.605894854648721 -0.8467065645873543 0.6850000000000001 -203 2.605894854648721 -0.8467065645873543 2.74 -204 2.605894854648721 -0.8467065645873543 2.055 -205 2.605894854648721 -0.8467065645873543 4.11 -206 2.605894854648721 -0.8467065645873543 3.425000000000001 -207 2.605894854648721 -0.8467065645873543 4.795 -208 0.5795539226760824 -0.9482384955974738 5.48 -209 -0.6331807601024405 0.7144594569041222 5.48 -210 1.289670577110049 0.3070664825272769 5.48 -211 -1.098636832574902 -0.8260373617931617 5.48 -212 0.50042300124461 1.392933945380423 5.48 -213 0.9346122498930655 -0.3205860065350984 5.48 -214 0.3282449085038041 0.5107629697156995 5.48 -215 -0.02681341871317906 -0.1168895193466757 5.48 -216 -1.919318416287451 -0.4130186808965861 5.48 -217 -0.8659087963386712 -0.0557889524445197 5.48 -218 -1.68659038005122 0.3572297284520558 5.48 -219 -0.2595414549494097 -0.8871379286953177 5.48 -220 -0.9726716985811233 -1.715966108220943 5.48 -221 -0.1335763209556312 -1.777066675123099 5.48 -222 -1.424943662344901 1.162495524092745 5.48 -223 0.7131302436317247 -1.777066675123096 5.48 -224 2.014835288555024 0.1535332412636384 5.48 -225 1.753188570848704 -0.6517325543770481 5.48 -226 1.398130243631721 -1.279385043439423 5.48 -227 0.8950467891773294 0.8500002139538499 5.48 -228 -0.06637887942891524 1.053696701142273 5.48 -229 -1.657671698581126 -1.218284476537273 5.48 -230 -0.7399436623449017 1.66017715577642 5.48 -231 -0.1731417816713763 1.999414400014571 5.48 -232 0.6735647829159813 1.999414400014573 5.48 -233 1.358564782915983 1.5017327683309 5.48 -234 1.753188570848702 0.9587990369043272 5.48 -235 0.5795539226760824 -0.9482384955974738 1.37 -236 0.5795539226760824 -0.9482384955974738 2.74 -237 0.5795539226760824 -0.9482384955974738 4.11 -238 -0.6331807601024405 0.7144594569041222 1.37 -239 -0.6331807601024405 0.7144594569041222 2.74 -240 -0.6331807601024405 0.7144594569041222 4.11 -241 1.289670577110049 0.3070664825272769 1.37 -242 1.289670577110049 0.3070664825272769 2.74 -243 1.289670577110049 0.3070664825272769 4.11 -244 -1.098636832574902 -0.8260373617931617 1.37 -245 -1.098636832574902 -0.8260373617931617 2.74 -246 -1.098636832574902 -0.8260373617931617 4.11 -247 0.50042300124461 1.392933945380423 1.37 -248 0.50042300124461 1.392933945380423 2.74 -249 0.50042300124461 1.392933945380423 4.11 -250 -0.6331807601024405 0.7144594569041222 0.6850000000000001 -251 0.5795539226760824 -0.9482384955974738 0.6850000000000001 -252 1.289670577110049 0.3070664825272769 0.6850000000000001 -253 -0.02681341871317905 -0.1168895193466758 1.37 -254 0.3282449085038041 0.5107629697156995 1.37 -255 0.9346122498930656 -0.3205860065350984 1.37 -256 -0.02681341871317905 -0.1168895193466757 0.6850000000000001 -257 0.3282449085038042 0.5107629697156995 0.6850000000000001 -258 0.9346122498930656 -0.3205860065350984 0.6850000000000001 -259 -0.6331807601024405 0.7144594569041222 2.055 -260 0.5795539226760824 -0.9482384955974738 2.055 -261 1.289670577110049 0.3070664825272769 2.055 -262 -0.02681341871317905 -0.1168895193466758 2.74 -263 0.3282449085038041 0.5107629697156995 2.74 -264 0.9346122498930656 -0.3205860065350984 2.74 -265 -0.02681341871317905 -0.1168895193466758 2.055 -266 0.3282449085038042 0.5107629697156995 2.055 -267 0.9346122498930656 -0.3205860065350984 2.055 -268 -0.6331807601024405 0.7144594569041222 3.425 -269 0.5795539226760824 -0.9482384955974738 3.425 -270 1.289670577110049 0.3070664825272769 3.425 -271 -0.02681341871317905 -0.1168895193466758 4.11 -272 0.3282449085038041 0.5107629697156995 4.11 -273 0.9346122498930656 -0.3205860065350984 4.11 -274 -0.02681341871317905 -0.1168895193466758 3.425000000000001 -275 0.3282449085038042 0.5107629697156995 3.425 -276 0.9346122498930656 -0.3205860065350984 3.425000000000001 -277 -0.6331807601024405 0.7144594569041222 4.795 -278 0.5795539226760824 -0.9482384955974738 4.795 -279 1.289670577110049 0.3070664825272769 4.795 -280 -0.0268134187131791 -0.1168895193466757 4.795 -281 0.3282449085038042 0.5107629697156995 4.795 -282 0.9346122498930656 -0.3205860065350984 4.795 -283 -1.098636832574902 -0.8260373617931617 0.6850000000000001 -284 -1.68659038005122 0.3572297284520558 1.37 -285 -0.8659087963386711 -0.05578895244451976 1.37 -286 -1.919318416287451 -0.4130186808965862 1.37 -287 -1.68659038005122 0.3572297284520612 0.6850000000000001 -288 -0.8659087963386711 -0.05578895244451973 0.6850000000000001 -289 -1.919318416287451 -0.4130186808965806 0.6850000000000001 -290 -1.098636832574902 -0.8260373617931617 2.055 -291 -1.68659038005122 0.3572297284520558 2.74 -292 -0.8659087963386711 -0.05578895244451976 2.74 -293 -1.919318416287451 -0.4130186808965862 2.74 -294 -1.68659038005122 0.3572297284520612 2.055 -295 -0.8659087963386711 -0.05578895244451976 2.055 -296 -1.919318416287451 -0.4130186808965807 2.055 -297 -1.098636832574902 -0.8260373617931617 3.425 -298 -1.68659038005122 0.3572297284520558 4.11 -299 -0.8659087963386711 -0.05578895244451976 4.11 -300 -1.919318416287451 -0.4130186808965862 4.11 -301 -1.68659038005122 0.3572297284520612 3.425000000000001 -302 -0.8659087963386711 -0.05578895244451976 3.425 -303 -1.919318416287451 -0.4130186808965807 3.425000000000001 -304 -1.098636832574902 -0.8260373617931617 4.795 -305 -1.68659038005122 0.3572297284520612 4.795 -306 -0.8659087963386711 -0.05578895244451976 4.795 -307 -1.919318416287451 -0.4130186808965806 4.795 -308 -0.1335763209556312 -1.777066675123099 1.37 -309 -0.9726716985811233 -1.715966108220943 1.37 -310 -0.2595414549494097 -0.8871379286953177 1.37 -311 -0.1335763209556312 -1.777066675123099 0.6850000000000001 -312 -0.9726716985811233 -1.715966108220943 0.6850000000000001 -313 -0.2595414549494097 -0.8871379286953177 0.6850000000000001 -314 -0.1335763209556312 -1.777066675123099 2.74 -315 -0.9726716985811233 -1.715966108220943 2.74 -316 -0.2595414549494097 -0.8871379286953177 2.74 -317 -0.1335763209556312 -1.777066675123099 2.055 -318 -0.9726716985811233 -1.715966108220943 2.055 -319 -0.2595414549494097 -0.8871379286953177 2.055 -320 -0.1335763209556312 -1.777066675123099 4.11 -321 -0.9726716985811233 -1.715966108220943 4.11 -322 -0.2595414549494097 -0.8871379286953177 4.11 -323 -0.1335763209556312 -1.777066675123099 3.425000000000001 -324 -0.9726716985811233 -1.715966108220943 3.425 -325 -0.2595414549494097 -0.8871379286953177 3.425000000000001 -326 -0.1335763209556312 -1.777066675123099 4.795 -327 -0.9726716985811233 -1.715966108220943 4.795 -328 -0.2595414549494097 -0.8871379286953177 4.795 -329 -1.424943662344901 1.162495524092745 1.37 -330 -1.424943662344901 1.162495524092745 0.6850000000000001 -331 -1.424943662344901 1.162495524092745 2.74 -332 -1.424943662344901 1.162495524092745 2.055 -333 -1.424943662344901 1.162495524092745 4.11 -334 -1.424943662344901 1.162495524092745 3.425 -335 -1.424943662344901 1.162495524092745 4.795 -336 0.7131302436317248 -1.777066675123096 1.37 -337 0.7131302436317247 -1.777066675123096 0.6850000000000001 -338 0.7131302436317248 -1.777066675123096 2.74 -339 0.7131302436317248 -1.777066675123096 2.055 -340 0.7131302436317248 -1.777066675123096 4.11 -341 0.7131302436317248 -1.777066675123096 3.425000000000001 -342 0.7131302436317247 -1.777066675123096 4.795 -343 1.753188570848704 -0.651732554377048 1.37 -344 2.014835288555024 0.1535332412636384 1.37 -345 1.753188570848704 -0.651732554377048 0.6850000000000001 -346 2.014835288555024 0.1535332412636384 0.6850000000000001 -347 1.753188570848704 -0.651732554377048 2.74 -348 2.014835288555024 0.1535332412636384 2.74 -349 1.753188570848704 -0.651732554377048 2.055 -350 2.014835288555024 0.1535332412636384 2.055 -351 1.753188570848704 -0.651732554377048 4.11 -352 2.014835288555024 0.1535332412636384 4.11 -353 1.753188570848704 -0.651732554377048 3.425 -354 2.014835288555024 0.1535332412636384 3.425000000000001 -355 1.753188570848704 -0.651732554377048 4.795 -356 2.014835288555024 0.1535332412636384 4.795 -357 1.39813024363172 -1.279385043439423 1.37 -358 1.398130243631721 -1.279385043439423 0.6850000000000001 -359 1.39813024363172 -1.279385043439423 2.74 -360 1.398130243631721 -1.279385043439423 2.055 -361 1.39813024363172 -1.279385043439423 4.11 -362 1.398130243631721 -1.279385043439423 3.425000000000001 -363 1.398130243631721 -1.279385043439423 4.795 -364 0.50042300124461 1.392933945380423 0.6850000000000001 -365 -0.06637887942891524 1.053696701142273 1.37 -366 0.8950467891773294 0.8500002139538499 1.37 -367 -0.06637887942891527 1.053696701142273 0.6850000000000001 -368 0.8950467891773294 0.8500002139538501 0.6850000000000001 -369 0.50042300124461 1.392933945380423 2.055 -370 -0.06637887942891524 1.053696701142273 2.74 -371 0.8950467891773294 0.8500002139538499 2.74 -372 -0.06637887942891527 1.053696701142273 2.055 -373 0.8950467891773294 0.8500002139538501 2.055 -374 0.50042300124461 1.392933945380423 3.425 -375 -0.06637887942891524 1.053696701142273 4.11 -376 0.8950467891773294 0.8500002139538499 4.11 -377 -0.06637887942891527 1.053696701142273 3.425 -378 0.8950467891773294 0.8500002139538501 3.425000000000001 -379 0.50042300124461 1.392933945380423 4.795 -380 -0.06637887942891527 1.053696701142273 4.795 -381 0.8950467891773294 0.8500002139538501 4.795 -382 -1.657671698581126 -1.218284476537273 1.37 -383 -1.657671698581126 -1.218284476537273 0.6850000000000001 -384 -1.657671698581126 -1.218284476537273 2.74 -385 -1.657671698581126 -1.218284476537273 2.055 -386 -1.657671698581126 -1.218284476537273 4.11 -387 -1.657671698581126 -1.218284476537273 3.425000000000001 -388 -1.657671698581126 -1.218284476537273 4.795 -389 -0.7399436623449016 1.66017715577642 1.37 -390 -0.7399436623449017 1.66017715577642 0.6850000000000001 -391 -0.7399436623449016 1.66017715577642 2.74 -392 -0.7399436623449016 1.66017715577642 2.055 -393 -0.7399436623449016 1.66017715577642 4.11 -394 -0.7399436623449016 1.66017715577642 3.425 -395 -0.7399436623449017 1.66017715577642 4.795 -396 -0.1731417816713764 1.999414400014571 1.37 -397 -0.1731417816713764 1.999414400014571 0.6850000000000001 -398 -0.1731417816713764 1.999414400014571 2.74 -399 -0.1731417816713764 1.999414400014571 2.055 -400 -0.1731417816713764 1.999414400014571 4.11 -401 -0.1731417816713764 1.999414400014571 3.425 -402 -0.1731417816713763 1.999414400014571 4.795 -403 0.6735647829159813 1.999414400014573 1.37 -404 0.673564782915981 1.999414400014573 0.6850000000000001 -405 0.6735647829159813 1.999414400014573 2.74 -406 0.673564782915981 1.999414400014573 2.055 -407 0.6735647829159813 1.999414400014573 4.11 -408 0.673564782915981 1.999414400014573 3.425 -409 0.673564782915981 1.999414400014573 4.795 -410 1.358564782915983 1.5017327683309 1.37 -411 1.358564782915983 1.5017327683309 0.6850000000000001 -412 1.358564782915983 1.5017327683309 2.74 -413 1.358564782915983 1.5017327683309 2.055 -414 1.358564782915983 1.5017327683309 4.11 -415 1.358564782915983 1.5017327683309 3.425 -416 1.358564782915983 1.5017327683309 4.795 -417 1.753188570848702 0.9587990369043273 1.37 -418 1.753188570848702 0.9587990369043273 0.6850000000000001 -419 1.753188570848702 0.9587990369043273 2.74 -420 1.753188570848702 0.9587990369043273 2.055 -421 1.753188570848702 0.9587990369043273 4.11 -422 1.753188570848702 0.9587990369043273 3.425000000000001 -423 1.753188570848702 0.9587990369043273 4.795 -$EndNodes -$Elements -148 -1 9 2 2 1 48 50 49 53 54 55 -2 9 2 2 1 7 51 49 56 57 58 -3 9 2 2 1 48 51 9 59 60 61 -4 9 2 2 1 49 51 48 57 59 55 -5 9 2 2 1 7 49 6 58 62 16 -6 9 2 2 1 10 48 9 63 61 19 -7 9 2 2 1 1 50 11 64 65 21 -8 9 2 2 1 11 50 48 65 53 66 -9 9 2 2 1 50 52 49 67 68 54 -10 9 2 2 1 8 51 7 69 56 17 -11 9 2 2 1 9 51 8 60 69 18 -12 9 2 2 1 6 49 5 62 70 15 -13 9 2 2 1 11 48 10 66 63 20 -14 9 2 2 1 49 52 5 68 71 70 -15 9 2 2 1 5 52 4 71 72 14 -16 9 2 2 1 4 52 3 72 73 13 -17 9 2 2 1 3 50 1 74 64 12 -18 9 2 2 1 3 52 50 73 67 74 -19 9 2 2 3 208 210 209 213 214 215 -20 9 2 2 3 33 211 209 216 217 218 -21 9 2 2 3 208 211 35 219 220 221 -22 9 2 2 3 209 211 208 217 219 215 -23 9 2 2 3 33 209 32 218 222 42 -24 9 2 2 3 36 208 35 223 221 45 -25 9 2 2 3 2 210 37 224 225 47 -26 9 2 2 3 37 210 208 225 213 226 -27 9 2 2 3 210 212 209 227 228 214 -28 9 2 2 3 34 211 33 229 216 43 -29 9 2 2 3 35 211 34 220 229 44 -30 9 2 2 3 32 209 31 222 230 41 -31 9 2 2 3 37 208 36 226 223 46 -32 9 2 2 3 209 212 31 228 231 230 -33 9 2 2 3 31 212 30 231 232 40 -34 9 2 2 3 30 212 29 232 233 39 -35 9 2 2 3 29 210 2 234 224 38 -36 9 2 2 3 29 212 210 233 227 234 -37 10 2 2 2 1 3 75 22 12 102 103 25 104 -38 10 2 2 2 22 75 76 23 103 105 106 26 107 -39 10 2 2 2 23 76 77 24 106 108 109 27 110 -40 10 2 2 2 24 77 29 2 109 111 38 28 112 -41 10 2 2 2 3 4 78 75 13 113 114 102 115 -42 10 2 2 2 75 78 79 76 114 116 117 105 118 -43 10 2 2 2 76 79 80 77 117 119 120 108 121 -44 10 2 2 2 77 80 30 29 120 122 39 111 123 -45 10 2 2 2 4 5 81 78 14 124 125 113 126 -46 10 2 2 2 78 81 82 79 125 127 128 116 129 -47 10 2 2 2 79 82 83 80 128 130 131 119 132 -48 10 2 2 2 80 83 31 30 131 133 40 122 134 -49 10 2 2 2 5 6 84 81 15 135 136 124 137 -50 10 2 2 2 81 84 85 82 136 138 139 127 140 -51 10 2 2 2 82 85 86 83 139 141 142 130 143 -52 10 2 2 2 83 86 32 31 142 144 41 133 145 -53 10 2 2 2 6 7 87 84 16 146 147 135 148 -54 10 2 2 2 84 87 88 85 147 149 150 138 151 -55 10 2 2 2 85 88 89 86 150 152 153 141 154 -56 10 2 2 2 86 89 33 32 153 155 42 144 156 -57 10 2 2 2 7 8 90 87 17 157 158 146 159 -58 10 2 2 2 87 90 91 88 158 160 161 149 162 -59 10 2 2 2 88 91 92 89 161 163 164 152 165 -60 10 2 2 2 89 92 34 33 164 166 43 155 167 -61 10 2 2 2 8 9 93 90 18 168 169 157 170 -62 10 2 2 2 90 93 94 91 169 171 172 160 173 -63 10 2 2 2 91 94 95 92 172 174 175 163 176 -64 10 2 2 2 92 95 35 34 175 177 44 166 178 -65 10 2 2 2 9 10 96 93 19 179 180 168 181 -66 10 2 2 2 93 96 97 94 180 182 183 171 184 -67 10 2 2 2 94 97 98 95 183 185 186 174 187 -68 10 2 2 2 95 98 36 35 186 188 45 177 189 -69 10 2 2 2 10 11 99 96 20 190 191 179 192 -70 10 2 2 2 96 99 100 97 191 193 194 182 195 -71 10 2 2 2 97 100 101 98 194 196 197 185 198 -72 10 2 2 2 98 101 37 36 197 199 46 188 200 -73 10 2 2 2 11 1 22 99 21 25 201 190 202 -74 10 2 2 2 99 22 23 100 201 26 203 193 204 -75 10 2 2 2 100 23 24 101 203 27 205 196 206 -76 10 2 2 2 101 24 2 37 205 28 47 199 207 -77 13 2 1 1 49 48 50 238 235 241 55 54 250 53 251 252 253 254 255 256 257 258 -78 13 2 1 1 238 235 241 239 236 242 253 254 259 255 260 261 262 263 264 265 266 267 -79 13 2 1 1 239 236 242 240 237 243 262 263 268 264 269 270 271 272 273 274 275 276 -80 13 2 1 1 240 237 243 209 208 210 271 272 277 273 278 279 215 214 213 280 281 282 -81 13 2 1 1 49 7 51 238 87 244 58 57 250 56 146 283 284 285 286 287 288 289 -82 13 2 1 1 238 87 244 239 88 245 284 285 259 286 149 290 291 292 293 294 295 296 -83 13 2 1 1 239 88 245 240 89 246 291 292 268 293 152 297 298 299 300 301 302 303 -84 13 2 1 1 240 89 246 209 33 211 298 299 277 300 155 304 218 217 216 305 306 307 -85 13 2 1 1 9 48 51 93 235 244 61 60 168 59 251 283 308 309 310 311 312 313 -86 13 2 1 1 93 235 244 94 236 245 308 309 171 310 260 290 314 315 316 317 318 319 -87 13 2 1 1 94 236 245 95 237 246 314 315 174 316 269 297 320 321 322 323 324 325 -88 13 2 1 1 95 237 246 35 208 211 320 321 177 322 278 304 221 220 219 326 327 328 -89 13 2 1 1 48 49 51 235 238 244 55 59 251 57 250 283 253 310 285 256 313 288 -90 13 2 1 1 235 238 244 236 239 245 253 310 260 285 259 290 262 316 292 265 319 295 -91 13 2 1 1 236 239 245 237 240 246 262 316 269 292 268 297 271 322 299 274 325 302 -92 13 2 1 1 237 240 246 208 209 211 271 322 278 299 277 304 215 219 217 280 328 306 -93 13 2 1 1 6 7 49 84 87 238 16 62 135 58 146 250 147 329 284 148 330 287 -94 13 2 1 1 84 87 238 85 88 239 147 329 138 284 149 259 150 331 291 151 332 294 -95 13 2 1 1 85 88 239 86 89 240 150 331 141 291 152 268 153 333 298 154 334 301 -96 13 2 1 1 86 89 240 32 33 209 153 333 144 298 155 277 42 222 218 156 335 305 -97 13 2 1 1 9 10 48 93 96 235 19 61 168 63 179 251 180 308 336 181 311 337 -98 13 2 1 1 93 96 235 94 97 236 180 308 171 336 182 260 183 314 338 184 317 339 -99 13 2 1 1 94 97 236 95 98 237 183 314 174 338 185 269 186 320 340 187 323 341 -100 13 2 1 1 95 98 237 35 36 208 186 320 177 340 188 278 45 221 223 189 326 342 -101 13 2 1 1 11 1 50 99 22 241 21 65 190 64 25 252 201 343 344 202 345 346 -102 13 2 1 1 99 22 241 100 23 242 201 343 193 344 26 261 203 347 348 204 349 350 -103 13 2 1 1 100 23 242 101 24 243 203 347 196 348 27 270 205 351 352 206 353 354 -104 13 2 1 1 101 24 243 37 2 210 205 351 199 352 28 279 47 225 224 207 355 356 -105 13 2 1 1 48 11 50 235 99 241 66 53 251 65 190 252 357 255 343 358 258 345 -106 13 2 1 1 235 99 241 236 100 242 357 255 260 343 193 261 359 264 347 360 267 349 -107 13 2 1 1 236 100 242 237 101 243 359 264 269 347 196 270 361 273 351 362 276 353 -108 13 2 1 1 237 101 243 208 37 210 361 273 278 351 199 279 226 213 225 363 282 355 -109 13 2 1 1 49 50 52 238 241 247 54 68 250 67 252 364 254 365 366 257 367 368 -110 13 2 1 1 238 241 247 239 242 248 254 365 259 366 261 369 263 370 371 266 372 373 -111 13 2 1 1 239 242 248 240 243 249 263 370 268 371 270 374 272 375 376 275 377 378 -112 13 2 1 1 240 243 249 209 210 212 272 375 277 376 279 379 214 228 227 281 380 381 -113 13 2 1 1 7 8 51 87 90 244 17 56 146 69 157 283 158 286 382 159 289 383 -114 13 2 1 1 87 90 244 88 91 245 158 286 149 382 160 290 161 293 384 162 296 385 -115 13 2 1 1 88 91 245 89 92 246 161 293 152 384 163 297 164 300 386 165 303 387 -116 13 2 1 1 89 92 246 33 34 211 164 300 155 386 166 304 43 216 229 167 307 388 -117 13 2 1 1 8 9 51 90 93 244 18 69 157 60 168 283 169 382 309 170 383 312 -118 13 2 1 1 90 93 244 91 94 245 169 382 160 309 171 290 172 384 315 173 385 318 -119 13 2 1 1 91 94 245 92 95 246 172 384 163 315 174 297 175 386 321 176 387 324 -120 13 2 1 1 92 95 246 34 35 211 175 386 166 321 177 304 44 229 220 178 388 327 -121 13 2 1 1 5 6 49 81 84 238 15 70 124 62 135 250 136 389 329 137 390 330 -122 13 2 1 1 81 84 238 82 85 239 136 389 127 329 138 259 139 391 331 140 392 332 -123 13 2 1 1 82 85 239 83 86 240 139 391 130 331 141 268 142 393 333 143 394 334 -124 13 2 1 1 83 86 240 31 32 209 142 393 133 333 144 277 41 230 222 145 395 335 -125 13 2 1 1 10 11 48 96 99 235 20 63 179 66 190 251 191 336 357 192 337 358 -126 13 2 1 1 96 99 235 97 100 236 191 336 182 357 193 260 194 338 359 195 339 360 -127 13 2 1 1 97 100 236 98 101 237 194 338 185 359 196 269 197 340 361 198 341 362 -128 13 2 1 1 98 101 237 36 37 208 197 340 188 361 199 278 46 223 226 200 342 363 -129 13 2 1 1 5 49 52 81 238 247 70 71 124 68 250 364 389 396 365 390 397 367 -130 13 2 1 1 81 238 247 82 239 248 389 396 127 365 259 369 391 398 370 392 399 372 -131 13 2 1 1 82 239 248 83 240 249 391 398 130 370 268 374 393 400 375 394 401 377 -132 13 2 1 1 83 240 249 31 209 212 393 400 133 375 277 379 230 231 228 395 402 380 -133 13 2 1 1 4 5 52 78 81 247 14 72 113 71 124 364 125 403 396 126 404 397 -134 13 2 1 1 78 81 247 79 82 248 125 403 116 396 127 369 128 405 398 129 406 399 -135 13 2 1 1 79 82 248 80 83 249 128 405 119 398 130 374 131 407 400 132 408 401 -136 13 2 1 1 80 83 249 30 31 212 131 407 122 400 133 379 40 232 231 134 409 402 -137 13 2 1 1 3 4 52 75 78 247 13 73 102 72 113 364 114 410 403 115 411 404 -138 13 2 1 1 75 78 247 76 79 248 114 410 105 403 116 369 117 412 405 118 413 406 -139 13 2 1 1 76 79 248 77 80 249 117 412 108 405 119 374 120 414 407 121 415 408 -140 13 2 1 1 77 80 249 29 30 212 120 414 111 407 122 379 39 233 232 123 416 409 -141 13 2 1 1 1 3 50 22 75 241 12 64 25 74 102 252 103 344 417 104 346 418 -142 13 2 1 1 22 75 241 23 76 242 103 344 26 417 105 261 106 348 419 107 350 420 -143 13 2 1 1 23 76 242 24 77 243 106 348 27 419 108 270 109 352 421 110 354 422 -144 13 2 1 1 24 77 243 2 29 210 109 352 28 421 111 279 38 224 234 112 356 423 -145 13 2 1 1 50 3 52 241 75 247 74 67 252 73 102 364 417 366 410 418 368 411 -146 13 2 1 1 241 75 247 242 76 248 417 366 261 410 105 369 419 371 412 420 373 413 -147 13 2 1 1 242 76 248 243 77 249 419 371 270 412 108 374 421 376 414 422 378 415 -148 13 2 1 1 243 77 249 210 29 212 421 376 279 414 111 379 234 227 233 423 381 416 -$EndElements +$MeshFormat +2.2 0 8 +$EndMeshFormat +$PhysicalNames +2 +2 2 "boundaries" +3 1 "cylinder" +$EndPhysicalNames +$Nodes +423 +1 2.74 0 0 +2 2.74 0 5.48 +3 2.216706564587355 1.610531591281378 0 +4 0.8467065645873526 2.605894854648722 0 +5 -0.8467065645873628 2.605894854648719 0 +6 -2.216706564587362 1.610531591281368 0 +7 -2.74 -1.061568669193617e-14 0 +8 -2.21670656458735 -1.610531591281385 0 +9 -0.8467065645873448 -2.605894854648724 0 +10 0.8467065645873671 -2.605894854648718 0 +11 2.216706564587358 -1.610531591281373 0 +12 2.605894854648721 0.8467065645873567 0 +13 1.610531591281374 2.216706564587358 0 +14 -5.307843345968085e-15 2.74 0 +15 -1.610531591281384 2.21670656458735 0 +16 -2.605894854648724 0.8467065645873459 0 +17 -2.605894854648718 -0.8467065645873662 0 +18 -1.610531591281367 -2.216706564587363 0 +19 1.166471451544216e-14 -2.74 0 +20 1.610531591281384 -2.21670656458735 0 +21 2.605894854648721 -0.8467065645873543 0 +22 2.74 0 1.37 +23 2.74 0 2.74 +24 2.74 0 4.11 +25 2.74 0 0.6850000000000001 +26 2.74 0 2.055 +27 2.74 0 3.425 +28 2.74 0 4.795 +29 2.216706564587355 1.610531591281378 5.48 +30 0.8467065645873526 2.605894854648722 5.48 +31 -0.8467065645873628 2.605894854648719 5.48 +32 -2.216706564587362 1.610531591281368 5.48 +33 -2.74 -1.061568669193617e-14 5.48 +34 -2.21670656458735 -1.610531591281385 5.48 +35 -0.8467065645873448 -2.605894854648724 5.48 +36 0.8467065645873671 -2.605894854648718 5.48 +37 2.216706564587358 -1.610531591281373 5.48 +38 2.605894854648721 0.8467065645873567 5.48 +39 1.610531591281374 2.216706564587358 5.48 +40 -5.307843345968085e-15 2.74 5.48 +41 -1.610531591281384 2.21670656458735 5.48 +42 -2.605894854648724 0.8467065645873459 5.48 +43 -2.605894854648718 -0.8467065645873662 5.48 +44 -1.610531591281367 -2.216706564587363 5.48 +45 1.166471451544216e-14 -2.74 5.48 +46 1.610531591281384 -2.21670656458735 5.48 +47 2.605894854648721 -0.8467065645873543 5.48 +48 0.5795539226760824 -0.9482384955974738 0 +49 -0.6331807601024405 0.7144594569041222 0 +50 1.289670577110049 0.3070664825272769 0 +51 -1.098636832574902 -0.8260373617931617 0 +52 0.50042300124461 1.392933945380423 0 +53 0.9346122498930655 -0.3205860065350984 0 +54 0.3282449085038041 0.5107629697156995 0 +55 -0.02681341871317906 -0.1168895193466757 0 +56 -1.919318416287451 -0.4130186808965861 0 +57 -0.8659087963386712 -0.0557889524445197 0 +58 -1.68659038005122 0.3572297284520558 0 +59 -0.2595414549494097 -0.8871379286953177 0 +60 -0.9726716985811233 -1.715966108220943 0 +61 -0.1335763209556312 -1.777066675123099 0 +62 -1.424943662344901 1.162495524092745 0 +63 0.7131302436317247 -1.777066675123096 0 +64 2.014835288555024 0.1535332412636384 0 +65 1.753188570848704 -0.6517325543770481 0 +66 1.398130243631721 -1.279385043439423 0 +67 0.8950467891773294 0.8500002139538499 0 +68 -0.06637887942891524 1.053696701142273 0 +69 -1.657671698581126 -1.218284476537273 0 +70 -0.7399436623449017 1.66017715577642 0 +71 -0.1731417816713763 1.999414400014571 0 +72 0.6735647829159813 1.999414400014573 0 +73 1.358564782915983 1.5017327683309 0 +74 1.753188570848702 0.9587990369043272 0 +75 2.216706564587355 1.610531591281378 1.37 +76 2.216706564587355 1.610531591281378 2.74 +77 2.216706564587355 1.610531591281378 4.11 +78 0.8467065645873526 2.605894854648722 1.37 +79 0.8467065645873526 2.605894854648722 2.74 +80 0.8467065645873526 2.605894854648722 4.11 +81 -0.8467065645873628 2.605894854648719 1.37 +82 -0.8467065645873628 2.605894854648719 2.74 +83 -0.8467065645873628 2.605894854648719 4.11 +84 -2.216706564587362 1.610531591281368 1.37 +85 -2.216706564587362 1.610531591281368 2.74 +86 -2.216706564587362 1.610531591281368 4.11 +87 -2.74 -1.061568669193617e-14 1.37 +88 -2.74 -1.061568669193617e-14 2.74 +89 -2.74 -1.061568669193617e-14 4.11 +90 -2.21670656458735 -1.610531591281385 1.37 +91 -2.21670656458735 -1.610531591281385 2.74 +92 -2.21670656458735 -1.610531591281385 4.11 +93 -0.8467065645873448 -2.605894854648724 1.37 +94 -0.8467065645873448 -2.605894854648724 2.74 +95 -0.8467065645873448 -2.605894854648724 4.11 +96 0.8467065645873671 -2.605894854648718 1.37 +97 0.8467065645873671 -2.605894854648718 2.74 +98 0.8467065645873671 -2.605894854648718 4.11 +99 2.216706564587358 -1.610531591281373 1.37 +100 2.216706564587358 -1.610531591281373 2.74 +101 2.216706564587358 -1.610531591281373 4.11 +102 2.216706564587355 1.610531591281378 0.6850000000000001 +103 2.60589485464872 0.8467065645873568 1.37 +104 2.605894854648721 0.8467065645873568 0.6850000000000001 +105 2.216706564587355 1.610531591281378 2.055 +106 2.60589485464872 0.8467065645873568 2.74 +107 2.605894854648721 0.8467065645873568 2.055 +108 2.216706564587355 1.610531591281378 3.425 +109 2.60589485464872 0.8467065645873568 4.11 +110 2.605894854648721 0.8467065645873568 3.425000000000001 +111 2.216706564587355 1.610531591281378 4.795 +112 2.605894854648721 0.8467065645873568 4.795 +113 0.8467065645873522 2.605894854648723 0.6850000000000001 +114 1.610531591281374 2.216706564587358 1.37 +115 1.610531591281374 2.216706564587358 0.6850000000000001 +116 0.8467065645873522 2.605894854648723 2.055 +117 1.610531591281374 2.216706564587358 2.74 +118 1.610531591281374 2.216706564587358 2.055 +119 0.8467065645873522 2.605894854648723 3.425 +120 1.610531591281374 2.216706564587358 4.11 +121 1.610531591281374 2.216706564587358 3.425000000000001 +122 0.8467065645873522 2.605894854648723 4.795 +123 1.610531591281374 2.216706564587358 4.795 +124 -0.8467065645873628 2.605894854648719 0.6850000000000001 +125 1.677766114831874e-16 2.74 1.37 +126 -2.775557561562891e-15 2.74 0.6850000000000001 +127 -0.8467065645873628 2.605894854648719 2.055 +128 1.677766114831874e-16 2.74 2.74 +129 0 2.74 2.055 +130 -0.8467065645873628 2.605894854648719 3.425 +131 1.677766114831874e-16 2.74 4.11 +132 0 2.74 3.425000000000001 +133 -0.8467065645873628 2.605894854648719 4.795 +134 -2.775557561562891e-15 2.74 4.795 +135 -2.216706564587362 1.610531591281368 0.6850000000000001 +136 -1.610531591281384 2.21670656458735 1.37 +137 -1.610531591281384 2.21670656458735 0.6850000000000001 +138 -2.216706564587362 1.610531591281368 2.055 +139 -1.610531591281384 2.21670656458735 2.74 +140 -1.610531591281384 2.21670656458735 2.055 +141 -2.216706564587362 1.610531591281368 3.425 +142 -1.610531591281384 2.21670656458735 4.11 +143 -1.610531591281384 2.21670656458735 3.425000000000001 +144 -2.216706564587362 1.610531591281368 4.795 +145 -1.610531591281384 2.21670656458735 4.795 +146 -2.74 3.355532229663748e-16 0.6850000000000001 +147 -2.605894854648724 0.8467065645873459 1.37 +148 -2.605894854648724 0.8467065645873515 0.6850000000000001 +149 -2.74 3.355532229663748e-16 2.055 +150 -2.605894854648724 0.8467065645873459 2.74 +151 -2.605894854648724 0.8467065645873515 2.055 +152 -2.74 3.355532229663748e-16 3.425 +153 -2.605894854648724 0.8467065645873459 4.11 +154 -2.605894854648724 0.8467065645873515 3.425000000000001 +155 -2.74 3.355532229663748e-16 4.795 +156 -2.605894854648724 0.8467065645873515 4.795 +157 -2.21670656458735 -1.610531591281384 0.6850000000000001 +158 -2.605894854648718 -0.8467065645873649 1.37 +159 -2.605894854648718 -0.8467065645873595 0.6850000000000001 +160 -2.21670656458735 -1.610531591281384 2.055 +161 -2.605894854648718 -0.8467065645873649 2.74 +162 -2.605894854648719 -0.8467065645873589 2.055 +163 -2.21670656458735 -1.610531591281384 3.425 +164 -2.605894854648718 -0.8467065645873649 4.11 +165 -2.605894854648719 -0.8467065645873589 3.425000000000001 +166 -2.21670656458735 -1.610531591281384 4.795 +167 -2.605894854648718 -0.8467065645873595 4.795 +168 -0.8467065645873448 -2.605894854648724 0.6850000000000001 +169 -1.610531591281369 -2.216706564587362 1.37 +170 -1.610531591281368 -2.216706564587362 0.6850000000000001 +171 -0.8467065645873448 -2.605894854648724 2.055 +172 -1.610531591281369 -2.216706564587362 2.74 +173 -1.61053159128137 -2.216706564587361 2.055 +174 -0.8467065645873448 -2.605894854648724 3.425 +175 -1.610531591281369 -2.216706564587362 4.11 +176 -1.61053159128137 -2.216706564587361 3.425000000000001 +177 -0.8467065645873448 -2.605894854648724 4.795 +178 -1.610531591281368 -2.216706564587362 4.795 +179 0.8467065645873671 -2.605894854648718 0.6850000000000001 +180 -5.033298344495622e-16 -2.74 1.37 +181 5.551115123125783e-15 -2.74 0.6850000000000001 +182 0.8467065645873671 -2.605894854648718 2.055 +183 -5.033298344495622e-16 -2.74 2.74 +184 -5.551115123125783e-16 -2.74 2.055 +185 0.8467065645873671 -2.605894854648718 3.425 +186 -5.033298344495622e-16 -2.74 4.11 +187 -5.551115123125783e-16 -2.74 3.425000000000001 +188 0.8467065645873671 -2.605894854648718 4.795 +189 5.551115123125783e-15 -2.74 4.795 +190 2.216706564587358 -1.610531591281373 0.6850000000000001 +191 1.610531591281382 -2.216706564587352 1.37 +192 1.610531591281383 -2.216706564587351 0.6850000000000001 +193 2.216706564587358 -1.610531591281373 2.055 +194 1.610531591281382 -2.216706564587352 2.74 +195 1.610531591281382 -2.216706564587352 2.055 +196 2.216706564587358 -1.610531591281373 3.425 +197 1.610531591281382 -2.216706564587352 4.11 +198 1.610531591281382 -2.216706564587352 3.425000000000001 +199 2.216706564587358 -1.610531591281373 4.795 +200 1.610531591281383 -2.216706564587351 4.795 +201 2.605894854648721 -0.8467065645873543 1.37 +202 2.605894854648721 -0.8467065645873543 0.6850000000000001 +203 2.605894854648721 -0.8467065645873543 2.74 +204 2.605894854648721 -0.8467065645873543 2.055 +205 2.605894854648721 -0.8467065645873543 4.11 +206 2.605894854648721 -0.8467065645873543 3.425000000000001 +207 2.605894854648721 -0.8467065645873543 4.795 +208 0.5795539226760824 -0.9482384955974738 5.48 +209 -0.6331807601024405 0.7144594569041222 5.48 +210 1.289670577110049 0.3070664825272769 5.48 +211 -1.098636832574902 -0.8260373617931617 5.48 +212 0.50042300124461 1.392933945380423 5.48 +213 0.9346122498930655 -0.3205860065350984 5.48 +214 0.3282449085038041 0.5107629697156995 5.48 +215 -0.02681341871317906 -0.1168895193466757 5.48 +216 -1.919318416287451 -0.4130186808965861 5.48 +217 -0.8659087963386712 -0.0557889524445197 5.48 +218 -1.68659038005122 0.3572297284520558 5.48 +219 -0.2595414549494097 -0.8871379286953177 5.48 +220 -0.9726716985811233 -1.715966108220943 5.48 +221 -0.1335763209556312 -1.777066675123099 5.48 +222 -1.424943662344901 1.162495524092745 5.48 +223 0.7131302436317247 -1.777066675123096 5.48 +224 2.014835288555024 0.1535332412636384 5.48 +225 1.753188570848704 -0.6517325543770481 5.48 +226 1.398130243631721 -1.279385043439423 5.48 +227 0.8950467891773294 0.8500002139538499 5.48 +228 -0.06637887942891524 1.053696701142273 5.48 +229 -1.657671698581126 -1.218284476537273 5.48 +230 -0.7399436623449017 1.66017715577642 5.48 +231 -0.1731417816713763 1.999414400014571 5.48 +232 0.6735647829159813 1.999414400014573 5.48 +233 1.358564782915983 1.5017327683309 5.48 +234 1.753188570848702 0.9587990369043272 5.48 +235 0.5795539226760824 -0.9482384955974738 1.37 +236 0.5795539226760824 -0.9482384955974738 2.74 +237 0.5795539226760824 -0.9482384955974738 4.11 +238 -0.6331807601024405 0.7144594569041222 1.37 +239 -0.6331807601024405 0.7144594569041222 2.74 +240 -0.6331807601024405 0.7144594569041222 4.11 +241 1.289670577110049 0.3070664825272769 1.37 +242 1.289670577110049 0.3070664825272769 2.74 +243 1.289670577110049 0.3070664825272769 4.11 +244 -1.098636832574902 -0.8260373617931617 1.37 +245 -1.098636832574902 -0.8260373617931617 2.74 +246 -1.098636832574902 -0.8260373617931617 4.11 +247 0.50042300124461 1.392933945380423 1.37 +248 0.50042300124461 1.392933945380423 2.74 +249 0.50042300124461 1.392933945380423 4.11 +250 -0.6331807601024405 0.7144594569041222 0.6850000000000001 +251 0.5795539226760824 -0.9482384955974738 0.6850000000000001 +252 1.289670577110049 0.3070664825272769 0.6850000000000001 +253 -0.02681341871317905 -0.1168895193466758 1.37 +254 0.3282449085038041 0.5107629697156995 1.37 +255 0.9346122498930656 -0.3205860065350984 1.37 +256 -0.02681341871317905 -0.1168895193466757 0.6850000000000001 +257 0.3282449085038042 0.5107629697156995 0.6850000000000001 +258 0.9346122498930656 -0.3205860065350984 0.6850000000000001 +259 -0.6331807601024405 0.7144594569041222 2.055 +260 0.5795539226760824 -0.9482384955974738 2.055 +261 1.289670577110049 0.3070664825272769 2.055 +262 -0.02681341871317905 -0.1168895193466758 2.74 +263 0.3282449085038041 0.5107629697156995 2.74 +264 0.9346122498930656 -0.3205860065350984 2.74 +265 -0.02681341871317905 -0.1168895193466758 2.055 +266 0.3282449085038042 0.5107629697156995 2.055 +267 0.9346122498930656 -0.3205860065350984 2.055 +268 -0.6331807601024405 0.7144594569041222 3.425 +269 0.5795539226760824 -0.9482384955974738 3.425 +270 1.289670577110049 0.3070664825272769 3.425 +271 -0.02681341871317905 -0.1168895193466758 4.11 +272 0.3282449085038041 0.5107629697156995 4.11 +273 0.9346122498930656 -0.3205860065350984 4.11 +274 -0.02681341871317905 -0.1168895193466758 3.425000000000001 +275 0.3282449085038042 0.5107629697156995 3.425 +276 0.9346122498930656 -0.3205860065350984 3.425000000000001 +277 -0.6331807601024405 0.7144594569041222 4.795 +278 0.5795539226760824 -0.9482384955974738 4.795 +279 1.289670577110049 0.3070664825272769 4.795 +280 -0.0268134187131791 -0.1168895193466757 4.795 +281 0.3282449085038042 0.5107629697156995 4.795 +282 0.9346122498930656 -0.3205860065350984 4.795 +283 -1.098636832574902 -0.8260373617931617 0.6850000000000001 +284 -1.68659038005122 0.3572297284520558 1.37 +285 -0.8659087963386711 -0.05578895244451976 1.37 +286 -1.919318416287451 -0.4130186808965862 1.37 +287 -1.68659038005122 0.3572297284520612 0.6850000000000001 +288 -0.8659087963386711 -0.05578895244451973 0.6850000000000001 +289 -1.919318416287451 -0.4130186808965806 0.6850000000000001 +290 -1.098636832574902 -0.8260373617931617 2.055 +291 -1.68659038005122 0.3572297284520558 2.74 +292 -0.8659087963386711 -0.05578895244451976 2.74 +293 -1.919318416287451 -0.4130186808965862 2.74 +294 -1.68659038005122 0.3572297284520612 2.055 +295 -0.8659087963386711 -0.05578895244451976 2.055 +296 -1.919318416287451 -0.4130186808965807 2.055 +297 -1.098636832574902 -0.8260373617931617 3.425 +298 -1.68659038005122 0.3572297284520558 4.11 +299 -0.8659087963386711 -0.05578895244451976 4.11 +300 -1.919318416287451 -0.4130186808965862 4.11 +301 -1.68659038005122 0.3572297284520612 3.425000000000001 +302 -0.8659087963386711 -0.05578895244451976 3.425 +303 -1.919318416287451 -0.4130186808965807 3.425000000000001 +304 -1.098636832574902 -0.8260373617931617 4.795 +305 -1.68659038005122 0.3572297284520612 4.795 +306 -0.8659087963386711 -0.05578895244451976 4.795 +307 -1.919318416287451 -0.4130186808965806 4.795 +308 -0.1335763209556312 -1.777066675123099 1.37 +309 -0.9726716985811233 -1.715966108220943 1.37 +310 -0.2595414549494097 -0.8871379286953177 1.37 +311 -0.1335763209556312 -1.777066675123099 0.6850000000000001 +312 -0.9726716985811233 -1.715966108220943 0.6850000000000001 +313 -0.2595414549494097 -0.8871379286953177 0.6850000000000001 +314 -0.1335763209556312 -1.777066675123099 2.74 +315 -0.9726716985811233 -1.715966108220943 2.74 +316 -0.2595414549494097 -0.8871379286953177 2.74 +317 -0.1335763209556312 -1.777066675123099 2.055 +318 -0.9726716985811233 -1.715966108220943 2.055 +319 -0.2595414549494097 -0.8871379286953177 2.055 +320 -0.1335763209556312 -1.777066675123099 4.11 +321 -0.9726716985811233 -1.715966108220943 4.11 +322 -0.2595414549494097 -0.8871379286953177 4.11 +323 -0.1335763209556312 -1.777066675123099 3.425000000000001 +324 -0.9726716985811233 -1.715966108220943 3.425 +325 -0.2595414549494097 -0.8871379286953177 3.425000000000001 +326 -0.1335763209556312 -1.777066675123099 4.795 +327 -0.9726716985811233 -1.715966108220943 4.795 +328 -0.2595414549494097 -0.8871379286953177 4.795 +329 -1.424943662344901 1.162495524092745 1.37 +330 -1.424943662344901 1.162495524092745 0.6850000000000001 +331 -1.424943662344901 1.162495524092745 2.74 +332 -1.424943662344901 1.162495524092745 2.055 +333 -1.424943662344901 1.162495524092745 4.11 +334 -1.424943662344901 1.162495524092745 3.425 +335 -1.424943662344901 1.162495524092745 4.795 +336 0.7131302436317248 -1.777066675123096 1.37 +337 0.7131302436317247 -1.777066675123096 0.6850000000000001 +338 0.7131302436317248 -1.777066675123096 2.74 +339 0.7131302436317248 -1.777066675123096 2.055 +340 0.7131302436317248 -1.777066675123096 4.11 +341 0.7131302436317248 -1.777066675123096 3.425000000000001 +342 0.7131302436317247 -1.777066675123096 4.795 +343 1.753188570848704 -0.651732554377048 1.37 +344 2.014835288555024 0.1535332412636384 1.37 +345 1.753188570848704 -0.651732554377048 0.6850000000000001 +346 2.014835288555024 0.1535332412636384 0.6850000000000001 +347 1.753188570848704 -0.651732554377048 2.74 +348 2.014835288555024 0.1535332412636384 2.74 +349 1.753188570848704 -0.651732554377048 2.055 +350 2.014835288555024 0.1535332412636384 2.055 +351 1.753188570848704 -0.651732554377048 4.11 +352 2.014835288555024 0.1535332412636384 4.11 +353 1.753188570848704 -0.651732554377048 3.425 +354 2.014835288555024 0.1535332412636384 3.425000000000001 +355 1.753188570848704 -0.651732554377048 4.795 +356 2.014835288555024 0.1535332412636384 4.795 +357 1.39813024363172 -1.279385043439423 1.37 +358 1.398130243631721 -1.279385043439423 0.6850000000000001 +359 1.39813024363172 -1.279385043439423 2.74 +360 1.398130243631721 -1.279385043439423 2.055 +361 1.39813024363172 -1.279385043439423 4.11 +362 1.398130243631721 -1.279385043439423 3.425000000000001 +363 1.398130243631721 -1.279385043439423 4.795 +364 0.50042300124461 1.392933945380423 0.6850000000000001 +365 -0.06637887942891524 1.053696701142273 1.37 +366 0.8950467891773294 0.8500002139538499 1.37 +367 -0.06637887942891527 1.053696701142273 0.6850000000000001 +368 0.8950467891773294 0.8500002139538501 0.6850000000000001 +369 0.50042300124461 1.392933945380423 2.055 +370 -0.06637887942891524 1.053696701142273 2.74 +371 0.8950467891773294 0.8500002139538499 2.74 +372 -0.06637887942891527 1.053696701142273 2.055 +373 0.8950467891773294 0.8500002139538501 2.055 +374 0.50042300124461 1.392933945380423 3.425 +375 -0.06637887942891524 1.053696701142273 4.11 +376 0.8950467891773294 0.8500002139538499 4.11 +377 -0.06637887942891527 1.053696701142273 3.425 +378 0.8950467891773294 0.8500002139538501 3.425000000000001 +379 0.50042300124461 1.392933945380423 4.795 +380 -0.06637887942891527 1.053696701142273 4.795 +381 0.8950467891773294 0.8500002139538501 4.795 +382 -1.657671698581126 -1.218284476537273 1.37 +383 -1.657671698581126 -1.218284476537273 0.6850000000000001 +384 -1.657671698581126 -1.218284476537273 2.74 +385 -1.657671698581126 -1.218284476537273 2.055 +386 -1.657671698581126 -1.218284476537273 4.11 +387 -1.657671698581126 -1.218284476537273 3.425000000000001 +388 -1.657671698581126 -1.218284476537273 4.795 +389 -0.7399436623449016 1.66017715577642 1.37 +390 -0.7399436623449017 1.66017715577642 0.6850000000000001 +391 -0.7399436623449016 1.66017715577642 2.74 +392 -0.7399436623449016 1.66017715577642 2.055 +393 -0.7399436623449016 1.66017715577642 4.11 +394 -0.7399436623449016 1.66017715577642 3.425 +395 -0.7399436623449017 1.66017715577642 4.795 +396 -0.1731417816713764 1.999414400014571 1.37 +397 -0.1731417816713764 1.999414400014571 0.6850000000000001 +398 -0.1731417816713764 1.999414400014571 2.74 +399 -0.1731417816713764 1.999414400014571 2.055 +400 -0.1731417816713764 1.999414400014571 4.11 +401 -0.1731417816713764 1.999414400014571 3.425 +402 -0.1731417816713763 1.999414400014571 4.795 +403 0.6735647829159813 1.999414400014573 1.37 +404 0.673564782915981 1.999414400014573 0.6850000000000001 +405 0.6735647829159813 1.999414400014573 2.74 +406 0.673564782915981 1.999414400014573 2.055 +407 0.6735647829159813 1.999414400014573 4.11 +408 0.673564782915981 1.999414400014573 3.425 +409 0.673564782915981 1.999414400014573 4.795 +410 1.358564782915983 1.5017327683309 1.37 +411 1.358564782915983 1.5017327683309 0.6850000000000001 +412 1.358564782915983 1.5017327683309 2.74 +413 1.358564782915983 1.5017327683309 2.055 +414 1.358564782915983 1.5017327683309 4.11 +415 1.358564782915983 1.5017327683309 3.425 +416 1.358564782915983 1.5017327683309 4.795 +417 1.753188570848702 0.9587990369043273 1.37 +418 1.753188570848702 0.9587990369043273 0.6850000000000001 +419 1.753188570848702 0.9587990369043273 2.74 +420 1.753188570848702 0.9587990369043273 2.055 +421 1.753188570848702 0.9587990369043273 4.11 +422 1.753188570848702 0.9587990369043273 3.425000000000001 +423 1.753188570848702 0.9587990369043273 4.795 +$EndNodes +$Elements +148 +1 9 2 2 1 48 50 49 53 54 55 +2 9 2 2 1 7 51 49 56 57 58 +3 9 2 2 1 48 51 9 59 60 61 +4 9 2 2 1 49 51 48 57 59 55 +5 9 2 2 1 7 49 6 58 62 16 +6 9 2 2 1 10 48 9 63 61 19 +7 9 2 2 1 1 50 11 64 65 21 +8 9 2 2 1 11 50 48 65 53 66 +9 9 2 2 1 50 52 49 67 68 54 +10 9 2 2 1 8 51 7 69 56 17 +11 9 2 2 1 9 51 8 60 69 18 +12 9 2 2 1 6 49 5 62 70 15 +13 9 2 2 1 11 48 10 66 63 20 +14 9 2 2 1 49 52 5 68 71 70 +15 9 2 2 1 5 52 4 71 72 14 +16 9 2 2 1 4 52 3 72 73 13 +17 9 2 2 1 3 50 1 74 64 12 +18 9 2 2 1 3 52 50 73 67 74 +19 9 2 2 3 208 210 209 213 214 215 +20 9 2 2 3 33 211 209 216 217 218 +21 9 2 2 3 208 211 35 219 220 221 +22 9 2 2 3 209 211 208 217 219 215 +23 9 2 2 3 33 209 32 218 222 42 +24 9 2 2 3 36 208 35 223 221 45 +25 9 2 2 3 2 210 37 224 225 47 +26 9 2 2 3 37 210 208 225 213 226 +27 9 2 2 3 210 212 209 227 228 214 +28 9 2 2 3 34 211 33 229 216 43 +29 9 2 2 3 35 211 34 220 229 44 +30 9 2 2 3 32 209 31 222 230 41 +31 9 2 2 3 37 208 36 226 223 46 +32 9 2 2 3 209 212 31 228 231 230 +33 9 2 2 3 31 212 30 231 232 40 +34 9 2 2 3 30 212 29 232 233 39 +35 9 2 2 3 29 210 2 234 224 38 +36 9 2 2 3 29 212 210 233 227 234 +37 10 2 2 2 1 3 75 22 12 102 103 25 104 +38 10 2 2 2 22 75 76 23 103 105 106 26 107 +39 10 2 2 2 23 76 77 24 106 108 109 27 110 +40 10 2 2 2 24 77 29 2 109 111 38 28 112 +41 10 2 2 2 3 4 78 75 13 113 114 102 115 +42 10 2 2 2 75 78 79 76 114 116 117 105 118 +43 10 2 2 2 76 79 80 77 117 119 120 108 121 +44 10 2 2 2 77 80 30 29 120 122 39 111 123 +45 10 2 2 2 4 5 81 78 14 124 125 113 126 +46 10 2 2 2 78 81 82 79 125 127 128 116 129 +47 10 2 2 2 79 82 83 80 128 130 131 119 132 +48 10 2 2 2 80 83 31 30 131 133 40 122 134 +49 10 2 2 2 5 6 84 81 15 135 136 124 137 +50 10 2 2 2 81 84 85 82 136 138 139 127 140 +51 10 2 2 2 82 85 86 83 139 141 142 130 143 +52 10 2 2 2 83 86 32 31 142 144 41 133 145 +53 10 2 2 2 6 7 87 84 16 146 147 135 148 +54 10 2 2 2 84 87 88 85 147 149 150 138 151 +55 10 2 2 2 85 88 89 86 150 152 153 141 154 +56 10 2 2 2 86 89 33 32 153 155 42 144 156 +57 10 2 2 2 7 8 90 87 17 157 158 146 159 +58 10 2 2 2 87 90 91 88 158 160 161 149 162 +59 10 2 2 2 88 91 92 89 161 163 164 152 165 +60 10 2 2 2 89 92 34 33 164 166 43 155 167 +61 10 2 2 2 8 9 93 90 18 168 169 157 170 +62 10 2 2 2 90 93 94 91 169 171 172 160 173 +63 10 2 2 2 91 94 95 92 172 174 175 163 176 +64 10 2 2 2 92 95 35 34 175 177 44 166 178 +65 10 2 2 2 9 10 96 93 19 179 180 168 181 +66 10 2 2 2 93 96 97 94 180 182 183 171 184 +67 10 2 2 2 94 97 98 95 183 185 186 174 187 +68 10 2 2 2 95 98 36 35 186 188 45 177 189 +69 10 2 2 2 10 11 99 96 20 190 191 179 192 +70 10 2 2 2 96 99 100 97 191 193 194 182 195 +71 10 2 2 2 97 100 101 98 194 196 197 185 198 +72 10 2 2 2 98 101 37 36 197 199 46 188 200 +73 10 2 2 2 11 1 22 99 21 25 201 190 202 +74 10 2 2 2 99 22 23 100 201 26 203 193 204 +75 10 2 2 2 100 23 24 101 203 27 205 196 206 +76 10 2 2 2 101 24 2 37 205 28 47 199 207 +77 13 2 1 1 49 48 50 238 235 241 55 54 250 53 251 252 253 254 255 256 257 258 +78 13 2 1 1 238 235 241 239 236 242 253 254 259 255 260 261 262 263 264 265 266 267 +79 13 2 1 1 239 236 242 240 237 243 262 263 268 264 269 270 271 272 273 274 275 276 +80 13 2 1 1 240 237 243 209 208 210 271 272 277 273 278 279 215 214 213 280 281 282 +81 13 2 1 1 49 7 51 238 87 244 58 57 250 56 146 283 284 285 286 287 288 289 +82 13 2 1 1 238 87 244 239 88 245 284 285 259 286 149 290 291 292 293 294 295 296 +83 13 2 1 1 239 88 245 240 89 246 291 292 268 293 152 297 298 299 300 301 302 303 +84 13 2 1 1 240 89 246 209 33 211 298 299 277 300 155 304 218 217 216 305 306 307 +85 13 2 1 1 9 48 51 93 235 244 61 60 168 59 251 283 308 309 310 311 312 313 +86 13 2 1 1 93 235 244 94 236 245 308 309 171 310 260 290 314 315 316 317 318 319 +87 13 2 1 1 94 236 245 95 237 246 314 315 174 316 269 297 320 321 322 323 324 325 +88 13 2 1 1 95 237 246 35 208 211 320 321 177 322 278 304 221 220 219 326 327 328 +89 13 2 1 1 48 49 51 235 238 244 55 59 251 57 250 283 253 310 285 256 313 288 +90 13 2 1 1 235 238 244 236 239 245 253 310 260 285 259 290 262 316 292 265 319 295 +91 13 2 1 1 236 239 245 237 240 246 262 316 269 292 268 297 271 322 299 274 325 302 +92 13 2 1 1 237 240 246 208 209 211 271 322 278 299 277 304 215 219 217 280 328 306 +93 13 2 1 1 6 7 49 84 87 238 16 62 135 58 146 250 147 329 284 148 330 287 +94 13 2 1 1 84 87 238 85 88 239 147 329 138 284 149 259 150 331 291 151 332 294 +95 13 2 1 1 85 88 239 86 89 240 150 331 141 291 152 268 153 333 298 154 334 301 +96 13 2 1 1 86 89 240 32 33 209 153 333 144 298 155 277 42 222 218 156 335 305 +97 13 2 1 1 9 10 48 93 96 235 19 61 168 63 179 251 180 308 336 181 311 337 +98 13 2 1 1 93 96 235 94 97 236 180 308 171 336 182 260 183 314 338 184 317 339 +99 13 2 1 1 94 97 236 95 98 237 183 314 174 338 185 269 186 320 340 187 323 341 +100 13 2 1 1 95 98 237 35 36 208 186 320 177 340 188 278 45 221 223 189 326 342 +101 13 2 1 1 11 1 50 99 22 241 21 65 190 64 25 252 201 343 344 202 345 346 +102 13 2 1 1 99 22 241 100 23 242 201 343 193 344 26 261 203 347 348 204 349 350 +103 13 2 1 1 100 23 242 101 24 243 203 347 196 348 27 270 205 351 352 206 353 354 +104 13 2 1 1 101 24 243 37 2 210 205 351 199 352 28 279 47 225 224 207 355 356 +105 13 2 1 1 48 11 50 235 99 241 66 53 251 65 190 252 357 255 343 358 258 345 +106 13 2 1 1 235 99 241 236 100 242 357 255 260 343 193 261 359 264 347 360 267 349 +107 13 2 1 1 236 100 242 237 101 243 359 264 269 347 196 270 361 273 351 362 276 353 +108 13 2 1 1 237 101 243 208 37 210 361 273 278 351 199 279 226 213 225 363 282 355 +109 13 2 1 1 49 50 52 238 241 247 54 68 250 67 252 364 254 365 366 257 367 368 +110 13 2 1 1 238 241 247 239 242 248 254 365 259 366 261 369 263 370 371 266 372 373 +111 13 2 1 1 239 242 248 240 243 249 263 370 268 371 270 374 272 375 376 275 377 378 +112 13 2 1 1 240 243 249 209 210 212 272 375 277 376 279 379 214 228 227 281 380 381 +113 13 2 1 1 7 8 51 87 90 244 17 56 146 69 157 283 158 286 382 159 289 383 +114 13 2 1 1 87 90 244 88 91 245 158 286 149 382 160 290 161 293 384 162 296 385 +115 13 2 1 1 88 91 245 89 92 246 161 293 152 384 163 297 164 300 386 165 303 387 +116 13 2 1 1 89 92 246 33 34 211 164 300 155 386 166 304 43 216 229 167 307 388 +117 13 2 1 1 8 9 51 90 93 244 18 69 157 60 168 283 169 382 309 170 383 312 +118 13 2 1 1 90 93 244 91 94 245 169 382 160 309 171 290 172 384 315 173 385 318 +119 13 2 1 1 91 94 245 92 95 246 172 384 163 315 174 297 175 386 321 176 387 324 +120 13 2 1 1 92 95 246 34 35 211 175 386 166 321 177 304 44 229 220 178 388 327 +121 13 2 1 1 5 6 49 81 84 238 15 70 124 62 135 250 136 389 329 137 390 330 +122 13 2 1 1 81 84 238 82 85 239 136 389 127 329 138 259 139 391 331 140 392 332 +123 13 2 1 1 82 85 239 83 86 240 139 391 130 331 141 268 142 393 333 143 394 334 +124 13 2 1 1 83 86 240 31 32 209 142 393 133 333 144 277 41 230 222 145 395 335 +125 13 2 1 1 10 11 48 96 99 235 20 63 179 66 190 251 191 336 357 192 337 358 +126 13 2 1 1 96 99 235 97 100 236 191 336 182 357 193 260 194 338 359 195 339 360 +127 13 2 1 1 97 100 236 98 101 237 194 338 185 359 196 269 197 340 361 198 341 362 +128 13 2 1 1 98 101 237 36 37 208 197 340 188 361 199 278 46 223 226 200 342 363 +129 13 2 1 1 5 49 52 81 238 247 70 71 124 68 250 364 389 396 365 390 397 367 +130 13 2 1 1 81 238 247 82 239 248 389 396 127 365 259 369 391 398 370 392 399 372 +131 13 2 1 1 82 239 248 83 240 249 391 398 130 370 268 374 393 400 375 394 401 377 +132 13 2 1 1 83 240 249 31 209 212 393 400 133 375 277 379 230 231 228 395 402 380 +133 13 2 1 1 4 5 52 78 81 247 14 72 113 71 124 364 125 403 396 126 404 397 +134 13 2 1 1 78 81 247 79 82 248 125 403 116 396 127 369 128 405 398 129 406 399 +135 13 2 1 1 79 82 248 80 83 249 128 405 119 398 130 374 131 407 400 132 408 401 +136 13 2 1 1 80 83 249 30 31 212 131 407 122 400 133 379 40 232 231 134 409 402 +137 13 2 1 1 3 4 52 75 78 247 13 73 102 72 113 364 114 410 403 115 411 404 +138 13 2 1 1 75 78 247 76 79 248 114 410 105 403 116 369 117 412 405 118 413 406 +139 13 2 1 1 76 79 248 77 80 249 117 412 108 405 119 374 120 414 407 121 415 408 +140 13 2 1 1 77 80 249 29 30 212 120 414 111 407 122 379 39 233 232 123 416 409 +141 13 2 1 1 1 3 50 22 75 241 12 64 25 74 102 252 103 344 417 104 346 418 +142 13 2 1 1 22 75 241 23 76 242 103 344 26 417 105 261 106 348 419 107 350 420 +143 13 2 1 1 23 76 242 24 77 243 106 348 27 419 108 270 109 352 421 110 354 422 +144 13 2 1 1 24 77 243 2 29 210 109 352 28 421 111 279 38 224 234 112 356 423 +145 13 2 1 1 50 3 52 241 75 247 74 67 252 73 102 364 417 366 410 418 368 411 +146 13 2 1 1 241 75 247 242 76 248 417 366 261 410 105 369 419 371 412 420 373 413 +147 13 2 1 1 242 76 248 243 77 249 419 371 270 412 108 374 421 376 414 422 378 415 +148 13 2 1 1 243 77 249 210 29 212 421 376 279 414 111 379 234 227 233 423 381 416 +$EndElements diff --git a/examples/cavity/mesh/mesh.jl b/examples/cavity/mesh/mesh.jl index 24115b0db5..e49bdea505 100644 --- a/examples/cavity/mesh/mesh.jl +++ b/examples/cavity/mesh/mesh.jl @@ -1,119 +1,119 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -using Gmsh: gmsh - -""" - generate_cylindrical_cavity_mesh(; - refinement=0, - order=1, - mesh_type=0, - radius=2.74, - aspect_ratio=1.0, - filename, - verbose=1, - ) - -Generate a mesh for the cylindrical cavity resonator example using Gmsh - -# Arguments - - - refinement - measure of how many elements to include, 0 is least - - order - the polynomial order of the approximation, minimum 1 - - mesh_type - 0 = tetrahedral mesh, 1 = prism mesh, 2 = hexahedral mesh - - radius - the radius of the cavity resonator - - aspect_ratio - the ratio of the DIAMETER of the cavity to the height - - filename - the filename to use for the generated mesh - - verbose - flag to dictate the level of print to REPL, passed to Gmsh -""" -function generate_cylindrical_cavity_mesh(; - refinement::Integer=0, - order::Integer=1, - mesh_type::Integer=0, - radius::Real=2.74, - aspect_ratio::Real=1.0, - filename::AbstractString, - verbose::Integer=1 -) - @assert refinement >= 0 - @assert order > 0 - - kernel = gmsh.model.occ - - gmsh.initialize() - gmsh.option.setNumber("General.Verbosity", verbose) - - # Add model - if "cavity" in gmsh.model.list() - gmsh.model.setCurrent("cavity") - gmsh.model.remove() - end - gmsh.model.add("cavity") - - # Geometry parameters (in cm) - height = aspect_ratio * 2 * radius # Cylinder height - - # Mesh parameters - n_height = 2 * 2^refinement # Minimum two elements in vertical - n_circum = 4 * 2^refinement # Minimum four elements on round - - # Geometry - base_circle = kernel.addDisk(0.0, 0.0, 0.0, radius, radius) - if mesh_type > 0 - cylinder_dimtags = - kernel.extrude([(2, base_circle)], 0.0, 0.0, height, [n_height], [1.0], true) - else - cylinder_dimtags = kernel.extrude([(2, base_circle)], 0.0, 0.0, height) - end - cylinder = filter(x -> x[1] == 3, cylinder_dimtags) - @assert length(cylinder) == 1 && first(cylinder)[1] == 3 - cylinder = first(cylinder)[2] - - kernel.synchronize() - - # Add physical groups - cylinder_group = gmsh.model.addPhysicalGroup(3, [cylinder], -1, "cylinder") - - _, boundaries = gmsh.model.getAdjacencies(3, cylinder) - boundary_group = gmsh.model.addPhysicalGroup(2, boundaries, -1, "boundaries") - - # Generate mesh - gmsh.option.setNumber("Mesh.MinimumCurveNodes", 2) - gmsh.option.setNumber("Mesh.MinimumCircleNodes", 0) - - gmsh.option.setNumber("Mesh.MeshSizeMin", 2π * radius / n_circum) - gmsh.option.setNumber("Mesh.MeshSizeMax", 2π * radius / n_circum) - - gmsh.option.setNumber("Mesh.MeshSizeFromPoints", 0) - gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", n_circum) - gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 1) - if mesh_type > 1 - gmsh.model.mesh.setRecombine(2, base_circle) - end - - gmsh.option.setNumber("Mesh.Algorithm", 6) - gmsh.option.setNumber("Mesh.Algorithm3D", 10) - - gmsh.model.mesh.generate(3) # Dimension of the mesh - gmsh.model.mesh.setOrder(order) # Polynomial order of the mesh - - # Save mesh - gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) - gmsh.option.setNumber("Mesh.Binary", 0) - gmsh.write(joinpath(@__DIR__, filename)) - - # Print some information - if verbose > 0 - println("\nFinished generating mesh. Physical group tags:") - println("Cylinder: ", cylinder_group) - println("Boundaries: ", boundary_group) - println() - end - - # Optionally launch GUI - if "gui" in lowercase.(ARGS) - gmsh.fltk.run() - end - - return gmsh.finalize() -end +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +using Gmsh: gmsh + +""" + generate_cylindrical_cavity_mesh(; + refinement=0, + order=1, + mesh_type=0, + radius=2.74, + aspect_ratio=1.0, + filename, + verbose=1, + ) + +Generate a mesh for the cylindrical cavity resonator example using Gmsh + +# Arguments + + - refinement - measure of how many elements to include, 0 is least + - order - the polynomial order of the approximation, minimum 1 + - mesh_type - 0 = tetrahedral mesh, 1 = prism mesh, 2 = hexahedral mesh + - radius - the radius of the cavity resonator + - aspect_ratio - the ratio of the DIAMETER of the cavity to the height + - filename - the filename to use for the generated mesh + - verbose - flag to dictate the level of print to REPL, passed to Gmsh +""" +function generate_cylindrical_cavity_mesh(; + refinement::Integer=0, + order::Integer=1, + mesh_type::Integer=0, + radius::Real=2.74, + aspect_ratio::Real=1.0, + filename::AbstractString, + verbose::Integer=1 +) + @assert refinement >= 0 + @assert order > 0 + + kernel = gmsh.model.occ + + gmsh.initialize() + gmsh.option.setNumber("General.Verbosity", verbose) + + # Add model + if "cavity" in gmsh.model.list() + gmsh.model.setCurrent("cavity") + gmsh.model.remove() + end + gmsh.model.add("cavity") + + # Geometry parameters (in cm) + height = aspect_ratio * 2 * radius # Cylinder height + + # Mesh parameters + n_height = 2 * 2^refinement # Minimum two elements in vertical + n_circum = 4 * 2^refinement # Minimum four elements on round + + # Geometry + base_circle = kernel.addDisk(0.0, 0.0, 0.0, radius, radius) + if mesh_type > 0 + cylinder_dimtags = + kernel.extrude([(2, base_circle)], 0.0, 0.0, height, [n_height], [1.0], true) + else + cylinder_dimtags = kernel.extrude([(2, base_circle)], 0.0, 0.0, height) + end + cylinder = filter(x -> x[1] == 3, cylinder_dimtags) + @assert length(cylinder) == 1 && first(cylinder)[1] == 3 + cylinder = first(cylinder)[2] + + kernel.synchronize() + + # Add physical groups + cylinder_group = gmsh.model.addPhysicalGroup(3, [cylinder], -1, "cylinder") + + _, boundaries = gmsh.model.getAdjacencies(3, cylinder) + boundary_group = gmsh.model.addPhysicalGroup(2, boundaries, -1, "boundaries") + + # Generate mesh + gmsh.option.setNumber("Mesh.MinimumCurveNodes", 2) + gmsh.option.setNumber("Mesh.MinimumCircleNodes", 0) + + gmsh.option.setNumber("Mesh.MeshSizeMin", 2π * radius / n_circum) + gmsh.option.setNumber("Mesh.MeshSizeMax", 2π * radius / n_circum) + + gmsh.option.setNumber("Mesh.MeshSizeFromPoints", 0) + gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", n_circum) + gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 1) + if mesh_type > 1 + gmsh.model.mesh.setRecombine(2, base_circle) + end + + gmsh.option.setNumber("Mesh.Algorithm", 6) + gmsh.option.setNumber("Mesh.Algorithm3D", 10) + + gmsh.model.mesh.generate(3) # Dimension of the mesh + gmsh.model.mesh.setOrder(order) # Polynomial order of the mesh + + # Save mesh + gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) + gmsh.option.setNumber("Mesh.Binary", 0) + gmsh.write(joinpath(@__DIR__, filename)) + + # Print some information + if verbose > 0 + println("\nFinished generating mesh. Physical group tags:") + println("Cylinder: ", cylinder_group) + println("Boundaries: ", boundary_group) + println() + end + + # Optionally launch GUI + if "gui" in lowercase.(ARGS) + gmsh.fltk.run() + end + + return gmsh.finalize() +end diff --git a/examples/coaxial/coaxial.jl b/examples/coaxial/coaxial.jl index d81419bf7c..b17bebd8e7 100644 --- a/examples/coaxial/coaxial.jl +++ b/examples/coaxial/coaxial.jl @@ -1,79 +1,132 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -using CSV -using DataFrames -using Measures -using Plots -using PyPlot: matplotlib - -""" - generate_coaxial_data(; num_processors::Integer=1) - -Generate the data for the coaxial cable example - -# Arguments - - - num_processors - number of processors to use for the simulation -""" -function generate_coaxial_data(; num_processors::Integer=1) - # Call the solver, discarding the terminal output - coaxial_dir = @__DIR__ - for sim ∈ ["matched", "open", "short"] - call_command = `palace -np $num_processors -wdir $coaxial_dir coaxial_$sim.json` - run(call_command) - end - - # Parse simulation data - postpro_dir = joinpath(@__DIR__, "postpro") - - file = joinpath(postpro_dir, "matched", "port-V.csv") - data_matched = CSV.File(file, header=1) |> DataFrame |> Matrix - t = data_matched[:, 1] - data_matched = data_matched[:, 3] - n_t = size(t, 1) - - file = joinpath(postpro_dir, "open", "port-V.csv") - data_open = CSV.File(file, header=1) |> DataFrame |> Matrix - data_open = data_open[:, 3] - - file = joinpath(postpro_dir, "short", "port-V.csv") - data_short = CSV.File(file, header=1) |> DataFrame |> Matrix - data_short = data_short[:, 3] - - # Plot settings - pyplot() - rcParams = PyPlot.PyDict(matplotlib["rcParams"]) - plotsz = (800, 400) - fntsz = 12 - fnt = font(fntsz) - rcParams["mathtext.fontset"] = "stix" - default( - size=plotsz, - palette=:Set1_9, - dpi=300, - tickfont=fnt, - guidefont=fnt, - legendfontsize=fntsz - 2, - margin=10mm - ) - - # Make plots - xlim = (minimum(t) - 0.1, maximum(t) + 0.1) - xlbl = "\$t\$ (ns)" - ylbl = string("\$V\\ /\\ V_0\$") - - pp = plot(xlims=xlim, xlabel=xlbl, ylabel=ylbl, legend=:bottomright) - - lbl = "Open" - plot!(pp, t, data_open, label=lbl) - - lbl = "Short" - plot!(pp, t, data_short, label=lbl) - - lbl = "Matched" - plot!(pp, t, data_matched, label=lbl) - - savefig(pp, joinpath(postpro_dir, "coaxial.png")) - return display(pp) -end +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +#= +# README + +This Julia script processes and visualizes results from the coaxial cable +transient simulation example. + +The script runs three Palace simulations with different termination conditions: +1. Matched termination (50Ω load) +2. Open circuit termination +3. Short circuit termination + +It then plots the voltage response at the port for all three cases, showing the +effect of reflections from different boundary conditions. + +## Prerequisites + +This script requires Julia packages. Install them with: + +```bash +julia --project=examples -e 'using Pkg; Pkg.instantiate()' +``` + +## How to run + +From the repository root, run: +```bash +julia --project=examples -e 'include("examples/coaxial/coaxial.jl"); generate_coaxial_data(num_processors=4)' +``` + +This requires `palace` to be a runnable command. If it is not, you can pass the +path to the executable, e.g. +```bash +julia --project=examples -e 'include("examples/coaxial/coaxial.jl"); generate_coaxial_data(palace_exec="build/bin/palace", num_processors=4)' +``` + +The script will: +1. Run Palace simulations for all three termination cases +2. Parse the voltage data from CSV output files +3. Generate a plot comparing the three cases +4. Save the plot to `postpro/coaxial.png` + +## Output + +The generated plot shows normalized voltage (V/V₀) vs time for the three +termination conditions, illustrating transmission line reflection behavior. +=# + +using CSV +using DataFrames +using Measures +using Plots + +""" + generate_coaxial_data(; + palace_exec::String="palace", + num_processors::Integer=1 + ) + +Generate the data for the coaxial cable example and visualize the results. + +# Arguments + + - palace_exec - executable for Palace + - num_processors - number of processors to use for the simulation +""" +function generate_coaxial_data(; palace_exec="palace", num_processors::Integer=1) + palace_exec_is_path = occursin(Base.Filesystem.path_separator, palace_exec) + if palace_exec_is_path + # Convert palace_exec to absolute path if it's relative + palace_exec = isabspath(palace_exec) ? palace_exec : abspath(palace_exec) + end + + # Call the solver, discarding the terminal output + coaxial_dir = @__DIR__ + for sim ∈ ["matched", "open", "short"] + base_cmd = `$palace_exec -np $num_processors coaxial_$sim.json` + call_command = Cmd(base_cmd; dir=coaxial_dir) + run(call_command) + end + + # Parse simulation data + file = joinpath(coaxial_dir, "postpro", "matched", "port-V.csv") + data_matched = CSV.File(file, header=1) |> DataFrame |> Matrix + t = data_matched[:, 1] + data_matched = data_matched[:, 3] + + file = joinpath(coaxial_dir, "postpro", "open", "port-V.csv") + data_open = CSV.File(file, header=1) |> DataFrame |> Matrix + data_open = data_open[:, 3] + + file = joinpath(coaxial_dir, "postpro", "short", "port-V.csv") + data_short = CSV.File(file, header=1) |> DataFrame |> Matrix + data_short = data_short[:, 3] + + # Plot settings + plotsz = (800, 400) + fntsz = 12 + fnt = font(fntsz) + default( + size=plotsz, + palette=:Set1_9, + dpi=300, + tickfont=fnt, + guidefont=fnt, + legendfontsize=fntsz - 2, + margin=10mm + ) + + # Make plots + xlim = (minimum(t) - 0.1, maximum(t) + 0.1) + xlbl = "\$t\$ (ns)" + ylbl = string("\$V\\ /\\ V_0\$") + + pp = plot(xlims=xlim, xlabel=xlbl, ylabel=ylbl, legend=:bottomright) + + lbl = "Open" + plot!(pp, t, data_open, label=lbl) + + lbl = "Short" + plot!(pp, t, data_short, label=lbl) + + lbl = "Matched" + plot!(pp, t, data_matched, label=lbl) + + savefig(pp, joinpath(coaxial_dir, "postpro", "coaxial.png")) + display(pp) + + return +end diff --git a/examples/coaxial/coaxial_matched.json b/examples/coaxial/coaxial_matched.json index a50c9724d7..79ba34d435 100644 --- a/examples/coaxial/coaxial_matched.json +++ b/examples/coaxial/coaxial_matched.json @@ -1,70 +1,71 @@ -{ - "Problem": - { - "Type": "Transient", - "Verbose": 2, - "Output": "D:/WelSimLLC/executable28/_palace_examples/coaxial/postpro/matched" - }, - "Model": - { - "Mesh": "D:/WelSimLLC/executable28/_palace_examples/coaxial/mesh/coaxial.msh", - "L0": 1.0e-3 // mm - }, - "Domains": - { - "Materials": - [ - { - "Attributes": [1], - "Permeability": 1.0, - "Permittivity": 2.08, - "Conductivity": 4.629e-2 // 100 x tan(δ) = 0.0004 @ 10 GHz - } - ] - }, - "Boundaries": - { - "PEC": - { - "Attributes": [2] - }, - "LumpedPort": - [ - { - "Index": 1, - "Attributes": [3], - "R": 50.0, // Ω - "Direction": "+R", // Coaxial lumped port - "Excitation": true - }, - { - "Index": 2, - "Attributes": [4], - "R": 50.0, // Ω, perfect matching - "Direction": "+R" // Coaxial lumped port - } - ] - }, - "Solver": - { - "Order": 3, - "Transient": - { - "Type": "GeneralizedAlpha", - "Excitation": "ModulatedGaussian", - "ExcitationFreq": 10.0, // GHz - "ExcitationWidth": 0.05, // ns, BW ∼ 1/τ - "MaxTime": 1.0, // ns - "TimeStep": 0.005, // ns - "SaveStep": 10 - }, - "Linear": - { - "Type": "AMS", - "KSPType": "CG", - "Tol": 1.0e-8, - "MaxIts": 100 - } - } -} - +{ + "Problem": + { + "Type": "Transient", + "Verbose": 2, + "Output": "postpro/matched" + }, + "Model": + { + "Mesh": "mesh/coaxial.msh", + "L0": 1.0e-3 // mm + }, + "Domains": + { + "Materials": + [ + { + "Attributes": [1], + "Permeability": 1.0, + "Permittivity": 2.08, + "Conductivity": 4.629e-2 // 100 x tan(δ) = 0.0004 @ 10 GHz + } + ] + }, + "Boundaries": + { + "PEC": + { + "Attributes": [2] + }, + "LumpedPort": + [ + { + "Index": 1, + "Attributes": [3], + "R": 50.0, // Ω + "Direction": "+R", // Coaxial lumped port + "Excitation": true + }, + { + "Index": 2, + "Attributes": [4], + "R": 50.0, // Ω, perfect matching + "Direction": "+R" // Coaxial lumped port + } + ] + }, + "Solver": + { + "Order": 3, + "Device": "CPU", + "Transient": + { + "Type": "GeneralizedAlpha", + "Excitation": "ModulatedGaussian", + "ExcitationFreq": 10.0, // GHz + "ExcitationWidth": 0.05, // ns, BW ∼ 1/τ + "MaxTime": 1.0, // ns + "TimeStep": 0.005, // ns + "SaveStep": 10 + }, + "Linear": + { + "Type": "AMS", + "KSPType": "CG", + "Tol": 1.0e-8, + "MaxIts": 100 + } + } +} + diff --git a/examples/coaxial/coaxial_open.json b/examples/coaxial/coaxial_open.json index 7fd67ca214..e47c42e43f 100644 --- a/examples/coaxial/coaxial_open.json +++ b/examples/coaxial/coaxial_open.json @@ -1,68 +1,69 @@ -{ - "Problem": - { - "Type": "Transient", - "Verbose": 2, - "Output": "D:/WelSimLLC/executable28/_palace_examples/coaxial/postpro/open" - }, - "Model": - { - "Mesh": "D:/WelSimLLC/executable28/_palace_examples/coaxial/mesh/coaxial.msh", - "L0": 1.0e-3 // mm - }, - "Domains": - { - "Materials": - [ - { - "Attributes": [1], - "Permeability": 1.0, - "Permittivity": 2.08, - "Conductivity": 4.629e-2 // 100 x tan(δ) = 0.0004 @ 10 GHz - } - ] - }, - "Boundaries": - { - "PEC": - { - "Attributes": [2] - }, - "PMC": - { - "Attributes": [4] - }, - "LumpedPort": - [ - { - "Index": 1, - "Attributes": [3], - "R": 50.0, // Ω - "Direction": "+R", // Coaxial lumped port - "Excitation": true - } - ] - }, - "Solver": - { - "Order": 3, - "Transient": - { - "Type": "GeneralizedAlpha", - "Excitation": "ModulatedGaussian", - "ExcitationFreq": 10.0, // GHz - "ExcitationWidth": 0.05, // ns, BW ∼ 1/τ - "MaxTime": 1.0, // ns - "TimeStep": 0.005, // ns - "SaveStep": 10 - }, - "Linear": - { - "Type": "AMS", - "KSPType": "CG", - "Tol": 1.0e-8, - "MaxIts": 100 - } - } -} - +{ + "Problem": + { + "Type": "Transient", + "Verbose": 2, + "Output": "postpro/open" + }, + "Model": + { + "Mesh": "mesh/coaxial.msh", + "L0": 1.0e-3 // mm + }, + "Domains": + { + "Materials": + [ + { + "Attributes": [1], + "Permeability": 1.0, + "Permittivity": 2.08, + "Conductivity": 4.629e-2 // 100 x tan(δ) = 0.0004 @ 10 GHz + } + ] + }, + "Boundaries": + { + "PEC": + { + "Attributes": [2] + }, + "PMC": + { + "Attributes": [4] + }, + "LumpedPort": + [ + { + "Index": 1, + "Attributes": [3], + "R": 50.0, // Ω + "Direction": "+R", // Coaxial lumped port + "Excitation": true + } + ] + }, + "Solver": + { + "Order": 3, + "Device": "CPU", + "Transient": + { + "Type": "GeneralizedAlpha", + "Excitation": "ModulatedGaussian", + "ExcitationFreq": 10.0, // GHz + "ExcitationWidth": 0.05, // ns, BW ∼ 1/τ + "MaxTime": 1.0, // ns + "TimeStep": 0.005, // ns + "SaveStep": 10 + }, + "Linear": + { + "Type": "AMS", + "KSPType": "CG", + "Tol": 1.0e-8, + "MaxIts": 100 + } + } +} + diff --git a/examples/coaxial/coaxial_short.json b/examples/coaxial/coaxial_short.json index 4f02b6958f..c1e34be80c 100644 --- a/examples/coaxial/coaxial_short.json +++ b/examples/coaxial/coaxial_short.json @@ -1,64 +1,65 @@ -{ - "Problem": - { - "Type": "Transient", - "Verbose": 2, - "Output": "D:/WelSimLLC/executable28/_palace_examples/coaxial/postpro/short" - }, - "Model": - { - "Mesh": "D:/WelSimLLC/executable28/_palace_examples/coaxial/mesh/coaxial.msh", - "L0": 1.0e-3 // mm - }, - "Domains": - { - "Materials": - [ - { - "Attributes": [1], - "Permeability": 1.0, - "Permittivity": 2.08, - "Conductivity": 4.629e-2 // 100 x tan(δ) = 0.0004 @ 10 GHz - } - ] - }, - "Boundaries": - { - "PEC": - { - "Attributes": [2, 4] - }, - "LumpedPort": - [ - { - "Index": 1, - "Attributes": [3], - "R": 50.0, // Ω - "Direction": "+R", // Coaxial lumped port - "Excitation": true - } - ] - }, - "Solver": - { - "Order": 3, - "Transient": - { - "Type": "GeneralizedAlpha", - "Excitation": "ModulatedGaussian", - "ExcitationFreq": 10.0, // GHz - "ExcitationWidth": 0.05, // ns, BW ∼ 1/τ - "MaxTime": 1.0, // ns - "TimeStep": 0.005, // ns - "SaveStep": 10 - }, - "Linear": - { - "Type": "AMS", - "KSPType": "CG", - "Tol": 1.0e-8, - "MaxIts": 100 - } - } -} - +{ + "Problem": + { + "Type": "Transient", + "Verbose": 2, + "Output": "postpro/short" + }, + "Model": + { + "Mesh": "mesh/coaxial.msh", + "L0": 1.0e-3 // mm + }, + "Domains": + { + "Materials": + [ + { + "Attributes": [1], + "Permeability": 1.0, + "Permittivity": 2.08, + "Conductivity": 4.629e-2 // 100 x tan(δ) = 0.0004 @ 10 GHz + } + ] + }, + "Boundaries": + { + "PEC": + { + "Attributes": [2, 4] + }, + "LumpedPort": + [ + { + "Index": 1, + "Attributes": [3], + "R": 50.0, // Ω + "Direction": "+R", // Coaxial lumped port + "Excitation": true + } + ] + }, + "Solver": + { + "Order": 3, + "Device": "CPU", + "Transient": + { + "Type": "GeneralizedAlpha", + "Excitation": "ModulatedGaussian", + "ExcitationFreq": 10.0, // GHz + "ExcitationWidth": 0.05, // ns, BW ∼ 1/τ + "MaxTime": 1.0, // ns + "TimeStep": 0.005, // ns + "SaveStep": 10 + }, + "Linear": + { + "Type": "AMS", + "KSPType": "CG", + "Tol": 1.0e-8, + "MaxIts": 100 + } + } +} + diff --git a/examples/coaxial/mesh/coaxial.msh b/examples/coaxial/mesh/coaxial.msh index b43461a694..d407e21f29 100644 Binary files a/examples/coaxial/mesh/coaxial.msh and b/examples/coaxial/mesh/coaxial.msh differ diff --git a/examples/coaxial/mesh/mesh.jl b/examples/coaxial/mesh/mesh.jl index 24bfea70cd..e2b81df964 100644 --- a/examples/coaxial/mesh/mesh.jl +++ b/examples/coaxial/mesh/mesh.jl @@ -1,170 +1,179 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -using Gmsh: gmsh - -""" - generate_coaxial_mesh(; - refinement::Integer = 0, - order::Integer = 1, - inner_diameter_mm::Real = 1.6383, - outer_diameter_mm::Real = 5.461, - length_mm::Real = 40.0, - filename::AbstractString, - verbose::Integer=1 - ) - -Generate a mesh for the coaxial cable example using Gmsh - -# Arguments - - - refinement - measure of how many elements to include, 0 is least - - order - the polynomial order of the approximation, minimum 1 - - inner_diameter_mm - the inner diameter of the cable, in millimeters - - outer_diameter_mm - the outer diameter of the cable, in millimeters - - length_mm - the length of the cable, in millimeters - - filename - the filename to use for the generated mesh - - verbose - flag to dictate the level of print to REPL, passed to Gmsh -""" -function generate_coaxial_mesh(; - refinement::Integer = 0, - order::Integer = 1, - inner_diameter_mm::Real = 1.6383, - outer_diameter_mm::Real = 5.461, - length_mm::Real = 40.0, - filename::AbstractString, - verbose::Integer=1 -) - @assert outer_diameter_mm > inner_diameter_mm > 0 - @assert length_mm > 0 - @assert refinement >= 0 - @assert order > 1 - - kernel = gmsh.model.occ - - gmsh.initialize() - gmsh.option.setNumber("General.Verbosity", verbose) - - # Add model - if "coaxial" in gmsh.model.list() - gmsh.model.setCurrent("coaxial") - gmsh.model.remove() - end - gmsh.model.add("coaxial") - - # Geometry parameters (in mm) - ri = inner_diameter_mm / 2 - ro = outer_diameter_mm / 2 - - # Mesh parameters - n_circum = 4 * 2^refinement # min 4 elements in round - n_length = 4 * 2^refinement # min 4 elements on length - - # Geometry - p0 = kernel.addPoint(ri, 0.0, 0.0) - p1 = kernel.addPoint(ro, 0.0, 0.0) - - l0 = kernel.addLine(p0, p1) - - base_face_0 = kernel.revolve( - (1, l0), - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 1.0, - pi, - [n_circum ÷ 2], - [1.0], - true - ) - base_face_1 = kernel.revolve( - (1, l0), - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 1.0, - -pi, - [n_circum ÷ 2], - [1.0], - true - ) - filter!(x -> x[1] == 2, base_face_0) - filter!(x -> x[1] == 2, base_face_1) - @assert length(base_face_0) == 1 && length(base_face_1) == 1 - - cylinder_0 = kernel.extrude(base_face_0, 0.0, 0.0, length_mm, [n_length], [1.0], true) - cylinder_1 = kernel.extrude(base_face_1, 0.0, 0.0, length_mm, [n_length], [1.0], true) - base_face_0 = last(first(base_face_0)) - base_face_1 = last(first(base_face_1)) - far_face_0 = last(cylinder_0[1]) - cylinder_0 = last(cylinder_0[2]) - far_face_1 = last(cylinder_1[1]) - cylinder_1 = last(cylinder_1[2]) - - # Remove duplicates but preserves tags for non-removed objects - kernel.fragment(kernel.getEntities(), []) - kernel.synchronize() - - boundaries = [] - for cylinder in [cylinder_0, cylinder_1] - _, local_boundaries = gmsh.model.getAdjacencies(3, cylinder) - local_idx = indexin(local_boundaries, boundaries) - local_delete = [] - for (idx, boundary) in zip(local_idx, local_boundaries) - if isnothing(idx) - push!(boundaries, boundary) - else - push!(local_delete, idx) - end - end - deleteat!(boundaries, sort(local_delete)) - end - deleteat!( - boundaries, - sort(indexin([base_face_0, base_face_1, far_face_0, far_face_1], boundaries)) - ) - - # Add physical groups - cylinder_group = - gmsh.model.addPhysicalGroup(3, [cylinder_0, cylinder_1], -1, "cylinder") - boundary_group = gmsh.model.addPhysicalGroup(2, boundaries, -1, "boundaries") - - port1_group = gmsh.model.addPhysicalGroup(2, [base_face_0, base_face_1], -1, "port1") - port2_group = gmsh.model.addPhysicalGroup(2, [far_face_0, far_face_1], -1, "port2") - - # Generate mesh - gmsh.option.setNumber("Mesh.MinimumCurveNodes", 2) - gmsh.option.setNumber("Mesh.MinimumCircleNodes", 0) - gmsh.option.setNumber("Mesh.MeshSizeFromPoints", 0) - gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", 0) - gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 0) - - gmsh.model.mesh.generate(3) - gmsh.model.mesh.setOrder(order) - - # Save mesh - gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) - gmsh.option.setNumber("Mesh.Binary", 0) - gmsh.write(joinpath(@__DIR__, filename)) - - # Print some information - if verbose > 0 - println("\nFinished generating mesh. Physical group tags:") - println("Cylinder: ", cylinder_group) - println("Boundaries: ", boundary_group) - println("Port 1: ", port1_group) - println("Port 2: ", port2_group) - println() - end - - # Optionally launch GUI - if "gui" in lowercase.(ARGS) - gmsh.fltk.run() - end - - return gmsh.finalize() -end +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Generated with: +# julia -e 'include("mesh/mesh.jl"); generate_coaxial_mesh(filename="coaxial.msh")' + +using Gmsh: gmsh + +""" + generate_coaxial_mesh(; + filename::AbstractString, + refinement::Integer = 2, + order::Integer = 2, + inner_diameter_mm::Real = 1.6383, + outer_diameter_mm::Real = 5.461, + length_mm::Real = 40.0, + verbose::Integer = 5, + gui::Bool = false + ) + +Generate a mesh for the coaxial cable example using Gmsh + +# Arguments + + - filename - the filename to use for the generated mesh + - refinement - measure of how many elements to include, 0 is least + - order - the polynomial order of the approximation, minimum 1 + - inner_diameter_mm - the inner diameter of the cable, in millimeters + - outer_diameter_mm - the outer diameter of the cable, in millimeters + - length_mm - the length of the cable, in millimeters + - verbose - flag to dictate the level of print to REPL, passed to Gmsh + - gui - whether to launch the Gmsh GUI on mesh generation +""" +function generate_coaxial_mesh(; + filename::AbstractString, + refinement::Integer = 2, + order::Integer = 2, + inner_diameter_mm::Real = 1.6383, + outer_diameter_mm::Real = 5.461, + length_mm::Real = 40.0, + verbose::Integer = 5, + gui::Bool = false +) + @assert outer_diameter_mm > inner_diameter_mm > 0 + @assert length_mm > 0 + @assert refinement >= 0 + @assert order > 0 + + kernel = gmsh.model.occ + + gmsh.initialize() + gmsh.option.setNumber("General.Verbosity", verbose) + + # Add model + if "coaxial" in gmsh.model.list() + gmsh.model.setCurrent("coaxial") + gmsh.model.remove() + end + gmsh.model.add("coaxial") + + # Geometry parameters (in mm) + ri = inner_diameter_mm / 2 + ro = outer_diameter_mm / 2 + + # Mesh parameters + n_circum = 4 * 2^refinement # min 4 elements in round + n_length = 4 * 2^refinement # min 4 elements on length + + # Geometry + p0 = kernel.addPoint(ri, 0.0, 0.0) + p1 = kernel.addPoint(ro, 0.0, 0.0) + + l0 = kernel.addLine(p0, p1) + + base_face_0 = kernel.revolve( + (1, l0), + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + pi, + [n_circum ÷ 2], + [1.0], + true + ) + base_face_1 = kernel.revolve( + (1, l0), + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + -pi, + [n_circum ÷ 2], + [1.0], + true + ) + filter!(x -> x[1] == 2, base_face_0) + filter!(x -> x[1] == 2, base_face_1) + @assert length(base_face_0) == 1 && length(base_face_1) == 1 + + cylinder_0 = kernel.extrude(base_face_0, 0.0, 0.0, length_mm, [n_length], [1.0], true) + cylinder_1 = kernel.extrude(base_face_1, 0.0, 0.0, length_mm, [n_length], [1.0], true) + base_face_0 = last(first(base_face_0)) + base_face_1 = last(first(base_face_1)) + far_face_0 = last(cylinder_0[1]) + cylinder_0 = last(cylinder_0[2]) + far_face_1 = last(cylinder_1[1]) + cylinder_1 = last(cylinder_1[2]) + + # Remove duplicates but preserves tags for non-removed objects + kernel.fragment(kernel.getEntities(), []) + kernel.synchronize() + + boundaries = [] + for cylinder in [cylinder_0, cylinder_1] + _, local_boundaries = gmsh.model.getAdjacencies(3, cylinder) + local_idx = indexin(local_boundaries, boundaries) + local_delete = [] + for (idx, boundary) in zip(local_idx, local_boundaries) + if isnothing(idx) + push!(boundaries, boundary) + else + push!(local_delete, idx) + end + end + deleteat!(boundaries, sort(local_delete)) + end + deleteat!( + boundaries, + sort(indexin([base_face_0, base_face_1, far_face_0, far_face_1], boundaries)) + ) + + # Add physical groups + cylinder_group = + gmsh.model.addPhysicalGroup(3, [cylinder_0, cylinder_1], -1, "cylinder") + boundary_group = gmsh.model.addPhysicalGroup(2, boundaries, -1, "boundaries") + + port1_group = gmsh.model.addPhysicalGroup(2, [base_face_0, base_face_1], -1, "port1") + port2_group = gmsh.model.addPhysicalGroup(2, [far_face_0, far_face_1], -1, "port2") + + # Generate mesh + gmsh.option.setNumber("Mesh.MinimumCurveNodes", 2) + gmsh.option.setNumber("Mesh.MinimumCircleNodes", 0) + gmsh.option.setNumber("Mesh.MeshSizeFromPoints", 0) + gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", 0) + gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 0) + + gmsh.option.setNumber("Mesh.Algorithm", 6) + gmsh.option.setNumber("Mesh.Algorithm3D", 1) + + gmsh.model.mesh.generate(3) + gmsh.model.mesh.setOrder(order) + + # Save mesh + gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) + gmsh.option.setNumber("Mesh.Binary", 1) + gmsh.write(joinpath(@__DIR__, filename)) + + # Print some information + if verbose > 0 + println("\nFinished generating mesh. Physical group tags:") + println("Cylinder: ", cylinder_group) + println("Boundaries: ", boundary_group) + println("Port 1: ", port1_group) + println("Port 2: ", port2_group) + println() + end + + # Optionally launch GUI + if gui + gmsh.fltk.run() + end + + return gmsh.finalize() +end diff --git a/examples/cpw/cpw.jl b/examples/cpw/cpw.jl index 1176166451..be340d008f 100644 --- a/examples/cpw/cpw.jl +++ b/examples/cpw/cpw.jl @@ -1,343 +1,217 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -using CSV -using DataFrames -using Measures -using Plots -using PyPlot: matplotlib - -""" - generate_coplanar_waveguide_data(; num_processors::Integer=1) - -Generate the data for the coplanar wave guide example - -# Arguments - - - num_processors - number of processors to use for the simulation -""" -function generate_coplanar_waveguide_data(; num_processors::Integer=1) - # Call the solver, discarding the terminal output - cpw_dir = @__DIR__ - for sim ∈ ["lumped", "wave"] - for mode ∈ ["adaptive", "uniform"] - call_command = `palace -np $num_processors -wdir $cpw_dir cpw_$sim\_$mode.json` - run(call_command) - end - end - - """ - Helper function for generating plots - """ - function helper(pp, f1, data1, lbl1, f2, data2, lbl2, f3, data3, lbl3, f4, data4, lbl4) - mkrsz = 8 - mkr1 = (:circle, mkrsz, stroke(0)) - mkr2 = (:utriangle, mkrsz, stroke(0)) - plot!(pp, f1, data1, label=lbl1) - plot!(pp, f3, data3, label=lbl3) - plot!(pp, f2, data2, label=lbl2, marker=mkr1, linewidth=0) - return plot!(pp, f4, data4, label=lbl4, marker=mkr2, linewidth=0) - end - - # Parse simulation data - postpro_dir = joinpath(@__DIR__, "postpro") - - file = joinpath(postpro_dir, "lumped_adaptive", "port-S.csv") - data_lumped_adaptive = CSV.File(file, header=1) |> DataFrame |> Matrix - f_adaptive = data_lumped_adaptive[:, 1] - data_lumped_adaptive = data_lumped_adaptive[:, 2:end] - n_p = size(data_lumped_adaptive, 2) ÷ 2 - - file = joinpath(postpro_dir, "wave_adaptive", "port-S.csv") - data_wave_adaptive = CSV.File(file, header=1) |> DataFrame |> Matrix - data_wave_adaptive = data_wave_adaptive[:, 2:end] - - file = joinpath(postpro_dir, "lumped_uniform", "port-S.csv") - data_lumped_uniform = CSV.File(file, header=1) |> DataFrame |> Matrix - f_uniform = data_lumped_uniform[:, 1] - data_lumped_uniform = data_lumped_uniform[:, 2:end] - - file = joinpath(postpro_dir, "wave_uniform", "port-S.csv") - data_wave_uniform = CSV.File(file, header=1) |> DataFrame |> Matrix - data_wave_uniform = data_wave_uniform[:, 2:end] - - # Wrap phases - for p = 1:n_p - idx = (data_lumped_adaptive[:, 2 * p] .< 0.0) - data_lumped_adaptive[idx, 2 * p] = data_lumped_adaptive[idx, 2 * p] .+ 180.0 - - idx = (data_wave_adaptive[:, 2 * p] .< 0.0) - data_wave_adaptive[idx, 2 * p] = data_wave_adaptive[idx, 2 * p] .+ 180.0 - - idx = (data_lumped_uniform[:, 2 * p] .< 0.0) - data_lumped_uniform[idx, 2 * p] = data_lumped_uniform[idx, 2 * p] .+ 180.0 - - idx = (data_wave_uniform[:, 2 * p] .< 0.0) - data_wave_uniform[idx, 2 * p] = data_wave_uniform[idx, 2 * p] .+ 180.0 - end - - # Plot settings - pyplot() - rcParams = PyPlot.PyDict(matplotlib["rcParams"]) - plotsz = (800, 400) - fntsz = 12 - fnt = font(fntsz) - rcParams["mathtext.fontset"] = "stix" - default( - size=plotsz, - palette=:Set1_9, - dpi=300, - tickfont=fnt, - guidefont=fnt, - legendfontsize=fntsz - 2, - margin=10mm - ) - - # Make plots - xlim = (minimum(f_uniform) - 1.0, maximum(f_uniform) + 1.0) - xlbl = "Frequency (GHz)" - - ## Reflection - p = 1 - - # Magnitude - ylbl = string("Reflection: abs(\$S_{11}\$) (dB)") - p1a = plot(xlims=xlim, xlabel=xlbl, ylabel=ylbl, legend=:bottomright) - - helper( - p1a, - f_adaptive, - data_lumped_adaptive[:, 2 * p - 1], - string("Adaptive, Lumped Port ", string(p)), - f_uniform, - data_lumped_uniform[:, 2 * p - 1], - string("Uniform, Lumped Port ", string(p)), - f_adaptive, - data_wave_adaptive[:, 2 * p - 1], - string("Adaptive, Wave Port ", string(p)), - f_uniform, - data_wave_uniform[:, 2 * p - 1], - string("Uniform, Wave Port ", string(p)) - ) - - plot!(p1a, ylims=(first(ylims(p1a)) - 20, 0)) - savefig(p1a, joinpath(postpro_dir, "figure1a.png")) - display(p1a) - - # Phase - ylbl = string("Reflection: arg(\$S_{11}\$) (deg.)") - p1b = plot(xlims=xlim, xlabel=xlbl, ylabel=ylbl, legend=:bottomright) - - helper( - p1b, - f_adaptive, - data_lumped_adaptive[:, 2 * p], - string("Adaptive, Lumped Port ", string(p)), - f_uniform, - data_lumped_uniform[:, 2 * p], - string("Uniform, Lumped Port ", string(p)), - f_adaptive, - data_wave_adaptive[:, 2 * p], - string("Adaptive, Wave Port ", string(p)), - f_uniform, - data_wave_uniform[:, 2 * p], - string("Uniform, Wave Port ", string(p)) - ) - - plot!(p1b, ylims=(first(ylims(p1b)) - 100, last(ylims(p1b)) + 0)) - savefig(p1b, joinpath(postpro_dir, "figure1b.png")) - display(p1b) - - ## Transmission - p = 2 - - # Magnitude - ylbl = string("Transmission: abs(\$S_{21}\$) (dB)") - p2a = plot(xlims=xlim, xlabel=xlbl, ylabel=ylbl, legend=:bottomleft) - - helper( - p2a, - f_adaptive, - data_lumped_adaptive[:, 2 * p - 1], - string("Adaptive, Lumped Port ", string(p)), - f_uniform, - data_lumped_uniform[:, 2 * p - 1], - string("Uniform, Lumped Port ", string(p)), - f_adaptive, - data_wave_adaptive[:, 2 * p - 1], - string("Adaptive, Wave Port ", string(p)), - f_uniform, - data_wave_uniform[:, 2 * p - 1], - string("Uniform, Wave Port ", string(p)) - ) - - plot!(p2a, ylims=(first(ylims(p2a)) - 20, 2)) - savefig(p2a, joinpath(postpro_dir, "figure2a.png")) - display(p2a) - - # Phase - ylbl = string("Transmission: arg(\$S_{21}\$) (deg.)") - p2b = plot(xlims=xlim, xlabel=xlbl, ylabel=ylbl, legend=:bottomleft) - - helper( - p2b, - f_adaptive, - data_lumped_adaptive[:, 2 * p], - string("Adaptive, Lumped Port ", string(p)), - f_uniform, - data_lumped_uniform[:, 2 * p], - string("Uniform, Lumped Port ", string(p)), - f_adaptive, - data_wave_adaptive[:, 2 * p], - string("Adaptive, Wave Port ", string(p)), - f_uniform, - data_wave_uniform[:, 2 * p], - string("Uniform, Wave Port ", string(p)) - ) - - plot!(p2b, ylims=(first(ylims(p2b)) - 60, last(ylims(p2b)) + 0)) - savefig(p2b, joinpath(postpro_dir, "figure2b.png")) - display(p2b) - - ## NEXT - p = 3 - - # Magnitude - ylbl = string("NEXT: abs(\$S_{31}\$) (dB)") - p3a = plot(xlims=xlim, xlabel=xlbl, ylabel=ylbl, legend=:bottomleft) - - helper( - p3a, - f_adaptive, - data_lumped_adaptive[:, 2 * p - 1], - string("Adaptive, Lumped Port ", string(p)), - f_uniform, - data_lumped_uniform[:, 2 * p - 1], - string("Uniform, Lumped Port ", string(p)), - f_adaptive, - data_wave_adaptive[:, 2 * p - 1], - string("Adaptive, Wave Port ", string(p)), - f_uniform, - data_wave_uniform[:, 2 * p - 1], - string("Uniform, Wave Port ", string(p)) - ) - - plot!(p3a, ylims=(first(ylims(p3a)) - 30, 0)) - savefig(p3a, joinpath(postpro_dir, "figure3a.png")) - display(p3a) - - # Phase - ylbl = string("NEXT: arg(\$S_{31}\$) (deg.)") - p3b = plot(xlims=xlim, xlabel=xlbl, ylabel=ylbl, legend=:bottomright) - - helper( - p3b, - f_adaptive, - data_lumped_adaptive[:, 2 * p], - string("Adaptive, Lumped Port ", string(p)), - f_uniform, - data_lumped_uniform[:, 2 * p], - string("Uniform, Lumped Port ", string(p)), - f_adaptive, - data_wave_adaptive[:, 2 * p], - string("Adaptive, Wave Port ", string(p)), - f_uniform, - data_wave_uniform[:, 2 * p], - string("Uniform, Wave Port ", string(p)) - ) - - plot!(p3b, ylims=(first(ylims(p3b)) - 100, last(ylims(p3b)) + 0)) - savefig(p3b, joinpath(postpro_dir, "figure3b.png")) - display(p3b) - - ## FEXT - p = 4 - - # Magnitude - ylbl = string("FEXT: abs(\$S_{41}\$) (dB)") - p4a = plot(xlims=xlim, xlabel=xlbl, ylabel=ylbl, legend=:bottomright) - - helper( - p4a, - f_adaptive, - data_lumped_adaptive[:, 2 * p - 1], - string("Adaptive, Lumped Port ", string(p)), - f_uniform, - data_lumped_uniform[:, 2 * p - 1], - string("Uniform, Lumped Port ", string(p)), - f_adaptive, - data_wave_adaptive[:, 2 * p - 1], - string("Adaptive, Wave Port ", string(p)), - f_uniform, - data_wave_uniform[:, 2 * p - 1], - string("Uniform, Wave Port ", string(p)) - ) - - plot!(p4a, ylims=(first(ylims(p4a)) - 40, 0)) - savefig(p4a, joinpath(postpro_dir, "figure4a.png")) - display(p4a) - - # Phase - ylbl = string("FEXT: arg(\$S_{41}\$) (deg.)") - p4b = plot(xlims=xlim, xlabel=xlbl, ylabel=ylbl, legend=:bottomright) - - helper( - p4b, - f_adaptive, - data_lumped_adaptive[:, 2 * p], - string("Adaptive, Lumped Port ", string(p)), - f_uniform, - data_lumped_uniform[:, 2 * p], - string("Uniform, Lumped Port ", string(p)), - f_adaptive, - data_wave_adaptive[:, 2 * p], - string("Adaptive, Wave Port ", string(p)), - f_uniform, - data_wave_uniform[:, 2 * p], - string("Uniform, Wave Port ", string(p)) - ) - - plot!(p4b, ylims=(first(ylims(p4b)) - 120, last(ylims(p4b)) + 0)) - savefig(p4b, joinpath(postpro_dir, "figure4b.png")) - return display(p4b) -end - -""" - cpw_impedance(;w,s,h,ϵᵣ) - -Compute the characteristic impedance of a coplanar wave guide. -See p259 of -H. J. Visser, Antenna Theory and Applications, Wiley, Hoboken, NJ, 2012 - -# Arguments - - - w width of trace [μm] - - s separation of trace [μm] - - h height of substrate [μm] - - ϵᵣ relative permittivity in surface normal direction -""" -function cpw_impedance(; w=30, s=18, h=500, ϵᵣ) - k = w / (w + 2 * s) - k₁ = sinh(π * w / (4 * h)) / sinh(π * (w + 2 * s) / (4 * h)) - - k′ = sqrt(1 - k^2) - k₁′ = sqrt(1 - k₁^2) - - function KoverK′(t, t′) - s = - x -> - log(2 * (sqrt(1 + x) + (4 * x)^(1 // 4)) / (sqrt(1 + x) - (4 * x)^(1 // 4))) - if t >= 1.0 / sqrt(2) - return s(t) / (2 * π) - else - return 2 * π / s(t′) - end - end - - koverk′ = KoverK′(k, k′) - k₁overk₁′ = KoverK′(k₁, k₁′) - - ϵ_eff = 1 + ((ϵᵣ - 1) / 2) * k₁overk₁′ / koverk′ - - return Z₀ = 30 * π / (koverk′ * sqrt(ϵ_eff)) -end +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +using CSV, DataFrames, Measures, CairoMakie, ColorSchemes + +""" + cpw_impedance(;w,s,h,ϵᵣ) + +Compute the characteristic impedance of a coplanar wave guide + +See p. 259 of H. J. Visser, _Antenna Theory and Applications_, Wiley, Hoboken, NJ, 2012 + +# Arguments + + - w width of trace [μm] + - s separation of trace [μm] + - h height of substrate [μm] + - ϵᵣ relative permittivity in surface normal direction +""" +function cpw_impedance(; w=30, s=18, h=500, ϵᵣ) + """ + Helper function for K(k)/K(k') + """ + function KoverK′(k, k′) + s = + x -> + log(2 * (sqrt(1 + x) + (4 * x)^(1 // 4)) / (sqrt(1 + x) - (4 * x)^(1 // 4))) + if k >= 1.0 / sqrt(2) + return s(k) / (2 * π) + else + return 2 * π / s(k′) + end + end + + k = w / (w + 2 * s) + k₁ = sinh(π * w / (4 * h)) / sinh(π * (w + 2 * s) / (4 * h)) + + k′ = sqrt(1 - k^2) + k₁′ = sqrt(1 - k₁^2) + + koverk′ = KoverK′(k, k′) + k₁overk₁′ = KoverK′(k₁, k₁′) + + ϵ_eff = 1 + ((ϵᵣ - 1) / 2) * k₁overk₁′ / koverk′ + + return Z₀ = 30 * π / (koverk′ * sqrt(ϵ_eff)) +end + +""" + extract_data(base_path::String, cols::Vector{Int}=[1,2,4,6,8]) + +Extract data from CSV files located in specific subfolders of `base_path`. + +# Arguments + + - `base_path::String`: Path to the parent directory containing the subfolders + - `cols::Vector{Int}`: Indices of columns to extract (default: [1,2,4,6,8]) + +# Returns + +A nested dictionary structure where: + + - Outer key: folder name (e.g., "wave_uniform", "lumped_uniform", etc.) + - Inner keys: column names from CSV headers + - Values: Vector{Float64} of data from each column + +# Example + +```julia +data = extract_data("postpro", [1, 2, 3]) +freq = data["lumped_uniform"]["f (GHz)"] +s11_mag = data["lumped_uniform"]["|S[1][1]| (dB)"] # Define folder structure +``` +""" +function extract_data(base_path::String, cols::Vector{Int}=[1, 2, 4, 6, 8]) + # Define folder structure + folders = ["wave_uniform", "lumped_uniform", "wave_adaptive", "lumped_adaptive"] + + # Initialize result dictionary + result = Dict{String, Dict{String, Vector{Float64}}}() + + for folder in folders + try + # Construct full file path + file_path = joinpath(base_path, folder, "port-S.csv") + + # Verify file exists + if !isfile(file_path) + @warn "File not found: $file_path" + continue + end + + # Read CSV file + df = CSV.read(file_path, DataFrame) + + # Get column names and strip leading whitespace + col_names = [lstrip(String(name)) for name in names(df)] + + # Validate column indices + valid_cols = filter(c -> c ≤ length(col_names), cols) + + if isempty(valid_cols) + @warn "No valid columns found for $folder" + continue + end + + # Initialize inner dictionary + result[folder] = Dict{String, Vector{Float64}}() + + # Extract specified columns + for col_idx in valid_cols + if col_idx ≤ length(col_names) + col_name = col_names[col_idx] + result[folder][col_name] = Vector{Float64}(df[!, col_idx]) + end + end + catch e + @error "Error processing $folder" exception=(e, catch_backtrace()) + end + end + + return result +end + +""" + plot_s_parameters(path::String, prefix::String="") + +Create and save individual S-parameter plots from data in specified path. +Plots S11, S21, S31, and S41 using Dark2_4 colorscheme, with circles for uniform +and lines for adaptive solutions. + +# Arguments + + - `path::String`: Path containing the data folders and where plots will be saved + - `prefix::String`: Optional prefix for saved files (default="") + +# Saves + +Four PNG files in the specified path: + + - cpw_[prefix_]11.png + - cpw_[prefix_]21.png + - cpw_[prefix_]31.png + - cpw_[prefix_]41.png +""" +function plot_s_parameters(path::String, prefix::String="") + data = extract_data(path) + + s_params = ["S[1][1]", "S[2][1]", "S[3][1]", "S[4][1]"] + unicode_labels = ["|S₁₁| dB", "|S₂₁| dB", "|S₃₁| dB", "|S₄₁| dB"] + colors = ColorSchemes.Dark2_4 + + for (s_param, label) in zip(s_params, unicode_labels) + fig = Figure(size=(600, 400)) + ax = Axis(fig[1, 1]) + + scatter!( + ax, + data["wave_uniform"]["f (GHz)"], + data["wave_uniform"]["|$(s_param)| (dB)"], + color=colors[1], + markersize=10, + label="Wave Uniform" + ) + + scatter!( + ax, + data["lumped_uniform"]["f (GHz)"], + data["lumped_uniform"]["|$(s_param)| (dB)"], + color=colors[2], + markersize=10, + label="Lumped Uniform" + ) + + lines!( + ax, + data["wave_adaptive"]["f (GHz)"], + data["wave_adaptive"]["|$(s_param)| (dB)"], + color=colors[1], + linewidth=2, + label="Wave Adaptive" + ) + + lines!( + ax, + data["lumped_adaptive"]["f (GHz)"], + data["lumped_adaptive"]["|$(s_param)| (dB)"], + color=colors[2], + linewidth=2, + label="Lumped Adaptive" + ) + + ax.xlabel = "Frequency (GHz)" + ax.ylabel = label + + ax.xlabelsize = 18 + ax.ylabelsize = 18 + ax.xticklabelsize = 18 + ax.yticklabelsize = 18 + + data_min = + minimum(minimum(data[folder]["|$(s_param)| (dB)"]) for folder in keys(data)) + data_max = + maximum(maximum(data[folder]["|$(s_param)| (dB)"]) for folder in keys(data)) + y_min = floor(data_min / 10) * 10 + y_max = ceil(data_max / 10) * 10 + y_max == 0 ? y_max = 1 : y_max = y_max + ax.limits = (nothing, (y_min, y_max)) + + axislegend(ax, position=:lb, textsize=18) + + suffix = s_param[[3, 6]] + filename = isempty(prefix) ? "cpw-$(suffix).png" : "cpw-$(prefix)-$(suffix).png" + + save(joinpath(path, filename), fig) + end +end diff --git a/examples/cpw/cpw_lumped_adaptive.json b/examples/cpw/cpw_lumped_adaptive.json index dd3c2765fb..96f6701f86 100644 --- a/examples/cpw/cpw_lumped_adaptive.json +++ b/examples/cpw/cpw_lumped_adaptive.json @@ -1,171 +1,205 @@ -{ - "Problem": - { - "Type": "Driven", - "Verbose": 2, - "Output": "D:/WelSimLLC/executable28/_palace_examples/cpw/postpro/lumped_adaptive" - }, - "Model": - { - "Mesh": "D:/WelSimLLC/executable28/_palace_examples/cpw/mesh/cpw_lumped.msh", - "L0": 1.0e-6, // μm - "Refinement": - { - "UniformLevels": 1 - } - }, - "Domains": - { - "Materials": - [ - { - "Attributes": [2], // Air - "Permeability": 1.0, - "Permittivity": 1.0, - "LossTan": 0.0 - }, - { - "Attributes": [1], // Sapphire - "Permeability": [0.99999975, 0.99999975, 0.99999979], - "Permittivity": [9.3, 9.3, 11.5], - "LossTan": [3.0e-5, 3.0e-5, 8.6e-5], - "MaterialAxes": [[0.8, 0.6, 0.0], [-0.6, 0.8, 0.0], [0.0, 0.0, 1.0]] - } - ], - "Postprocessing": - { - "Dielectric": - [ - { - "Index": 1, - "Attributes": [1] - } - ], - "Probe": - [ - { - "Index": 1, - "X": 2000, - "Y": 833, - "Z": 30 - }, - { - "Index": 2, - "X": 2000, - "Y": 833, - "Z": -30 - } - ] - } - }, - "Boundaries": - { - "PEC": - { - "Attributes": [3] // Metal trace - }, - "Absorbing": - { - "Attributes": [13], - "Order": 1 - }, - "LumpedPort": - [ - { - "Index": 1, - "R": 56.02, // Ω, 2-element uniform - "Excitation": true, - "Elements": - [ - { - "Attributes": [4], - "Direction": "+Y" - }, - { - "Attributes": [8], - "Direction": "-Y" - } - ] - }, - { - "Index": 2, - "R": 56.02, - "Elements": - [ - { - "Attributes": [5], - "Direction": "+Y" - }, - { - "Attributes": [9], - "Direction": "-Y" - } - ] - }, - { - "Index": 3, - "R": 56.02, - "Elements": - [ - { - "Attributes": [6], - "Direction": "+Y" - }, - { - "Attributes": [10], - "Direction": "-Y" - } - ] - }, - { - "Index": 4, - "R": 56.02, - "Elements": - [ - { - "Attributes": [7], - "Direction": "+Y" - }, - { - "Attributes": [11], - "Direction": "-Y" - } - ] - } - ], - "Postprocessing": - { - "Dielectric": - [ - { - "Index": 1, - "Attributes": [12], - "Side": "+Z", - "Thickness": 2e-3, // μm - "PermittivitySA": 4.0, - "LossTan": 1.0 - } - ] - } - }, - "Solver": - { - "Order": 1, - "Driven": - { - "MinFreq": 2.0, // GHz - "MaxFreq": 30.0, // GHz - "FreqStep": 0.1, // GHz - "SaveStep": 40, - "AdaptiveTol": 1.0e-3 - }, - "Linear": - { - "Type": "Default", - "KSPType": "GMRES", - "Tol": 1.0e-8, - "MaxIts": 100 - } - } -} - +{ + "Problem": + { + "Type": "Driven", + "Verbose": 2, + "Output": "postpro/lumped_adaptive" + }, + "Model": + { + "Mesh": "mesh/cpw_lumped_0.msh", + "L0": 1.0e-6, // μm + "Refinement": {} + }, + "Domains": + { + "Materials": + [ + { + "Attributes": [1], // Air + "Permeability": 1.0, + "Permittivity": 1.0, + "LossTan": 0.0 + }, + { + "Attributes": [2], // Sapphire + "Permeability": [0.99999975, 0.99999975, 0.99999979], + "Permittivity": [9.3, 9.3, 11.5], + "LossTan": [3.0e-5, 3.0e-5, 8.6e-5], + "MaterialAxes": [[0.8, 0.6, 0.0], [-0.6, 0.8, 0.0], [0.0, 0.0, 1.0]] + } + ], + "Postprocessing": + { + "Energy": + [ + { + "Index": 1, + "Attributes": [2] + } + ], + "Probe": + [ + { + "Index": 1, + "Center": [2000, 833, 30] + }, + { + "Index": 2, + "Center": [2000, 833, -30] + } + ] + } + }, + "Boundaries": + { + "PEC": + { + "Attributes": [13] // Metal trace + }, + "Absorbing": + { + "Attributes": [4], + "Order": 1 + }, + "LumpedPort": + [ + { + "Index": 1, + "R": 56.02, // Ω, 2-element uniform + "Excitation": true, + "Elements": + [ + { + "Attributes": [5], + "Direction": "+Y" + }, + { + "Attributes": [9], + "Direction": "-Y" + } + ] + }, + { + "Index": 2, + "R": 56.02, + "Elements": + [ + { + "Attributes": [6], + "Direction": "+Y" + }, + { + "Attributes": [10], + "Direction": "-Y" + } + ] + }, + { + "Index": 3, + "R": 56.02, + "Elements": + [ + { + "Attributes": [7], + "Direction": "+Y" + }, + { + "Attributes": [11], + "Direction": "-Y" + } + ] + }, + { + "Index": 4, + "R": 56.02, + "Elements": + [ + { + "Attributes": [8], + "Direction": "+Y" + }, + { + "Attributes": [12], + "Direction": "-Y" + } + ] + } + ], + "Postprocessing": + { + "SurfaceFlux": + [ + { + "Index": 1, + "Attributes": [13], + "Type": "Electric", + "TwoSided": true + }, + { + "Index": 2, + "Attributes": [4], + "Type": "Power" + } + ], + "Dielectric": + [ + { + "Index": 1, + "Attributes": [14], + "Type": "SA", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + }, + { + "Index": 2, + "Attributes": [13], + "Type": "MS", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + }, + { + "Index": 3, + "Attributes": [13], + "Type": "MA", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + } + ] + } + }, + "Solver": + { + "Order": 2, + "Device": "CPU", + "Driven": + { + "Samples": + [ + { + "Type": "Linear", + "MinFreq": 2.0, // GHz + "MaxFreq": 32.0, // GHz + "FreqStep": 0.1 // GHz + }, + { + "Type": "Point", + "Freq": [17.0], // GHz + "SaveStep": 1 + } + ], + "Save": [2.0, 32.0], + "AdaptiveTol": 1.0e-3 + }, + "Linear": + { + "Type": "Default", + "KSPType": "GMRES", + "Tol": 1.0e-8, + "MaxIts": 200 + } + } +} diff --git a/examples/cpw/cpw_lumped_uniform.json b/examples/cpw/cpw_lumped_uniform.json index e2bb5c34cf..76fca588ab 100644 --- a/examples/cpw/cpw_lumped_uniform.json +++ b/examples/cpw/cpw_lumped_uniform.json @@ -1,170 +1,206 @@ -{ - "Problem": - { - "Type": "Driven", - "Verbose": 2, - "Output": "D:/WelSimLLC/executable28/_palace_examples/cpw/postpro/lumped_uniform" - }, - "Model": - { - "Mesh": "D:/WelSimLLC/executable28/_palace_examples/cpw/mesh/cpw_lumped.msh", - "L0": 1.0e-6, // μm - "Refinement": - { - "UniformLevels": 1 - } - }, - "Domains": - { - "Materials": - [ - { - "Attributes": [2], // Air - "Permeability": 1.0, - "Permittivity": 1.0, - "LossTan": 0.0 - }, - { - "Attributes": [1], // Sapphire - "Permeability": [0.99999975, 0.99999975, 0.99999979], - "Permittivity": [9.3, 9.3, 11.5], - "LossTan": [3.0e-5, 3.0e-5, 8.6e-5], - "MaterialAxes": [[0.8, 0.6, 0.0], [-0.6, 0.8, 0.0], [0.0, 0.0, 1.0]] - } - ], - "Postprocessing": - { - "Dielectric": - [ - { - "Index": 1, - "Attributes": [1] - } - ], - "Probe": - [ - { - "Index": 1, - "X": 2000, - "Y": 833, - "Z": 30 - }, - { - "Index": 2, - "X": 2000, - "Y": 833, - "Z": -30 - } - ] - } - }, - "Boundaries": - { - "PEC": - { - "Attributes": [3] // Metal trace - }, - "Absorbing": - { - "Attributes": [13], - "Order": 1 - }, - "LumpedPort": - [ - { - "Index": 1, - "R": 56.02, // Ω, 2-element uniform - "Excitation": true, - "Elements": - [ - { - "Attributes": [4], - "Direction": "+Y" - }, - { - "Attributes": [8], - "Direction": "-Y" - } - ] - }, - { - "Index": 2, - "R": 56.02, - "Elements": - [ - { - "Attributes": [5], - "Direction": "+Y" - }, - { - "Attributes": [9], - "Direction": "-Y" - } - ] - }, - { - "Index": 3, - "R": 56.02, - "Elements": - [ - { - "Attributes": [6], - "Direction": "+Y" - }, - { - "Attributes": [10], - "Direction": "-Y" - } - ] - }, - { - "Index": 4, - "R": 56.02, - "Elements": - [ - { - "Attributes": [7], - "Direction": "+Y" - }, - { - "Attributes": [11], - "Direction": "-Y" - } - ] - } - ], - "Postprocessing": - { - "Dielectric": - [ - { - "Index": 1, - "Attributes": [12], - "Side": "+Z", - "Thickness": 2e-3, // μm - "PermittivitySA": 4.0, - "LossTan": 1.0 - } - ] - } - }, - "Solver": - { - "Order": 1, - "Driven": - { - "MinFreq": 2.0, // GHz - "MaxFreq": 30.0, // GHz - "FreqStep": 2.0, // GHz - "SaveStep": 2 - }, - "Linear": - { - "Type": "Default", - "KSPType": "GMRES", - "Tol": 1.0e-8, - "MaxIts": 100 - } - } -} - +{ + "Problem": + { + "Type": "Driven", + "Verbose": 2, + "Output": "postpro/lumped_uniform" + }, + "Model": + { + "Mesh": "mesh/cpw_lumped_0.msh", + "L0": 1.0e-6, // μm + "Refinement": {} + }, + "Domains": + { + "Materials": + [ + { + "Attributes": [1], // Air + "Permeability": 1.0, + "Permittivity": 1.0, + "LossTan": 0.0 + }, + { + "Attributes": [2], // Sapphire + "Permeability": [0.99999975, 0.99999975, 0.99999979], + "Permittivity": [9.3, 9.3, 11.5], + "LossTan": [3.0e-5, 3.0e-5, 8.6e-5], + "MaterialAxes": [[0.8, 0.6, 0.0], [-0.6, 0.8, 0.0], [0.0, 0.0, 1.0]] + } + ], + "Postprocessing": + { + "Energy": + [ + { + "Index": 1, + "Attributes": [2] + } + ], + "Probe": + [ + { + "Index": 1, + "Center": [2000, 833, 30] + }, + { + "Index": 2, + "Center": [2000, 833, -30] + } + ] + } + }, + "Boundaries": + { + "PEC": + { + "Attributes": [13] // Metal trace + }, + "Absorbing": + { + "Attributes": [4], + "Order": 1 + }, + "LumpedPort": + [ + { + "Index": 1, + "R": 56.02, // Ω, 2-element uniform + "Excitation": 1, + "Elements": + [ + { + "Attributes": [5], + "Direction": "+Y" + }, + { + "Attributes": [9], + "Direction": "-Y" + } + ] + }, + { + "Index": 2, + "R": 56.02, + "Excitation": 2, + "Elements": + [ + { + "Attributes": [6], + "Direction": "+Y" + }, + { + "Attributes": [10], + "Direction": "-Y" + } + ] + }, + { + "Index": 3, + "R": 56.02, + "Elements": + [ + { + "Attributes": [7], + "Direction": "+Y" + }, + { + "Attributes": [11], + "Direction": "-Y" + } + ] + }, + { + "Index": 4, + "R": 56.02, + "Elements": + [ + { + "Attributes": [8], + "Direction": "+Y" + }, + { + "Attributes": [12], + "Direction": "-Y" + } + ] + } + ], + "Postprocessing": + { + "SurfaceFlux": + [ + { + "Index": 1, + "Attributes": [13], + "Type": "Electric", + "TwoSided": true + }, + { + "Index": 2, + "Attributes": [4], + "Type": "Power" + } + ], + "Dielectric": + [ + { + "Index": 1, + "Attributes": [14], + "Type": "SA", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + }, + { + "Index": 2, + "Attributes": [13], + "Type": "MS", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + }, + { + "Index": 3, + "Attributes": [13], + "Type": "MA", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + } + ] + } + }, + "Solver": + { + "Order": 2, + "Device": "CPU", + "Driven": + { + "Samples": + [ + { + "Type": "Linear", + "MinFreq": 2.0, // GHz + "MaxFreq": 32.0, // GHz + "FreqStep": 6.0 // GHz + }, + { + "Type": "Point", + "Freq": [17.0], // GHz + "SaveStep": 1 + } + ], + "Save": [2.0, 32.0] + }, + "Linear": + { + "Type": "Default", + "KSPType": "GMRES", + "Tol": 1.0e-8, + "MaxIts": 200 + } + } +} + diff --git a/examples/cpw/cpw_wave_adaptive.json b/examples/cpw/cpw_wave_adaptive.json index 9b73c046a1..36c44c5e50 100644 --- a/examples/cpw/cpw_wave_adaptive.json +++ b/examples/cpw/cpw_wave_adaptive.json @@ -1,135 +1,182 @@ -{ - "Problem": - { - "Type": "Driven", - "Verbose": 2, - "Output": "D:/WelSimLLC/executable28/_palace_examples/cpw/postpro/wave_adaptive" - }, - "Model": - { - "Mesh": "D:/WelSimLLC/executable28/_palace_examples/cpw/mesh/cpw_wave.msh", - "L0": 1.0e-6, // μm - "Refinement": - { - "UniformLevels": 1 - } - }, - "Domains": - { - "Materials": - [ - { - "Attributes": [2], // Air - "Permeability": 1.0, - "Permittivity": 1.0, - "LossTan": 0.0 - }, - { - "Attributes": [1], // Sapphire - "Permeability": [0.99999975, 0.99999975, 0.99999979], - "Permittivity": [9.3, 9.3, 11.5], - "LossTan": [3.0e-5, 3.0e-5, 8.6e-5], - "MaterialAxes": [[0.8, 0.6, 0.0], [-0.6, 0.8, 0.0], [0.0, 0.0, 1.0]] - } - ], - "Postprocessing": - { - "Dielectric": - [ - { - "Index": 1, - "Attributes": [1] - } - ], - "Probe": - [ - { - "Index": 1, - "X": 2000, - "Y": 833, - "Z": 30 - }, - { - "Index": 2, - "X": 2000, - "Y": 833, - "Z": -30 - } - ] - } - }, - "Boundaries": - { - "PEC": - { - "Attributes": [3, 9] // Metal trace + end boundaries - }, - "Absorbing": - { - "Attributes": [10], - "Order": 1 - }, - "WavePort": - [ - { - "Index": 1, - "Attributes": [5], - "Mode": 1, - "Offset": 0.0, - "Excitation": true - }, - { - "Index": 2, - "Attributes": [6], - "Mode": 1, - "Offset": 0.0 - }, - { - "Index": 3, - "Attributes": [7], - "Mode": 1, - "Offset": 0.0 - }, - { - "Index": 4, - "Attributes": [8], - "Mode": 1, - "Offset": 0.0 - } - ], - "Postprocessing": - { - "Dielectric": - [ - { - "Index": 1, - "Attributes": [3], - "Side": "+Z", - "Thickness": 2e-3, // μm - "PermittivitySA": 4.0, - "LossTan": 1.0 - } - ] - } - }, - "Solver": - { - "Order": 1, - "Driven": - { - "MinFreq": 2.0, // GHz - "MaxFreq": 30.0, // GHz - "FreqStep": 0.1, // GHz - "SaveStep": 40, - "AdaptiveTol": 1.0e-3 - }, - "Linear": - { - "Type": "Default", - "KSPType": "GMRES", - "Tol": 1.0e-8, - "MaxIts": 100 - } - } -} - +{ + "Problem": + { + "Type": "Driven", + "Verbose": 2, + "Output": "postpro/wave_adaptive" + }, + "Model": + { + "Mesh": "mesh/cpw_wave_0.msh", + "L0": 1.0e-6, // μm + "Refinement": {} + }, + "Domains": + { + "Materials": + [ + { + "Attributes": [1], // Air + "Permeability": 1.0, + "Permittivity": 1.0, + "LossTan": 0.0 + }, + { + "Attributes": [2], // Sapphire + "Permeability": [0.99999975, 0.99999975, 0.99999979], + "Permittivity": [9.3, 9.3, 11.5], + "LossTan": [3.0e-5, 3.0e-5, 8.6e-5], + "MaterialAxes": [[0.8, 0.6, 0.0], [-0.6, 0.8, 0.0], [0.0, 0.0, 1.0]] + } + ], + "Postprocessing": + { + "Energy": + [ + { + "Index": 1, + "Attributes": [2] + } + ], + "Probe": + [ + { + "Index": 1, + "Center": [2000, 833, 30] + }, + { + "Index": 2, + "Center": [2000, 833, -30] + } + ] + } + }, + "Boundaries": + { + "PEC": + { + "Attributes": [8, 9, 11] // Metal trace + end boundaries + }, + "Absorbing": + { + "Attributes": [10], + "Order": 1 + }, + "WavePort": + [ + { + "Index": 1, + "Attributes": [4], + "Mode": 1, + "Offset": 0.0, + "Excitation": true + }, + { + "Index": 2, + "Attributes": [5], + "Mode": 1, + "Offset": 0.0 + }, + { + "Index": 3, + "Attributes": [6], + "Mode": 1, + "Offset": 0.0 + }, + { + "Index": 4, + "Attributes": [7], + "Mode": 1, + "Offset": 0.0 + } + ], + "Postprocessing": + { + "SurfaceFlux": + [ + { + "Index": 1, + "Attributes": [11], + "Type": "Electric", + "TwoSided": true + }, + { + "Index": 2, + "Attributes": [10], + "Type": "Power" + }, + { + "Index": 3, + "Attributes": [4, 6, 8], + "Type": "Power", + "Center": [-2000, 0, 0] + }, + { + "Index": 4, + "Attributes": [5, 7, 9], + "Type": "Power", + "Center": [2000, 0, 0] + } + ], + "Dielectric": + [ + { + "Index": 1, + "Attributes": [12], + "Type": "SA", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + }, + { + "Index": 2, + "Attributes": [11], + "Type": "MS", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + }, + { + "Index": 3, + "Attributes": [11], + "Type": "MA", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + } + ] + } + }, + "Solver": + { + "Order": 2, + "Device": "CPU", + "Driven": + { + "Samples": + [ + { + "Type": "Linear", + "MinFreq": 2.0, // GHz + "MaxFreq": 32.0, // GHz + "FreqStep": 0.1 // GHz + }, + { + "Type": "Point", + "Freq": [17.0], // GHz + "SaveStep": 1 + } + ], + "Save": [2.0, 32.0], + "AdaptiveTol": 1.0e-3 + }, + "Linear": + { + "Type": "Default", + "KSPType": "GMRES", + "Tol": 1.0e-8, + "MaxIts": 200 + } + } +} + diff --git a/examples/cpw/cpw_wave_uniform.json b/examples/cpw/cpw_wave_uniform.json index 61de49e892..2a6c3fc46d 100644 --- a/examples/cpw/cpw_wave_uniform.json +++ b/examples/cpw/cpw_wave_uniform.json @@ -1,134 +1,198 @@ -{ - "Problem": - { - "Type": "Driven", - "Verbose": 2, - "Output": "D:/WelSimLLC/executable28/_palace_examples/cpw/postpro/wave_uniform" - }, - "Model": - { - "Mesh": "D:/WelSimLLC/executable28/_palace_examples/cpw/mesh/cpw_wave.msh", - "L0": 1.0e-6, // μm - "Refinement": - { - "UniformLevels": 1 - } - }, - "Domains": - { - "Materials": - [ - { - "Attributes": [2], // Air - "Permeability": 1.0, - "Permittivity": 1.0, - "LossTan": 0.0 - }, - { - "Attributes": [1], // Sapphire - "Permeability": [0.99999975, 0.99999975, 0.99999979], - "Permittivity": [9.3, 9.3, 11.5], - "LossTan": [3.0e-5, 3.0e-5, 8.6e-5], - "MaterialAxes": [[0.8, 0.6, 0.0], [-0.6, 0.8, 0.0], [0.0, 0.0, 1.0]] - } - ], - "Postprocessing": - { - "Dielectric": - [ - { - "Index": 1, - "Attributes": [1] - } - ], - "Probe": - [ - { - "Index": 1, - "X": 2000, - "Y": 833, - "Z": 30 - }, - { - "Index": 2, - "X": 2000, - "Y": 833, - "Z": -30 - } - ] - } - }, - "Boundaries": - { - "PEC": - { - "Attributes": [3, 9] // Metal trace + end boundaries - }, - "Absorbing": - { - "Attributes": [10], - "Order": 1 - }, - "WavePort": - [ - { - "Index": 1, - "Attributes": [5], - "Mode": 1, - "Offset": 0.0, - "Excitation": true - }, - { - "Index": 2, - "Attributes": [6], - "Mode": 1, - "Offset": 0.0 - }, - { - "Index": 3, - "Attributes": [7], - "Mode": 1, - "Offset": 0.0 - }, - { - "Index": 4, - "Attributes": [8], - "Mode": 1, - "Offset": 0.0 - } - ], - "Postprocessing": - { - "Dielectric": - [ - { - "Index": 1, - "Attributes": [3], - "Side": "+Z", - "Thickness": 2e-3, // μm - "PermittivitySA": 4.0, - "LossTan": 1.0 - } - ] - } - }, - "Solver": - { - "Order": 1, - "Driven": - { - "MinFreq": 2.0, // GHz - "MaxFreq": 30.0, // GHz - "FreqStep": 2.0, // GHz - "SaveStep": 2 - }, - "Linear": - { - "Type": "Default", - "KSPType": "GMRES", - "Tol": 1.0e-8, - "MaxIts": 100 - } - } -} - +{ + "Problem": + { + "Type": "Driven", + "Verbose": 2, + "Output": "postpro/wave_uniform" + }, + "Model": + { + "Mesh": "mesh/cpw_wave_0.msh", + "L0": 1.0e-6, // μm + "Refinement": {} + }, + "Domains": + { + "Materials": + [ + { + "Attributes": [1], // Air + "Permeability": 1.0, + "Permittivity": 1.0, + "LossTan": 0.0 + }, + { + "Attributes": [2], // Sapphire + "Permeability": [0.99999975, 0.99999975, 0.99999979], + "Permittivity": [9.3, 9.3, 11.5], + "LossTan": [3.0e-5, 3.0e-5, 8.6e-5], + "MaterialAxes": [[0.8, 0.6, 0.0], [-0.6, 0.8, 0.0], [0.0, 0.0, 1.0]] + } + ], + "Postprocessing": + { + "Energy": + [ + { + "Index": 1, + "Attributes": [2] + } + ], + "Probe": + [ + { + "Index": 1, + "Center": [2000, 833, 30] + }, + { + "Index": 2, + "Center": [2000, 833, -30] + } + ] + } + }, + "Boundaries": + { + "PEC": + { + "Attributes": [8, 9, 11] // Metal trace + end boundaries + }, + "Absorbing": + { + "Attributes": [10], + "Order": 1 + }, + "WavePort": + [ + { + "Index": 1, + "Attributes": [4], + "Mode": 1, + "Excitation": 1, + "Offset": 0.0, + "MaxIts": 30, + "KSPTol": 1e-8, + "EigenTol": 1e-6, + "Verbose": 0 + }, + { + "Index": 2, + "Attributes": [5], + "Mode": 1, + "Excitation": 2, + "Offset": 0.0, + "MaxIts": 30, + "KSPTol": 1e-8, + "EigenTol": 1e-6, + "Verbose": 0 + }, + { + "Index": 3, + "Attributes": [6], + "Mode": 1, + "Offset": 0.0, + "MaxIts": 30, + "KSPTol": 1e-8, + "EigenTol": 1e-6, + "Verbose": 0 + }, + { + "Index": 4, + "Attributes": [7], + "Mode": 1, + "Offset": 0.0, + "MaxIts": 30, + "KSPTol": 1e-8, + "EigenTol": 1e-6, + "Verbose": 0 + } + ], + "Postprocessing": + { + "SurfaceFlux": + [ + { + "Index": 1, + "Attributes": [11], + "Type": "Electric", + "TwoSided": true + }, + { + "Index": 2, + "Attributes": [10], + "Type": "Power" + }, + { + "Index": 3, + "Attributes": [4, 6, 8], + "Type": "Power", + "Center": [-2000, 0, 0] + }, + { + "Index": 4, + "Attributes": [5, 7, 9], + "Type": "Power", + "Center": [2000, 0, 0] + } + ], + "Dielectric": + [ + { + "Index": 1, + "Attributes": [12], + "Type": "SA", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + }, + { + "Index": 2, + "Attributes": [11], + "Type": "MS", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + }, + { + "Index": 3, + "Attributes": [11], + "Type": "MA", + "Thickness": 2.0e-3, // μm + "Permittivity": 10.0, + "LossTan": 1.0 + } + ] + } + }, + "Solver": + { + "Order": 2, + "Device": "CPU", + "Driven": + { + "Samples": + [ + { + "Type": "Linear", + "MinFreq": 2.0, // GHz + "MaxFreq": 32.0, // GHz + "FreqStep": 6.0 // GHz + }, + { + "Type": "Point", + "Freq": [17.0], // GHz + "SaveStep": 1 + } + ], + "Save": [2.0, 32.0] + }, + "Linear": + { + "Type": "Default", + "KSPType": "GMRES", + "Tol": 1.0e-8, + "MaxIts": 200 + } + } +} + diff --git a/examples/cpw/mesh/cpw_lumped.msh b/examples/cpw/mesh/cpw_lumped.msh index 52eeca3dd4..caf719b259 100644 Binary files a/examples/cpw/mesh/cpw_lumped.msh and b/examples/cpw/mesh/cpw_lumped.msh differ diff --git a/examples/cpw/mesh/cpw_wave.msh b/examples/cpw/mesh/cpw_wave.msh index caae2702ea..4f43f9a17c 100644 Binary files a/examples/cpw/mesh/cpw_wave.msh and b/examples/cpw/mesh/cpw_wave.msh differ diff --git a/examples/cpw/mesh/mesh_lumped.jl b/examples/cpw/mesh/mesh_lumped.jl index e377796863..59e7a0c661 100644 --- a/examples/cpw/mesh/mesh_lumped.jl +++ b/examples/cpw/mesh/mesh_lumped.jl @@ -1,292 +1,292 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -using Gmsh: gmsh - -""" - function generate_coplanar_waveguide_lumped_mesh(; - refinement::Integer = 0, - trace_width_μm::Real = 30.0, - gap_width_μm::Real = 18.0, - separation_width_μm::Real = 200.0, - ground_width_μm::Real = 800.0, - substrate_height_μm::Real = 500.0, - length_μm::Real = 4000.0, - filename::AbstractString, - verbose::Integer = 1, - ) - -Generate a mesh for the coplanar waveguide with lumped ports using Gmsh - -# Arguments - - - refinement - measure of how many elements to include, 0 is least - - trace_width_μm - width of the coplanar waveguide trace, in μm - - gap_width_μm - width of the coplanar waveguide gap, in μm - - separation_width_μm - separation distance between the two waveguides, in μm - - ground_width_μm - width of the ground plane, in μm - - substrate_height_μm - height of the substrate, in μm - - length_μm - length of the waveguides, in μm - - filename - the filename to use for the generated mesh - - verbose - flag to dictate the level of print to REPL, passed to Gmsh -""" -function generate_coplanar_waveguide_lumped_mesh(; - refinement::Integer = 0, - trace_width_μm::Real = 30.0, - gap_width_μm::Real = 18.0, - separation_width_μm::Real = 200.0, - ground_width_μm::Real = 800.0, - substrate_height_μm::Real = 500.0, - length_μm::Real = 4000.0, - filename::AbstractString, - verbose::Integer=1 -) - @assert refinement >= 0 - @assert trace_width_μm > 0 - @assert gap_width_μm > 0 - @assert separation_width_μm > 0 - @assert ground_width_μm > 0 - @assert substrate_height_μm > 0 - @assert length_μm > 0 - - kernel = gmsh.model.occ - - gmsh.initialize() - gmsh.option.setNumber("General.Verbosity", verbose) - - # Add model - if "cpw" in gmsh.model.list() - gmsh.model.setCurrent("cpw") - gmsh.model.remove() - end - gmsh.model.add("cpw") - - sep_dz = 1000.0 - sep_dy = 0.5 * sep_dz - - # Mesh parameters - l_trace = 1.5 * trace_width_μm * 2^-refinement - l_farfield = 1.0 * substrate_height_μm * 2^-refinement - - # Chip pattern - dy = 0.0 - g1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, ground_width_μm) - dy += ground_width_μm - n1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) - dy += gap_width_μm - t1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, trace_width_μm) - dy += trace_width_μm - n2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) - dy += gap_width_μm - g2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, separation_width_μm) - dy += separation_width_μm - n3 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) - dy += gap_width_μm - t2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, trace_width_μm) - dy += trace_width_μm - n4 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) - dy += gap_width_μm - g3 = kernel.addRectangle(0.0, dy, 0.0, length_μm, ground_width_μm) - dy += ground_width_μm - - # Substrate - substrate = - kernel.addBox(0.0, 0.0, -substrate_height_μm, length_μm, dy, substrate_height_μm) - - # Exterior box - domain = kernel.addBox( - -0.5 * sep_dy, - -sep_dy, - -sep_dz, - length_μm + sep_dy, - dy + 2.0 * sep_dy, - 2.0 * sep_dz - ) - _, domain_boundary = kernel.getSurfaceLoops(domain) - @assert length(domain_boundary) == 1 - domain_boundary = first(domain_boundary) - - # Ports - dy = ground_width_μm - p1a = kernel.addRectangle(0.0, dy, 0.0, gap_width_μm, gap_width_μm) - p2a = kernel.addRectangle(length_μm - gap_width_μm, dy, 0.0, gap_width_μm, gap_width_μm) - dy += gap_width_μm + trace_width_μm - p1b = kernel.addRectangle(0.0, dy, 0.0, gap_width_μm, gap_width_μm) - p2b = kernel.addRectangle(length_μm - gap_width_μm, dy, 0.0, gap_width_μm, gap_width_μm) - dy += gap_width_μm + separation_width_μm - p3a = kernel.addRectangle(0.0, dy, 0.0, gap_width_μm, gap_width_μm) - p4a = kernel.addRectangle(length_μm - gap_width_μm, dy, 0.0, gap_width_μm, gap_width_μm) - dy += gap_width_μm + trace_width_μm - p3b = kernel.addRectangle(0.0, dy, 0.0, gap_width_μm, gap_width_μm) - p4b = kernel.addRectangle(length_μm - gap_width_μm, dy, 0.0, gap_width_μm, gap_width_μm) - - # Embedding - geom_dimtags = filter(x -> x[1] == 2 || x[1] == 3, kernel.getEntities()) - _, geom_map = kernel.fragment(geom_dimtags, []) - - kernel.synchronize() - - # Add physical groups - si_domain = geom_map[findfirst(x -> x == (3, substrate), geom_dimtags)] - @assert length(si_domain) == 1 - si_domain = last(first(si_domain)) - - air_domain = filter( - x -> x != (3, si_domain), - geom_map[findfirst(x -> x == (3, domain), geom_dimtags)] - ) - @assert length(air_domain) == 1 - air_domain = last(first(air_domain)) - - si_domain_group = gmsh.model.addPhysicalGroup(3, [si_domain], -1, "si") - air_domain_group = gmsh.model.addPhysicalGroup(3, [air_domain], -1, "air") - - metal = - last.( - collect( - Iterators.flatten( - geom_map[findall( - x -> x in [(2, g1), (2, g2), (2, g3), (2, t1), (2, t2)], - geom_dimtags - )] - ) - ) - ) - - metal_group = gmsh.model.addPhysicalGroup(2, metal, -1, "metal") - - port1a = last.(geom_map[findfirst(x -> x == (2, p1a), geom_dimtags)]) - port2a = last.(geom_map[findfirst(x -> x == (2, p2a), geom_dimtags)]) - port3a = last.(geom_map[findfirst(x -> x == (2, p3a), geom_dimtags)]) - port4a = last.(geom_map[findfirst(x -> x == (2, p4a), geom_dimtags)]) - port1b = last.(geom_map[findfirst(x -> x == (2, p1b), geom_dimtags)]) - port2b = last.(geom_map[findfirst(x -> x == (2, p2b), geom_dimtags)]) - port3b = last.(geom_map[findfirst(x -> x == (2, p3b), geom_dimtags)]) - port4b = last.(geom_map[findfirst(x -> x == (2, p4b), geom_dimtags)]) - - port1a_group = gmsh.model.addPhysicalGroup(2, port1a, -1, "port1a") - port2a_group = gmsh.model.addPhysicalGroup(2, port2a, -1, "port2a") - port3a_group = gmsh.model.addPhysicalGroup(2, port3a, -1, "port3a") - port4a_group = gmsh.model.addPhysicalGroup(2, port4a, -1, "port4a") - port1b_group = gmsh.model.addPhysicalGroup(2, port1b, -1, "port1b") - port2b_group = gmsh.model.addPhysicalGroup(2, port2b, -1, "port2b") - port3b_group = gmsh.model.addPhysicalGroup(2, port3b, -1, "port3b") - port4b_group = gmsh.model.addPhysicalGroup(2, port4b, -1, "port4b") - - gap = - last.( - collect( - Iterators.flatten( - geom_map[findall( - x -> x in [(2, n1), (2, n2), (2, n3), (2, n4)], - geom_dimtags - )] - ) - ) - ) - filter!( - x -> !( - x in port1a || - x in port2a || - x in port3a || - x in port4a || - x in port1b || - x in port2b || - x in port3b || - x in port4b - ), - gap - ) - - gap_group = gmsh.model.addPhysicalGroup(2, gap, -1, "gap") - - farfield = - last.( - collect( - Iterators.flatten( - geom_map[findall( - x -> x[1] == 2 && x[2] in domain_boundary, - geom_dimtags - )] - ) - ) - ) - - farfield_group = gmsh.model.addPhysicalGroup(2, farfield, -1, "farfield") - - # Generate mesh - gmsh.option.setNumber("Mesh.MeshSizeMin", 0.0) - gmsh.option.setNumber("Mesh.MeshSizeMax", l_farfield) - gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", 0) - gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 0) - - gap_points = filter( - x -> x[1] == 0, - gmsh.model.getBoundary([(2, z) for z in gap], false, true, true) - ) - gap_curves = - last.( - filter( - x -> x[1] == 1, - gmsh.model.getBoundary([(2, z) for z in gap], false, false, false) - ) - ) - gmsh.model.mesh.setSize(gap_points, l_trace) - - gmsh.model.mesh.field.add("Extend", 1) - gmsh.model.mesh.field.setNumbers(1, "CurvesList", gap_curves) - gmsh.model.mesh.field.setNumbers(1, "SurfacesList", gap) - gmsh.model.mesh.field.setNumber(1, "Power", 1.0) - gmsh.model.mesh.field.setNumber(1, "DistMax", sep_dz) - gmsh.model.mesh.field.setNumber(1, "SizeMax", l_farfield) - - gmsh.model.mesh.field.add("Distance", 2) - gmsh.model.mesh.field.setNumbers(2, "CurvesList", gap_curves) - gmsh.model.mesh.field.setNumber(2, "Sampling", 10) - - gmsh.model.mesh.field.add("Threshold", 3) - gmsh.model.mesh.field.setNumber(3, "InField", 2) - gmsh.model.mesh.field.setNumber(3, "SizeMin", l_trace) - gmsh.model.mesh.field.setNumber(3, "SizeMax", l_farfield) - gmsh.model.mesh.field.setNumber(3, "DistMin", 0.0) - gmsh.model.mesh.field.setNumber(3, "DistMax", sep_dz) - - gmsh.model.mesh.field.add("Min", 101) - gmsh.model.mesh.field.setNumbers(101, "FieldsList", [1, 3]) - gmsh.model.mesh.field.setAsBackgroundMesh(101) - - gmsh.option.setNumber("Mesh.Algorithm", 8) - gmsh.option.setNumber("Mesh.Algorithm3D", 10) - - gmsh.model.mesh.generate(3) - gmsh.model.mesh.setOrder(1) - - # Save mesh - gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) - gmsh.option.setNumber("Mesh.Binary", 0) - gmsh.write(joinpath(@__DIR__, filename)) - - # Print some information - if verbose > 0 - println("\nFinished generating mesh. Physical group tags:") - println("Si domain: ", si_domain_group) - println("Air domain: ", air_domain_group) - println("Farfield boundaries: ", farfield_group) - println("Metal boundaries: ", metal_group) - println("Trace negative boundaries: ", gap_group) - - println("\nMultielement lumped ports:") - println("Port 1: ", port1a_group, ", ", port1b_group) - println("Port 2: ", port2a_group, ", ", port2b_group) - println("Port 3: ", port3a_group, ", ", port3b_group) - println("Port 4: ", port4a_group, ", ", port4b_group) - println() - end - - # Optionally launch GUI - if "gui" in lowercase.(ARGS) - gmsh.fltk.run() - end - - return gmsh.finalize() -end +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +using Gmsh: gmsh + +""" + function generate_coplanar_waveguide_lumped_mesh(; + refinement::Integer = 0, + trace_width_μm::Real = 30.0, + gap_width_μm::Real = 18.0, + separation_width_μm::Real = 200.0, + ground_width_μm::Real = 800.0, + substrate_height_μm::Real = 500.0, + length_μm::Real = 4000.0, + filename::AbstractString, + verbose::Integer = 1, + ) + +Generate a mesh for the coplanar waveguide with lumped ports using Gmsh + +# Arguments + + - refinement - measure of how many elements to include, 0 is least + - trace_width_μm - width of the coplanar waveguide trace, in μm + - gap_width_μm - width of the coplanar waveguide gap, in μm + - separation_width_μm - separation distance between the two waveguides, in μm + - ground_width_μm - width of the ground plane, in μm + - substrate_height_μm - height of the substrate, in μm + - length_μm - length of the waveguides, in μm + - filename - the filename to use for the generated mesh + - verbose - flag to dictate the level of print to REPL, passed to Gmsh +""" +function generate_coplanar_waveguide_lumped_mesh(; + refinement::Integer = 0, + trace_width_μm::Real = 30.0, + gap_width_μm::Real = 18.0, + separation_width_μm::Real = 200.0, + ground_width_μm::Real = 800.0, + substrate_height_μm::Real = 500.0, + length_μm::Real = 4000.0, + filename::AbstractString, + verbose::Integer=1 +) + @assert refinement >= 0 + @assert trace_width_μm > 0 + @assert gap_width_μm > 0 + @assert separation_width_μm > 0 + @assert ground_width_μm > 0 + @assert substrate_height_μm > 0 + @assert length_μm > 0 + + kernel = gmsh.model.occ + + gmsh.initialize() + gmsh.option.setNumber("General.Verbosity", verbose) + + # Add model + if "cpw" in gmsh.model.list() + gmsh.model.setCurrent("cpw") + gmsh.model.remove() + end + gmsh.model.add("cpw") + + sep_dz = 1000.0 + sep_dy = 0.5 * sep_dz + + # Mesh parameters + l_trace = 1.5 * trace_width_μm * 2^-refinement + l_farfield = 1.0 * substrate_height_μm * 2^-refinement + + # Chip pattern + dy = 0.0 + g1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, ground_width_μm) + dy += ground_width_μm + n1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) + dy += gap_width_μm + t1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, trace_width_μm) + dy += trace_width_μm + n2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) + dy += gap_width_μm + g2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, separation_width_μm) + dy += separation_width_μm + n3 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) + dy += gap_width_μm + t2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, trace_width_μm) + dy += trace_width_μm + n4 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) + dy += gap_width_μm + g3 = kernel.addRectangle(0.0, dy, 0.0, length_μm, ground_width_μm) + dy += ground_width_μm + + # Substrate + substrate = + kernel.addBox(0.0, 0.0, -substrate_height_μm, length_μm, dy, substrate_height_μm) + + # Exterior box + domain = kernel.addBox( + -0.5 * sep_dy, + -sep_dy, + -sep_dz, + length_μm + sep_dy, + dy + 2.0 * sep_dy, + 2.0 * sep_dz + ) + _, domain_boundary = kernel.getSurfaceLoops(domain) + @assert length(domain_boundary) == 1 + domain_boundary = first(domain_boundary) + + # Ports + dy = ground_width_μm + p1a = kernel.addRectangle(0.0, dy, 0.0, gap_width_μm, gap_width_μm) + p2a = kernel.addRectangle(length_μm - gap_width_μm, dy, 0.0, gap_width_μm, gap_width_μm) + dy += gap_width_μm + trace_width_μm + p1b = kernel.addRectangle(0.0, dy, 0.0, gap_width_μm, gap_width_μm) + p2b = kernel.addRectangle(length_μm - gap_width_μm, dy, 0.0, gap_width_μm, gap_width_μm) + dy += gap_width_μm + separation_width_μm + p3a = kernel.addRectangle(0.0, dy, 0.0, gap_width_μm, gap_width_μm) + p4a = kernel.addRectangle(length_μm - gap_width_μm, dy, 0.0, gap_width_μm, gap_width_μm) + dy += gap_width_μm + trace_width_μm + p3b = kernel.addRectangle(0.0, dy, 0.0, gap_width_μm, gap_width_μm) + p4b = kernel.addRectangle(length_μm - gap_width_μm, dy, 0.0, gap_width_μm, gap_width_μm) + + # Embedding + geom_dimtags = filter(x -> x[1] == 2 || x[1] == 3, kernel.getEntities()) + _, geom_map = kernel.fragment(geom_dimtags, []) + + kernel.synchronize() + + # Add physical groups + si_domain = geom_map[findfirst(x -> x == (3, substrate), geom_dimtags)] + @assert length(si_domain) == 1 + si_domain = last(first(si_domain)) + + air_domain = filter( + x -> x != (3, si_domain), + geom_map[findfirst(x -> x == (3, domain), geom_dimtags)] + ) + @assert length(air_domain) == 1 + air_domain = last(first(air_domain)) + + si_domain_group = gmsh.model.addPhysicalGroup(3, [si_domain], -1, "si") + air_domain_group = gmsh.model.addPhysicalGroup(3, [air_domain], -1, "air") + + metal = + last.( + collect( + Iterators.flatten( + geom_map[findall( + x -> x in [(2, g1), (2, g2), (2, g3), (2, t1), (2, t2)], + geom_dimtags + )] + ) + ) + ) + + metal_group = gmsh.model.addPhysicalGroup(2, metal, -1, "metal") + + port1a = last.(geom_map[findfirst(x -> x == (2, p1a), geom_dimtags)]) + port2a = last.(geom_map[findfirst(x -> x == (2, p2a), geom_dimtags)]) + port3a = last.(geom_map[findfirst(x -> x == (2, p3a), geom_dimtags)]) + port4a = last.(geom_map[findfirst(x -> x == (2, p4a), geom_dimtags)]) + port1b = last.(geom_map[findfirst(x -> x == (2, p1b), geom_dimtags)]) + port2b = last.(geom_map[findfirst(x -> x == (2, p2b), geom_dimtags)]) + port3b = last.(geom_map[findfirst(x -> x == (2, p3b), geom_dimtags)]) + port4b = last.(geom_map[findfirst(x -> x == (2, p4b), geom_dimtags)]) + + port1a_group = gmsh.model.addPhysicalGroup(2, port1a, -1, "port1a") + port2a_group = gmsh.model.addPhysicalGroup(2, port2a, -1, "port2a") + port3a_group = gmsh.model.addPhysicalGroup(2, port3a, -1, "port3a") + port4a_group = gmsh.model.addPhysicalGroup(2, port4a, -1, "port4a") + port1b_group = gmsh.model.addPhysicalGroup(2, port1b, -1, "port1b") + port2b_group = gmsh.model.addPhysicalGroup(2, port2b, -1, "port2b") + port3b_group = gmsh.model.addPhysicalGroup(2, port3b, -1, "port3b") + port4b_group = gmsh.model.addPhysicalGroup(2, port4b, -1, "port4b") + + gap = + last.( + collect( + Iterators.flatten( + geom_map[findall( + x -> x in [(2, n1), (2, n2), (2, n3), (2, n4)], + geom_dimtags + )] + ) + ) + ) + filter!( + x -> !( + x in port1a || + x in port2a || + x in port3a || + x in port4a || + x in port1b || + x in port2b || + x in port3b || + x in port4b + ), + gap + ) + + gap_group = gmsh.model.addPhysicalGroup(2, gap, -1, "gap") + + farfield = + last.( + collect( + Iterators.flatten( + geom_map[findall( + x -> x[1] == 2 && x[2] in domain_boundary, + geom_dimtags + )] + ) + ) + ) + + farfield_group = gmsh.model.addPhysicalGroup(2, farfield, -1, "farfield") + + # Generate mesh + gmsh.option.setNumber("Mesh.MeshSizeMin", 0.0) + gmsh.option.setNumber("Mesh.MeshSizeMax", l_farfield) + gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", 0) + gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 0) + + gap_points = filter( + x -> x[1] == 0, + gmsh.model.getBoundary([(2, z) for z in gap], false, true, true) + ) + gap_curves = + last.( + filter( + x -> x[1] == 1, + gmsh.model.getBoundary([(2, z) for z in gap], false, false, false) + ) + ) + gmsh.model.mesh.setSize(gap_points, l_trace) + + gmsh.model.mesh.field.add("Extend", 1) + gmsh.model.mesh.field.setNumbers(1, "CurvesList", gap_curves) + gmsh.model.mesh.field.setNumbers(1, "SurfacesList", gap) + gmsh.model.mesh.field.setNumber(1, "Power", 1.0) + gmsh.model.mesh.field.setNumber(1, "DistMax", sep_dz) + gmsh.model.mesh.field.setNumber(1, "SizeMax", l_farfield) + + gmsh.model.mesh.field.add("Distance", 2) + gmsh.model.mesh.field.setNumbers(2, "CurvesList", gap_curves) + gmsh.model.mesh.field.setNumber(2, "Sampling", 10) + + gmsh.model.mesh.field.add("Threshold", 3) + gmsh.model.mesh.field.setNumber(3, "InField", 2) + gmsh.model.mesh.field.setNumber(3, "SizeMin", l_trace) + gmsh.model.mesh.field.setNumber(3, "SizeMax", l_farfield) + gmsh.model.mesh.field.setNumber(3, "DistMin", 0.0) + gmsh.model.mesh.field.setNumber(3, "DistMax", sep_dz) + + gmsh.model.mesh.field.add("Min", 101) + gmsh.model.mesh.field.setNumbers(101, "FieldsList", [1, 3]) + gmsh.model.mesh.field.setAsBackgroundMesh(101) + + gmsh.option.setNumber("Mesh.Algorithm", 8) + gmsh.option.setNumber("Mesh.Algorithm3D", 10) + + gmsh.model.mesh.generate(3) + gmsh.model.mesh.setOrder(1) + + # Save mesh + gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) + gmsh.option.setNumber("Mesh.Binary", 0) + gmsh.write(joinpath(@__DIR__, filename)) + + # Print some information + if verbose > 0 + println("\nFinished generating mesh. Physical group tags:") + println("Si domain: ", si_domain_group) + println("Air domain: ", air_domain_group) + println("Farfield boundaries: ", farfield_group) + println("Metal boundaries: ", metal_group) + println("Trace negative boundaries: ", gap_group) + + println("\nMultielement lumped ports:") + println("Port 1: ", port1a_group, ", ", port1b_group) + println("Port 2: ", port2a_group, ", ", port2b_group) + println("Port 3: ", port3a_group, ", ", port3b_group) + println("Port 4: ", port4a_group, ", ", port4b_group) + println() + end + + # Optionally launch GUI + if "gui" in lowercase.(ARGS) + gmsh.fltk.run() + end + + return gmsh.finalize() +end diff --git a/examples/cpw/mesh/mesh_wave.jl b/examples/cpw/mesh/mesh_wave.jl index cbbc5f33f5..b0edcfb67d 100644 --- a/examples/cpw/mesh/mesh_wave.jl +++ b/examples/cpw/mesh/mesh_wave.jl @@ -1,311 +1,311 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -using Gmsh: gmsh - -""" - function generate_coplanar_waveguide_wave_mesh(; - refinement::Integer = 0, - trace_width_μm::Real = 30.0, - gap_width_μm::Real = 18.0, - separation_width_μm::Real = 200.0, - ground_width_μm::Real = 800.0, - substrate_height_μm::Real = 500.0, - length_μm::Real = 4000.0, - filename::AbstractString, - verbose::Integer=1 - ) - -Generate a mesh for the coplanar waveguide with wave ports using Gmsh - -# Arguments - - - refinement - measure of how many elements to include, 0 is least - - trace_width_μm - width of the coplanar waveguide trace, in μm - - gap_width_μm - width of the coplanar waveguide gap, in μm - - separation_width_μm - separation distance between the two waveguides, in μm - - ground_width_μm - width of the ground plane, in μm - - substrate_height_μm - height of the substrate, in μm - - length_μm - length of the waveguides, in μm - - filename - the filename to use for the generated mesh - - verbose - flag to dictate the level of print to REPL, passed to Gmsh -""" -function generate_coplanar_waveguide_wave_mesh(; - refinement::Integer = 0, - trace_width_μm::Real = 30.0, - gap_width_μm::Real = 18.0, - separation_width_μm::Real = 200.0, - ground_width_μm::Real = 800.0, - substrate_height_μm::Real = 500.0, - length_μm::Real = 4000.0, - filename::AbstractString, - verbose::Integer=1 -) - @assert refinement >= 0 - @assert trace_width_μm > 0 - @assert gap_width_μm > 0 - @assert separation_width_μm > 0 - @assert ground_width_μm > 0 - @assert substrate_height_μm > 0 - @assert length_μm > 0 - - kernel = gmsh.model.occ - - gmsh.initialize() - gmsh.option.setNumber("General.Verbosity", verbose) - - # Add model - if "cpw" in gmsh.model.list() - gmsh.model.setCurrent("cpw") - gmsh.model.remove() - end - gmsh.model.add("cpw") - - sep_dz = 1000.0 - sep_dy = 0.5 * sep_dz - - # Mesh parameters - l_trace = 1.5 * trace_width_μm * 2^-refinement - l_farfield = 1.0 * substrate_height_μm * 2^-refinement - - # Chip pattern - dy = 0.0 - g1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, ground_width_μm) - dy += ground_width_μm - n1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) - dy += gap_width_μm - t1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, trace_width_μm) - dy += trace_width_μm - n2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) - dy += gap_width_μm - g2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, separation_width_μm) - dy += separation_width_μm - n3 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) - dy += gap_width_μm - t2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, trace_width_μm) - dy += trace_width_μm - n4 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) - dy += gap_width_μm - g3 = kernel.addRectangle(0.0, dy, 0.0, length_μm, ground_width_μm) - dy += ground_width_μm - - # Substrate - substrate = - kernel.addBox(0.0, 0.0, -substrate_height_μm, length_μm, dy, substrate_height_μm) - - # Exterior box - domain = - kernel.addBox(0.0, -sep_dy, -sep_dz, length_μm, dy + 2.0 * sep_dy, 2.0 * sep_dz) - _, domain_boundary = kernel.getSurfaceLoops(domain) - @assert length(domain_boundary) == 1 - domain_boundary = first(domain_boundary) - - # Ports - cy1 = ground_width_μm + gap_width_μm + 0.5 * trace_width_μm - cy2 = - cy1 + - 0.5 * trace_width_μm + - gap_width_μm + - separation_width_μm + - gap_width_μm + - 0.5 * trace_width_μm - dzp = trace_width_μm + 2.0 * gap_width_μm - dyp = 2.0 * dzp - let pa, pb, l - pa = kernel.addPoint(0.0, cy1 - 0.5 * dyp, -0.5 * dzp) - pb = kernel.addPoint(0.0, cy1 + 0.5 * dyp, -0.5 * dzp) - l = kernel.addLine(pa, pb) - global p1 = first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, dzp)))[2] - end - let pa, pb, l - pa = kernel.addPoint(0.0, cy2 - 0.5 * dyp, -0.5 * dzp) - pb = kernel.addPoint(0.0, cy2 + 0.5 * dyp, -0.5 * dzp) - l = kernel.addLine(pa, pb) - global p3 = first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, dzp)))[2] - end - let pa, pb, l - pa = kernel.addPoint(length_μm, cy1 - 0.5 * dyp, -0.5 * dzp) - pb = kernel.addPoint(length_μm, cy1 + 0.5 * dyp, -0.5 * dzp) - l = kernel.addLine(pa, pb) - global p2 = first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, dzp)))[2] - end - let pa, pb, l - pa = kernel.addPoint(length_μm, cy2 - 0.5 * dyp, -0.5 * dzp) - pb = kernel.addPoint(length_μm, cy2 + 0.5 * dyp, -0.5 * dzp) - l = kernel.addLine(pa, pb) - global p4 = first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, dzp)))[2] - end - let pa, pb, l - pa = kernel.addPoint(0.0, -sep_dy, -sep_dz) - pb = kernel.addPoint(0.0, dy + sep_dy, -sep_dz) - l = kernel.addLine(pa, pb) - global p5 = - first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, 2.0 * sep_dz)))[2] - end - let pa, pb, l - pa = kernel.addPoint(length_μm, -sep_dy, -sep_dz) - pb = kernel.addPoint(length_μm, dy + sep_dy, -sep_dz) - l = kernel.addLine(pa, pb) - global p6 = - first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, 2.0 * sep_dz)))[2] - end - - # Embedding - geom_dimtags = filter(x -> x[1] == 2 || x[1] == 3, kernel.getEntities()) - _, geom_map = kernel.fragment(geom_dimtags, []) - - kernel.synchronize() - - # Add physical groups - si_domain = geom_map[findfirst(x -> x == (3, substrate), geom_dimtags)] - @assert length(si_domain) == 1 - si_domain = last(first(si_domain)) - - air_domain = filter( - x -> x != (3, si_domain), - geom_map[findfirst(x -> x == (3, domain), geom_dimtags)] - ) - @assert length(air_domain) == 1 - air_domain = last(first(air_domain)) - - si_domain_group = gmsh.model.addPhysicalGroup(3, [si_domain], -1, "si") - air_domain_group = gmsh.model.addPhysicalGroup(3, [air_domain], -1, "air") - - metal = - last.( - collect( - Iterators.flatten( - geom_map[findall( - x -> x in [(2, g1), (2, g2), (2, g3), (2, t1), (2, t2)], - geom_dimtags - )] - ) - ) - ) - gap = - last.( - collect( - Iterators.flatten( - geom_map[findall( - x -> x in [(2, n1), (2, n2), (2, n3), (2, n4)], - geom_dimtags - )] - ) - ) - ) - - metal_group = gmsh.model.addPhysicalGroup(2, metal, -1, "metal") - gap_group = gmsh.model.addPhysicalGroup(2, gap, -1, "gap") - - port1 = last.(geom_map[findfirst(x -> x == (2, p1), geom_dimtags)]) - port2 = last.(geom_map[findfirst(x -> x == (2, p2), geom_dimtags)]) - port3 = last.(geom_map[findfirst(x -> x == (2, p3), geom_dimtags)]) - port4 = last.(geom_map[findfirst(x -> x == (2, p4), geom_dimtags)]) - - port1_group = gmsh.model.addPhysicalGroup(2, port1, -1, "port1") - port2_group = gmsh.model.addPhysicalGroup(2, port2, -1, "port2") - port3_group = gmsh.model.addPhysicalGroup(2, port3, -1, "port3") - port4_group = gmsh.model.addPhysicalGroup(2, port4, -1, "port4") - - port5 = last.(geom_map[findfirst(x -> x == (2, p5), geom_dimtags)]) - port6 = last.(geom_map[findfirst(x -> x == (2, p6), geom_dimtags)]) - ends = vcat(port5, port6) - filter!(x -> !(x in port1 || x in port2 || x in port3 || x in port4), ends) - - ends_group = gmsh.model.addPhysicalGroup(2, ends, -1, "ends") - - farfield = - last.( - collect( - Iterators.flatten( - geom_map[findall( - x -> x[1] == 2 && x[2] in domain_boundary, - geom_dimtags - )] - ) - ) - ) - filter!( - x -> !(x in port1 || x in port2 || x in port3 || x in port4 || x in ends), - farfield - ) - - farfield_group = gmsh.model.addPhysicalGroup(2, farfield, -1, "farfield") - - # Generate mesh - gmsh.option.setNumber("Mesh.MeshSizeMin", 0.0) - gmsh.option.setNumber("Mesh.MeshSizeMax", l_farfield) - gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", 0) - gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 0) - - gap_points = filter( - x -> x[1] == 0, - gmsh.model.getBoundary([(2, z) for z in gap], false, true, true) - ) - gap_curves = - last.( - filter( - x -> x[1] == 1, - gmsh.model.getBoundary([(2, z) for z in gap], false, false, false) - ) - ) - gmsh.model.mesh.setSize(gap_points, l_trace) - - gmsh.model.mesh.field.add("Extend", 1) - gmsh.model.mesh.field.setNumbers(1, "CurvesList", gap_curves) - gmsh.model.mesh.field.setNumbers(1, "SurfacesList", gap) - gmsh.model.mesh.field.setNumber(1, "Power", 1.0) - gmsh.model.mesh.field.setNumber(1, "DistMax", sep_dz) - gmsh.model.mesh.field.setNumber(1, "SizeMax", l_farfield) - - gmsh.model.mesh.field.add("Distance", 2) - gmsh.model.mesh.field.setNumbers(2, "CurvesList", gap_curves) - gmsh.model.mesh.field.setNumber(2, "Sampling", 10) - - gmsh.model.mesh.field.add("Threshold", 3) - gmsh.model.mesh.field.setNumber(3, "InField", 2) - gmsh.model.mesh.field.setNumber(3, "SizeMin", l_trace) - gmsh.model.mesh.field.setNumber(3, "SizeMax", l_farfield) - gmsh.model.mesh.field.setNumber(3, "DistMin", 0.0) - gmsh.model.mesh.field.setNumber(3, "DistMax", sep_dz) - - gmsh.model.mesh.field.add("Min", 101) - gmsh.model.mesh.field.setNumbers(101, "FieldsList", [1, 3]) - gmsh.model.mesh.field.setAsBackgroundMesh(101) - - gmsh.option.setNumber("Mesh.Algorithm", 8) - gmsh.option.setNumber("Mesh.Algorithm3D", 10) - - gmsh.model.mesh.generate(3) - gmsh.model.mesh.setOrder(1) - - # Save mesh - gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) - gmsh.option.setNumber("Mesh.Binary", 0) - gmsh.write(joinpath(@__DIR__, filename)) - - # Print some information - if verbose > 0 - println("\nFinished generating mesh. Physical group tags:") - println("Si domain: ", si_domain_group) - println("Air domain: ", air_domain_group) - println("Farfield boundaries: ", farfield_group) - println("End boundaries: ", ends_group) - println("Metal boundaries: ", metal_group) - println("Trace negative boundaries: ", gap_group) - - println("\nPorts:") - println("Port 1: ", port1_group) - println("Port 2: ", port2_group) - println("Port 3: ", port3_group) - println("Port 4: ", port4_group) - println() - end - - # Optionally launch GUI - if "gui" in lowercase.(ARGS) - gmsh.fltk.run() - end - - return gmsh.finalize() -end +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +using Gmsh: gmsh + +""" + function generate_coplanar_waveguide_wave_mesh(; + refinement::Integer = 0, + trace_width_μm::Real = 30.0, + gap_width_μm::Real = 18.0, + separation_width_μm::Real = 200.0, + ground_width_μm::Real = 800.0, + substrate_height_μm::Real = 500.0, + length_μm::Real = 4000.0, + filename::AbstractString, + verbose::Integer=1 + ) + +Generate a mesh for the coplanar waveguide with wave ports using Gmsh + +# Arguments + + - refinement - measure of how many elements to include, 0 is least + - trace_width_μm - width of the coplanar waveguide trace, in μm + - gap_width_μm - width of the coplanar waveguide gap, in μm + - separation_width_μm - separation distance between the two waveguides, in μm + - ground_width_μm - width of the ground plane, in μm + - substrate_height_μm - height of the substrate, in μm + - length_μm - length of the waveguides, in μm + - filename - the filename to use for the generated mesh + - verbose - flag to dictate the level of print to REPL, passed to Gmsh +""" +function generate_coplanar_waveguide_wave_mesh(; + refinement::Integer = 0, + trace_width_μm::Real = 30.0, + gap_width_μm::Real = 18.0, + separation_width_μm::Real = 200.0, + ground_width_μm::Real = 800.0, + substrate_height_μm::Real = 500.0, + length_μm::Real = 4000.0, + filename::AbstractString, + verbose::Integer=1 +) + @assert refinement >= 0 + @assert trace_width_μm > 0 + @assert gap_width_μm > 0 + @assert separation_width_μm > 0 + @assert ground_width_μm > 0 + @assert substrate_height_μm > 0 + @assert length_μm > 0 + + kernel = gmsh.model.occ + + gmsh.initialize() + gmsh.option.setNumber("General.Verbosity", verbose) + + # Add model + if "cpw" in gmsh.model.list() + gmsh.model.setCurrent("cpw") + gmsh.model.remove() + end + gmsh.model.add("cpw") + + sep_dz = 1000.0 + sep_dy = 0.5 * sep_dz + + # Mesh parameters + l_trace = 1.5 * trace_width_μm * 2^-refinement + l_farfield = 1.0 * substrate_height_μm * 2^-refinement + + # Chip pattern + dy = 0.0 + g1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, ground_width_μm) + dy += ground_width_μm + n1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) + dy += gap_width_μm + t1 = kernel.addRectangle(0.0, dy, 0.0, length_μm, trace_width_μm) + dy += trace_width_μm + n2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) + dy += gap_width_μm + g2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, separation_width_μm) + dy += separation_width_μm + n3 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) + dy += gap_width_μm + t2 = kernel.addRectangle(0.0, dy, 0.0, length_μm, trace_width_μm) + dy += trace_width_μm + n4 = kernel.addRectangle(0.0, dy, 0.0, length_μm, gap_width_μm) + dy += gap_width_μm + g3 = kernel.addRectangle(0.0, dy, 0.0, length_μm, ground_width_μm) + dy += ground_width_μm + + # Substrate + substrate = + kernel.addBox(0.0, 0.0, -substrate_height_μm, length_μm, dy, substrate_height_μm) + + # Exterior box + domain = + kernel.addBox(0.0, -sep_dy, -sep_dz, length_μm, dy + 2.0 * sep_dy, 2.0 * sep_dz) + _, domain_boundary = kernel.getSurfaceLoops(domain) + @assert length(domain_boundary) == 1 + domain_boundary = first(domain_boundary) + + # Ports + cy1 = ground_width_μm + gap_width_μm + 0.5 * trace_width_μm + cy2 = + cy1 + + 0.5 * trace_width_μm + + gap_width_μm + + separation_width_μm + + gap_width_μm + + 0.5 * trace_width_μm + dzp = trace_width_μm + 2.0 * gap_width_μm + dyp = 2.0 * dzp + let pa, pb, l + pa = kernel.addPoint(0.0, cy1 - 0.5 * dyp, -0.5 * dzp) + pb = kernel.addPoint(0.0, cy1 + 0.5 * dyp, -0.5 * dzp) + l = kernel.addLine(pa, pb) + global p1 = first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, dzp)))[2] + end + let pa, pb, l + pa = kernel.addPoint(0.0, cy2 - 0.5 * dyp, -0.5 * dzp) + pb = kernel.addPoint(0.0, cy2 + 0.5 * dyp, -0.5 * dzp) + l = kernel.addLine(pa, pb) + global p3 = first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, dzp)))[2] + end + let pa, pb, l + pa = kernel.addPoint(length_μm, cy1 - 0.5 * dyp, -0.5 * dzp) + pb = kernel.addPoint(length_μm, cy1 + 0.5 * dyp, -0.5 * dzp) + l = kernel.addLine(pa, pb) + global p2 = first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, dzp)))[2] + end + let pa, pb, l + pa = kernel.addPoint(length_μm, cy2 - 0.5 * dyp, -0.5 * dzp) + pb = kernel.addPoint(length_μm, cy2 + 0.5 * dyp, -0.5 * dzp) + l = kernel.addLine(pa, pb) + global p4 = first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, dzp)))[2] + end + let pa, pb, l + pa = kernel.addPoint(0.0, -sep_dy, -sep_dz) + pb = kernel.addPoint(0.0, dy + sep_dy, -sep_dz) + l = kernel.addLine(pa, pb) + global p5 = + first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, 2.0 * sep_dz)))[2] + end + let pa, pb, l + pa = kernel.addPoint(length_μm, -sep_dy, -sep_dz) + pb = kernel.addPoint(length_μm, dy + sep_dy, -sep_dz) + l = kernel.addLine(pa, pb) + global p6 = + first(filter(x -> x[1] == 2, kernel.extrude([1, l], 0.0, 0.0, 2.0 * sep_dz)))[2] + end + + # Embedding + geom_dimtags = filter(x -> x[1] == 2 || x[1] == 3, kernel.getEntities()) + _, geom_map = kernel.fragment(geom_dimtags, []) + + kernel.synchronize() + + # Add physical groups + si_domain = geom_map[findfirst(x -> x == (3, substrate), geom_dimtags)] + @assert length(si_domain) == 1 + si_domain = last(first(si_domain)) + + air_domain = filter( + x -> x != (3, si_domain), + geom_map[findfirst(x -> x == (3, domain), geom_dimtags)] + ) + @assert length(air_domain) == 1 + air_domain = last(first(air_domain)) + + si_domain_group = gmsh.model.addPhysicalGroup(3, [si_domain], -1, "si") + air_domain_group = gmsh.model.addPhysicalGroup(3, [air_domain], -1, "air") + + metal = + last.( + collect( + Iterators.flatten( + geom_map[findall( + x -> x in [(2, g1), (2, g2), (2, g3), (2, t1), (2, t2)], + geom_dimtags + )] + ) + ) + ) + gap = + last.( + collect( + Iterators.flatten( + geom_map[findall( + x -> x in [(2, n1), (2, n2), (2, n3), (2, n4)], + geom_dimtags + )] + ) + ) + ) + + metal_group = gmsh.model.addPhysicalGroup(2, metal, -1, "metal") + gap_group = gmsh.model.addPhysicalGroup(2, gap, -1, "gap") + + port1 = last.(geom_map[findfirst(x -> x == (2, p1), geom_dimtags)]) + port2 = last.(geom_map[findfirst(x -> x == (2, p2), geom_dimtags)]) + port3 = last.(geom_map[findfirst(x -> x == (2, p3), geom_dimtags)]) + port4 = last.(geom_map[findfirst(x -> x == (2, p4), geom_dimtags)]) + + port1_group = gmsh.model.addPhysicalGroup(2, port1, -1, "port1") + port2_group = gmsh.model.addPhysicalGroup(2, port2, -1, "port2") + port3_group = gmsh.model.addPhysicalGroup(2, port3, -1, "port3") + port4_group = gmsh.model.addPhysicalGroup(2, port4, -1, "port4") + + port5 = last.(geom_map[findfirst(x -> x == (2, p5), geom_dimtags)]) + port6 = last.(geom_map[findfirst(x -> x == (2, p6), geom_dimtags)]) + ends = vcat(port5, port6) + filter!(x -> !(x in port1 || x in port2 || x in port3 || x in port4), ends) + + ends_group = gmsh.model.addPhysicalGroup(2, ends, -1, "ends") + + farfield = + last.( + collect( + Iterators.flatten( + geom_map[findall( + x -> x[1] == 2 && x[2] in domain_boundary, + geom_dimtags + )] + ) + ) + ) + filter!( + x -> !(x in port1 || x in port2 || x in port3 || x in port4 || x in ends), + farfield + ) + + farfield_group = gmsh.model.addPhysicalGroup(2, farfield, -1, "farfield") + + # Generate mesh + gmsh.option.setNumber("Mesh.MeshSizeMin", 0.0) + gmsh.option.setNumber("Mesh.MeshSizeMax", l_farfield) + gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", 0) + gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 0) + + gap_points = filter( + x -> x[1] == 0, + gmsh.model.getBoundary([(2, z) for z in gap], false, true, true) + ) + gap_curves = + last.( + filter( + x -> x[1] == 1, + gmsh.model.getBoundary([(2, z) for z in gap], false, false, false) + ) + ) + gmsh.model.mesh.setSize(gap_points, l_trace) + + gmsh.model.mesh.field.add("Extend", 1) + gmsh.model.mesh.field.setNumbers(1, "CurvesList", gap_curves) + gmsh.model.mesh.field.setNumbers(1, "SurfacesList", gap) + gmsh.model.mesh.field.setNumber(1, "Power", 1.0) + gmsh.model.mesh.field.setNumber(1, "DistMax", sep_dz) + gmsh.model.mesh.field.setNumber(1, "SizeMax", l_farfield) + + gmsh.model.mesh.field.add("Distance", 2) + gmsh.model.mesh.field.setNumbers(2, "CurvesList", gap_curves) + gmsh.model.mesh.field.setNumber(2, "Sampling", 10) + + gmsh.model.mesh.field.add("Threshold", 3) + gmsh.model.mesh.field.setNumber(3, "InField", 2) + gmsh.model.mesh.field.setNumber(3, "SizeMin", l_trace) + gmsh.model.mesh.field.setNumber(3, "SizeMax", l_farfield) + gmsh.model.mesh.field.setNumber(3, "DistMin", 0.0) + gmsh.model.mesh.field.setNumber(3, "DistMax", sep_dz) + + gmsh.model.mesh.field.add("Min", 101) + gmsh.model.mesh.field.setNumbers(101, "FieldsList", [1, 3]) + gmsh.model.mesh.field.setAsBackgroundMesh(101) + + gmsh.option.setNumber("Mesh.Algorithm", 8) + gmsh.option.setNumber("Mesh.Algorithm3D", 10) + + gmsh.model.mesh.generate(3) + gmsh.model.mesh.setOrder(1) + + # Save mesh + gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) + gmsh.option.setNumber("Mesh.Binary", 0) + gmsh.write(joinpath(@__DIR__, filename)) + + # Print some information + if verbose > 0 + println("\nFinished generating mesh. Physical group tags:") + println("Si domain: ", si_domain_group) + println("Air domain: ", air_domain_group) + println("Farfield boundaries: ", farfield_group) + println("End boundaries: ", ends_group) + println("Metal boundaries: ", metal_group) + println("Trace negative boundaries: ", gap_group) + + println("\nPorts:") + println("Port 1: ", port1_group) + println("Port 2: ", port2_group) + println("Port 3: ", port3_group) + println("Port 4: ", port4_group) + println() + end + + # Optionally launch GUI + if "gui" in lowercase.(ARGS) + gmsh.fltk.run() + end + + return gmsh.finalize() +end diff --git a/examples/rings/mesh/mesh.jl b/examples/rings/mesh/mesh.jl index 3f71d25c15..4ec047a1f8 100644 --- a/examples/rings/mesh/mesh.jl +++ b/examples/rings/mesh/mesh.jl @@ -1,238 +1,259 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -using Gmsh: gmsh -using LinearAlgebra - -""" - generate_ring_mesh(; - gui::Bool = false, - rc::AbstractVector{<:Real} = [0.0, 0.0, 0.0], - ra::AbstractVector{<:Real} = [0.0, 0.0, 1.0], - θ::Real = π / 2, - verbose::Bool = false - ) - -Generate a mesh for the rings example using Gmsh - -# Arguments - - - gui - whether to launch the gmsh gui on mesh generation - - rc - center of rotation - - ra - axis of rotation - - θ - angle of rotation about ra, originating at rc - - verbose - flag to dictate the level of print to REPL, passed to Gmsh -""" -function generate_ring_mesh(; - gui::Bool = false, - rc::AbstractVector{<:Real} = [0.0, 0.0, 0.0], - ra::AbstractVector{<:Real} = [0.0, 0.0, 1.0], - θ::Real = π / 2, - verbose::Bool = false -) - kernel = gmsh.model.occ - - gmsh.initialize() - gmsh.option.setNumber("General.Verbosity", verbose) - - # Add model - if "rings" in gmsh.model.list() - gmsh.model.setCurrent("rings") - gmsh.model.remove() - end - gmsh.model.add("rings") - - # Geometry parameters (in um) - wire_width = 1.0 - inner_radius = 10.0 - outer_radius = 100.0 - farfield_radius = 10.0 * outer_radius - - # Mesh parameters - l_ring = 2.0 - l_farfield = 200.0 - - # Origin - p0 = kernel.addPoint(0.0, 0.0, 0.0) - - # Inner ring - h0 = 0.5 * wire_width - r1 = inner_radius - h0 - r2 = inner_radius + h0 - x1 = sqrt(r1^2 - h0^2) - x2 = sqrt(r2^2 - h0^2) - - pi1 = kernel.addPoint(x1, -h0, 0.0, l_ring) - pi2 = kernel.addPoint(x1, h0, 0.0, l_ring) - pi3 = kernel.addPoint(x2, h0, 0.0, l_ring) - pi4 = kernel.addPoint(x2, -h0, 0.0, l_ring) - - li1 = kernel.addLine(pi1, pi2) - li2 = kernel.addLine(pi2, pi3) - li3 = kernel.addLine(pi3, pi4) - li4 = kernel.addLine(pi4, pi1) - - inner_terminal_loop = kernel.addCurveLoop([li1, li2, li3, li4]) - inner_terminal = kernel.addPlaneSurface([inner_terminal_loop]) - - pi5 = kernel.addPoint(-r1, 0.0, 0.0, l_ring) - pi6 = kernel.addPoint(-r2, 0.0, 0.0, l_ring) - - ai1 = kernel.addCircleArc(pi2, p0, pi5) - ai2 = kernel.addCircleArc(pi5, p0, pi1) - ai3 = kernel.addCircleArc(pi3, p0, pi6) - ai4 = kernel.addCircleArc(pi6, p0, pi4) - - inner_ring_loop = kernel.addCurveLoop([ai1, ai2, -li4, -ai4, -ai3, -li2]) - inner_ring = kernel.addPlaneSurface([inner_ring_loop]) - - # Outer ring - r1 = outer_radius - h0 - r2 = outer_radius + h0 - x1 = sqrt(r1^2 - h0^2) - x2 = sqrt(r2^2 - h0^2) - - po1 = kernel.addPoint(x1, -h0, 0.0, l_ring) - po2 = kernel.addPoint(x1, h0, 0.0, l_ring) - po3 = kernel.addPoint(x2, h0, 0.0, l_ring) - po4 = kernel.addPoint(x2, -h0, 0.0, l_ring) - - lo1 = kernel.addLine(po1, po2) - lo2 = kernel.addLine(po2, po3) - lo3 = kernel.addLine(po3, po4) - lo4 = kernel.addLine(po4, po1) - - outer_terminal_loop = kernel.addCurveLoop([lo1, lo2, lo3, lo4]) - outer_terminal = kernel.addPlaneSurface([outer_terminal_loop]) - - po5 = kernel.addPoint(-r1, 0.0, 0.0, l_ring) - po6 = kernel.addPoint(-r2, 0.0, 0.0, l_ring) - - ao1 = kernel.addCircleArc(po2, p0, po5) - ao2 = kernel.addCircleArc(po5, p0, po1) - ao3 = kernel.addCircleArc(po3, p0, po6) - ao4 = kernel.addCircleArc(po6, p0, po4) - - outer_ring_loop = kernel.addCurveLoop([ao1, ao2, -lo4, -ao4, -ao3, -lo2]) - outer_ring = kernel.addPlaneSurface([outer_ring_loop]) - - # Auxiliary surfaces - inner_gap_loop = kernel.addCurveLoop([ai1, ai2, li1]) - inner_gap = kernel.addPlaneSurface([inner_gap_loop]) - - outer_gap_loop_in = kernel.addCurveLoop([ai3, ai4, -li3]) - outer_gap_in = kernel.addPlaneSurface([outer_gap_loop_in]) - outer_gap_loop_out = kernel.addCurveLoop([ao1, ao2, lo1]) - outer_gap_out = kernel.addPlaneSurface([outer_gap_loop_out]) - outer_gap, _ = kernel.cut([(2, outer_gap_out)], [(2, outer_gap_in)]) - @assert length(outer_gap) == 1 - outer_gap = first(outer_gap)[2] - - # Add external box - domain = kernel.addBox( - -farfield_radius, - -farfield_radius, - -farfield_radius, - 2.0 * farfield_radius, - 2.0 * farfield_radius, - 2.0 * farfield_radius - ) - - # Apply a rotation transformation to all entities in the model - ra ./= norm(ra) - kernel.rotate(kernel.getEntities(), rc[1], rc[2], rc[3], ra[1], ra[2], ra[3], θ) - - kernel.synchronize() - - # Add physical groups - domain_group = gmsh.model.addPhysicalGroup(3, [domain], -1, "domain") - - _, farfield_boundaries = gmsh.model.getAdjacencies(3, domain) - farfield_group = gmsh.model.addPhysicalGroup(2, farfield_boundaries, -1, "farfield") - - rings_group = gmsh.model.addPhysicalGroup(2, [inner_ring, outer_ring], -1, "rings") - - inner_terminal_group = - gmsh.model.addPhysicalGroup(2, [inner_terminal], -1, "terminal_inner") - outer_terminal_group = - gmsh.model.addPhysicalGroup(2, [outer_terminal], -1, "terminal_outer") - - inner_gap_group = gmsh.model.addPhysicalGroup(2, [inner_gap], -1, "hole_inner") - outer_gap_group = gmsh.model.addPhysicalGroup(2, [outer_gap], -1, "hole_outer") - - # Generate mesh - gmsh.option.setNumber("Mesh.MeshSizeMin", l_ring) - gmsh.option.setNumber("Mesh.MeshSizeMax", l_farfield) - gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", 0) - gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 0) - - gmsh.model.mesh.field.add("Extend", 1) - gmsh.model.mesh.field.setNumbers( - 1, - "SurfacesList", - [inner_ring, outer_ring, inner_terminal, outer_terminal] - ) - gmsh.model.mesh.field.setNumber(1, "Power", 1.0) - gmsh.model.mesh.field.setNumber(1, "DistMax", 6.0 * outer_radius) - gmsh.model.mesh.field.setNumber(1, "SizeMax", l_farfield) - - mesh_curves = - last.( - gmsh.model.getBoundary( - [(2, x) for x in [inner_ring, outer_ring, inner_terminal, outer_terminal]], - true, - false, - false - ) - ) - - gmsh.model.mesh.field.add("Distance", 2) - gmsh.model.mesh.field.setNumbers(2, "CurvesList", mesh_curves) - gmsh.model.mesh.field.setNumber(2, "Sampling", 30) - - gmsh.model.mesh.field.add("Threshold", 3) - gmsh.model.mesh.field.setNumber(3, "InField", 2) - gmsh.model.mesh.field.setNumber(3, "SizeMin", l_ring) - gmsh.model.mesh.field.setNumber(3, "SizeMax", l_farfield) - gmsh.model.mesh.field.setNumber(3, "DistMin", 0.0) - gmsh.model.mesh.field.setNumber(3, "DistMax", 6.0 * outer_radius) - - gmsh.model.mesh.field.add("Min", 101) - gmsh.model.mesh.field.setNumbers(101, "FieldsList", [1, 3]) - gmsh.model.mesh.field.setAsBackgroundMesh(101) - - gmsh.model.mesh.embed( - 2, - [inner_terminal, inner_ring, outer_terminal, outer_ring, inner_gap, outer_gap], - 3, - domain - ) - - gmsh.option.setNumber("Mesh.Algorithm3D", 10) - - gmsh.model.mesh.generate(3) - gmsh.model.mesh.setOrder(2) - - # Save mesh - gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) - gmsh.option.setNumber("Mesh.Binary", 0) - gmsh.write(joinpath(@__DIR__, "rings.msh")) - - # Print some information - println("\nFinished generating mesh. Physical group tags:") - println("Domain: ", domain_group) - println("Farfield boundaries: ", farfield_group) - println("Ring boundaries: ", rings_group) - println("Inner terminal: ", inner_terminal_group) - println("Outer terminal: ", outer_terminal_group) - println("Inner hole: ", inner_gap_group) - println("Outer hole: ", outer_gap_group) - println() - - # Optionally launch GUI - if gui - gmsh.fltk.run() - end - - return gmsh.finalize() -end +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Generate example mesh with: +# julia -e 'include("mesh/mesh.jl"); generate_ring_mesh(filename="rings.msh")' + +using Gmsh: gmsh +using LinearAlgebra + +""" + generate_ring_mesh(; + filename::AbstractString, + wire_width = 1.0, + inner_radius = 10.0, + outer_radius = 100.0, + rot_center::AbstractVector{<:Real} = [0.0, 0.0, 0.0], + rot_axis::AbstractVector{<:Real} = [0.0, 0.0, 1.0], + rot_θ::Real = π / 2, + verbose::Integer = 5, + gui::Bool = false + ) + +Generate a mesh for the rings example using Gmsh + +# Arguments + + - filename - the filename to use for the generated mesh + - wire_width - width of the rings + - inner_radius - radius of the inner ring + - outer_radius - radius of the outer ring + - rot_center - center of rotation + - rot_axis - axis of rotation + - rot_θ - angle of rotation about rot_axis, originating at rot_center + - verbose - flag to dictate the level of print to REPL, passed to Gmsh + - gui - whether to launch the Gmsh GUI on mesh generation +""" +function generate_ring_mesh(; + filename::AbstractString, + wire_width = 1.0, + inner_radius = 10.0, + outer_radius = 100.0, + rot_center::AbstractVector{<:Real} = [0.0, 0.0, 0.0], + rot_axis::AbstractVector{<:Real} = [0.0, 0.0, 1.0], + rot_θ::Real = π / 6, + verbose::Integer = 5, + gui::Bool = false +) + kernel = gmsh.model.occ + + gmsh.initialize() + gmsh.option.setNumber("General.Verbosity", verbose) + + # Add model + if "rings" in gmsh.model.list() + gmsh.model.setCurrent("rings") + gmsh.model.remove() + end + gmsh.model.add("rings") + + # Geometry parameters (in μm) + farfield_radius = 10.0 * outer_radius + + # Mesh parameters + l_ring = 2.0 + l_farfield = 200.0 + + # Origin + p0 = kernel.addPoint(0.0, 0.0, 0.0) + + # Inner ring + h0 = 0.5 * wire_width + r1 = inner_radius - h0 + r2 = inner_radius + h0 + x1 = sqrt(r1^2 - h0^2) + x2 = sqrt(r2^2 - h0^2) + + pi1 = kernel.addPoint(x1, -h0, 0.0, l_ring) + pi2 = kernel.addPoint(x1, h0, 0.0, l_ring) + pi3 = kernel.addPoint(x2, h0, 0.0, l_ring) + pi4 = kernel.addPoint(x2, -h0, 0.0, l_ring) + + li1 = kernel.addLine(pi1, pi2) + li2 = kernel.addLine(pi2, pi3) + li3 = kernel.addLine(pi3, pi4) + li4 = kernel.addLine(pi4, pi1) + + inner_terminal_loop = kernel.addCurveLoop([li1, li2, li3, li4]) + inner_terminal = kernel.addPlaneSurface([inner_terminal_loop]) + + pi5 = kernel.addPoint(-r1, 0.0, 0.0, l_ring) + pi6 = kernel.addPoint(-r2, 0.0, 0.0, l_ring) + + ai1 = kernel.addCircleArc(pi2, p0, pi5) + ai2 = kernel.addCircleArc(pi5, p0, pi1) + ai3 = kernel.addCircleArc(pi3, p0, pi6) + ai4 = kernel.addCircleArc(pi6, p0, pi4) + + inner_ring_loop = kernel.addCurveLoop([ai1, ai2, -li4, -ai4, -ai3, -li2]) + inner_ring = kernel.addPlaneSurface([inner_ring_loop]) + + # Outer ring + r1 = outer_radius - h0 + r2 = outer_radius + h0 + x1 = sqrt(r1^2 - h0^2) + x2 = sqrt(r2^2 - h0^2) + + po1 = kernel.addPoint(x1, -h0, 0.0, l_ring) + po2 = kernel.addPoint(x1, h0, 0.0, l_ring) + po3 = kernel.addPoint(x2, h0, 0.0, l_ring) + po4 = kernel.addPoint(x2, -h0, 0.0, l_ring) + + lo1 = kernel.addLine(po1, po2) + lo2 = kernel.addLine(po2, po3) + lo3 = kernel.addLine(po3, po4) + lo4 = kernel.addLine(po4, po1) + + outer_terminal_loop = kernel.addCurveLoop([lo1, lo2, lo3, lo4]) + outer_terminal = kernel.addPlaneSurface([outer_terminal_loop]) + + po5 = kernel.addPoint(-r1, 0.0, 0.0, l_ring) + po6 = kernel.addPoint(-r2, 0.0, 0.0, l_ring) + + ao1 = kernel.addCircleArc(po2, p0, po5) + ao2 = kernel.addCircleArc(po5, p0, po1) + ao3 = kernel.addCircleArc(po3, p0, po6) + ao4 = kernel.addCircleArc(po6, p0, po4) + + outer_ring_loop = kernel.addCurveLoop([ao1, ao2, -lo4, -ao4, -ao3, -lo2]) + outer_ring = kernel.addPlaneSurface([outer_ring_loop]) + + # Auxiliary surfaces + inner_gap_loop = kernel.addCurveLoop([ai1, ai2, li1]) + inner_gap = kernel.addPlaneSurface([inner_gap_loop]) + + outer_gap_loop_in = kernel.addCurveLoop([ai3, ai4, -li3]) + outer_gap_in = kernel.addPlaneSurface([outer_gap_loop_in]) + outer_gap_loop_out = kernel.addCurveLoop([ao1, ao2, lo1]) + outer_gap_out = kernel.addPlaneSurface([outer_gap_loop_out]) + outer_gap, _ = kernel.cut([(2, outer_gap_out)], [(2, outer_gap_in)]) + @assert length(outer_gap) == 1 + outer_gap = first(outer_gap)[2] + + # Add external box + domain = kernel.addBox( + -farfield_radius, + -farfield_radius, + -farfield_radius, + 2.0 * farfield_radius, + 2.0 * farfield_radius, + 2.0 * farfield_radius + ) + + # Apply a rotation transformation to all entities in the model + rot_axis ./= norm(rot_axis) + kernel.rotate( + kernel.getEntities(), + rot_center[1], + rot_center[2], + rot_center[3], + rot_axis[1], + rot_axis[2], + rot_axis[3], + rot_θ + ) + + kernel.synchronize() + + # Add physical groups + domain_group = gmsh.model.addPhysicalGroup(3, [domain], -1, "domain") + + _, farfield_boundaries = gmsh.model.getAdjacencies(3, domain) + farfield_group = gmsh.model.addPhysicalGroup(2, farfield_boundaries, -1, "farfield") + + rings_group = gmsh.model.addPhysicalGroup(2, [inner_ring, outer_ring], -1, "rings") + + inner_terminal_group = + gmsh.model.addPhysicalGroup(2, [inner_terminal], -1, "terminal_inner") + outer_terminal_group = + gmsh.model.addPhysicalGroup(2, [outer_terminal], -1, "terminal_outer") + + inner_gap_group = gmsh.model.addPhysicalGroup(2, [inner_gap], -1, "hole_inner") + outer_gap_group = gmsh.model.addPhysicalGroup(2, [outer_gap], -1, "hole_outer") + + # Generate mesh + gmsh.option.setNumber("Mesh.MeshSizeMin", l_ring) + gmsh.option.setNumber("Mesh.MeshSizeMax", l_farfield) + gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", 0) + gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 0) + + gmsh.model.mesh.field.add("Extend", 1) + gmsh.model.mesh.field.setNumbers( + 1, + "SurfacesList", + [inner_ring, outer_ring, inner_terminal, outer_terminal] + ) + gmsh.model.mesh.field.setNumber(1, "Power", 1.0) + gmsh.model.mesh.field.setNumber(1, "DistMax", 6.0 * outer_radius) + gmsh.model.mesh.field.setNumber(1, "SizeMax", l_farfield) + + mesh_curves = last.( + gmsh.model.getBoundary( + [(2, x) for x in [inner_ring, outer_ring, inner_terminal, outer_terminal]], + true, + false, + false + ) + ) + + gmsh.model.mesh.field.add("Distance", 2) + gmsh.model.mesh.field.setNumbers(2, "CurvesList", mesh_curves) + gmsh.model.mesh.field.setNumber(2, "Sampling", 30) + + gmsh.model.mesh.field.add("Threshold", 3) + gmsh.model.mesh.field.setNumber(3, "InField", 2) + gmsh.model.mesh.field.setNumber(3, "SizeMin", l_ring) + gmsh.model.mesh.field.setNumber(3, "SizeMax", l_farfield) + gmsh.model.mesh.field.setNumber(3, "DistMin", 0.0) + gmsh.model.mesh.field.setNumber(3, "DistMax", 6.0 * outer_radius) + + gmsh.model.mesh.field.add("Min", 101) + gmsh.model.mesh.field.setNumbers(101, "FieldsList", [1, 3]) + gmsh.model.mesh.field.setAsBackgroundMesh(101) + + gmsh.model.mesh.embed( + 2, + [inner_terminal, inner_ring, outer_terminal, outer_ring, inner_gap, outer_gap], + 3, + domain + ) + + gmsh.option.setNumber("Mesh.Algorithm", 6) + gmsh.option.setNumber("Mesh.Algorithm3D", 1) + + gmsh.model.mesh.generate(3) + gmsh.model.mesh.setOrder(2) + + # Save mesh + gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) + gmsh.option.setNumber("Mesh.Binary", 1) + gmsh.write(joinpath(@__DIR__, filename)) + + # Print some information + println("\nFinished generating mesh. Physical group tags:") + println("Domain: ", domain_group) + println("Farfield boundaries: ", farfield_group) + println("Ring boundaries: ", rings_group) + println("Inner terminal: ", inner_terminal_group) + println("Outer terminal: ", outer_terminal_group) + println("Inner hole: ", inner_gap_group) + println("Outer hole: ", outer_gap_group) + println() + + # Optionally launch GUI + if gui + gmsh.fltk.run() + end + + return gmsh.finalize() +end diff --git a/examples/rings/mesh/rings.msh b/examples/rings/mesh/rings.msh index b3c1110787..8d51a97c56 100644 Binary files a/examples/rings/mesh/rings.msh and b/examples/rings/mesh/rings.msh differ diff --git a/examples/rings/rings.json b/examples/rings/rings.json index e1e4f7bde0..528d9bc6cb 100644 --- a/examples/rings/rings.json +++ b/examples/rings/rings.json @@ -1,87 +1,95 @@ -{ - "Problem": - { - "Type": "Magnetostatic", - "Verbose": 2, - "Output": "D:/WelSimLLC/executable28/_palace_examples/rings/postpro" - }, - "Model": - { - "Mesh": "D:/WelSimLLC/executable28/_palace_examples/rings/mesh/rings.msh", - "L0": 1.0e-6 // um - }, - "Domains": - { - "Materials": - [ - { - "Attributes": [1], - "Permeability": 1.0 - } - ], - "Postprocessing": - { - "Probe": - [ - { - "Index": 1, // Center of rings - "X": 0.0, - "Y": 0.0, - "Z": 0.0 - } - ] - } - }, - "Boundaries": - { - "PEC": - { - "Attributes": [2, 3] - }, - "SurfaceCurrent": - [ - { - "Index": 1, - "Attributes": [4], // Inner ring - "Direction": [-0.5, 0.8660254037844386, 0.0] // "+Y" rotated π/6 around (0,0,1) - }, - { - "Index": 2, - "Attributes": [5], // Outer ring - "Direction": [-0.5, 0.8660254037844386, 0.0] // "+Y" rotated π/6 around (0,0,1) - } - ], - "Postprocessing": // Inductance from flux instead of energy - { - "Inductance": - [ - { - "Index": 1, - "Attributes": [6], // Inner hole - "Direction": [0, 0, 1] - }, - { - "Index": 2, - "Attributes": [6, 7], // Outer (total) hole - "Direction": [0, 0, 1] - } - ] - } - }, - "Solver": - { - "Order": 2, - "Magnetostatic": - { - "Save": 2 - }, - "Linear": - { - "Type": "AMS", - "KSPType": "CG", - "Tol": 1.0e-8, - "MaxIts": 100 - } - } -} - +{ + "Problem": + { + "Type": "Magnetostatic", + "Verbose": 2, + "Output": "postpro" + }, + "Model": + { + "Mesh": "mesh/rings.msh", + "L0": 1.0e-6 // μm + }, + "Domains": + { + "Materials": + [ + { + "Attributes": [1], + "Permeability": 1.0 + } + ], + "Postprocessing": + { + "Probe": + [ + { + "Index": 1, // Center of rings + "Center": [0.0, 0.0, 0.0] + } + ], + "Energy": + [ + { + "Index": 1, + "Attributes": [1] + } + ] + } + }, + "Boundaries": + { + "PEC": + { + "Attributes": [2, 3] + }, + "SurfaceCurrent": + [ + { + "Index": 1, + "Attributes": [4], // Inner ring + "Direction": [-0.5, 0.8660254037844386, 0.0] // "+Y" rotated π/6 around (0,0,1) + }, + { + "Index": 2, + "Attributes": [5], // Outer ring + "Direction": [-0.5, 0.8660254037844386, 0.0] // "+Y" rotated π/6 around (0,0,1) + } + ], + "Postprocessing": // Inductance from flux instead of energy + { + "SurfaceFlux": + [ + { + "Index": 1, + "Attributes": [6], // Inner hole + "Type": "Magnetic", + "Center": [0, 0, -1] // Positive in +Z + }, + { + "Index": 2, + "Attributes": [6, 7], // Outer (total) hole + "Type": "Magnetic", + "Center": [0, 0, -1] // Positive in +Z + } + ] + } + }, + "Solver": + { + "Order": 2, + "Device": "CPU", + "Magnetostatic": + { + "Save": 2 + }, + "Linear": + { + "Type": "AMS", + "KSPType": "CG", + "Tol": 1.0e-8, + "MaxIts": 100 + } + } +} + diff --git a/examples/spheres/mesh/mesh.jl b/examples/spheres/mesh/mesh.jl index 1875b23f42..a935807cf7 100644 --- a/examples/spheres/mesh/mesh.jl +++ b/examples/spheres/mesh/mesh.jl @@ -1,117 +1,219 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -using Gmsh: gmsh - -kernel = gmsh.model.occ - -gmsh.initialize() - -# Add model -if "spheres" in gmsh.model.list() - gmsh.model.setCurrent("spheres") - gmsh.model.remove() -end -gmsh.model.add("spheres") - -# Geometry parameters (in cm) -radius_a = 1.0 -radius_b = 2.0 -center_d = 5.0 -radius_farfield = 15.0 * center_d - -# Mesh parameters -n_sphere = 16 -l_farfield = 20.0 - -# Geometry -sphere_a = kernel.addSphere(-0.5 * center_d, 0.0, 0.0, radius_a) -sphere_b = kernel.addSphere(0.5 * center_d, 0.0, 0.0, radius_b) -sphere_farfield = kernel.add_sphere(0.0, 0.0, 0.0, radius_farfield) - -cut_dimtags, cut_dimtags_map = - kernel.cut([(3, sphere_farfield)], [(3, sphere_a), (3, sphere_b)]) -@assert length(cut_dimtags) == 1 && first(cut_dimtags)[1] == 3 -domain = first(cut_dimtags)[2] - -eps = 1.0e-3 -sphere_a = kernel.getEntitiesInBoundingBox( - -0.5 * center_d - radius_a - eps, - -radius_a - eps, - -radius_a - eps, - -0.5 * center_d + radius_a + eps, - radius_a + eps, - radius_a + eps, - 2 -) -sphere_b = kernel.getEntitiesInBoundingBox( - 0.5 * center_d - radius_b - eps, - -radius_b - eps, - -radius_b - eps, - 0.5 * center_d + radius_b + eps, - radius_b + eps, - radius_b + eps, - 2 -) -@assert length(sphere_a) == 1 && length(sphere_b) == 1 -sphere_a = first(sphere_a)[2] -sphere_b = first(sphere_b)[2] - -sphere_farfield = - filter(x -> x != (2, sphere_a) && x != (2, sphere_b), kernel.getEntities(2)) -@assert length(sphere_farfield) == 1 -sphere_farfield = first(sphere_farfield)[2] - -kernel.synchronize() - -# Add physical groups -domain_group = gmsh.model.addPhysicalGroup(3, [domain], -1, "domain") - -farfield_group = gmsh.model.addPhysicalGroup(2, [sphere_farfield], -1, "farfield") - -sphere_a_group = gmsh.model.addPhysicalGroup(2, [sphere_a], -1, "sphere_a") -sphere_b_group = gmsh.model.addPhysicalGroup(2, [sphere_b], -1, "sphere_b") - -# Generate mesh -gmsh.option.setNumber("Mesh.MinimumCurveNodes", 2) -gmsh.option.setNumber("Mesh.MinimumCircleNodes", 0) - -gmsh.option.setNumber("Mesh.MeshSizeMin", 2.0 * pi * radius_a / n_sphere / 2.0) -gmsh.option.setNumber("Mesh.MeshSizeMax", l_farfield) -gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", n_sphere) -gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 0) - -gmsh.model.mesh.field.add("Extend", 1) -gmsh.model.mesh.field.setNumbers(1, "SurfacesList", [sphere_a, sphere_b]) -gmsh.model.mesh.field.setNumber(1, "Power", 1.0) -gmsh.model.mesh.field.setNumber(1, "DistMax", radius_farfield) -gmsh.model.mesh.field.setNumber(1, "SizeMax", l_farfield) - -gmsh.model.mesh.field.add("Min", 101) -gmsh.model.mesh.field.setNumbers(101, "FieldsList", [1]) -gmsh.model.mesh.field.setAsBackgroundMesh(101) - -gmsh.option.setNumber("Mesh.Algorithm3D", 10) - -gmsh.model.mesh.generate(3) -gmsh.model.mesh.setOrder(3) - -# Save mesh -gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) -gmsh.option.setNumber("Mesh.Binary", 0) -gmsh.write(joinpath(@__DIR__, "spheres.msh")) - -# Print some information -println("\nFinished generating mesh. Physical group tags:") -println("Domain: ", domain_group) -println("Farfield boundary: ", farfield_group) -println("Sphere A: ", sphere_a_group) -println("Sphere B: ", sphere_b_group) -println() - -# Optionally launch GUI -if "gui" in lowercase.(ARGS) - gmsh.fltk.run() -end - -gmsh.finalize() +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +#= +# README + +This Julia script uses Gmsh to create a mesh for two conducting spheres enclosed within a +larger boundary sphere. + +The generated mesh contains four distinct regions: +1. A 3D volume region (the space between spheres) +2. A large outer spherical boundary (typically set to ground potential) +3. Two inner spherical conductors (typically set as terminals) + +## Prerequisites + +This script requires the Gmsh Julia package. If you don't already have it installed, you can +install it with + +```bash +julia -e 'using Pkg; Pkg.add("Gmsh")' +``` + +## How to run + +From this directory, run: +```bash +julia -e 'include("mesh.jl"); generate_spheres_mesh(; filename="spheres.msh")' +``` +This generates the mesh used in the example. + +To visualize the mesh in Gmsh's graphical interface, add the `gui=true` parameter: +```bash +julia -e 'include("mesh.jl"); generate_spheres_mesh(; filename="spheres.msh", gui=true)' +``` + +The script will generate a mesh file and print the "attribute" numbers for each region. +These attributes are needed when configuring Palace simulations. +=# + +using Gmsh: gmsh + +# Convenience function to extract the tag (ID) from the return values of several Gmsh +# functions +extract_tag(entity) = first(entity)[2] + +""" + generate_spheres_mesh(; + filename::AbstractString, + radius_a::Real = 1.0, + radius_b::Real = 2.0, + center_d::Real = 5.0, + radius_farfield::Real = 15.0 * center_d, + verbose::Integer = 5, + gui::Bool = false + ) + +Generate a mesh for the two spheres example using Gmsh. + +# Arguments + + - filename - the filename to use for the generated mesh + - radius_a - radius of the first sphere + - radius_b - radius of the second sphere + - center_d - distance between sphere centers + - radius_farfield - Radius of the outer boundary sphere + - verbose - flag to dictate the level of print to REPL, passed to Gmsh + (0-5, higher = more verbose) + - gui - whether to launch the Gmsh GUI on mesh generation +""" +function generate_spheres_mesh(; + filename::AbstractString, + radius_a::Real=1.0, + radius_b::Real=2.0, + center_d::Real=5.0, + radius_farfield::Real=15.0 * center_d, + verbose::Integer=5, + gui::Bool=false +) + # Boilerplate + kernel = gmsh.model.occ + gmsh.initialize() + gmsh.option.setNumber("General.Verbosity", verbose) + + # Create a new model. The name spheres is not important. If a model was already added, + # remove it first (this is useful when interactively the body of this function in the + # REPL) + if "spheres" in gmsh.model.list() + gmsh.model.setCurrent("spheres") + gmsh.model.remove() + end + gmsh.model.add("spheres") + + # n_sphere: Controls how many elements around sphere's circumference, + # higher values = finer mesh on spheres + # l_farfield: Maximum element size at the outer boundary, + # larger values = coarser mesh at boundary + n_sphere = 16 + l_farfield = 20.0 + + # Create three spheres: two inner conductors along the x axis and one outer boundary + # with centers along the x axis + sphere_a = kernel.addSphere(-0.5 * center_d, 0.0, 0.0, radius_a) + sphere_b = kernel.addSphere(0.5 * center_d, 0.0, 0.0, radius_b) + sphere_farfield = kernel.addSphere(0.0, 0.0, 0.0, radius_farfield) + + # We want to mesh the volume between the spheres, so we start with the large outer + # sphere and subtract the two inner spheres + cut_dimtags, cut_dimtags_map = + kernel.cut([(3, sphere_farfield)], [(3, sphere_a), (3, sphere_b)]) + + # Verify we got exactly one 3D region as expected + only_one_region = length(cut_dimtags) == 1 + cut_region_is_3d = first(cut_dimtags)[1] == 3 + @assert only_one_region && cut_region_is_3d + + domain = extract_tag(cut_dimtags) + + # After boolean operations, we need to find the surfaces again + eps = 1.0e-3 + + # Find sphere_a's boundary surface + sphere_a = kernel.getEntitiesInBoundingBox( + -0.5 * center_d - radius_a - eps, + -radius_a - eps, + -radius_a - eps, + -0.5 * center_d + radius_a + eps, + radius_a + eps, + radius_a + eps, + 2 # 2 means we're looking for 2D surfaces + ) + + # Find sphere_b's boundary surface + sphere_b = kernel.getEntitiesInBoundingBox( + 0.5 * center_d - radius_b - eps, + -radius_b - eps, + -radius_b - eps, + 0.5 * center_d + radius_b + eps, + radius_b + eps, + radius_b + eps, + 2 + ) + @assert length(sphere_a) == 1 && length(sphere_b) == 1 + + sphere_a = extract_tag(sphere_a) + sphere_b = extract_tag(sphere_b) + + # Find the outer boundary by getting all 2D entities except the inner spheres + sphere_farfield = + filter(x -> x != (2, sphere_a) && x != (2, sphere_b), kernel.getEntities(2)) + @assert length(sphere_farfield) == 1 + sphere_farfield = extract_tag(sphere_farfield) + + # Commit all geometric operations from the CAD representation to the Gmsh model + kernel.synchronize() + + # Create physical groups (these become attributes in Palace) + # The provided names are primarily for human consumption, e.g., in the Gmsh GUI + # The -1 means "assign an attribute number automatically" + domain_group = gmsh.model.addPhysicalGroup(3, [domain], -1, "domain") + farfield_group = gmsh.model.addPhysicalGroup(2, [sphere_farfield], -1, "farfield") + sphere_a_group = gmsh.model.addPhysicalGroup(2, [sphere_a], -1, "sphere_a") + sphere_b_group = gmsh.model.addPhysicalGroup(2, [sphere_b], -1, "sphere_b") + + # Set minimum nodes per curve for better curved element quality + gmsh.option.setNumber("Mesh.MinimumCurveNodes", 2) + gmsh.option.setNumber("Mesh.MinimumCircleNodes", 0) + + # Set smallest and largest element size + gmsh.option.setNumber("Mesh.MeshSizeMin", 2.0 * pi * radius_a / n_sphere / 2.0) + gmsh.option.setNumber("Mesh.MeshSizeMax", l_farfield) + # Set minimum number of elements per 2π radians of curvature + gmsh.option.setNumber("Mesh.MeshSizeFromCurvature", n_sphere) + # Don't extend mesh size constraints from boundaries into the volume + # This option is typically activated when working with mesh size fields + gmsh.option.setNumber("Mesh.MeshSizeExtendFromBoundary", 0) + + # Create mesh size fields to manually control size and distribution of elements + # throughout the mesh + + # First, create a mesh size field (with id 1) that extends from the inner surfaces to + # the volume toward the outer boundary + gmsh.model.mesh.field.add("Extend", 1) + gmsh.model.mesh.field.setNumbers(1, "SurfacesList", [sphere_a, sphere_b]) + gmsh.model.mesh.field.setNumber(1, "DistMax", radius_farfield) + gmsh.model.mesh.field.setNumber(1, "SizeMax", l_farfield) + + # Finally, use this Extend field to determine element sizes + gmsh.model.mesh.field.setAsBackgroundMesh(1) + + # Choose meshing algorithm. Typically, we would choose HXT, 10, because it + # is parallel and high-performance, but it is not reproducible, so best to + # stick with something more stable for this example. + gmsh.option.setNumber("Mesh.Algorithm", 6) + gmsh.option.setNumber("Mesh.Algorithm3D", 1) + + gmsh.model.mesh.generate(3) # 3 means generate a 3D volume mesh + gmsh.model.mesh.setOrder(3) # 3 means cubically curved elements + + # Set mesh format version as required by Palace + gmsh.option.setNumber("Mesh.MshFileVersion", 2.2) + gmsh.option.setNumber("Mesh.Binary", 1) + gmsh.write(joinpath(@__DIR__, filename)) + + println("\nFinished generating mesh. Physical group tags:") + println("Domain: ", domain_group) + println("Farfield boundary: ", farfield_group) + println("Sphere A: ", sphere_a_group) + println("Sphere B: ", sphere_b_group) + println() + + # Optionally launch the Gmsh GUI + if gui + gmsh.fltk.run() + end + + # Clean up Gmsh resources + return gmsh.finalize() +end diff --git a/examples/spheres/mesh/spheres.msh b/examples/spheres/mesh/spheres.msh index 63678489f2..0b536fb2a3 100644 Binary files a/examples/spheres/mesh/spheres.msh and b/examples/spheres/mesh/spheres.msh differ diff --git a/examples/spheres/spheres.json b/examples/spheres/spheres.json index 8289a4cd5b..2d03e51238 100644 --- a/examples/spheres/spheres.json +++ b/examples/spheres/spheres.json @@ -1,83 +1,95 @@ -{ - "Problem": - { - "Type": "Electrostatic", - "Verbose": 2, - "Output": "D:/WelSimLLC/executable28/_palace_examples/spheres/postpro" - }, - "Model": - { - "Mesh": "D:/WelSimLLC/executable28/_palace_examples/spheres/mesh/spheres.msh", - "L0": 1.0e-2 // cm - }, - "Domains": - { - "Materials": - [ - { - "Attributes": [1], - "Permittivity": 1.0 - } - ], - "Postprocessing": - { - "Probe": - [ - { - "Index": 1, // On surface of smaller sphere - "X": -1.5, - "Y": 0.0, - "Z": 0.0 - } - ] - } - }, - "Boundaries": - { - "Ground": - { - "Attributes": [2] - }, - "Terminal": - [ - { - "Index": 1, - "Attributes": [3] // Sphere A - }, - { - "Index": 2, - "Attributes": [4] // Sphere B - } - ], - "Postprocessing": // Capacitance from charge instead of energy - { - "Capacitance": - [ - { - "Index": 1, - "Attributes": [3] // Sphere A - }, - { - "Index": 2, - "Attributes": [4] // Sphere B - } - ] - } - }, - "Solver": - { - "Order": 3, - "Electrostatic": - { - "Save": 2 - }, - "Linear": - { - "Type": "BoomerAMG", - "KSPType": "CG", - "Tol": 1.0e-8, - "MaxIts": 100 - } - } -} - +{ + "Problem": + { + "Type": "Electrostatic", + "Verbose": 2, + "Output": "postpro", + "OutputFormats": + { + "Paraview": true, + "GridFunction": true + } + }, + "Model": + { + "Mesh": "mesh/spheres.msh", + "L0": 1.0e-2 // cm + }, + "Domains": + { + "Materials": + [ + { + "Attributes": [1], + "Permittivity": 1.0 + } + ], + "Postprocessing": + { + "Probe": + [ + { + "Index": 1, // On surface of smaller sphere + "Center": [-1.5, 0.0, 0.0] + } + ], + "Energy": + [ + { + "Index": 1, + "Attributes": [1] + } + ] + } + }, + "Boundaries": + { + "Ground": + { + "Attributes": [2] + }, + "Terminal": + [ + { + "Index": 1, + "Attributes": [3] // Sphere A + }, + { + "Index": 2, + "Attributes": [4] // Sphere B + } + ], + "Postprocessing": // Capacitance from charge instead of energy + { + "SurfaceFlux": + [ + { + "Index": 1, + "Attributes": [3], // Sphere A + "Type": "Electric" + }, + { + "Index": 2, + "Attributes": [4], // Sphere B + "Type": "Electric" + } + ] + } + }, + "Solver": + { + "Order": 3, + "Device": "CPU", + "Electrostatic": + { + "Save": 2 + }, + "Linear": + { + "Type": "BoomerAMG", + "KSPType": "CG", + "Tol": 1.0e-8, + "MaxIts": 100 + } + } +} diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt index 5942ef2044..4cb4fca8aa 100644 --- a/extern/CMakeLists.txt +++ b/extern/CMakeLists.txt @@ -1,80 +1,95 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Download and build third-party dependencies -# - -# Add METIS/ParMETIS libraries -message(STATUS "================ Configuring METIS/ParMETIS dependencies ===============") -include(ExternalMETIS) - -# Add SuperLU_DIST -if(PALACE_WITH_SUPERLU) - message(STATUS "================= Configuring SuperLU_DIST dependency ==================") - include(ExternalSuperLU_DIST) -endif() - -# Add ScaLAPACK -if(PALACE_WITH_STRUMPACK OR PALACE_WITH_MUMPS) - message(STATUS "=================== Configuring ScaLAPACK dependency ===================") - include(ExternalScaLAPACK) -endif() - -# Add SLATE (for STRUMPACK with GPU support) -if(PALACE_WITH_STRUMPACK AND (PALACE_WITH_CUDA OR PALACE_WITH_HIP)) - message(STATUS "===================== Configuring SLATE dependency =====================") - include(ExternalSLATE) -endif() - -# Add MAGMA (for STRUMPACK with GPU support and for libCEED) -if(PALACE_WITH_MAGMA) - message(STATUS "===================== Configuring MAGMA dependency =====================") - include(ExternalMAGMA) -endif() - -# Add STRUMPACK -if(PALACE_WITH_STRUMPACK) - message(STATUS "=================== Configuring STRUMPACK dependency ===================") - include(ExternalSTRUMPACK) -endif() - -# Add MUMPS -if(PALACE_WITH_MUMPS) - message(STATUS "===================== Configuring MUMPS dependency =====================") - include(ExternalMUMPS) -endif() - -# Add PETSc and SLEPc -if(PALACE_WITH_SLEPC) - message(STATUS "=============== Configuring PETSc and SLEPc dependencies ===============") - include(ExternalSLEPc) -endif() - -# Add ARPACK -if(PALACE_WITH_ARPACK) - message(STATUS "==================== Configuring ARPACK dependency =====================") - include(ExternalARPACK) -endif() - -# Add LIBXSMM (for libCEED) -if(PALACE_WITH_LIBXSMM) - message(STATUS "==================== Configuring LIBXSMM dependency ====================") - include(ExternalLIBXSMM) -endif() - -# Add HYPRE -message(STATUS "===================== Configuring HYPRE dependency =====================") -include(ExternalHYPRE) - -# Add JSON -message(STATUS "================= Configuring nlohmann/json dependency =================") -include(ExternalJSON) - -# Add fmt -message(STATUS "====================== Configuring fmt dependency ======================") -include(ExternalFmt) - -# Add Eigen -message(STATUS "===================== Configuring Eigen dependency =====================") -include(ExternalEigen) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Download and build third-party dependencies +# + +# Add METIS/ParMETIS libraries +message(STATUS "================ Configuring METIS/ParMETIS dependencies ===============") +include(ExternalMETIS) + +# Add SuperLU_DIST +if(PALACE_WITH_SUPERLU) + message(STATUS "================= Configuring SuperLU_DIST dependency ==================") + include(ExternalSuperLU_DIST) +endif() + +# Add ScaLAPACK +if(PALACE_WITH_STRUMPACK OR PALACE_WITH_MUMPS) + message(STATUS "=================== Configuring ScaLAPACK dependency ===================") + include(ExternalScaLAPACK) +endif() + +# Add MAGMA (for STRUMPACK with GPU support and for libCEED) +if(PALACE_WITH_MAGMA) + message(STATUS "===================== Configuring MAGMA dependency =====================") + include(ExternalMAGMA) +endif() + +# Add STRUMPACK +if(PALACE_WITH_STRUMPACK) + message(STATUS "=================== Configuring STRUMPACK dependency ===================") + include(ExternalSTRUMPACK) +endif() + +# Add MUMPS +if(PALACE_WITH_MUMPS) + message(STATUS "===================== Configuring MUMPS dependency =====================") + include(ExternalMUMPS) +endif() + +# Add PETSc and SLEPc +if(PALACE_WITH_SLEPC) + message(STATUS "=============== Configuring PETSc and SLEPc dependencies ===============") + include(ExternalSLEPc) +endif() + +# Add ARPACK +if(PALACE_WITH_ARPACK) + message(STATUS "==================== Configuring ARPACK dependency =====================") + include(ExternalARPACK) +endif() + +# Add LIBXSMM (for libCEED) +if(PALACE_WITH_LIBXSMM) + message(STATUS "==================== Configuring LIBXSMM dependency ====================") + include(ExternalLIBXSMM) +endif() + +# Add HYPRE +message(STATUS "===================== Configuring HYPRE dependency =====================") +include(ExternalHYPRE) + +# Add JSON +message(STATUS "================= Configuring nlohmann/json dependency =================") +include(ExternalJSON) + +# Add fmt +message(STATUS "====================== Configuring fmt dependency ======================") +include(ExternalFmt) + +# Add scn +message(STATUS "====================== Configuring scn dependency ======================") +include(ExternalScn) + +# Add Eigen +message(STATUS "===================== Configuring Eigen dependency =====================") +include(ExternalEigen) + +# Add SUNDIALS +if(PALACE_WITH_SUNDIALS) + message(STATUS "===================== Configuring SUNDIALS dependency =====================") + include(ExternalSUNDIALS) +endif() + +# Add GSLIB +if(PALACE_WITH_GSLIB) + message(STATUS "===================== Configuring GSLIB dependency =====================") + include(ExternalGSLIB) +endif() + +# Add libCEED +message(STATUS "==================== Configuring libCEED dependency ====================") +include(ExternalLibCEED) + diff --git a/extern/patch/ButterflyPACK/patch_build.diff b/extern/patch/ButterflyPACK/patch_build.diff index 6978864aed..2ad06c22b1 100644 --- a/extern/patch/ButterflyPACK/patch_build.diff +++ b/extern/patch/ButterflyPACK/patch_build.diff @@ -1,13 +1,13 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 0468258..dc27a91 100755 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -419,7 +419,7 @@ add_subdirectory(SRC_DOUBLE) - add_subdirectory(SRC_DOUBLECOMPLEX) - add_subdirectory(SRC_SINGLE) - add_subdirectory(SRC_COMPLEX) --add_subdirectory(EXAMPLE) -+# add_subdirectory(EXAMPLE) - - # Documentation - if(enable_doc) +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 0468258..dc27a91 100755 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -419,7 +419,7 @@ add_subdirectory(SRC_DOUBLE) + add_subdirectory(SRC_DOUBLECOMPLEX) + add_subdirectory(SRC_SINGLE) + add_subdirectory(SRC_COMPLEX) +-add_subdirectory(EXAMPLE) ++# add_subdirectory(EXAMPLE) + + # Documentation + if(enable_doc) diff --git a/extern/patch/GKlib/patch_build.diff b/extern/patch/GKlib/patch_build.diff index 92d6dbf1b4..596ef99ea7 100644 --- a/extern/patch/GKlib/patch_build.diff +++ b/extern/patch/GKlib/patch_build.diff @@ -1,636 +1,636 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 9cd1b4b..8bac52d 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -1,31 +1,185 @@ --cmake_minimum_required(VERSION 2.8) --project(GKlib C) -+cmake_minimum_required(VERSION 3.13) - --option(BUILD_SHARED_LIBS "Build shared libraries (.dll/.so) instead of static ones (.lib/.a)" OFF) -+project(GKlib VERSION 0.0.1 LANGUAGES C) - --get_filename_component(abs "." ABSOLUTE) --set(GKLIB_PATH ${abs}) --unset(abs) --include(GKlibSystem.cmake) -+set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - --include_directories(".") --if(MSVC) -- include_directories("win32") -- file(GLOB win32_sources RELATIVE "win32" "*.c") --else(MSVC) -- set(win32_sources, "") --endif(MSVC) -+if(NOT CMAKE_BUILD_TYPE) -+ set(CMAKE_BUILD_TYPE Release) -+endif() - --add_library(GKlib ${GKlib_sources} ${win32_sources}) -+# ----------------------------------------------------------------------------- -+include(CheckCCompilerFlag) -+include(CheckCSourceCompiles) -+include(CheckFunctionExists) -+include(CheckIncludeFile) -+include(CMakePackageConfigHelpers) -+include(GNUInstallDirs) - --if(UNIX) -- target_link_libraries(GKlib m) --endif(UNIX) -+# ----------------------------------------------------------------------------- -+# User Options -+option(ASSERT "turn asserts on" OFF) -+option(ASSERT2 "additional assertions" OFF) -+option(DEBUG "add debugging support" OFF) -+option(GPROF "add gprof support" OFF) -+option(OPENMP "enable OpenMP support" OFF) -+option(PCRE "enable PCRE support" OFF) -+option(GKREGEX "enable GKREGEX support" OFF) -+option(GKRAND "enable GKRAND support" OFF) -+ -+if(NOT MSVC) -+ find_library(CMATH_LIB m REQUIRED) -+endif() -+ -+# ----------------------------------------------------------------------------- -+# This section from unmerged issue: https://github.com/KarypisLab/GKlib/pull/8 -+check_include_file(execinfo.h HAVE_EXECINFO_H) -+check_function_exists(getline HAVE_GETLINE) -+ -+if(PCRE) -+ check_include_file(pcreposix.h HAVE_PCREPOSIX_H) -+ if(NOT HAVE_PCREPOSIX_H) -+ message(WARNING "PCRE was requested, but is not available") -+ endif() -+endif() -+ -+if(NOT HAVE_PCREPOSIX_H) -+ check_include_file(regex.h HAVE_REGEX_H) -+ if(NOT HAVE_REGEX_H) -+ set(USE_GKREGEX ON) -+ endif() -+endif() -+ -+# Windows (Is this needed?) -+if(MSVC OR MINGW) -+ set(USE_GKREGEX ON) -+endif() -+ -+if(GPROF) -+ check_c_compiler_flag("-pg" HAVE_GPROF_SUPPORT) -+ if(NOT HAVE_GPROF_SUPPORT) -+ message(WARNING "GPROF support was requested, but is not available") -+ endif() -+endif() -+ -+if(OPENMP) -+ find_package(OpenMP) -+ if(NOT OpenMP_C_FOUND) -+ message(WARNING "OpenMP was requested, but is not available") -+ endif() -+endif() -+ -+# thread local storage -+if(NOT DEFINED HAVE_TLS) -+ set(TLS_NAME -+ "" -+ CACHE INTERNAL "Thread local keyword") -+ foreach(tls_name "__thread" "__declspec(thread)") -+ unset(HAVE_TLS CACHE) -+ check_c_source_compiles("${tls_name} int x; int main(void) { return 0; }" HAVE_TLS) -+ if(HAVE_TLS) -+ set(TLS_NAME -+ ${tls_name} -+ CACHE INTERNAL "Thread local keyword") -+ break() -+ else() -+ -+ endif() -+ endforeach() -+endif() -+ -+ -+add_library(${PROJECT_NAME}_compile_definitions INTERFACE) -+ -+target_compile_definitions( -+ ${PROJECT_NAME}_compile_definitions -+ INTERFACE $<$:LINUX> -+ $<$>:NDEBUG> -+ $<$>:NDEBUG2> -+ $<$,$>:DEBUG> -+ $<$:GKRAND> -+ $<$:HAVE_EXEC_INFO_H> -+ $<$:USE_PCRE> -+ $<$,$>:HAVE_PCREPOSIX_H> -+ $<$:HAVE_REGEX_H> -+ $<$:USE_GKREGEX> -+ $<$:HAVE_GETLINE> -+ __thread=${TLS_NAME} -+ $<$>:_FILE_OFFSET_BITS=64> -+ $<$:WIN32> -+ $<$:MSC> -+ $<$:_CRT_SECURE_NO_DEPRECATE>) -+ -+# ----------------------------------------------------------------------------- -+ -+add_library(${PROJECT_NAME}) -+add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME}) -+ -+target_compile_features(${PROJECT_NAME} PUBLIC c_std_99) -+set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE True) -+ -+file(GLOB GKlib_sources ${PROJECT_SOURCE_DIR}/*.c) -+file(GLOB GKlib_headers ${PROJECT_SOURCE_DIR}/*.h) -+ -+target_sources(${PROJECT_NAME} PRIVATE ${GKlib_sources} ${GKlib_headers} -+ $<$:win32/adapt.c win32/adapt.h>) -+ -+target_include_directories(${PROJECT_NAME} PUBLIC $ -+ $) -+ -+target_link_libraries( -+ ${PROJECT_NAME} -+ PUBLIC $<$>:${CMATH_LIB}> -+ PRIVATE ${PROJECT_NAME}_compile_definitions) -+ -+target_compile_options( -+ ${PROJECT_NAME} PRIVATE $<$:-fno-strict-aliasing> -+ ) -+ -+target_compile_options(${PROJECT_NAME} PUBLIC $<$,$>:-pg>) -+ -+ -+if(OpenMP_C_FOUND) -+ target_link_libraries(${PROJECT_NAME} PUBLIC OpenMP::OpenMP_C) -+endif() -+# ----------------------------------------------------------------------------- - - include_directories("test") - add_subdirectory("test") - --install(TARGETS GKlib -- ARCHIVE DESTINATION lib/${LINSTALL_PATH} -- LIBRARY DESTINATION lib/${LINSTALL_PATH}) --install(FILES ${GKlib_includes} DESTINATION include/${HINSTALL_PATH}) -+# ----------------------------------------------------------------------------- -+ -+configure_package_config_file(GKlibConfig.cmake.in cmake/GKlibConfig.cmake -+ INSTALL_DESTINATION lib/cmake/GKlib) -+ -+write_basic_package_version_file( -+ cmake/GKlibConfigVersion.cmake -+ VERSION ${PROJECT_VERSION} -+ COMPATIBILITY SameMajorVersion) -+ -+configure_file(GKlibConfig.cmake.in GKlibConfig.cmake @ONLY) -+ -+# ----------------------------------------------------------------------------- -+ -+# install library -+install( -+ TARGETS ${PROJECT_NAME} -+ EXPORT GKlibTargets -+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} -+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} -+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} -+ INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) -+ -+install( -+ FILES ${GKlib_headers} -+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) -+ -+install(TARGETS ${PROJECT_NAME}_compile_definitions EXPORT GKlibTargets) -+ -+install(EXPORT GKlibTargets FILE GKlibTargets.cmake NAMESPACE GKlib:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/GKlib) -+ -+ -+install( -+ FILES ${CMAKE_CURRENT_BINARY_DIR}/cmake/GKlibConfig.cmake -+ ${CMAKE_CURRENT_BINARY_DIR}/cmake/GKlibConfigVersion.cmake -+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/GKlib) -\ No newline at end of file -diff --git a/GKlib.h b/GKlib.h -index 9278fe4..061e4cb 100644 ---- a/GKlib.h -+++ b/GKlib.h -@@ -1,6 +1,6 @@ - /* - * GKlib.h -- * -+ * - * George's library of most frequently used routines - * - * $Id: GKlib.h 14866 2013-08-03 16:40:04Z karypis $ -@@ -55,7 +55,7 @@ - - - --#if defined(__OPENMP__) -+#if defined(_OPENMP) - #include - #endif - -diff --git a/GKlibConfig.cmake.in b/GKlibConfig.cmake.in -new file mode 100644 -index 0000000..8c3f1c9 ---- /dev/null -+++ b/GKlibConfig.cmake.in -@@ -0,0 +1,16 @@ -+if(NOT GKlib_FOUND) -+set(GKlib_FOUND True) -+ -+@PACKAGE_INIT@ -+ -+set(GKlib_OpenMP_C_FOUND @OpenMP_C_FOUND@) -+if(GKlib_OpenMP_C_FOUND) -+ include(CMakeFindDependencyMacro) -+ find_dependency(OpenMP) -+endif() -+ -+include(${CMAKE_CURRENT_LIST_DIR}/GKlibTargets.cmake) -+ -+check_required_components(GKlib) -+ -+endif() -\ No newline at end of file -diff --git a/GKlibSystem.cmake b/GKlibSystem.cmake -deleted file mode 100644 -index 31a1cf1..0000000 ---- a/GKlibSystem.cmake -+++ /dev/null -@@ -1,152 +0,0 @@ --# Helper modules. --include(CheckFunctionExists) --include(CheckIncludeFile) -- --# Setup options. --option(GDB "enable use of GDB" OFF) --option(ASSERT "turn asserts on" OFF) --option(ASSERT2 "additional assertions" OFF) --option(DEBUG "add debugging support" OFF) --option(GPROF "add gprof support" OFF) --option(VALGRIND "add valgrind support" OFF) --option(OPENMP "enable OpenMP support" OFF) --option(PCRE "enable PCRE support" OFF) --option(GKREGEX "enable GKREGEX support" OFF) --option(GKRAND "enable GKRAND support" OFF) --option(NO_X86 "enable NO_X86 support" OFF) -- -- --# Add compiler flags. --if(MSVC) -- set(GKlib_COPTS "/Ox") -- set(GKlib_COPTIONS "-DWIN32 -DMSC -D_CRT_SECURE_NO_DEPRECATE -DUSE_GKREGEX") --elseif(MINGW) -- set(GKlib_COPTS "-DUSE_GKREGEX") --else() -- set(GKlib_COPTIONS "-DLINUX -D_FILE_OFFSET_BITS=64") --endif(MSVC) --if(CYGWIN) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DCYGWIN") --endif(CYGWIN) --if(CMAKE_COMPILER_IS_GNUCC) --# GCC opts. -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -std=c99 -fno-strict-aliasing") --if(VALGRIND) -- set(GKlib_COPTIONS "${GK_COPTIONS} -march=x86-64 -mtune=generic") --else() --# -march=native is not a valid flag on PPC: --if(CMAKE_SYSTEM_PROCESSOR MATCHES "power|ppc|powerpc|ppc64|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64")) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -mtune=native") --else() -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -march=native") --endif() --endif(VALGRIND) -- if(NOT MINGW) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -fPIC") -- endif(NOT MINGW) --# GCC warnings. -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -Werror -Wall -pedantic -Wno-unused-function -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unknown-pragmas -Wno-unused-label") --elseif(${CMAKE_C_COMPILER_ID} MATCHES "Sun") --# Sun insists on -xc99. -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -xc99") --endif(CMAKE_COMPILER_IS_GNUCC) -- --# Intel compiler --if(${CMAKE_C_COMPILER_ID} MATCHES "Intel") -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -xHost -std=c99") --endif() -- --# Find OpenMP if it is requested. --if(OPENMP) -- include(FindOpenMP) -- if(OPENMP_FOUND) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -D__OPENMP__ ${OpenMP_C_FLAGS}") -- else() -- message(WARNING "OpenMP was requested but support was not found") -- endif(OPENMP_FOUND) --endif(OPENMP) -- --# Set the CPU type --if(NO_X86) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DNO_X86=${NO_X86}") --endif(NO_X86) -- --# Add various definitions. --if(GDB) -- set(GKlib_COPTS "${GKlib_COPTS} -g") -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -Werror") --else() -- set(GKlib_COPTS "-O3") --endif(GDB) -- -- --if(DEBUG) -- set(GKlib_COPTS "-g") -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DDEBUG") --endif(DEBUG) -- --if(GPROF) -- set(GKlib_COPTS "-pg") --endif(GPROF) -- --if(NOT ASSERT) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DNDEBUG") --endif(NOT ASSERT) -- --if(NOT ASSERT2) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DNDEBUG2") --endif(NOT ASSERT2) -- -- --# Add various options --if(PCRE) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -D__WITHPCRE__") --endif(PCRE) -- --if(GKREGEX) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DUSE_GKREGEX") --endif(GKREGEX) -- --if(GKRAND) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DUSE_GKRAND") --endif(GKRAND) -- -- --# Check for features. --check_include_file(execinfo.h HAVE_EXECINFO_H) --if(HAVE_EXECINFO_H) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DHAVE_EXECINFO_H") --endif(HAVE_EXECINFO_H) -- --check_function_exists(getline HAVE_GETLINE) --if(HAVE_GETLINE) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DHAVE_GETLINE") --endif(HAVE_GETLINE) -- -- --# Custom check for TLS. --if(MSVC) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -D__thread=__declspec(thread)") -- -- # This if checks if that value is cached or not. -- if("${HAVE_THREADLOCALSTORAGE}" MATCHES "^${HAVE_THREADLOCALSTORAGE}$") -- try_compile(HAVE_THREADLOCALSTORAGE -- ${CMAKE_BINARY_DIR} -- ${GKLIB_PATH}/conf/check_thread_storage.c) -- if(HAVE_THREADLOCALSTORAGE) -- message(STATUS "checking for thread-local storage - found") -- else() -- message(STATUS "checking for thread-local storage - not found") -- endif() -- endif() -- if(NOT HAVE_THREADLOCALSTORAGE) -- set(GKlib_COPTIONS "${GKlib_COPTIONS} -D__thread=") -- endif() --endif() -- --# Finally set the official C flags. --set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GKlib_COPTIONS} ${GKlib_COPTS}") -- --# Find GKlib sources. --file(GLOB GKlib_sources ${GKLIB_PATH}/*.c) --file(GLOB GKlib_includes ${GKLIB_PATH}/*.h) -diff --git a/Makefile b/Makefile -deleted file mode 100644 -index 6ac97b9..0000000 ---- a/Makefile -+++ /dev/null -@@ -1,87 +0,0 @@ --# Configuration options. --cc = gcc --prefix = ~/local --openmp = not-set --gdb = not-set --assert = not-set --assert2 = not-set --debug = not-set --gprof = not-set --valgrind = not-set --pcre = not-set --gkregex = not-set --gkrand = not-set -- -- --# Basically proxies everything to the builddir cmake. --cputype = $(shell uname -m | sed "s/\\ /_/g") --systype = $(shell uname -s) -- --BUILDDIR = build/$(systype)-$(cputype) -- --# Process configuration options. --CONFIG_FLAGS = -DCMAKE_VERBOSE_MAKEFILE=1 --ifneq ($(gdb), not-set) -- CONFIG_FLAGS += -DGDB=$(gdb) --endif --ifneq ($(assert), not-set) -- CONFIG_FLAGS += -DASSERT=$(assert) --endif --ifneq ($(assert2), not-set) -- CONFIG_FLAGS += -DASSERT2=$(assert2) --endif --ifneq ($(debug), not-set) -- CONFIG_FLAGS += -DDEBUG=$(debug) --endif --ifneq ($(gprof), not-set) -- CONFIG_FLAGS += -DGPROF=$(gprof) --endif --ifneq ($(valgrind), not-set) -- CONFIG_FLAGS += -DVALGRIND=$(valgrind) --endif --ifneq ($(openmp), not-set) -- CONFIG_FLAGS += -DOPENMP=$(openmp) --endif --ifneq ($(pcre), not-set) -- CONFIG_FLAGS += -DPCRE=$(pcre) --endif --ifneq ($(gkregex), not-set) -- CONFIG_FLAGS += -DGKREGEX=$(pcre) --endif --ifneq ($(gkrand), not-set) -- CONFIG_FLAGS += -DGKRAND=$(pcre) --endif --ifneq ($(prefix), not-set) -- CONFIG_FLAGS += -DCMAKE_INSTALL_PREFIX=$(prefix) --endif --ifneq ($(cc), not-set) -- CONFIG_FLAGS += -DCMAKE_C_COMPILER=$(cc) --endif --ifneq ($(cputype), x86_64) -- CONFIG_FLAGS += -DNO_X86=$(cputype) --endif -- --define run-config --mkdir -p $(BUILDDIR) --cd $(BUILDDIR) && cmake $(CURDIR) $(CONFIG_FLAGS) --endef -- --all clean install: $(BUILDDIR) -- make -C $(BUILDDIR) $@ -- --uninstall: -- xargs rm < $(BUILDDIR)/install_manifest.txt -- --$(BUILDDIR): -- $(run-config) -- --config: distclean -- $(run-config) -- --distclean: -- rm -rf $(BUILDDIR) -- --remake: -- find . -name CMakeLists.txt -exec touch {} ';' -- --.PHONY: config distclean all clean install uninstall remake -diff --git a/README.md b/README.md -index f94eeea..93a7d76 100644 ---- a/README.md -+++ b/README.md -@@ -1,54 +1,6 @@ - # GKlib - A library of various helper routines and frameworks used by many of the lab's software - --## Build requirements -- - CMake 2.8, found at http://www.cmake.org/, as well as GNU make. - --Assuming that the above are available, two commands should suffice to --build the software: --``` --make config --make --``` -- --## Configuring the build --It is primarily configured by passing options to make config. For example: --``` --make config cc=icc --``` -- --would configure it to be built using icc. -- --Configuration options are: --``` --cc=[compiler] - The C compiler to use [default: gcc] --prefix=[PATH] - Set the installation prefix [default: ~/local] --openmp=set - To build a version with OpenMP support --``` -- -- --## Building and installing --To build and install, run the following --``` --make --make install --``` -- --By default, the library file, header file, and binaries will be installed in --``` --~/local/lib --~/local/include --~/local/bin --``` -- --## Other make commands -- make uninstall -- Removes all files installed by 'make install'. -- -- make clean -- Removes all object files but retains the configuration options. -- -- make distclean -- Performs clean and completely removes the build directory. - - -diff --git a/conf/check_thread_storage.c b/conf/check_thread_storage.c -deleted file mode 100644 -index e6e1e98..0000000 ---- a/conf/check_thread_storage.c -+++ /dev/null -@@ -1,5 +0,0 @@ --extern __thread int x; -- --int main(int argc, char **argv) { -- return 0; --} -diff --git a/gk_proto.h b/gk_proto.h -index 6fd6bd4..aa943a5 100644 ---- a/gk_proto.h -+++ b/gk_proto.h -@@ -292,7 +292,7 @@ uint32_t gk_randint32(void); - /*------------------------------------------------------------- - * OpenMP fake functions - *-------------------------------------------------------------*/ --#if !defined(__OPENMP__) -+#if !defined(_OPENMP) - void omp_set_num_threads(int num_threads); - int omp_get_num_threads(void); - int omp_get_max_threads(void); -@@ -303,7 +303,7 @@ void omp_set_dynamic(int num_threads); - int omp_get_dynamic(void); - void omp_set_nested(int nested); - int omp_get_nested(void); --#endif /* __OPENMP__ */ -+#endif /* _OPENMP */ - - - /*------------------------------------------------------------- -diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt -index 8584820..04ecac1 100644 ---- a/test/CMakeLists.txt -+++ b/test/CMakeLists.txt -@@ -9,11 +9,15 @@ add_executable(grKx grKx.c) - add_executable(m2mnbrs m2mnbrs.c) - add_executable(cmpnbrs cmpnbrs.c) - add_executable(splatt2svd splatt2svd.c) --add_executable(gkuniq gkuniq.c) - --foreach(prog strings gksort fis gkrw gkgraph csrcnv grKx m2mnbrs cmpnbrs splatt2svd gkuniq) -+foreach(prog strings gksort fis gkrw gkgraph csrcnv grKx m2mnbrs cmpnbrs splatt2svd) - target_link_libraries(${prog} GKlib) - endforeach(prog) - -+if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64|arm*") -+ add_executable(gkuniq gkuniq.c) -+ target_link_libraries(gkuniq GKlib) -+endif() -+ - # Install a subset of them - install(TARGETS csrcnv RUNTIME DESTINATION bin) -diff --git a/timers.c b/timers.c -index bb8f296..b004e23 100644 ---- a/timers.c -+++ b/timers.c -@@ -35,7 +35,7 @@ double gk_WClockSeconds(void) - **************************************************************************/ - double gk_CPUSeconds(void) - { --//#ifdef __OPENMP__ -+//#ifdef _OPENMP - #ifdef __OPENMPXXXX__ - return omp_get_wtime(); - #else - +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 9cd1b4b..8bac52d 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -1,31 +1,185 @@ +-cmake_minimum_required(VERSION 2.8) +-project(GKlib C) ++cmake_minimum_required(VERSION 3.13) + +-option(BUILD_SHARED_LIBS "Build shared libraries (.dll/.so) instead of static ones (.lib/.a)" OFF) ++project(GKlib VERSION 0.0.1 LANGUAGES C) + +-get_filename_component(abs "." ABSOLUTE) +-set(GKLIB_PATH ${abs}) +-unset(abs) +-include(GKlibSystem.cmake) ++set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +-include_directories(".") +-if(MSVC) +- include_directories("win32") +- file(GLOB win32_sources RELATIVE "win32" "*.c") +-else(MSVC) +- set(win32_sources, "") +-endif(MSVC) ++if(NOT CMAKE_BUILD_TYPE) ++ set(CMAKE_BUILD_TYPE Release) ++endif() + +-add_library(GKlib ${GKlib_sources} ${win32_sources}) ++# ----------------------------------------------------------------------------- ++include(CheckCCompilerFlag) ++include(CheckCSourceCompiles) ++include(CheckFunctionExists) ++include(CheckIncludeFile) ++include(CMakePackageConfigHelpers) ++include(GNUInstallDirs) + +-if(UNIX) +- target_link_libraries(GKlib m) +-endif(UNIX) ++# ----------------------------------------------------------------------------- ++# User Options ++option(ASSERT "turn asserts on" OFF) ++option(ASSERT2 "additional assertions" OFF) ++option(DEBUG "add debugging support" OFF) ++option(GPROF "add gprof support" OFF) ++option(OPENMP "enable OpenMP support" OFF) ++option(PCRE "enable PCRE support" OFF) ++option(GKREGEX "enable GKREGEX support" OFF) ++option(GKRAND "enable GKRAND support" OFF) ++ ++if(NOT MSVC) ++ find_library(CMATH_LIB m REQUIRED) ++endif() ++ ++# ----------------------------------------------------------------------------- ++# This section from unmerged issue: https://github.com/KarypisLab/GKlib/pull/8 ++check_include_file(execinfo.h HAVE_EXECINFO_H) ++check_function_exists(getline HAVE_GETLINE) ++ ++if(PCRE) ++ check_include_file(pcreposix.h HAVE_PCREPOSIX_H) ++ if(NOT HAVE_PCREPOSIX_H) ++ message(WARNING "PCRE was requested, but is not available") ++ endif() ++endif() ++ ++if(NOT HAVE_PCREPOSIX_H) ++ check_include_file(regex.h HAVE_REGEX_H) ++ if(NOT HAVE_REGEX_H) ++ set(USE_GKREGEX ON) ++ endif() ++endif() ++ ++# Windows (Is this needed?) ++if(MSVC OR MINGW) ++ set(USE_GKREGEX ON) ++endif() ++ ++if(GPROF) ++ check_c_compiler_flag("-pg" HAVE_GPROF_SUPPORT) ++ if(NOT HAVE_GPROF_SUPPORT) ++ message(WARNING "GPROF support was requested, but is not available") ++ endif() ++endif() ++ ++if(OPENMP) ++ find_package(OpenMP) ++ if(NOT OpenMP_C_FOUND) ++ message(WARNING "OpenMP was requested, but is not available") ++ endif() ++endif() ++ ++# thread local storage ++if(NOT DEFINED HAVE_TLS) ++ set(TLS_NAME ++ "" ++ CACHE INTERNAL "Thread local keyword") ++ foreach(tls_name "__thread" "__declspec(thread)") ++ unset(HAVE_TLS CACHE) ++ check_c_source_compiles("${tls_name} int x; int main(void) { return 0; }" HAVE_TLS) ++ if(HAVE_TLS) ++ set(TLS_NAME ++ ${tls_name} ++ CACHE INTERNAL "Thread local keyword") ++ break() ++ else() ++ ++ endif() ++ endforeach() ++endif() ++ ++ ++add_library(${PROJECT_NAME}_compile_definitions INTERFACE) ++ ++target_compile_definitions( ++ ${PROJECT_NAME}_compile_definitions ++ INTERFACE $<$:LINUX> ++ $<$>:NDEBUG> ++ $<$>:NDEBUG2> ++ $<$,$>:DEBUG> ++ $<$:GKRAND> ++ $<$:HAVE_EXEC_INFO_H> ++ $<$:USE_PCRE> ++ $<$,$>:HAVE_PCREPOSIX_H> ++ $<$:HAVE_REGEX_H> ++ $<$:USE_GKREGEX> ++ $<$:HAVE_GETLINE> ++ __thread=${TLS_NAME} ++ $<$>:_FILE_OFFSET_BITS=64> ++ $<$:WIN32> ++ $<$:MSC> ++ $<$:_CRT_SECURE_NO_DEPRECATE>) ++ ++# ----------------------------------------------------------------------------- ++ ++add_library(${PROJECT_NAME}) ++add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME}) ++ ++target_compile_features(${PROJECT_NAME} PUBLIC c_std_99) ++set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE True) ++ ++file(GLOB GKlib_sources ${PROJECT_SOURCE_DIR}/*.c) ++file(GLOB GKlib_headers ${PROJECT_SOURCE_DIR}/*.h) ++ ++target_sources(${PROJECT_NAME} PRIVATE ${GKlib_sources} ${GKlib_headers} ++ $<$:win32/adapt.c win32/adapt.h>) ++ ++target_include_directories(${PROJECT_NAME} PUBLIC $ ++ $) ++ ++target_link_libraries( ++ ${PROJECT_NAME} ++ PUBLIC $<$>:${CMATH_LIB}> ++ PRIVATE ${PROJECT_NAME}_compile_definitions) ++ ++target_compile_options( ++ ${PROJECT_NAME} PRIVATE $<$:-fno-strict-aliasing> ++ ) ++ ++target_compile_options(${PROJECT_NAME} PUBLIC $<$,$>:-pg>) ++ ++ ++if(OpenMP_C_FOUND) ++ target_link_libraries(${PROJECT_NAME} PUBLIC OpenMP::OpenMP_C) ++endif() ++# ----------------------------------------------------------------------------- + + include_directories("test") + add_subdirectory("test") + +-install(TARGETS GKlib +- ARCHIVE DESTINATION lib/${LINSTALL_PATH} +- LIBRARY DESTINATION lib/${LINSTALL_PATH}) +-install(FILES ${GKlib_includes} DESTINATION include/${HINSTALL_PATH}) ++# ----------------------------------------------------------------------------- ++ ++configure_package_config_file(GKlibConfig.cmake.in cmake/GKlibConfig.cmake ++ INSTALL_DESTINATION lib/cmake/GKlib) ++ ++write_basic_package_version_file( ++ cmake/GKlibConfigVersion.cmake ++ VERSION ${PROJECT_VERSION} ++ COMPATIBILITY SameMajorVersion) ++ ++configure_file(GKlibConfig.cmake.in GKlibConfig.cmake @ONLY) ++ ++# ----------------------------------------------------------------------------- ++ ++# install library ++install( ++ TARGETS ${PROJECT_NAME} ++ EXPORT GKlibTargets ++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ++ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ++ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ++ INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) ++ ++install( ++ FILES ${GKlib_headers} ++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) ++ ++install(TARGETS ${PROJECT_NAME}_compile_definitions EXPORT GKlibTargets) ++ ++install(EXPORT GKlibTargets FILE GKlibTargets.cmake NAMESPACE GKlib:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/GKlib) ++ ++ ++install( ++ FILES ${CMAKE_CURRENT_BINARY_DIR}/cmake/GKlibConfig.cmake ++ ${CMAKE_CURRENT_BINARY_DIR}/cmake/GKlibConfigVersion.cmake ++ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/GKlib) +\ No newline at end of file +diff --git a/GKlib.h b/GKlib.h +index 9278fe4..061e4cb 100644 +--- a/GKlib.h ++++ b/GKlib.h +@@ -1,6 +1,6 @@ + /* + * GKlib.h +- * ++ * + * George's library of most frequently used routines + * + * $Id: GKlib.h 14866 2013-08-03 16:40:04Z karypis $ +@@ -55,7 +55,7 @@ + + + +-#if defined(__OPENMP__) ++#if defined(_OPENMP) + #include + #endif + +diff --git a/GKlibConfig.cmake.in b/GKlibConfig.cmake.in +new file mode 100644 +index 0000000..8c3f1c9 +--- /dev/null ++++ b/GKlibConfig.cmake.in +@@ -0,0 +1,16 @@ ++if(NOT GKlib_FOUND) ++set(GKlib_FOUND True) ++ ++@PACKAGE_INIT@ ++ ++set(GKlib_OpenMP_C_FOUND @OpenMP_C_FOUND@) ++if(GKlib_OpenMP_C_FOUND) ++ include(CMakeFindDependencyMacro) ++ find_dependency(OpenMP) ++endif() ++ ++include(${CMAKE_CURRENT_LIST_DIR}/GKlibTargets.cmake) ++ ++check_required_components(GKlib) ++ ++endif() +\ No newline at end of file +diff --git a/GKlibSystem.cmake b/GKlibSystem.cmake +deleted file mode 100644 +index 31a1cf1..0000000 +--- a/GKlibSystem.cmake ++++ /dev/null +@@ -1,152 +0,0 @@ +-# Helper modules. +-include(CheckFunctionExists) +-include(CheckIncludeFile) +- +-# Setup options. +-option(GDB "enable use of GDB" OFF) +-option(ASSERT "turn asserts on" OFF) +-option(ASSERT2 "additional assertions" OFF) +-option(DEBUG "add debugging support" OFF) +-option(GPROF "add gprof support" OFF) +-option(VALGRIND "add valgrind support" OFF) +-option(OPENMP "enable OpenMP support" OFF) +-option(PCRE "enable PCRE support" OFF) +-option(GKREGEX "enable GKREGEX support" OFF) +-option(GKRAND "enable GKRAND support" OFF) +-option(NO_X86 "enable NO_X86 support" OFF) +- +- +-# Add compiler flags. +-if(MSVC) +- set(GKlib_COPTS "/Ox") +- set(GKlib_COPTIONS "-DWIN32 -DMSC -D_CRT_SECURE_NO_DEPRECATE -DUSE_GKREGEX") +-elseif(MINGW) +- set(GKlib_COPTS "-DUSE_GKREGEX") +-else() +- set(GKlib_COPTIONS "-DLINUX -D_FILE_OFFSET_BITS=64") +-endif(MSVC) +-if(CYGWIN) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DCYGWIN") +-endif(CYGWIN) +-if(CMAKE_COMPILER_IS_GNUCC) +-# GCC opts. +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -std=c99 -fno-strict-aliasing") +-if(VALGRIND) +- set(GKlib_COPTIONS "${GK_COPTIONS} -march=x86-64 -mtune=generic") +-else() +-# -march=native is not a valid flag on PPC: +-if(CMAKE_SYSTEM_PROCESSOR MATCHES "power|ppc|powerpc|ppc64|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64")) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -mtune=native") +-else() +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -march=native") +-endif() +-endif(VALGRIND) +- if(NOT MINGW) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -fPIC") +- endif(NOT MINGW) +-# GCC warnings. +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -Werror -Wall -pedantic -Wno-unused-function -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unknown-pragmas -Wno-unused-label") +-elseif(${CMAKE_C_COMPILER_ID} MATCHES "Sun") +-# Sun insists on -xc99. +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -xc99") +-endif(CMAKE_COMPILER_IS_GNUCC) +- +-# Intel compiler +-if(${CMAKE_C_COMPILER_ID} MATCHES "Intel") +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -xHost -std=c99") +-endif() +- +-# Find OpenMP if it is requested. +-if(OPENMP) +- include(FindOpenMP) +- if(OPENMP_FOUND) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -D__OPENMP__ ${OpenMP_C_FLAGS}") +- else() +- message(WARNING "OpenMP was requested but support was not found") +- endif(OPENMP_FOUND) +-endif(OPENMP) +- +-# Set the CPU type +-if(NO_X86) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DNO_X86=${NO_X86}") +-endif(NO_X86) +- +-# Add various definitions. +-if(GDB) +- set(GKlib_COPTS "${GKlib_COPTS} -g") +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -Werror") +-else() +- set(GKlib_COPTS "-O3") +-endif(GDB) +- +- +-if(DEBUG) +- set(GKlib_COPTS "-g") +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DDEBUG") +-endif(DEBUG) +- +-if(GPROF) +- set(GKlib_COPTS "-pg") +-endif(GPROF) +- +-if(NOT ASSERT) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DNDEBUG") +-endif(NOT ASSERT) +- +-if(NOT ASSERT2) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DNDEBUG2") +-endif(NOT ASSERT2) +- +- +-# Add various options +-if(PCRE) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -D__WITHPCRE__") +-endif(PCRE) +- +-if(GKREGEX) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DUSE_GKREGEX") +-endif(GKREGEX) +- +-if(GKRAND) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DUSE_GKRAND") +-endif(GKRAND) +- +- +-# Check for features. +-check_include_file(execinfo.h HAVE_EXECINFO_H) +-if(HAVE_EXECINFO_H) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DHAVE_EXECINFO_H") +-endif(HAVE_EXECINFO_H) +- +-check_function_exists(getline HAVE_GETLINE) +-if(HAVE_GETLINE) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -DHAVE_GETLINE") +-endif(HAVE_GETLINE) +- +- +-# Custom check for TLS. +-if(MSVC) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -D__thread=__declspec(thread)") +- +- # This if checks if that value is cached or not. +- if("${HAVE_THREADLOCALSTORAGE}" MATCHES "^${HAVE_THREADLOCALSTORAGE}$") +- try_compile(HAVE_THREADLOCALSTORAGE +- ${CMAKE_BINARY_DIR} +- ${GKLIB_PATH}/conf/check_thread_storage.c) +- if(HAVE_THREADLOCALSTORAGE) +- message(STATUS "checking for thread-local storage - found") +- else() +- message(STATUS "checking for thread-local storage - not found") +- endif() +- endif() +- if(NOT HAVE_THREADLOCALSTORAGE) +- set(GKlib_COPTIONS "${GKlib_COPTIONS} -D__thread=") +- endif() +-endif() +- +-# Finally set the official C flags. +-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GKlib_COPTIONS} ${GKlib_COPTS}") +- +-# Find GKlib sources. +-file(GLOB GKlib_sources ${GKLIB_PATH}/*.c) +-file(GLOB GKlib_includes ${GKLIB_PATH}/*.h) +diff --git a/Makefile b/Makefile +deleted file mode 100644 +index 6ac97b9..0000000 +--- a/Makefile ++++ /dev/null +@@ -1,87 +0,0 @@ +-# Configuration options. +-cc = gcc +-prefix = ~/local +-openmp = not-set +-gdb = not-set +-assert = not-set +-assert2 = not-set +-debug = not-set +-gprof = not-set +-valgrind = not-set +-pcre = not-set +-gkregex = not-set +-gkrand = not-set +- +- +-# Basically proxies everything to the builddir cmake. +-cputype = $(shell uname -m | sed "s/\\ /_/g") +-systype = $(shell uname -s) +- +-BUILDDIR = build/$(systype)-$(cputype) +- +-# Process configuration options. +-CONFIG_FLAGS = -DCMAKE_VERBOSE_MAKEFILE=1 +-ifneq ($(gdb), not-set) +- CONFIG_FLAGS += -DGDB=$(gdb) +-endif +-ifneq ($(assert), not-set) +- CONFIG_FLAGS += -DASSERT=$(assert) +-endif +-ifneq ($(assert2), not-set) +- CONFIG_FLAGS += -DASSERT2=$(assert2) +-endif +-ifneq ($(debug), not-set) +- CONFIG_FLAGS += -DDEBUG=$(debug) +-endif +-ifneq ($(gprof), not-set) +- CONFIG_FLAGS += -DGPROF=$(gprof) +-endif +-ifneq ($(valgrind), not-set) +- CONFIG_FLAGS += -DVALGRIND=$(valgrind) +-endif +-ifneq ($(openmp), not-set) +- CONFIG_FLAGS += -DOPENMP=$(openmp) +-endif +-ifneq ($(pcre), not-set) +- CONFIG_FLAGS += -DPCRE=$(pcre) +-endif +-ifneq ($(gkregex), not-set) +- CONFIG_FLAGS += -DGKREGEX=$(pcre) +-endif +-ifneq ($(gkrand), not-set) +- CONFIG_FLAGS += -DGKRAND=$(pcre) +-endif +-ifneq ($(prefix), not-set) +- CONFIG_FLAGS += -DCMAKE_INSTALL_PREFIX=$(prefix) +-endif +-ifneq ($(cc), not-set) +- CONFIG_FLAGS += -DCMAKE_C_COMPILER=$(cc) +-endif +-ifneq ($(cputype), x86_64) +- CONFIG_FLAGS += -DNO_X86=$(cputype) +-endif +- +-define run-config +-mkdir -p $(BUILDDIR) +-cd $(BUILDDIR) && cmake $(CURDIR) $(CONFIG_FLAGS) +-endef +- +-all clean install: $(BUILDDIR) +- make -C $(BUILDDIR) $@ +- +-uninstall: +- xargs rm < $(BUILDDIR)/install_manifest.txt +- +-$(BUILDDIR): +- $(run-config) +- +-config: distclean +- $(run-config) +- +-distclean: +- rm -rf $(BUILDDIR) +- +-remake: +- find . -name CMakeLists.txt -exec touch {} ';' +- +-.PHONY: config distclean all clean install uninstall remake +diff --git a/README.md b/README.md +index f94eeea..93a7d76 100644 +--- a/README.md ++++ b/README.md +@@ -1,54 +1,6 @@ + # GKlib + A library of various helper routines and frameworks used by many of the lab's software + +-## Build requirements +- - CMake 2.8, found at http://www.cmake.org/, as well as GNU make. + +-Assuming that the above are available, two commands should suffice to +-build the software: +-``` +-make config +-make +-``` +- +-## Configuring the build +-It is primarily configured by passing options to make config. For example: +-``` +-make config cc=icc +-``` +- +-would configure it to be built using icc. +- +-Configuration options are: +-``` +-cc=[compiler] - The C compiler to use [default: gcc] +-prefix=[PATH] - Set the installation prefix [default: ~/local] +-openmp=set - To build a version with OpenMP support +-``` +- +- +-## Building and installing +-To build and install, run the following +-``` +-make +-make install +-``` +- +-By default, the library file, header file, and binaries will be installed in +-``` +-~/local/lib +-~/local/include +-~/local/bin +-``` +- +-## Other make commands +- make uninstall +- Removes all files installed by 'make install'. +- +- make clean +- Removes all object files but retains the configuration options. +- +- make distclean +- Performs clean and completely removes the build directory. + + +diff --git a/conf/check_thread_storage.c b/conf/check_thread_storage.c +deleted file mode 100644 +index e6e1e98..0000000 +--- a/conf/check_thread_storage.c ++++ /dev/null +@@ -1,5 +0,0 @@ +-extern __thread int x; +- +-int main(int argc, char **argv) { +- return 0; +-} +diff --git a/gk_proto.h b/gk_proto.h +index 6fd6bd4..aa943a5 100644 +--- a/gk_proto.h ++++ b/gk_proto.h +@@ -292,7 +292,7 @@ uint32_t gk_randint32(void); + /*------------------------------------------------------------- + * OpenMP fake functions + *-------------------------------------------------------------*/ +-#if !defined(__OPENMP__) ++#if !defined(_OPENMP) + void omp_set_num_threads(int num_threads); + int omp_get_num_threads(void); + int omp_get_max_threads(void); +@@ -303,7 +303,7 @@ void omp_set_dynamic(int num_threads); + int omp_get_dynamic(void); + void omp_set_nested(int nested); + int omp_get_nested(void); +-#endif /* __OPENMP__ */ ++#endif /* _OPENMP */ + + + /*------------------------------------------------------------- +diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt +index 8584820..04ecac1 100644 +--- a/test/CMakeLists.txt ++++ b/test/CMakeLists.txt +@@ -9,11 +9,15 @@ add_executable(grKx grKx.c) + add_executable(m2mnbrs m2mnbrs.c) + add_executable(cmpnbrs cmpnbrs.c) + add_executable(splatt2svd splatt2svd.c) +-add_executable(gkuniq gkuniq.c) + +-foreach(prog strings gksort fis gkrw gkgraph csrcnv grKx m2mnbrs cmpnbrs splatt2svd gkuniq) ++foreach(prog strings gksort fis gkrw gkgraph csrcnv grKx m2mnbrs cmpnbrs splatt2svd) + target_link_libraries(${prog} GKlib) + endforeach(prog) + ++if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64|arm*") ++ add_executable(gkuniq gkuniq.c) ++ target_link_libraries(gkuniq GKlib) ++endif() ++ + # Install a subset of them + install(TARGETS csrcnv RUNTIME DESTINATION bin) +diff --git a/timers.c b/timers.c +index bb8f296..b004e23 100644 +--- a/timers.c ++++ b/timers.c +@@ -35,7 +35,7 @@ double gk_WClockSeconds(void) + **************************************************************************/ + double gk_CPUSeconds(void) + { +-//#ifdef __OPENMP__ ++//#ifdef _OPENMP + #ifdef __OPENMPXXXX__ + return omp_get_wtime(); + #else + diff --git a/extern/patch/GKlib/patch_install.diff b/extern/patch/GKlib/patch_install.diff index bd406a1620..35aa3acc22 100644 --- a/extern/patch/GKlib/patch_install.diff +++ b/extern/patch/GKlib/patch_install.diff @@ -1,13 +1,13 @@ -diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt -index 04ecac1..3fafee7 100644 ---- a/test/CMakeLists.txt -+++ b/test/CMakeLists.txt -@@ -19,5 +19,5 @@ if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64|arm*") - target_link_libraries(gkuniq GKlib) - endif() - --# Install a subset of them --install(TARGETS csrcnv RUNTIME DESTINATION bin) -+# # Install a subset of them -+# install(TARGETS csrcnv RUNTIME DESTINATION bin) - +diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt +index 04ecac1..3fafee7 100644 +--- a/test/CMakeLists.txt ++++ b/test/CMakeLists.txt +@@ -19,5 +19,5 @@ if(NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64|arm*") + target_link_libraries(gkuniq GKlib) + endif() + +-# Install a subset of them +-install(TARGETS csrcnv RUNTIME DESTINATION bin) ++# # Install a subset of them ++# install(TARGETS csrcnv RUNTIME DESTINATION bin) + diff --git a/extern/patch/METIS/patch_build.diff b/extern/patch/METIS/patch_build.diff index ac0404550f..f4178648f7 100644 --- a/extern/patch/METIS/patch_build.diff +++ b/extern/patch/METIS/patch_build.diff @@ -1,416 +1,416 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index a15d19a..e1f1731 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -1,22 +1,26 @@ --cmake_minimum_required(VERSION 2.8) --project(METIS C) -+cmake_minimum_required(VERSION 3.13) - --set(SHARED FALSE CACHE BOOL "build a shared library") -+project(metis VERSION 0.0.1 LANGUAGES C) - --if(MSVC) -- set(METIS_INSTALL FALSE) --else() -- set(METIS_INSTALL TRUE) -+set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -+ -+if(NOT CMAKE_BUILD_TYPE) -+ set(CMAKE_BUILD_TYPE Release) - endif() - --# Configure libmetis library. --if(SHARED) -- set(METIS_LIBRARY_TYPE SHARED) --else() -- set(METIS_LIBRARY_TYPE STATIC) --endif(SHARED) -+# ----------------------------------------------------------------------------- -+include(GNUInstallDirs) -+ -+find_package(GKlib REQUIRED) -+ -+if(GPROF) -+ check_c_compiler_flag("-pg" HAVE_GPROF_SUPPORT) -+ if(NOT HAVE_GPROF_SUPPORT) -+ message(WARNING "GPROF support was requested, but is not available") -+ endif() -+endif() - --include(./conf/gkbuild.cmake) -+# ----------------------------------------------------------------------------- - - # METIS' custom options - #option(IDX64 "enable 64 bit ints" OFF) -@@ -34,19 +38,5 @@ include(./conf/gkbuild.cmake) - # - #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${METIS_COPTIONS}") - -- --# Add include directories. --# i.e., the -I equivalent --include_directories(build/xinclude) --include_directories(${GKLIB_PATH}/include) --include_directories(${CMAKE_INSTALL_PREFIX}/include) -- --# List of paths that the compiler will search for library files. --# i.e., the -L equivalent --link_directories(${GKLIB_PATH}/lib) --link_directories(${CMAKE_INSTALL_PREFIX}/lib) -- --# Recursively look for CMakeLists.txt in subdirs. --add_subdirectory("build/xinclude") --add_subdirectory("libmetis") --add_subdirectory("programs") -+add_subdirectory(libmetis) -+add_subdirectory(programs) -diff --git a/conf/check_thread_storage.c b/conf/check_thread_storage.c -deleted file mode 100644 -index e6e1e98..0000000 ---- a/conf/check_thread_storage.c -+++ /dev/null -@@ -1,5 +0,0 @@ --extern __thread int x; -- --int main(int argc, char **argv) { -- return 0; --} -diff --git a/conf/gkbuild.cmake b/conf/gkbuild.cmake -deleted file mode 100644 -index 591ece4..0000000 ---- a/conf/gkbuild.cmake -+++ /dev/null -@@ -1,148 +0,0 @@ --# Helper modules. --include(CheckFunctionExists) --include(CheckIncludeFile) -- --# Setup options. --option(GDB "enable use of GDB" OFF) --option(ASSERT "turn asserts on" OFF) --option(ASSERT2 "additional assertions" OFF) --option(DEBUG "add debugging support" OFF) --option(GPROF "add gprof support" OFF) --option(VALGRIND "add valgrind support" OFF) --option(OPENMP "enable OpenMP support" OFF) --option(PCRE "enable PCRE support" OFF) --option(GKREGEX "enable GKREGEX support" OFF) --option(GKRAND "enable GKRAND support" OFF) -- --# Add compiler flags. --if(MSVC) -- set(GK_COPTS "/Ox") -- set(GK_COPTIONS "-DWIN32 -DMSC -D_CRT_SECURE_NO_DEPRECATE -DUSE_GKREGEX") --elseif(MINGW) -- set(GK_COPTS "-DUSE_GKREGEX") --else() -- set(GK_COPTIONS "-DLINUX -D_FILE_OFFSET_BITS=64") --endif(MSVC) --if(CYGWIN) -- set(GK_COPTIONS "${GK_COPTIONS} -DCYGWIN") --endif(CYGWIN) --if(CMAKE_COMPILER_IS_GNUCC) --# GCC opts. -- set(GK_COPTIONS "${GK_COPTIONS} -std=c99 -fno-strict-aliasing") --if(VALGRIND) -- set(GK_COPTIONS "${GK_COPTIONS} -march=x86-64 -mtune=generic") --else() --# -march=native is not a valid flag on PPC: --if(CMAKE_SYSTEM_PROCESSOR MATCHES "power|ppc|powerpc|ppc64|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64")) -- set(GK_COPTIONS "${GK_COPTIONS} -mtune=native") --else() -- set(GK_COPTIONS "${GK_COPTIONS} -march=native") --endif() --endif(VALGRIND) -- if(NOT MINGW) -- set(GK_COPTIONS "${GK_COPTIONS} -fPIC") -- endif(NOT MINGW) --# GCC warnings. -- set(GK_COPTIONS "${GK_COPTIONS} -Werror -Wall -pedantic -Wno-unused-function -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unknown-pragmas -Wno-unused-label") --elseif(${CMAKE_C_COMPILER_ID} MATCHES "Sun") --# Sun insists on -xc99. -- set(GK_COPTIONS "${GK_COPTIONS} -xc99") --endif(CMAKE_COMPILER_IS_GNUCC) -- --if(${CMAKE_C_COMPILER_ID} STREQUAL "Intel") -- set(GK_COPTIONS "${GK_COPTIONS} -xHost") -- # set(GK_COPTIONS "${GK_COPTIONS} -fast") --endif() -- --# Add support for MacOS items --if(APPLE) -- set(GK_COPTIONS "${GK_COPTIONS} -DMACOS") --endif(APPLE) -- --# Find OpenMP if it is requested. --if(OPENMP) -- include(FindOpenMP) -- if(OPENMP_FOUND) -- set(GK_COPTIONS "${GK_COPTIONS} -D__OPENMP__ ${OpenMP_C_FLAGS}") -- else() -- message(WARNING "OpenMP was requested but support was not found") -- endif(OPENMP_FOUND) --endif(OPENMP) -- -- --# Add various definitions. --if(GDB) -- set(GK_COPTS "${GK_COPTS} -g") -- set(GK_COPTIONS "${GK_COPTIONS} -Werror") --else() -- set(GK_COPTS "-O3") --endif(GDB) -- -- --if(DEBUG) -- set(GK_COPTS "-Og") -- set(GK_COPTIONS "${GK_COPTIONS} -DDEBUG") --endif(DEBUG) -- --if(GPROF) -- set(GK_COPTS "-pg") --endif(GPROF) -- --if(NOT ASSERT) -- set(GK_COPTIONS "${GK_COPTIONS} -DNDEBUG") --endif(NOT ASSERT) -- --if(NOT ASSERT2) -- set(GK_COPTIONS "${GK_COPTIONS} -DNDEBUG2") --endif(NOT ASSERT2) -- -- --# Add various options --if(PCRE) -- set(GK_COPTIONS "${GK_COPTIONS} -D__WITHPCRE__") --endif(PCRE) -- --if(GKREGEX) -- set(GK_COPTIONS "${GK_COPTIONS} -DUSE_GKREGEX") --endif(GKREGEX) -- --if(GKRAND) -- set(GK_COPTIONS "${GK_COPTIONS} -DUSE_GKRAND") --endif(GKRAND) -- -- --# Check for features. --check_include_file(execinfo.h HAVE_EXECINFO_H) --if(HAVE_EXECINFO_H) -- set(GK_COPTIONS "${GK_COPTIONS} -DHAVE_EXECINFO_H") --endif(HAVE_EXECINFO_H) -- --check_function_exists(getline HAVE_GETLINE) --if(HAVE_GETLINE) -- set(GK_COPTIONS "${GK_COPTIONS} -DHAVE_GETLINE") --endif(HAVE_GETLINE) -- -- --# Custom check for TLS. --if(MSVC) -- set(GK_COPTIONS "${GK_COPTIONS} -D__thread=__declspec(thread)") -- -- # This if checks if that value is cached or not. -- if("${HAVE_THREADLOCALSTORAGE}" MATCHES "^${HAVE_THREADLOCALSTORAGE}$") -- try_compile(HAVE_THREADLOCALSTORAGE -- ${CMAKE_BINARY_DIR} -- ${CMAKE_SOURCE_DIR}/conf/check_thread_storage.c) -- if(HAVE_THREADLOCALSTORAGE) -- message(STATUS "checking for thread-local storage - found") -- else() -- message(STATUS "checking for thread-local storage - not found") -- endif() -- endif() -- if(NOT HAVE_THREADLOCALSTORAGE) -- set(GK_COPTIONS "${GK_COPTIONS} -D__thread=") -- endif() --endif() -- --# Finally set the official C flags. --set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GK_COPTIONS} ${GK_COPTS}") -- -diff --git a/conf/metisConfig.cmake.in b/conf/metisConfig.cmake.in -new file mode 100644 -index 0000000..3caad46 ---- /dev/null -+++ b/conf/metisConfig.cmake.in -@@ -0,0 +1,13 @@ -+if(NOT metis_FOUND) -+set(metis_FOUND True) -+ -+@PACKAGE_INIT@ -+ -+include(CMakeFindDependencyMacro) -+find_dependency(GKlib) -+ -+include(${CMAKE_CURRENT_LIST_DIR}/metisTargets.cmake) -+ -+check_required_components(metis) -+ -+endif() -\ No newline at end of file -diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt -deleted file mode 100644 -index 9515a51..0000000 ---- a/include/CMakeLists.txt -+++ /dev/null -@@ -1,3 +0,0 @@ --if(METIS_INSTALL) -- install(FILES metis.h DESTINATION include) --endif() -diff --git a/libmetis/CMakeLists.txt b/libmetis/CMakeLists.txt -index fc6cec6..0d0e7b4 100644 ---- a/libmetis/CMakeLists.txt -+++ b/libmetis/CMakeLists.txt -@@ -1,15 +1,52 @@ --# Add this directory for internal users. --include_directories(.) -- - # Find sources. --file(GLOB metis_sources *.c) -+file(GLOB metis_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.c) -+file(GLOB metis_headers ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -+ -+add_library(${PROJECT_NAME}) -+add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME}) -+ -+target_compile_features(${PROJECT_NAME} PUBLIC c_std_99) -+set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE True) -+ -+target_sources(${PROJECT_NAME} PRIVATE ${metis_sources} ${metis_headers} ${PROJECT_SOURCE_DIR}/include/metis.h) -+ -+target_include_directories(${PROJECT_NAME} PUBLIC $ $ -+ $) -+ -+target_link_libraries(${PROJECT_NAME} -+ PUBLIC GKlib::GKlib -+ PRIVATE GKlib::GKlib_compile_definitions) -+ -+target_compile_options(${PROJECT_NAME} PUBLIC $<$,$>:-pg>) -+ -+# ----------------------------------------------------------------------------- -+# Configure and Install -+include(CMakePackageConfigHelpers) -+ -+configure_package_config_file(${PROJECT_SOURCE_DIR}/conf/metisConfig.cmake.in cmake/metisConfig.cmake -+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/metis) -+ -+write_basic_package_version_file( -+ cmake/metisConfigVersion.cmake -+ VERSION ${PROJECT_VERSION} -+ COMPATIBILITY SameMajorVersion) -+ -+# --------------------------- -+install( -+ TARGETS ${PROJECT_NAME} -+ EXPORT metisTargets -+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} -+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} -+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} -+) -+ -+install( -+ FILES ${PROJECT_SOURCE_DIR}/include/metis.h -+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) - --# Build libmetis. --add_library(metis ${METIS_LIBRARY_TYPE} ${metis_sources}) -+install(EXPORT metisTargets FILE metisTargets.cmake NAMESPACE metis:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/metis) - --if(METIS_INSTALL) -- install(TARGETS metis -- LIBRARY DESTINATION lib -- RUNTIME DESTINATION lib -- ARCHIVE DESTINATION lib) --endif() -+install( -+ FILES ${CMAKE_CURRENT_BINARY_DIR}/cmake/metisConfig.cmake -+ ${CMAKE_CURRENT_BINARY_DIR}/cmake/metisConfigVersion.cmake -+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/metis) -\ No newline at end of file -diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt -index baf40ad..cab3078 100644 ---- a/programs/CMakeLists.txt -+++ b/programs/CMakeLists.txt -@@ -1,6 +1,3 @@ --# These programs use internal metis data structures. --include_directories(../libmetis) -- - # Build program. - add_executable(gpmetis gpmetis.c cmdline_gpmetis.c io.c stat.c) - add_executable(ndmetis ndmetis.c cmdline_ndmetis.c io.c smbfactor.c) -@@ -11,10 +8,8 @@ add_executable(cmpfillin cmpfillin.c io.c smbfactor.c) - - # Link with the required libraries - foreach(prog gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin) -- target_link_libraries(${prog} metis GKlib m) -+ target_link_libraries(${prog} PRIVATE metis::metis) - endforeach(prog) - --if(METIS_INSTALL) -- install(TARGETS gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin -- RUNTIME DESTINATION bin) --endif() -+install(TARGETS gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin -+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) -diff --git a/programs/gpmetis.c b/programs/gpmetis.c -index 4b7e0b5..b57914f 100644 ---- a/programs/gpmetis.c -+++ b/programs/gpmetis.c -@@ -239,14 +239,5 @@ void GPReportResults(params_t *params, graph_t *graph, idx_t *part, idx_t objval - printf("\nMemory Information ----------------------------------------------------------\n"); - printf(" Max memory used:\t\t %7.3"PRREAL" MB\n", (real_t)(params->maxmemory/(1024.0*1024.0))); - --#ifndef MACOS -- { -- struct rusage usage; -- getrusage(RUSAGE_SELF, &usage); -- printf(" rusage.ru_maxrss:\t\t %7.3"PRREAL" MB\n", (real_t)(usage.ru_maxrss/(1024.0))); -- } -- printf(" proc/self/stat/VmPeak:\t %7.3"PRREAL" MB\n", (real_t)gk_GetProcVmPeak()/(1024.0*1024.0)); --#endif -- - printf("******************************************************************************\n"); - } -diff --git a/programs/mpmetis.c b/programs/mpmetis.c -index 04a6a1b..a604864 100644 ---- a/programs/mpmetis.c -+++ b/programs/mpmetis.c -@@ -188,15 +188,6 @@ void MPReportResults(params_t *params, mesh_t *mesh, idx_t *epart, idx_t *npart, - printf("\nMemory Information ----------------------------------------------------------\n"); - printf(" Max memory used:\t\t %7.3"PRREAL" MB\n", (real_t)(params->maxmemory/(1024.0*1024.0))); - --#ifndef MACOS -- { -- struct rusage usage; -- getrusage(RUSAGE_SELF, &usage); -- printf(" rusage.ru_maxrss:\t\t %7.3"PRREAL" MB\n", (real_t)(usage.ru_maxrss/(1024.0))); -- } -- printf(" proc/self/stat/VmPeak:\t %7.3"PRREAL" MB\n", (real_t)gk_GetProcVmPeak()/(1024.0*1024.0)); --#endif -- - printf("******************************************************************************\n"); - - } -diff --git a/programs/ndmetis.c b/programs/ndmetis.c -index 5991f0c..c6ed1b5 100644 ---- a/programs/ndmetis.c -+++ b/programs/ndmetis.c -@@ -172,15 +172,6 @@ void NDReportResults(params_t *params, graph_t *graph, idx_t *perm, - printf("\nMemory Information ----------------------------------------------------------\n"); - printf(" Max memory used:\t\t %7.3"PRREAL" MB\n", (real_t)(params->maxmemory/(1024.0*1024.0))); - --#ifndef MACOS -- { -- struct rusage usage; -- getrusage(RUSAGE_SELF, &usage); -- printf(" rusage.ru_maxrss:\t\t %7.3"PRREAL" MB\n", (real_t)(usage.ru_maxrss/(1024.0))); -- } -- printf(" proc/self/stat/VmPeak:\t %7.3"PRREAL" MB\n", (real_t)gk_GetProcVmPeak()/(1024.0*1024.0)); --#endif -- - printf("******************************************************************************\n"); - - } - +diff --git a/CMakeLists.txt b/CMakeLists.txt +index a15d19a..e1f1731 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -1,22 +1,26 @@ +-cmake_minimum_required(VERSION 2.8) +-project(METIS C) ++cmake_minimum_required(VERSION 3.13) + +-set(SHARED FALSE CACHE BOOL "build a shared library") ++project(metis VERSION 0.0.1 LANGUAGES C) + +-if(MSVC) +- set(METIS_INSTALL FALSE) +-else() +- set(METIS_INSTALL TRUE) ++set(CMAKE_EXPORT_COMPILE_COMMANDS ON) ++ ++if(NOT CMAKE_BUILD_TYPE) ++ set(CMAKE_BUILD_TYPE Release) + endif() + +-# Configure libmetis library. +-if(SHARED) +- set(METIS_LIBRARY_TYPE SHARED) +-else() +- set(METIS_LIBRARY_TYPE STATIC) +-endif(SHARED) ++# ----------------------------------------------------------------------------- ++include(GNUInstallDirs) ++ ++find_package(GKlib REQUIRED) ++ ++if(GPROF) ++ check_c_compiler_flag("-pg" HAVE_GPROF_SUPPORT) ++ if(NOT HAVE_GPROF_SUPPORT) ++ message(WARNING "GPROF support was requested, but is not available") ++ endif() ++endif() + +-include(./conf/gkbuild.cmake) ++# ----------------------------------------------------------------------------- + + # METIS' custom options + #option(IDX64 "enable 64 bit ints" OFF) +@@ -34,19 +38,5 @@ include(./conf/gkbuild.cmake) + # + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${METIS_COPTIONS}") + +- +-# Add include directories. +-# i.e., the -I equivalent +-include_directories(build/xinclude) +-include_directories(${GKLIB_PATH}/include) +-include_directories(${CMAKE_INSTALL_PREFIX}/include) +- +-# List of paths that the compiler will search for library files. +-# i.e., the -L equivalent +-link_directories(${GKLIB_PATH}/lib) +-link_directories(${CMAKE_INSTALL_PREFIX}/lib) +- +-# Recursively look for CMakeLists.txt in subdirs. +-add_subdirectory("build/xinclude") +-add_subdirectory("libmetis") +-add_subdirectory("programs") ++add_subdirectory(libmetis) ++add_subdirectory(programs) +diff --git a/conf/check_thread_storage.c b/conf/check_thread_storage.c +deleted file mode 100644 +index e6e1e98..0000000 +--- a/conf/check_thread_storage.c ++++ /dev/null +@@ -1,5 +0,0 @@ +-extern __thread int x; +- +-int main(int argc, char **argv) { +- return 0; +-} +diff --git a/conf/gkbuild.cmake b/conf/gkbuild.cmake +deleted file mode 100644 +index 591ece4..0000000 +--- a/conf/gkbuild.cmake ++++ /dev/null +@@ -1,148 +0,0 @@ +-# Helper modules. +-include(CheckFunctionExists) +-include(CheckIncludeFile) +- +-# Setup options. +-option(GDB "enable use of GDB" OFF) +-option(ASSERT "turn asserts on" OFF) +-option(ASSERT2 "additional assertions" OFF) +-option(DEBUG "add debugging support" OFF) +-option(GPROF "add gprof support" OFF) +-option(VALGRIND "add valgrind support" OFF) +-option(OPENMP "enable OpenMP support" OFF) +-option(PCRE "enable PCRE support" OFF) +-option(GKREGEX "enable GKREGEX support" OFF) +-option(GKRAND "enable GKRAND support" OFF) +- +-# Add compiler flags. +-if(MSVC) +- set(GK_COPTS "/Ox") +- set(GK_COPTIONS "-DWIN32 -DMSC -D_CRT_SECURE_NO_DEPRECATE -DUSE_GKREGEX") +-elseif(MINGW) +- set(GK_COPTS "-DUSE_GKREGEX") +-else() +- set(GK_COPTIONS "-DLINUX -D_FILE_OFFSET_BITS=64") +-endif(MSVC) +-if(CYGWIN) +- set(GK_COPTIONS "${GK_COPTIONS} -DCYGWIN") +-endif(CYGWIN) +-if(CMAKE_COMPILER_IS_GNUCC) +-# GCC opts. +- set(GK_COPTIONS "${GK_COPTIONS} -std=c99 -fno-strict-aliasing") +-if(VALGRIND) +- set(GK_COPTIONS "${GK_COPTIONS} -march=x86-64 -mtune=generic") +-else() +-# -march=native is not a valid flag on PPC: +-if(CMAKE_SYSTEM_PROCESSOR MATCHES "power|ppc|powerpc|ppc64|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64")) +- set(GK_COPTIONS "${GK_COPTIONS} -mtune=native") +-else() +- set(GK_COPTIONS "${GK_COPTIONS} -march=native") +-endif() +-endif(VALGRIND) +- if(NOT MINGW) +- set(GK_COPTIONS "${GK_COPTIONS} -fPIC") +- endif(NOT MINGW) +-# GCC warnings. +- set(GK_COPTIONS "${GK_COPTIONS} -Werror -Wall -pedantic -Wno-unused-function -Wno-unused-but-set-variable -Wno-unused-variable -Wno-unknown-pragmas -Wno-unused-label") +-elseif(${CMAKE_C_COMPILER_ID} MATCHES "Sun") +-# Sun insists on -xc99. +- set(GK_COPTIONS "${GK_COPTIONS} -xc99") +-endif(CMAKE_COMPILER_IS_GNUCC) +- +-if(${CMAKE_C_COMPILER_ID} STREQUAL "Intel") +- set(GK_COPTIONS "${GK_COPTIONS} -xHost") +- # set(GK_COPTIONS "${GK_COPTIONS} -fast") +-endif() +- +-# Add support for MacOS items +-if(APPLE) +- set(GK_COPTIONS "${GK_COPTIONS} -DMACOS") +-endif(APPLE) +- +-# Find OpenMP if it is requested. +-if(OPENMP) +- include(FindOpenMP) +- if(OPENMP_FOUND) +- set(GK_COPTIONS "${GK_COPTIONS} -D__OPENMP__ ${OpenMP_C_FLAGS}") +- else() +- message(WARNING "OpenMP was requested but support was not found") +- endif(OPENMP_FOUND) +-endif(OPENMP) +- +- +-# Add various definitions. +-if(GDB) +- set(GK_COPTS "${GK_COPTS} -g") +- set(GK_COPTIONS "${GK_COPTIONS} -Werror") +-else() +- set(GK_COPTS "-O3") +-endif(GDB) +- +- +-if(DEBUG) +- set(GK_COPTS "-Og") +- set(GK_COPTIONS "${GK_COPTIONS} -DDEBUG") +-endif(DEBUG) +- +-if(GPROF) +- set(GK_COPTS "-pg") +-endif(GPROF) +- +-if(NOT ASSERT) +- set(GK_COPTIONS "${GK_COPTIONS} -DNDEBUG") +-endif(NOT ASSERT) +- +-if(NOT ASSERT2) +- set(GK_COPTIONS "${GK_COPTIONS} -DNDEBUG2") +-endif(NOT ASSERT2) +- +- +-# Add various options +-if(PCRE) +- set(GK_COPTIONS "${GK_COPTIONS} -D__WITHPCRE__") +-endif(PCRE) +- +-if(GKREGEX) +- set(GK_COPTIONS "${GK_COPTIONS} -DUSE_GKREGEX") +-endif(GKREGEX) +- +-if(GKRAND) +- set(GK_COPTIONS "${GK_COPTIONS} -DUSE_GKRAND") +-endif(GKRAND) +- +- +-# Check for features. +-check_include_file(execinfo.h HAVE_EXECINFO_H) +-if(HAVE_EXECINFO_H) +- set(GK_COPTIONS "${GK_COPTIONS} -DHAVE_EXECINFO_H") +-endif(HAVE_EXECINFO_H) +- +-check_function_exists(getline HAVE_GETLINE) +-if(HAVE_GETLINE) +- set(GK_COPTIONS "${GK_COPTIONS} -DHAVE_GETLINE") +-endif(HAVE_GETLINE) +- +- +-# Custom check for TLS. +-if(MSVC) +- set(GK_COPTIONS "${GK_COPTIONS} -D__thread=__declspec(thread)") +- +- # This if checks if that value is cached or not. +- if("${HAVE_THREADLOCALSTORAGE}" MATCHES "^${HAVE_THREADLOCALSTORAGE}$") +- try_compile(HAVE_THREADLOCALSTORAGE +- ${CMAKE_BINARY_DIR} +- ${CMAKE_SOURCE_DIR}/conf/check_thread_storage.c) +- if(HAVE_THREADLOCALSTORAGE) +- message(STATUS "checking for thread-local storage - found") +- else() +- message(STATUS "checking for thread-local storage - not found") +- endif() +- endif() +- if(NOT HAVE_THREADLOCALSTORAGE) +- set(GK_COPTIONS "${GK_COPTIONS} -D__thread=") +- endif() +-endif() +- +-# Finally set the official C flags. +-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GK_COPTIONS} ${GK_COPTS}") +- +diff --git a/conf/metisConfig.cmake.in b/conf/metisConfig.cmake.in +new file mode 100644 +index 0000000..3caad46 +--- /dev/null ++++ b/conf/metisConfig.cmake.in +@@ -0,0 +1,13 @@ ++if(NOT metis_FOUND) ++set(metis_FOUND True) ++ ++@PACKAGE_INIT@ ++ ++include(CMakeFindDependencyMacro) ++find_dependency(GKlib) ++ ++include(${CMAKE_CURRENT_LIST_DIR}/metisTargets.cmake) ++ ++check_required_components(metis) ++ ++endif() +\ No newline at end of file +diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt +deleted file mode 100644 +index 9515a51..0000000 +--- a/include/CMakeLists.txt ++++ /dev/null +@@ -1,3 +0,0 @@ +-if(METIS_INSTALL) +- install(FILES metis.h DESTINATION include) +-endif() +diff --git a/libmetis/CMakeLists.txt b/libmetis/CMakeLists.txt +index fc6cec6..0d0e7b4 100644 +--- a/libmetis/CMakeLists.txt ++++ b/libmetis/CMakeLists.txt +@@ -1,15 +1,52 @@ +-# Add this directory for internal users. +-include_directories(.) +- + # Find sources. +-file(GLOB metis_sources *.c) ++file(GLOB metis_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.c) ++file(GLOB metis_headers ${CMAKE_CURRENT_SOURCE_DIR}/*.h) ++ ++add_library(${PROJECT_NAME}) ++add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME}) ++ ++target_compile_features(${PROJECT_NAME} PUBLIC c_std_99) ++set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE True) ++ ++target_sources(${PROJECT_NAME} PRIVATE ${metis_sources} ${metis_headers} ${PROJECT_SOURCE_DIR}/include/metis.h) ++ ++target_include_directories(${PROJECT_NAME} PUBLIC $ $ ++ $) ++ ++target_link_libraries(${PROJECT_NAME} ++ PUBLIC GKlib::GKlib ++ PRIVATE GKlib::GKlib_compile_definitions) ++ ++target_compile_options(${PROJECT_NAME} PUBLIC $<$,$>:-pg>) ++ ++# ----------------------------------------------------------------------------- ++# Configure and Install ++include(CMakePackageConfigHelpers) ++ ++configure_package_config_file(${PROJECT_SOURCE_DIR}/conf/metisConfig.cmake.in cmake/metisConfig.cmake ++ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/metis) ++ ++write_basic_package_version_file( ++ cmake/metisConfigVersion.cmake ++ VERSION ${PROJECT_VERSION} ++ COMPATIBILITY SameMajorVersion) ++ ++# --------------------------- ++install( ++ TARGETS ${PROJECT_NAME} ++ EXPORT metisTargets ++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ++ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ++ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ++) ++ ++install( ++ FILES ${PROJECT_SOURCE_DIR}/include/metis.h ++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + +-# Build libmetis. +-add_library(metis ${METIS_LIBRARY_TYPE} ${metis_sources}) ++install(EXPORT metisTargets FILE metisTargets.cmake NAMESPACE metis:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/metis) + +-if(METIS_INSTALL) +- install(TARGETS metis +- LIBRARY DESTINATION lib +- RUNTIME DESTINATION lib +- ARCHIVE DESTINATION lib) +-endif() ++install( ++ FILES ${CMAKE_CURRENT_BINARY_DIR}/cmake/metisConfig.cmake ++ ${CMAKE_CURRENT_BINARY_DIR}/cmake/metisConfigVersion.cmake ++ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/metis) +\ No newline at end of file +diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt +index baf40ad..cab3078 100644 +--- a/programs/CMakeLists.txt ++++ b/programs/CMakeLists.txt +@@ -1,6 +1,3 @@ +-# These programs use internal metis data structures. +-include_directories(../libmetis) +- + # Build program. + add_executable(gpmetis gpmetis.c cmdline_gpmetis.c io.c stat.c) + add_executable(ndmetis ndmetis.c cmdline_ndmetis.c io.c smbfactor.c) +@@ -11,10 +8,8 @@ add_executable(cmpfillin cmpfillin.c io.c smbfactor.c) + + # Link with the required libraries + foreach(prog gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin) +- target_link_libraries(${prog} metis GKlib m) ++ target_link_libraries(${prog} PRIVATE metis::metis) + endforeach(prog) + +-if(METIS_INSTALL) +- install(TARGETS gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin +- RUNTIME DESTINATION bin) +-endif() ++install(TARGETS gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin ++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) +diff --git a/programs/gpmetis.c b/programs/gpmetis.c +index 4b7e0b5..b57914f 100644 +--- a/programs/gpmetis.c ++++ b/programs/gpmetis.c +@@ -239,14 +239,5 @@ void GPReportResults(params_t *params, graph_t *graph, idx_t *part, idx_t objval + printf("\nMemory Information ----------------------------------------------------------\n"); + printf(" Max memory used:\t\t %7.3"PRREAL" MB\n", (real_t)(params->maxmemory/(1024.0*1024.0))); + +-#ifndef MACOS +- { +- struct rusage usage; +- getrusage(RUSAGE_SELF, &usage); +- printf(" rusage.ru_maxrss:\t\t %7.3"PRREAL" MB\n", (real_t)(usage.ru_maxrss/(1024.0))); +- } +- printf(" proc/self/stat/VmPeak:\t %7.3"PRREAL" MB\n", (real_t)gk_GetProcVmPeak()/(1024.0*1024.0)); +-#endif +- + printf("******************************************************************************\n"); + } +diff --git a/programs/mpmetis.c b/programs/mpmetis.c +index 04a6a1b..a604864 100644 +--- a/programs/mpmetis.c ++++ b/programs/mpmetis.c +@@ -188,15 +188,6 @@ void MPReportResults(params_t *params, mesh_t *mesh, idx_t *epart, idx_t *npart, + printf("\nMemory Information ----------------------------------------------------------\n"); + printf(" Max memory used:\t\t %7.3"PRREAL" MB\n", (real_t)(params->maxmemory/(1024.0*1024.0))); + +-#ifndef MACOS +- { +- struct rusage usage; +- getrusage(RUSAGE_SELF, &usage); +- printf(" rusage.ru_maxrss:\t\t %7.3"PRREAL" MB\n", (real_t)(usage.ru_maxrss/(1024.0))); +- } +- printf(" proc/self/stat/VmPeak:\t %7.3"PRREAL" MB\n", (real_t)gk_GetProcVmPeak()/(1024.0*1024.0)); +-#endif +- + printf("******************************************************************************\n"); + + } +diff --git a/programs/ndmetis.c b/programs/ndmetis.c +index 5991f0c..c6ed1b5 100644 +--- a/programs/ndmetis.c ++++ b/programs/ndmetis.c +@@ -172,15 +172,6 @@ void NDReportResults(params_t *params, graph_t *graph, idx_t *perm, + printf("\nMemory Information ----------------------------------------------------------\n"); + printf(" Max memory used:\t\t %7.3"PRREAL" MB\n", (real_t)(params->maxmemory/(1024.0*1024.0))); + +-#ifndef MACOS +- { +- struct rusage usage; +- getrusage(RUSAGE_SELF, &usage); +- printf(" rusage.ru_maxrss:\t\t %7.3"PRREAL" MB\n", (real_t)(usage.ru_maxrss/(1024.0))); +- } +- printf(" proc/self/stat/VmPeak:\t %7.3"PRREAL" MB\n", (real_t)gk_GetProcVmPeak()/(1024.0*1024.0)); +-#endif +- + printf("******************************************************************************\n"); + + } + diff --git a/extern/patch/METIS/patch_idx32.diff b/extern/patch/METIS/patch_idx32.diff index 65fba223d7..ce4ef3d9f7 100644 --- a/extern/patch/METIS/patch_idx32.diff +++ b/extern/patch/METIS/patch_idx32.diff @@ -1,14 +1,14 @@ -diff --git a/include/metis.h b/include/metis.h -index 7fef0e7..85291cb 100644 ---- a/include/metis.h -+++ b/include/metis.h -@@ -30,7 +30,7 @@ - GCC does provides these definitions in stdint.h, but it may require some - modifications on other architectures. - --------------------------------------------------------------------------*/ --//#define IDXTYPEWIDTH 32 -+#define IDXTYPEWIDTH 32 - - - /*-------------------------------------------------------------------------- - +diff --git a/include/metis.h b/include/metis.h +index 7fef0e7..85291cb 100644 +--- a/include/metis.h ++++ b/include/metis.h +@@ -30,7 +30,7 @@ + GCC does provides these definitions in stdint.h, but it may require some + modifications on other architectures. + --------------------------------------------------------------------------*/ +-//#define IDXTYPEWIDTH 32 ++#define IDXTYPEWIDTH 32 + + + /*-------------------------------------------------------------------------- + diff --git a/extern/patch/METIS/patch_idx64.diff b/extern/patch/METIS/patch_idx64.diff index 8c6b63ddff..257748399d 100644 --- a/extern/patch/METIS/patch_idx64.diff +++ b/extern/patch/METIS/patch_idx64.diff @@ -1,14 +1,14 @@ -diff --git a/include/metis.h b/include/metis.h -index 7fef0e7..69e7241 100644 ---- a/include/metis.h -+++ b/include/metis.h -@@ -30,7 +30,7 @@ - GCC does provides these definitions in stdint.h, but it may require some - modifications on other architectures. - --------------------------------------------------------------------------*/ --//#define IDXTYPEWIDTH 32 -+#define IDXTYPEWIDTH 64 - - - /*-------------------------------------------------------------------------- - +diff --git a/include/metis.h b/include/metis.h +index 7fef0e7..69e7241 100644 +--- a/include/metis.h ++++ b/include/metis.h +@@ -30,7 +30,7 @@ + GCC does provides these definitions in stdint.h, but it may require some + modifications on other architectures. + --------------------------------------------------------------------------*/ +-//#define IDXTYPEWIDTH 32 ++#define IDXTYPEWIDTH 64 + + + /*-------------------------------------------------------------------------- + diff --git a/extern/patch/METIS/patch_install.diff b/extern/patch/METIS/patch_install.diff index 175b4fa525..a26a33982b 100644 --- a/extern/patch/METIS/patch_install.diff +++ b/extern/patch/METIS/patch_install.diff @@ -1,13 +1,13 @@ -diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt -index e22ec29..22d4bd5 100644 ---- a/programs/CMakeLists.txt -+++ b/programs/CMakeLists.txt -@@ -11,5 +11,5 @@ foreach(prog gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin) - target_link_libraries(${prog} PRIVATE metis::metis) - endforeach(prog) - --install(TARGETS gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin -- RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) -+# install(TARGETS gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin -+# RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) - +diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt +index e22ec29..22d4bd5 100644 +--- a/programs/CMakeLists.txt ++++ b/programs/CMakeLists.txt +@@ -11,5 +11,5 @@ foreach(prog gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin) + target_link_libraries(${prog} PRIVATE metis::metis) + endforeach(prog) + +-install(TARGETS gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin +- RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) ++# install(TARGETS gpmetis ndmetis mpmetis m2gmetis graphchk cmpfillin ++# RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + diff --git a/extern/patch/METIS/patch_real32.diff b/extern/patch/METIS/patch_real32.diff index fedaec30bd..64c57b0865 100644 --- a/extern/patch/METIS/patch_real32.diff +++ b/extern/patch/METIS/patch_real32.diff @@ -1,14 +1,14 @@ -diff --git a/include/metis.h b/include/metis.h -index d5550ec..04ba74d 100644 ---- a/include/metis.h -+++ b/include/metis.h -@@ -40,7 +40,7 @@ - 32 : single precision floating point (float) - 64 : double precision floating point (double) - --------------------------------------------------------------------------*/ --//#define REALTYPEWIDTH 32 -+#define REALTYPEWIDTH 32 - - - - +diff --git a/include/metis.h b/include/metis.h +index d5550ec..04ba74d 100644 +--- a/include/metis.h ++++ b/include/metis.h +@@ -40,7 +40,7 @@ + 32 : single precision floating point (float) + 64 : double precision floating point (double) + --------------------------------------------------------------------------*/ +-//#define REALTYPEWIDTH 32 ++#define REALTYPEWIDTH 32 + + + + diff --git a/extern/patch/METIS/patch_real64.diff b/extern/patch/METIS/patch_real64.diff index e1671a0dd5..af7e4bb515 100644 --- a/extern/patch/METIS/patch_real64.diff +++ b/extern/patch/METIS/patch_real64.diff @@ -1,14 +1,14 @@ -diff --git a/include/metis.h b/include/metis.h -index d5550ec..04ba74d 100644 ---- a/include/metis.h -+++ b/include/metis.h -@@ -40,7 +40,7 @@ - 32 : single precision floating point (float) - 64 : double precision floating point (double) - --------------------------------------------------------------------------*/ --//#define REALTYPEWIDTH 32 -+#define REALTYPEWIDTH 64 - - - - +diff --git a/include/metis.h b/include/metis.h +index d5550ec..04ba74d 100644 +--- a/include/metis.h ++++ b/include/metis.h +@@ -40,7 +40,7 @@ + 32 : single precision floating point (float) + 64 : double precision floating point (double) + --------------------------------------------------------------------------*/ +-//#define REALTYPEWIDTH 32 ++#define REALTYPEWIDTH 64 + + + + diff --git a/extern/patch/ParMETIS/patch_build.diff b/extern/patch/ParMETIS/patch_build.diff index 9489077d02..c3a786ef84 100644 --- a/extern/patch/ParMETIS/patch_build.diff +++ b/extern/patch/ParMETIS/patch_build.diff @@ -1,307 +1,307 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 146bc5f..89f3c13 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -1,43 +1,30 @@ --cmake_minimum_required(VERSION 2.8) --project(ParMETIS C) -+cmake_minimum_required(VERSION 3.13) -+project(parmetis VERSION 0.0.1 LANGUAGES C) - -+set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - --# Search for MPI. --# GK commented this out as it seems to be creating problems --# include(FindMPI) --# if(NOT MPI_FOUND) --# message(FATAL_ERROR "mpi is not found") --# endif() --# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MPI_COMPILE_FLAGS}") -+if(NOT CMAKE_BUILD_TYPE) -+ set(CMAKE_BUILD_TYPE Release) -+endif() -+ -+# ----------------------------------------------------------------------------- -+include(GNUInstallDirs) -+ -+find_package(metis REQUIRED) - -+find_package(MPI COMPONENTS C REQUIRED) - --# Prepare libraries. --if(SHARED) -- set(ParMETIS_LIBRARY_TYPE SHARED) --else() -- set(ParMETIS_LIBRARY_TYPE STATIC) -+if(GPROF) -+ check_c_compiler_flag("-pg" HAVE_GPROF_SUPPORT) -+ if(NOT HAVE_GPROF_SUPPORT) -+ message(WARNING "GPROF support was requested, but is not available") -+ endif() - endif() - --include(./conf/gkbuild.cmake) -+# ----------------------------------------------------------------------------- -+ - --# List of paths that the compiler will search for header files. --# i.e., the -I equivalent --include_directories(include) --include_directories(${MPI_INCLUDE_PATH}) --include_directories(${GKLIB_PATH}/include) --include_directories(${METIS_PATH}/include) --include_directories(${CMAKE_INSTALL_PREFIX}/include) - --# List of paths that the compiler will search for library files. --# i.e., the -L equivalent --link_directories(${GKLIB_PATH}/lib) --link_directories(${METIS_PATH}/lib) --link_directories(${CMAKE_INSTALL_PREFIX}/lib) - --# List of directories that cmake will look for CMakeLists.txt --add_subdirectory(include) - add_subdirectory(libparmetis) - add_subdirectory(programs) -- --# This is for testing during development and is not being distributed --#add_subdirectory(test) -diff --git a/conf/check_thread_storage.c b/conf/check_thread_storage.c -deleted file mode 100644 -index e6e1e98..0000000 ---- a/conf/check_thread_storage.c -+++ /dev/null -@@ -1,5 +0,0 @@ --extern __thread int x; -- --int main(int argc, char **argv) { -- return 0; --} -diff --git a/conf/parmetisConfig.cmake.in b/conf/parmetisConfig.cmake.in -new file mode 100644 -index 0000000..3e948b1 ---- /dev/null -+++ b/conf/parmetisConfig.cmake.in -@@ -0,0 +1,14 @@ -+if(NOT parmetis_FOUND) -+set(parmetis_FOUND True) -+ -+@PACKAGE_INIT@ -+ -+include(CMakeFindDependencyMacro) -+find_dependency(metis) -+find_dependency(MPI COMPONENTS C) -+ -+include(${CMAKE_CURRENT_LIST_DIR}/parmetisTargets.cmake) -+ -+check_required_components(parmetis) -+ -+endif() -\ No newline at end of file -diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt -deleted file mode 100644 -index 6a27074..0000000 ---- a/include/CMakeLists.txt -+++ /dev/null -@@ -1 +0,0 @@ --install(FILES parmetis.h DESTINATION include) -\ No newline at end of file -diff --git a/libparmetis/CMakeLists.txt b/libparmetis/CMakeLists.txt -index b9d6d84..b8ddac5 100644 ---- a/libparmetis/CMakeLists.txt -+++ b/libparmetis/CMakeLists.txt -@@ -1,17 +1,52 @@ - # Include directories for library code. --include_directories(.) -- --# Find sources. --file(GLOB parmetis_sources *.c) -- --# Create libparmetis --add_library(parmetis ${ParMETIS_LIBRARY_TYPE} ${parmetis_sources}) -- --if(SHARED) -- target_link_libraries(parmetis metis GKlib) --endif() -- --install(TARGETS parmetis -- LIBRARY DESTINATION lib -- RUNTIME DESTINATION lib -- ARCHIVE DESTINATION lib) -+file(GLOB parmetis_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.c) -+file(GLOB parmetis_headers ${CMAKE_CURRENT_SOURCE_DIR}/*.h) -+ -+add_library(${PROJECT_NAME}) -+add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME}) -+ -+target_compile_features(${PROJECT_NAME} PUBLIC c_std_99) -+set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE True) -+ -+target_sources(${PROJECT_NAME} PRIVATE ${parmetis_sources} ${parmetis_headers} ${PROJECT_SOURCE_DIR}/include/parmetis.h) -+ -+target_include_directories(${PROJECT_NAME} PUBLIC $ $ -+ $) -+ -+target_link_libraries(${PROJECT_NAME} -+ PUBLIC metis::metis GKlib::GKlib MPI::MPI_C -+ PRIVATE GKlib::GKlib_compile_definitions) -+ -+target_compile_options(${PROJECT_NAME} PUBLIC $<$,$>:-pg>) -+ -+# ----------------------------------------------------------------------------- -+# Configure and Install -+include(CMakePackageConfigHelpers) -+ -+configure_package_config_file(${PROJECT_SOURCE_DIR}/conf/parmetisConfig.cmake.in cmake/parmetisConfig.cmake -+ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parmetis) -+ -+write_basic_package_version_file( -+ cmake/parmetisConfigVersion.cmake -+ VERSION ${PROJECT_VERSION} -+ COMPATIBILITY SameMajorVersion) -+ -+# --------------------------- -+install( -+ TARGETS ${PROJECT_NAME} -+ EXPORT parmetisTargets -+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} -+ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} -+ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} -+) -+ -+install( -+ FILES ${PROJECT_SOURCE_DIR}/include/parmetis.h -+ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) -+ -+install(EXPORT parmetisTargets FILE parmetisTargets.cmake NAMESPACE parmetis:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parmetis) -+ -+install( -+ FILES ${CMAKE_CURRENT_BINARY_DIR}/cmake/parmetisConfig.cmake -+ ${CMAKE_CURRENT_BINARY_DIR}/cmake/parmetisConfigVersion.cmake -+ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parmetis) -\ No newline at end of file -diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt -index 2f9341c..19285d7 100644 ---- a/programs/CMakeLists.txt -+++ b/programs/CMakeLists.txt -@@ -1,5 +1,3 @@ --include_directories(.) -- - # Create programs. - add_executable(pm_ptest ptest.c io.c adaptgraph.c) - add_executable(pm_mtest mtest.c io.c) -@@ -9,8 +7,8 @@ add_executable(pm_dglpart dglpart.c) - - # Link with the required libraries - foreach(prog pm_ptest pm_mtest pm_parmetis pm_pometis pm_dglpart) -- target_link_libraries(${prog} parmetis metis GKlib m) -+ target_link_libraries(${prog} PRIVATE parmetis::parmetis) - endforeach(prog) - - install(TARGETS pm_ptest pm_mtest pm_parmetis pm_pometis pm_dglpart -- RUNTIME DESTINATION bin) -+ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) -diff --git a/programs/adaptgraph.c b/programs/adaptgraph.c -index 76ec425..92eeb03 100644 ---- a/programs/adaptgraph.c -+++ b/programs/adaptgraph.c -@@ -12,7 +12,7 @@ - * - */ - --#include -+#include "parmetisbin.h" - - - /************************************************************************* -diff --git a/programs/dglpart.c b/programs/dglpart.c -index 8d964cd..0f616af 100644 ---- a/programs/dglpart.c -+++ b/programs/dglpart.c -@@ -8,7 +8,7 @@ - * - */ - --#include -+#include "parmetisbin.h" - - #define CHUNKSIZE (1<<15) - -diff --git a/programs/io.c b/programs/io.c -index 9250659..eba63ac 100644 ---- a/programs/io.c -+++ b/programs/io.c -@@ -12,7 +12,7 @@ - * - */ - --#include -+#include "parmetisbin.h" - #define MAXLINE 64*1024*1024 - - /************************************************************************* -diff --git a/programs/mtest.c b/programs/mtest.c -index f6f892e..dc86ca3 100644 ---- a/programs/mtest.c -+++ b/programs/mtest.c -@@ -12,7 +12,7 @@ - * - */ - --#include -+#include "parmetisbin.h" - - - /************************************************************************* -diff --git a/programs/otest.c b/programs/otest.c -index aaee9c3..11cdc97 100644 ---- a/programs/otest.c -+++ b/programs/otest.c -@@ -12,7 +12,7 @@ - * - */ - --#include -+#include "parmetisbin.h" - - - /*************************************************************************/ -diff --git a/programs/parmetis.c b/programs/parmetis.c -index e087eb5..aef7ddc 100644 ---- a/programs/parmetis.c -+++ b/programs/parmetis.c -@@ -12,7 +12,7 @@ - * - */ - --#include -+#include "parmetisbin.h" - - /************************************************************************* - * Let the game begin -diff --git a/programs/pometis.c b/programs/pometis.c -index 2f18fbc..a306cfe 100644 ---- a/programs/pometis.c -+++ b/programs/pometis.c -@@ -12,7 +12,7 @@ - * - */ - --#include -+#include "parmetisbin.h" - - /************************************************************************* - * Let the game begin -diff --git a/programs/ptest.c b/programs/ptest.c -index 7cfb00b..87083d0 100644 ---- a/programs/ptest.c -+++ b/programs/ptest.c -@@ -12,7 +12,7 @@ - * - */ - --#include -+#include "parmetisbin.h" - - #define NCON 5 - - +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 146bc5f..89f3c13 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -1,43 +1,30 @@ +-cmake_minimum_required(VERSION 2.8) +-project(ParMETIS C) ++cmake_minimum_required(VERSION 3.13) ++project(parmetis VERSION 0.0.1 LANGUAGES C) + ++set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +-# Search for MPI. +-# GK commented this out as it seems to be creating problems +-# include(FindMPI) +-# if(NOT MPI_FOUND) +-# message(FATAL_ERROR "mpi is not found") +-# endif() +-# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${MPI_COMPILE_FLAGS}") ++if(NOT CMAKE_BUILD_TYPE) ++ set(CMAKE_BUILD_TYPE Release) ++endif() ++ ++# ----------------------------------------------------------------------------- ++include(GNUInstallDirs) ++ ++find_package(metis REQUIRED) + ++find_package(MPI COMPONENTS C REQUIRED) + +-# Prepare libraries. +-if(SHARED) +- set(ParMETIS_LIBRARY_TYPE SHARED) +-else() +- set(ParMETIS_LIBRARY_TYPE STATIC) ++if(GPROF) ++ check_c_compiler_flag("-pg" HAVE_GPROF_SUPPORT) ++ if(NOT HAVE_GPROF_SUPPORT) ++ message(WARNING "GPROF support was requested, but is not available") ++ endif() + endif() + +-include(./conf/gkbuild.cmake) ++# ----------------------------------------------------------------------------- ++ + +-# List of paths that the compiler will search for header files. +-# i.e., the -I equivalent +-include_directories(include) +-include_directories(${MPI_INCLUDE_PATH}) +-include_directories(${GKLIB_PATH}/include) +-include_directories(${METIS_PATH}/include) +-include_directories(${CMAKE_INSTALL_PREFIX}/include) + +-# List of paths that the compiler will search for library files. +-# i.e., the -L equivalent +-link_directories(${GKLIB_PATH}/lib) +-link_directories(${METIS_PATH}/lib) +-link_directories(${CMAKE_INSTALL_PREFIX}/lib) + +-# List of directories that cmake will look for CMakeLists.txt +-add_subdirectory(include) + add_subdirectory(libparmetis) + add_subdirectory(programs) +- +-# This is for testing during development and is not being distributed +-#add_subdirectory(test) +diff --git a/conf/check_thread_storage.c b/conf/check_thread_storage.c +deleted file mode 100644 +index e6e1e98..0000000 +--- a/conf/check_thread_storage.c ++++ /dev/null +@@ -1,5 +0,0 @@ +-extern __thread int x; +- +-int main(int argc, char **argv) { +- return 0; +-} +diff --git a/conf/parmetisConfig.cmake.in b/conf/parmetisConfig.cmake.in +new file mode 100644 +index 0000000..3e948b1 +--- /dev/null ++++ b/conf/parmetisConfig.cmake.in +@@ -0,0 +1,14 @@ ++if(NOT parmetis_FOUND) ++set(parmetis_FOUND True) ++ ++@PACKAGE_INIT@ ++ ++include(CMakeFindDependencyMacro) ++find_dependency(metis) ++find_dependency(MPI COMPONENTS C) ++ ++include(${CMAKE_CURRENT_LIST_DIR}/parmetisTargets.cmake) ++ ++check_required_components(parmetis) ++ ++endif() +\ No newline at end of file +diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt +deleted file mode 100644 +index 6a27074..0000000 +--- a/include/CMakeLists.txt ++++ /dev/null +@@ -1 +0,0 @@ +-install(FILES parmetis.h DESTINATION include) +\ No newline at end of file +diff --git a/libparmetis/CMakeLists.txt b/libparmetis/CMakeLists.txt +index b9d6d84..b8ddac5 100644 +--- a/libparmetis/CMakeLists.txt ++++ b/libparmetis/CMakeLists.txt +@@ -1,17 +1,52 @@ + # Include directories for library code. +-include_directories(.) +- +-# Find sources. +-file(GLOB parmetis_sources *.c) +- +-# Create libparmetis +-add_library(parmetis ${ParMETIS_LIBRARY_TYPE} ${parmetis_sources}) +- +-if(SHARED) +- target_link_libraries(parmetis metis GKlib) +-endif() +- +-install(TARGETS parmetis +- LIBRARY DESTINATION lib +- RUNTIME DESTINATION lib +- ARCHIVE DESTINATION lib) ++file(GLOB parmetis_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.c) ++file(GLOB parmetis_headers ${CMAKE_CURRENT_SOURCE_DIR}/*.h) ++ ++add_library(${PROJECT_NAME}) ++add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME}) ++ ++target_compile_features(${PROJECT_NAME} PUBLIC c_std_99) ++set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE True) ++ ++target_sources(${PROJECT_NAME} PRIVATE ${parmetis_sources} ${parmetis_headers} ${PROJECT_SOURCE_DIR}/include/parmetis.h) ++ ++target_include_directories(${PROJECT_NAME} PUBLIC $ $ ++ $) ++ ++target_link_libraries(${PROJECT_NAME} ++ PUBLIC metis::metis GKlib::GKlib MPI::MPI_C ++ PRIVATE GKlib::GKlib_compile_definitions) ++ ++target_compile_options(${PROJECT_NAME} PUBLIC $<$,$>:-pg>) ++ ++# ----------------------------------------------------------------------------- ++# Configure and Install ++include(CMakePackageConfigHelpers) ++ ++configure_package_config_file(${PROJECT_SOURCE_DIR}/conf/parmetisConfig.cmake.in cmake/parmetisConfig.cmake ++ INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parmetis) ++ ++write_basic_package_version_file( ++ cmake/parmetisConfigVersion.cmake ++ VERSION ${PROJECT_VERSION} ++ COMPATIBILITY SameMajorVersion) ++ ++# --------------------------- ++install( ++ TARGETS ${PROJECT_NAME} ++ EXPORT parmetisTargets ++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ++ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ++ ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ++) ++ ++install( ++ FILES ${PROJECT_SOURCE_DIR}/include/parmetis.h ++ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) ++ ++install(EXPORT parmetisTargets FILE parmetisTargets.cmake NAMESPACE parmetis:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parmetis) ++ ++install( ++ FILES ${CMAKE_CURRENT_BINARY_DIR}/cmake/parmetisConfig.cmake ++ ${CMAKE_CURRENT_BINARY_DIR}/cmake/parmetisConfigVersion.cmake ++ DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/parmetis) +\ No newline at end of file +diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt +index 2f9341c..19285d7 100644 +--- a/programs/CMakeLists.txt ++++ b/programs/CMakeLists.txt +@@ -1,5 +1,3 @@ +-include_directories(.) +- + # Create programs. + add_executable(pm_ptest ptest.c io.c adaptgraph.c) + add_executable(pm_mtest mtest.c io.c) +@@ -9,8 +7,8 @@ add_executable(pm_dglpart dglpart.c) + + # Link with the required libraries + foreach(prog pm_ptest pm_mtest pm_parmetis pm_pometis pm_dglpart) +- target_link_libraries(${prog} parmetis metis GKlib m) ++ target_link_libraries(${prog} PRIVATE parmetis::parmetis) + endforeach(prog) + + install(TARGETS pm_ptest pm_mtest pm_parmetis pm_pometis pm_dglpart +- RUNTIME DESTINATION bin) ++ RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) +diff --git a/programs/adaptgraph.c b/programs/adaptgraph.c +index 76ec425..92eeb03 100644 +--- a/programs/adaptgraph.c ++++ b/programs/adaptgraph.c +@@ -12,7 +12,7 @@ + * + */ + +-#include ++#include "parmetisbin.h" + + + /************************************************************************* +diff --git a/programs/dglpart.c b/programs/dglpart.c +index 8d964cd..0f616af 100644 +--- a/programs/dglpart.c ++++ b/programs/dglpart.c +@@ -8,7 +8,7 @@ + * + */ + +-#include ++#include "parmetisbin.h" + + #define CHUNKSIZE (1<<15) + +diff --git a/programs/io.c b/programs/io.c +index 9250659..eba63ac 100644 +--- a/programs/io.c ++++ b/programs/io.c +@@ -12,7 +12,7 @@ + * + */ + +-#include ++#include "parmetisbin.h" + #define MAXLINE 64*1024*1024 + + /************************************************************************* +diff --git a/programs/mtest.c b/programs/mtest.c +index f6f892e..dc86ca3 100644 +--- a/programs/mtest.c ++++ b/programs/mtest.c +@@ -12,7 +12,7 @@ + * + */ + +-#include ++#include "parmetisbin.h" + + + /************************************************************************* +diff --git a/programs/otest.c b/programs/otest.c +index aaee9c3..11cdc97 100644 +--- a/programs/otest.c ++++ b/programs/otest.c +@@ -12,7 +12,7 @@ + * + */ + +-#include ++#include "parmetisbin.h" + + + /*************************************************************************/ +diff --git a/programs/parmetis.c b/programs/parmetis.c +index e087eb5..aef7ddc 100644 +--- a/programs/parmetis.c ++++ b/programs/parmetis.c +@@ -12,7 +12,7 @@ + * + */ + +-#include ++#include "parmetisbin.h" + + /************************************************************************* + * Let the game begin +diff --git a/programs/pometis.c b/programs/pometis.c +index 2f18fbc..a306cfe 100644 +--- a/programs/pometis.c ++++ b/programs/pometis.c +@@ -12,7 +12,7 @@ + * + */ + +-#include ++#include "parmetisbin.h" + + /************************************************************************* + * Let the game begin +diff --git a/programs/ptest.c b/programs/ptest.c +index 7cfb00b..87083d0 100644 +--- a/programs/ptest.c ++++ b/programs/ptest.c +@@ -12,7 +12,7 @@ + * + */ + +-#include ++#include "parmetisbin.h" + + #define NCON 5 + + diff --git a/extern/patch/ParMETIS/patch_install.diff b/extern/patch/ParMETIS/patch_install.diff index 643a67c956..372e0fc828 100644 --- a/extern/patch/ParMETIS/patch_install.diff +++ b/extern/patch/ParMETIS/patch_install.diff @@ -1,13 +1,13 @@ -diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt -index 19285d7..e27a809 100644 ---- a/programs/CMakeLists.txt -+++ b/programs/CMakeLists.txt -@@ -10,5 +10,5 @@ foreach(prog pm_ptest pm_mtest pm_parmetis pm_pometis pm_dglpart) - target_link_libraries(${prog} PRIVATE parmetis::parmetis) - endforeach(prog) - --install(TARGETS pm_ptest pm_mtest pm_parmetis pm_pometis pm_dglpart -- RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) -+# install(TARGETS pm_ptest pm_mtest pm_parmetis pm_pometis pm_dglpart -+# RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) - +diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt +index 19285d7..e27a809 100644 +--- a/programs/CMakeLists.txt ++++ b/programs/CMakeLists.txt +@@ -10,5 +10,5 @@ foreach(prog pm_ptest pm_mtest pm_parmetis pm_pometis pm_dglpart) + target_link_libraries(${prog} PRIVATE parmetis::parmetis) + endforeach(prog) + +-install(TARGETS pm_ptest pm_mtest pm_parmetis pm_pometis pm_dglpart +- RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) ++# install(TARGETS pm_ptest pm_mtest pm_parmetis pm_pometis pm_dglpart ++# RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + diff --git a/extern/patch/ParMETIS/patch_spack.diff b/extern/patch/ParMETIS/patch_spack.diff index 3313269f32..cc3afc4816 100644 --- a/extern/patch/ParMETIS/patch_spack.diff +++ b/extern/patch/ParMETIS/patch_spack.diff @@ -1,71 +1,71 @@ -diff --git a/libparmetis/xyzpart.c b/libparmetis/xyzpart.c -index 0f1d746..9cc18ac 100644 ---- a/libparmetis/xyzpart.c -+++ b/libparmetis/xyzpart.c -@@ -104,14 +104,14 @@ void IRBinCoordinates(ctrl_t *ctrl, graph_t *graph, idx_t ndims, real_t *xyz, - - for (i=0; i 1) { -- int flag, rank = 0; -- MPI_Initialized(&flag); -- if (flag) { -- MPIComm c; -- rank = c.rank(); -- } -- cudaSetDevice(rank % devs); -- } --#endif -+// #if defined(STRUMPACK_USE_MPI) -+// int devs; -+// cudaGetDeviceCount(&devs); -+// if (devs > 1) { -+// int flag, rank = 0; -+// MPI_Initialized(&flag); -+// if (flag) { -+// MPIComm c; -+// rank = c.rank(); -+// } -+// cudaSetDevice(rank % devs); -+// } -+// #endif - } - - void gemm(BLASHandle& handle, cublasOperation_t transa, -diff --git a/src/dense/HIPWrapper.cpp b/src/dense/HIPWrapper.cpp -index 3534131c..59e76434 100644 ---- a/src/dense/HIPWrapper.cpp -+++ b/src/dense/HIPWrapper.cpp -@@ -88,19 +88,19 @@ namespace strumpack { - } - - void init() { --#if defined(STRUMPACK_USE_MPI) -- int devs; -- gpu_check(hipGetDeviceCount(&devs)); -- if (devs > 1) { -- int flag, rank = 0; -- MPI_Initialized(&flag); -- if (flag) { -- MPIComm c; -- rank = c.rank(); -- } -- gpu_check(hipSetDevice(rank % devs)); -- } --#endif -+// #if defined(STRUMPACK_USE_MPI) -+// int devs; -+// gpu_check(hipGetDeviceCount(&devs)); -+// if (devs > 1) { -+// int flag, rank = 0; -+// MPI_Initialized(&flag); -+// if (flag) { -+// MPIComm c; -+// rank = c.rank(); -+// } -+// gpu_check(hipSetDevice(rank % devs)); -+// } -+// #endif - } - - void gemm(BLASHandle& handle, hipblasOperation_t transa, +diff --git a/src/dense/CUDAWrapper.cpp b/src/dense/CUDAWrapper.cpp +index 6236a723..532729f5 100644 +--- a/src/dense/CUDAWrapper.cpp ++++ b/src/dense/CUDAWrapper.cpp +@@ -317,23 +317,23 @@ namespace strumpack { + } + + void init() { +-#if defined(STRUMPACK_USE_MPI) +- int devs; +- gpu_check(cudaGetDeviceCount(&devs)); +- if (devs > 1) { +- int flag, rank = 0; +- MPI_Initialized(&flag); +- if (flag) { +- MPIComm c; +- rank = c.rank(); +- } +- gpu_check(cudaSetDevice(rank % devs)); +-#pragma omp parallel +- { +- gpu_check(cudaSetDevice(rank % devs)); +- } +- } +-#endif ++// #if defined(STRUMPACK_USE_MPI) ++// int devs; ++// gpu_check(cudaGetDeviceCount(&devs)); ++// if (devs > 1) { ++// int flag, rank = 0; ++// MPI_Initialized(&flag); ++// if (flag) { ++// MPIComm c; ++// rank = c.rank(); ++// } ++// gpu_check(cudaSetDevice(rank % devs)); ++// #pragma omp parallel ++// { ++// gpu_check(cudaSetDevice(rank % devs)); ++// } ++// } ++// #endif + // gpu_check(cudaFree(0)); + // #if defined(STRUMPACK_USE_MAGMA) + // magma_init(); +diff --git a/src/dense/HIPWrapper.cpp b/src/dense/HIPWrapper.cpp +index c68623cf..6fd8f425 100644 +--- a/src/dense/HIPWrapper.cpp ++++ b/src/dense/HIPWrapper.cpp +@@ -280,23 +280,23 @@ namespace strumpack { + } + + void init() { +-#if defined(STRUMPACK_USE_MPI) +- int devs; +- gpu_check(hipGetDeviceCount(&devs)); +- if (devs > 1) { +- int flag, rank = 0; +- MPI_Initialized(&flag); +- if (flag) { +- MPIComm c; +- rank = c.rank(); +- } +- gpu_check(hipSetDevice(rank % devs)); +-#pragma omp parallel +- { +- gpu_check(hipSetDevice(rank % devs)); +- } +- } +-#endif ++// #if defined(STRUMPACK_USE_MPI) ++// int devs; ++// gpu_check(hipGetDeviceCount(&devs)); ++// if (devs > 1) { ++// int flag, rank = 0; ++// MPI_Initialized(&flag); ++// if (flag) { ++// MPIComm c; ++// rank = c.rank(); ++// } ++// gpu_check(hipSetDevice(rank % devs)); ++// #pragma omp parallel ++// { ++// gpu_check(hipSetDevice(rank % devs)); ++// } ++// } ++// #endif + } + + void device_memset(void* dptr, int value, std::size_t count) { +@@ -563,7 +563,7 @@ namespace strumpack { + STRUMPACK_FLOPS(4*blas::trsm_flops(m, n, alpha, side)); + STRUMPACK_BYTES(2*8*blas::trsm_moves(m, n)); + gpu_check(hipblasZtrsm(handle, side, uplo, trans, diag, m, n, +- reinterpret_cast(alpha), ++ reinterpret_cast(alpha), + reinterpret_cast(A), lda, + reinterpret_cast(B), ldb)); + } diff --git a/extern/patch/arpack-ng/patch_build.diff b/extern/patch/arpack-ng/patch_build.diff index cb86b9b700..78238827bc 100644 --- a/extern/patch/arpack-ng/patch_build.diff +++ b/extern/patch/arpack-ng/patch_build.diff @@ -1,100 +1,100 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 690cad4..e238b8f 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -71,7 +71,7 @@ function(pexamples list_name) - foreach(l ${${list_name}}) - get_filename_component(lwe ${l} NAME_WE) - add_executable(${lwe} ${parpackexample_DIR}/${l} ) -- target_link_libraries(${lwe} parpack arpack MPI::MPI_Fortran) -+ target_link_libraries(${lwe} parpack arpack BLAS::BLAS LAPACK::LAPACK ${EXTRA_LDFLAGS} MPI::MPI_Fortran) - add_test(NAME "${lwe}_ex" COMMAND mpiexec -n 2 ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${lwe}) - endforeach() - endfunction(pexamples) -@@ -390,13 +390,11 @@ endif() - add_library(arpack ${arpackutil_STAT_SRCS} ${arpacksrc_STAT_SRCS} ${arpacksrc_ICB}) - - target_link_libraries(arpack -- PUBLIC -- $,,LAPACK::LAPACK>> -- $,,BLAS::BLAS>> -+ PRIVATE - $ - $ - ) --target_link_options(arpack PUBLIC "${EXTRA_LDFLAGS}") -+target_link_options(arpack PRIVATE "${EXTRA_LDFLAGS}") - set_target_properties(arpack PROPERTIES OUTPUT_NAME arpack${LIBSUFFIX}${ITF64SUFFIX}) - set_target_properties(arpack PROPERTIES VERSION 2.1.0) - set_target_properties(arpack PROPERTIES SOVERSION 2) -@@ -415,9 +413,10 @@ if (MPI) - # use -DBUILD_SHARED_LIBS=ON|OFF to control static/shared - add_library(parpack ${parpacksrc_STAT_SRCS} ${parpackutil_STAT_SRCS} ${parpacksrc_ICB}) - target_link_libraries(parpack -- PUBLIC -+ PRIVATE - arpack -- $,,MPI::MPI_Fortran>> -+ $ -+ $ - $ - ) - set_target_properties(parpack PROPERTIES OUTPUT_NAME parpack${LIBSUFFIX}${ITF64SUFFIX}) -@@ -674,7 +673,7 @@ function(build_tests) - set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/PARPACK/TESTS/MPI) - - add_executable(issue46 PARPACK/TESTS/MPI/issue46.f) -- target_link_libraries(issue46 parpack arpack BLAS::BLAS LAPACK::LAPACK ${EXTRA_LDFLAGS}) -+ target_link_libraries(issue46 parpack arpack BLAS::BLAS LAPACK::LAPACK ${EXTRA_LDFLAGS} MPI::MPI_Fortran) - add_test(issue46_tst mpiexec -n 2 ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/issue46) - endif() - -diff --git a/cmake/arpackng-config.cmake.in b/cmake/arpackng-config.cmake.in -index bafa3fe..fe2fa53 100644 ---- a/cmake/arpackng-config.cmake.in -+++ b/cmake/arpackng-config.cmake.in -@@ -12,25 +12,25 @@ - # target_include_directories(main INTERFACE PARPACK::PARPACK) - # target_link_libraries(main PARPACK::PARPACK) - --if (NOT @BUILD_SHARED_LIBS@) -- include(CMakeFindDependencyMacro) -- # Find dependencies -- if (NOT TARGET BLAS::BLAS) -- find_dependency(BLAS REQUIRED) -- endif() -- if (NOT TARGET LAPACK::LAPACK) -- find_dependency(LAPACK REQUIRED) -- endif() -- if (@ICB@) -- enable_language(Fortran) -- endif() -- if (@MPI@) -- include(FindMPI) -- if (NOT TARGET MPI::Fortran) -- find_dependency(MPI REQUIRED COMPONENTS Fortran) -- endif() -- endif() --endif() -+#if (NOT @BUILD_SHARED_LIBS@) -+# include(CMakeFindDependencyMacro) -+# # Find dependencies -+# if (NOT TARGET BLAS::BLAS) -+# find_dependency(BLAS REQUIRED) -+# endif() -+# if (NOT TARGET LAPACK::LAPACK) -+# find_dependency(LAPACK REQUIRED) -+# endif() -+# if (@ICB@) -+# enable_language(Fortran) -+# endif() -+# if (@MPI@) -+# include(FindMPI) -+# if (NOT TARGET MPI::Fortran) -+# find_dependency(MPI REQUIRED COMPONENTS Fortran) -+# endif() -+# endif() -+#endif() - - include("${CMAKE_CURRENT_LIST_DIR}/arpackngTargets.cmake") - +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 690cad4..e238b8f 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -71,7 +71,7 @@ function(pexamples list_name) + foreach(l ${${list_name}}) + get_filename_component(lwe ${l} NAME_WE) + add_executable(${lwe} ${parpackexample_DIR}/${l} ) +- target_link_libraries(${lwe} parpack arpack MPI::MPI_Fortran) ++ target_link_libraries(${lwe} parpack arpack BLAS::BLAS LAPACK::LAPACK ${EXTRA_LDFLAGS} MPI::MPI_Fortran) + add_test(NAME "${lwe}_ex" COMMAND mpiexec -n 2 ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${lwe}) + endforeach() + endfunction(pexamples) +@@ -390,13 +390,11 @@ endif() + add_library(arpack ${arpackutil_STAT_SRCS} ${arpacksrc_STAT_SRCS} ${arpacksrc_ICB}) + + target_link_libraries(arpack +- PUBLIC +- $,,LAPACK::LAPACK>> +- $,,BLAS::BLAS>> ++ PRIVATE + $ + $ + ) +-target_link_options(arpack PUBLIC "${EXTRA_LDFLAGS}") ++target_link_options(arpack PRIVATE "${EXTRA_LDFLAGS}") + set_target_properties(arpack PROPERTIES OUTPUT_NAME arpack${LIBSUFFIX}${ITF64SUFFIX}) + set_target_properties(arpack PROPERTIES VERSION 2.1.0) + set_target_properties(arpack PROPERTIES SOVERSION 2) +@@ -415,9 +413,10 @@ if (MPI) + # use -DBUILD_SHARED_LIBS=ON|OFF to control static/shared + add_library(parpack ${parpacksrc_STAT_SRCS} ${parpackutil_STAT_SRCS} ${parpacksrc_ICB}) + target_link_libraries(parpack +- PUBLIC ++ PRIVATE + arpack +- $,,MPI::MPI_Fortran>> ++ $ ++ $ + $ + ) + set_target_properties(parpack PROPERTIES OUTPUT_NAME parpack${LIBSUFFIX}${ITF64SUFFIX}) +@@ -674,7 +673,7 @@ function(build_tests) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/PARPACK/TESTS/MPI) + + add_executable(issue46 PARPACK/TESTS/MPI/issue46.f) +- target_link_libraries(issue46 parpack arpack BLAS::BLAS LAPACK::LAPACK ${EXTRA_LDFLAGS}) ++ target_link_libraries(issue46 parpack arpack BLAS::BLAS LAPACK::LAPACK ${EXTRA_LDFLAGS} MPI::MPI_Fortran) + add_test(issue46_tst mpiexec -n 2 ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/issue46) + endif() + +diff --git a/cmake/arpackng-config.cmake.in b/cmake/arpackng-config.cmake.in +index bafa3fe..fe2fa53 100644 +--- a/cmake/arpackng-config.cmake.in ++++ b/cmake/arpackng-config.cmake.in +@@ -12,25 +12,25 @@ + # target_include_directories(main INTERFACE PARPACK::PARPACK) + # target_link_libraries(main PARPACK::PARPACK) + +-if (NOT @BUILD_SHARED_LIBS@) +- include(CMakeFindDependencyMacro) +- # Find dependencies +- if (NOT TARGET BLAS::BLAS) +- find_dependency(BLAS REQUIRED) +- endif() +- if (NOT TARGET LAPACK::LAPACK) +- find_dependency(LAPACK REQUIRED) +- endif() +- if (@ICB@) +- enable_language(Fortran) +- endif() +- if (@MPI@) +- include(FindMPI) +- if (NOT TARGET MPI::Fortran) +- find_dependency(MPI REQUIRED COMPONENTS Fortran) +- endif() +- endif() +-endif() ++#if (NOT @BUILD_SHARED_LIBS@) ++# include(CMakeFindDependencyMacro) ++# # Find dependencies ++# if (NOT TARGET BLAS::BLAS) ++# find_dependency(BLAS REQUIRED) ++# endif() ++# if (NOT TARGET LAPACK::LAPACK) ++# find_dependency(LAPACK REQUIRED) ++# endif() ++# if (@ICB@) ++# enable_language(Fortran) ++# endif() ++# if (@MPI@) ++# include(FindMPI) ++# if (NOT TARGET MPI::Fortran) ++# find_dependency(MPI REQUIRED COMPONENTS Fortran) ++# endif() ++# endif() ++#endif() + + include("${CMAKE_CURRENT_LIST_DIR}/arpackngTargets.cmake") + diff --git a/extern/patch/arpack-ng/patch_second.diff b/extern/patch/arpack-ng/patch_second.diff index 84733caa7d..a6d3ccb2ff 100644 --- a/extern/patch/arpack-ng/patch_second.diff +++ b/extern/patch/arpack-ng/patch_second.diff @@ -1,31 +1,31 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index a11e77c..a646634 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -329,7 +329,7 @@ set(arpackutil_STAT_SRCS - ${arpack_SOURCE_DIR}/UTIL/iset.f - ${arpack_SOURCE_DIR}/UTIL/iswap.f - ${arpack_SOURCE_DIR}/UTIL/ivout.f -- ${arpack_SOURCE_DIR}/UTIL/second_NONE.f -+ ${arpack_SOURCE_DIR}/UTIL/second.f - ${arpack_SOURCE_DIR}/UTIL/svout.f - ${arpack_SOURCE_DIR}/UTIL/smout.f - ${arpack_SOURCE_DIR}/UTIL/dvout.f -diff --git a/UTIL/Makefile.am b/UTIL/Makefile.am -index 0d0f6e3..65100eb 100644 ---- a/UTIL/Makefile.am -+++ b/UTIL/Makefile.am -@@ -1,11 +1,11 @@ --SRCS = icnteq.f icopy.f iset.f iswap.f ivout.f second_NONE.f -+SRCS = icnteq.f icopy.f iset.f iswap.f ivout.f second.f - - SSRC = svout.f smout.f - DSRC = dvout.f dmout.f - CSRC = cvout.f cmout.f - ZSRC = zvout.f zmout.f - --EXTRA_DIST = second.f second.t3d -+EXTRA_DIST = second_NONE.f second.t3d - - noinst_LTLIBRARIES = libarpackutil.la - libarpackutil_la_SOURCES = $(SRCS) $(SSRC) $(DSRC) $(CSRC) $(ZSRC) +diff --git a/CMakeLists.txt b/CMakeLists.txt +index a11e77c..a646634 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -329,7 +329,7 @@ set(arpackutil_STAT_SRCS + ${arpack_SOURCE_DIR}/UTIL/iset.f + ${arpack_SOURCE_DIR}/UTIL/iswap.f + ${arpack_SOURCE_DIR}/UTIL/ivout.f +- ${arpack_SOURCE_DIR}/UTIL/second_NONE.f ++ ${arpack_SOURCE_DIR}/UTIL/second.f + ${arpack_SOURCE_DIR}/UTIL/svout.f + ${arpack_SOURCE_DIR}/UTIL/smout.f + ${arpack_SOURCE_DIR}/UTIL/dvout.f +diff --git a/UTIL/Makefile.am b/UTIL/Makefile.am +index 0d0f6e3..65100eb 100644 +--- a/UTIL/Makefile.am ++++ b/UTIL/Makefile.am +@@ -1,11 +1,11 @@ +-SRCS = icnteq.f icopy.f iset.f iswap.f ivout.f second_NONE.f ++SRCS = icnteq.f icopy.f iset.f iswap.f ivout.f second.f + + SSRC = svout.f smout.f + DSRC = dvout.f dmout.f + CSRC = cvout.f cmout.f + ZSRC = zvout.f zmout.f + +-EXTRA_DIST = second.f second.t3d ++EXTRA_DIST = second_NONE.f second.t3d + + noinst_LTLIBRARIES = libarpackutil.la + libarpackutil_la_SOURCES = $(SRCS) $(SSRC) $(DSRC) $(CSRC) $(ZSRC) diff --git a/extern/patch/arpack-ng/patch_zdotc.diff b/extern/patch/arpack-ng/patch_zdotc.diff index c8a89cb7ac..04e84daaba 100644 --- a/extern/patch/arpack-ng/patch_zdotc.diff +++ b/extern/patch/arpack-ng/patch_zdotc.diff @@ -1,433 +1,433 @@ -diff --git a/PARPACK/SRC/BLACS/pzgetv0.f b/PARPACK/SRC/BLACS/pzgetv0.f -index 17354c1..bcbf552 100644 ---- a/PARPACK/SRC/BLACS/pzgetv0.f -+++ b/PARPACK/SRC/BLACS/pzgetv0.f -@@ -197,8 +197,8 @@ c - Double precision - & pdznorm2 , dlapy2 - Complex*16 -- & zzdotc -- external zzdotc , pdznorm2 , dlapy2 -+ & zzdotc10 -+ external zzdotc10 , pdznorm2 , dlapy2 - c - c %-----------------% - c | Data Statements | -@@ -335,7 +335,7 @@ c - c - first = .FALSE. - if (bmat .eq. 'G') then -- cnorm = zzdotc (n, resid, 1, workd, 1) -+ cnorm = zzdotc10 (n, resid, 1, workd, 1) - call zgsum2d ( comm, 'All', ' ', 1, 1, cnorm, 1, -1, -1 ) - rnorm0 = sqrt(dlapy2 (dble (cnorm),dimag (cnorm))) - else if (bmat .eq. 'I') then -@@ -394,7 +394,7 @@ c - end if - c - if (bmat .eq. 'G') then -- cnorm = zzdotc (n, resid, 1, workd, 1) -+ cnorm = zzdotc10 (n, resid, 1, workd, 1) - call zgsum2d ( comm, 'All', ' ', 1, 1, cnorm, 1, -1, -1 ) - rnorm = sqrt(dlapy2 (dble (cnorm),dimag (cnorm))) - else if (bmat .eq. 'I') then -diff --git a/PARPACK/SRC/BLACS/pznaitr.f b/PARPACK/SRC/BLACS/pznaitr.f -index cd1ba01..a6a3d29 100644 ---- a/PARPACK/SRC/BLACS/pznaitr.f -+++ b/PARPACK/SRC/BLACS/pznaitr.f -@@ -303,10 +303,10 @@ c | External Functions | - c %--------------------% - c - Complex*16 -- & zzdotc -+ & zzdotc10 - Double precision - & pdlamch, pdznorm2, zlanhs, dlapy2 -- external zzdotc, pdznorm2, zlanhs, pdlamch, dlapy2 -+ external zzdotc10, pdznorm2, zlanhs, pdlamch, dlapy2 - c - c %---------------------% - c | Intrinsic Functions | -@@ -573,7 +573,7 @@ c | Compute the B-norm of OP*v_{j}. | - c %-------------------------------------% - c - if (bmat .eq. 'G') then -- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) -+ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) - call zgsum2d( comm, 'All', ' ', 1, 1, cnorm, 1, -1, -1 ) - wnorm = sqrt( dlapy2(dble(cnorm),dimag(cnorm)) ) - else if (bmat .eq. 'I') then -@@ -647,7 +647,7 @@ c | Compute the B-norm of r_{j}. | - c %------------------------------% - c - if (bmat .eq. 'G') then -- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) -+ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) - call zgsum2d( comm, 'All', ' ', 1, 1, cnorm, 1, -1, -1 ) - rnorm = sqrt( dlapy2(dble(cnorm),dimag(cnorm)) ) - else if (bmat .eq. 'I') then -@@ -749,7 +749,7 @@ c | Compute the B-norm of the corrected residual r_{j}. | - c %-----------------------------------------------------% - c - if (bmat .eq. 'G') then -- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) -+ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) - call zgsum2d( comm, 'All', ' ', 1, 1, cnorm, 1, -1, -1 ) - rnorm1 = sqrt( dlapy2(dble(cnorm),dimag(cnorm)) ) - else if (bmat .eq. 'I') then -diff --git a/PARPACK/SRC/BLACS/pznaup2.f b/PARPACK/SRC/BLACS/pznaup2.f -index 8fff22f..a1af4c2 100644 ---- a/PARPACK/SRC/BLACS/pznaup2.f -+++ b/PARPACK/SRC/BLACS/pznaup2.f -@@ -254,10 +254,10 @@ c | External functions | - c %--------------------% - c - Complex*16 -- & zzdotc -+ & zzdotc10 - Double precision - & pdznorm2, pdlamch, dlapy2 -- external zzdotc, pdznorm2, pdlamch, dlapy2 -+ external zzdotc10, pdznorm2, pdlamch, dlapy2 - c - c %---------------------% - c | Intrinsic Functions | -@@ -767,7 +767,7 @@ c - end if - c - if (bmat .eq. 'G') then -- cmpnorm = zzdotc (n, resid, 1, workd, 1) -+ cmpnorm = zzdotc10 (n, resid, 1, workd, 1) - call zgsum2d( comm, 'All', ' ', 1, 1, cmpnorm, 1, -1, -1 ) - rnorm = sqrt(dlapy2(dble(cmpnorm),dimag(cmpnorm))) - else if (bmat .eq. 'I') then -diff --git a/PARPACK/SRC/BLACS/pzneupd.f b/PARPACK/SRC/BLACS/pzneupd.f -index af76f06..4cc4388 100644 ---- a/PARPACK/SRC/BLACS/pzneupd.f -+++ b/PARPACK/SRC/BLACS/pzneupd.f -@@ -340,8 +340,8 @@ c - external dznrm2,pdlamch,dlapy2 - c - Complex*16 -- & zzdotc -- external zzdotc -+ & zzdotc10 -+ external zzdotc10 - c - c %---------------------% - c | Intrinsic Functions | -@@ -743,7 +743,7 @@ c | upper triangular, thus the length of the | - c | inner product can be set to j. | - c %------------------------------------------% - c -- workev(j) = zzdotc(j, workl(ihbds), 1, -+ workev(j) = zzdotc10(j, workl(ihbds), 1, - & workl(invsub+(j-1)*ldq), 1) - 40 continue - c -diff --git a/PARPACK/SRC/MPI/pzgetv0.f b/PARPACK/SRC/MPI/pzgetv0.f -index 94fb705..a22538d 100644 ---- a/PARPACK/SRC/MPI/pzgetv0.f -+++ b/PARPACK/SRC/MPI/pzgetv0.f -@@ -200,8 +200,8 @@ c - Double precision - & pdznorm2 , dlapy2 - Complex*16 -- & zzdotc -- external zzdotc , pdznorm2 , dlapy2 -+ & zzdotc10 -+ external zzdotc10 , pdznorm2 , dlapy2 - c - c %-----------------% - c | Data Statements | -@@ -331,7 +331,7 @@ c - c - first = .FALSE. - if (bmat .eq. 'G') then -- cnorm_buf = zzdotc (n, resid, 1, workd, 1) -+ cnorm_buf = zzdotc10 (n, resid, 1, workd, 1) - call MPI_ALLREDUCE( [cnorm_buf], buf2, 1, - & MPI_DOUBLE_COMPLEX , MPI_SUM, comm, ierr ) - cnorm = buf2(1) -@@ -393,7 +393,7 @@ c - end if - c - if (bmat .eq. 'G') then -- cnorm_buf = zzdotc (n, resid, 1, workd, 1) -+ cnorm_buf = zzdotc10 (n, resid, 1, workd, 1) - call MPI_ALLREDUCE( [cnorm_buf], buf2, 1, - & MPI_DOUBLE_COMPLEX , MPI_SUM, comm, ierr ) - cnorm = buf2(1) -diff --git a/PARPACK/SRC/MPI/pznaitr.f b/PARPACK/SRC/MPI/pznaitr.f -index 29a757f..d41407f 100644 ---- a/PARPACK/SRC/MPI/pznaitr.f -+++ b/PARPACK/SRC/MPI/pznaitr.f -@@ -307,10 +307,10 @@ c | External Functions | - c %--------------------% - c - Complex*16 -- & zzdotc -+ & zzdotc10 - Double precision - & pdlamch10, pdznorm2, zlanhs, dlapy2 -- external zzdotc, pdznorm2, zlanhs, pdlamch10, dlapy2 -+ external zzdotc10, pdznorm2, zlanhs, pdlamch10, dlapy2 - c - c %---------------------% - c | Intrinsic Functions | -@@ -576,7 +576,7 @@ c | Compute the B-norm of OP*v_{j}. | - c %-------------------------------------% - c - if (bmat .eq. 'G') then -- cnorm_buf = zzdotc (n, resid, 1, workd(ipj), 1) -+ cnorm_buf = zzdotc10 (n, resid, 1, workd(ipj), 1) - call MPI_ALLREDUCE( [cnorm_buf], buf2, 1, - & MPI_DOUBLE_COMPLEX, MPI_SUM, comm, ierr ) - cnorm = buf2(1) -@@ -653,7 +653,7 @@ c | Compute the B-norm of r_{j}. | - c %------------------------------% - c - if (bmat .eq. 'G') then -- cnorm_buf = zzdotc (n, resid, 1, workd(ipj), 1) -+ cnorm_buf = zzdotc10 (n, resid, 1, workd(ipj), 1) - call MPI_ALLREDUCE( [cnorm_buf], buf2, 1, - & MPI_DOUBLE_COMPLEX, MPI_SUM, comm, ierr ) - cnorm = buf2(1) -@@ -758,7 +758,7 @@ c | Compute the B-norm of the corrected residual r_{j}. | - c %-----------------------------------------------------% - c - if (bmat .eq. 'G') then -- cnorm_buf = zzdotc (n, resid, 1, workd(ipj), 1) -+ cnorm_buf = zzdotc10 (n, resid, 1, workd(ipj), 1) - call MPI_ALLREDUCE( [cnorm_buf], buf2, 1, - & MPI_DOUBLE_COMPLEX, MPI_SUM, comm, ierr ) - cnorm = buf2(1) -diff --git a/PARPACK/SRC/MPI/pznaup2.f b/PARPACK/SRC/MPI/pznaup2.f -index 7ea1198..e46e31d 100644 ---- a/PARPACK/SRC/MPI/pznaup2.f -+++ b/PARPACK/SRC/MPI/pznaup2.f -@@ -257,10 +257,10 @@ c | External functions | - c %--------------------% - c - Complex*16 -- & zzdotc -+ & zzdotc10 - Double precision - & pdznorm2, pdlamch10, dlapy2 -- external zzdotc, pdznorm2, pdlamch10, dlapy2 -+ external zzdotc10, pdznorm2, pdlamch10, dlapy2 - c - c %---------------------% - c | Intrinsic Functions | -@@ -770,7 +770,7 @@ c - end if - c - if (bmat .eq. 'G') then -- cmpnorm_buf = zzdotc (n, resid, 1, workd, 1) -+ cmpnorm_buf = zzdotc10 (n, resid, 1, workd, 1) - call MPI_ALLREDUCE( [cmpnorm_buf], buf2, 1, - & MPI_DOUBLE_COMPLEX, MPI_SUM, comm, ierr ) - cmpnorm = buf2(1) -diff --git a/PARPACK/SRC/MPI/pzneupd.f b/PARPACK/SRC/MPI/pzneupd.f -index 395cfee..44b9978 100644 ---- a/PARPACK/SRC/MPI/pzneupd.f -+++ b/PARPACK/SRC/MPI/pzneupd.f -@@ -340,8 +340,8 @@ c - external dznrm2,pdlamch10,dlapy2 - c - Complex*16 -- & zzdotc -- external zzdotc -+ & zzdotc10 -+ external zzdotc10 - c - c %---------------------% - c | Intrinsic Functions | -@@ -743,7 +743,7 @@ c | upper triangular, thus the length of the | - c | inner product can be set to j. | - c %------------------------------------------% - c -- workev(j) = zzdotc(j, workl(ihbds), 1, -+ workev(j) = zzdotc10(j, workl(ihbds), 1, - & workl(invsub+(j-1)*ldq), 1) - 40 continue - c -diff --git a/SRC/Makefile.am b/SRC/Makefile.am -index afc48c0..d8ad395 100644 ---- a/SRC/Makefile.am -+++ b/SRC/Makefile.am -@@ -12,7 +12,7 @@ CSRC = cnaitr.f cnapps.f cnaup2.f cnaupd.f cneigh.f cneupd.f cngets.f cstatn.f \ - cgetv0.f csortc.f ccdotc.f - - ZSRC = znaitr.f znapps.f znaup2.f znaupd.f zneigh.f zneupd.f zngets.f zstatn.f \ -- zgetv0.f zsortc.f zzdotc.f -+ zgetv0.f zsortc.f zzdotc10.f - - if ICB - SSRC += icbass.F90 icbasn.F90 -diff --git a/SRC/zgetv0.f b/SRC/zgetv0.f -index 1fbd508..ee1bd41 100644 ---- a/SRC/zgetv0.f -+++ b/SRC/zgetv0.f -@@ -177,8 +177,8 @@ c - Double precision - & dznrm2, dlapy2 - Complex*16 -- & zzdotc -- external zzdotc, dznrm2, dlapy2 -+ & zzdotc10 -+ external zzdotc10, dznrm2, dlapy2 - c - c %-----------------% - c | Data Statements | -@@ -293,7 +293,7 @@ c - c - first = .FALSE. - if (bmat .eq. 'G') then -- cnorm = zzdotc (n, resid, 1, workd, 1) -+ cnorm = zzdotc10 (n, resid, 1, workd, 1) - rnorm0 = sqrt(dlapy2(dble(cnorm),aimag(cnorm))) - else if (bmat .eq. 'I') then - rnorm0 = dznrm2(n, resid, 1) -@@ -350,7 +350,7 @@ c - end if - c - if (bmat .eq. 'G') then -- cnorm = zzdotc (n, resid, 1, workd, 1) -+ cnorm = zzdotc10 (n, resid, 1, workd, 1) - rnorm = sqrt(dlapy2(dble(cnorm),aimag(cnorm))) - else if (bmat .eq. 'I') then - rnorm = dznrm2(n, resid, 1) -diff --git a/SRC/znaitr.f b/SRC/znaitr.f -index 240412c..55ee683 100644 ---- a/SRC/znaitr.f -+++ b/SRC/znaitr.f -@@ -280,10 +280,10 @@ c | External Functions | - c %--------------------% - c - Complex*16 -- & zzdotc -+ & zzdotc10 - Double precision - & dlamch, dznrm2, zlanhs, dlapy2 -- external zzdotc, dznrm2, zlanhs, dlamch, dlapy2 -+ external zzdotc10, dznrm2, zlanhs, dlamch, dlapy2 - c - c %---------------------% - c | Intrinsic Functions | -@@ -550,7 +550,7 @@ c | Compute the B-norm of OP*v_{j}. | - c %-------------------------------------% - c - if (bmat .eq. 'G') then -- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) -+ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) - wnorm = sqrt( dlapy2(dble(cnorm),aimag(cnorm)) ) - else if (bmat .eq. 'I') then - wnorm = dznrm2(n, resid, 1) -@@ -622,7 +622,7 @@ c | Compute the B-norm of r_{j}. | - c %------------------------------% - c - if (bmat .eq. 'G') then -- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) -+ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) - rnorm = sqrt( dlapy2(dble(cnorm),aimag(cnorm)) ) - else if (bmat .eq. 'I') then - rnorm = dznrm2(n, resid, 1) -@@ -722,7 +722,7 @@ c | Compute the B-norm of the corrected residual r_{j}. | - c %-----------------------------------------------------% - c - if (bmat .eq. 'G') then -- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) -+ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) - rnorm1 = sqrt( dlapy2(dble(cnorm),aimag(cnorm)) ) - else if (bmat .eq. 'I') then - rnorm1 = dznrm2(n, resid, 1) -diff --git a/SRC/znaup2.f b/SRC/znaup2.f -index 0ab01dd..d3f7f2b 100644 ---- a/SRC/znaup2.f -+++ b/SRC/znaup2.f -@@ -247,10 +247,10 @@ c | External functions | - c %--------------------% - c - Complex*16 -- & zzdotc -+ & zzdotc10 - Double precision - & dznrm2 , dlamch , dlapy2 -- external zzdotc , dznrm2 , dlamch , dlapy2 -+ external zzdotc10 , dznrm2 , dlamch , dlapy2 - c - c %---------------------% - c | Intrinsic Functions | -@@ -754,7 +754,7 @@ c - end if - c - if (bmat .eq. 'G') then -- cmpnorm = zzdotc (n, resid, 1, workd, 1) -+ cmpnorm = zzdotc10 (n, resid, 1, workd, 1) - rnorm = sqrt(dlapy2 (dble (cmpnorm),aimag (cmpnorm))) - else if (bmat .eq. 'I') then - rnorm = dznrm2 (n, resid, 1) -diff --git a/SRC/zneupd.f b/SRC/zneupd.f -index 92e7dc9..fa78ab7 100644 ---- a/SRC/zneupd.f -+++ b/SRC/zneupd.f -@@ -325,8 +325,8 @@ c - external dznrm2, dlamch, dlapy2 - c - Complex*16 -- & zzdotc -- external zzdotc -+ & zzdotc10 -+ external zzdotc10 - c - c %-----------------------% - c | Executable Statements | -@@ -731,7 +731,7 @@ c | upper triangular, thus the length of the | - c | inner product can be set to j. | - c %------------------------------------------% - c -- workev(j) = zzdotc(j, workl(ihbds), 1, -+ workev(j) = zzdotc10(j, workl(ihbds), 1, - & workl(invsub+(j-1)*ldq), 1) - 40 continue - c -diff --git a/SRC/zzdotc.f b/SRC/zzdotc10.f -similarity index 85% -rename from SRC/zzdotc.f -rename to SRC/zzdotc10.f -index a98c342..fc2383b 100644 ---- a/SRC/zzdotc.f -+++ b/SRC/zzdotc10.f -@@ -1,4 +1,4 @@ -- double complex function zzdotc(n,zx,incx,zy,incy) -+ double complex function zzdotc10(n,zx,incx,zy,incy) - c - c forms the dot product of a vector. - c jack dongarra, 3/11/78. -@@ -7,7 +7,7 @@ c - double complex zx(*),zy(*),ztemp - integer i,incx,incy,ix,iy,n - ztemp = (0.0d0,0.0d0) -- zzdotc = (0.0d0,0.0d0) -+ zzdotc10 = (0.0d0,0.0d0) - if(n.le.0)return - if(incx.eq.1.and.incy.eq.1)go to 20 - c -@@ -23,7 +23,7 @@ c - ix = ix + incx - iy = iy + incy - 10 continue -- zzdotc = ztemp -+ zzdotc10 = ztemp - return - c - c code for both increments equal to 1 -@@ -31,6 +31,6 @@ c - 20 do 30 i = 1,n - ztemp = ztemp + conjg(zx(i))*zy(i) - 30 continue -- zzdotc = ztemp -+ zzdotc10 = ztemp - return - end +diff --git a/PARPACK/SRC/BLACS/pzgetv0.f b/PARPACK/SRC/BLACS/pzgetv0.f +index 17354c1..bcbf552 100644 +--- a/PARPACK/SRC/BLACS/pzgetv0.f ++++ b/PARPACK/SRC/BLACS/pzgetv0.f +@@ -197,8 +197,8 @@ c + Double precision + & pdznorm2 , dlapy2 + Complex*16 +- & zzdotc +- external zzdotc , pdznorm2 , dlapy2 ++ & zzdotc10 ++ external zzdotc10 , pdznorm2 , dlapy2 + c + c %-----------------% + c | Data Statements | +@@ -335,7 +335,7 @@ c + c + first = .FALSE. + if (bmat .eq. 'G') then +- cnorm = zzdotc (n, resid, 1, workd, 1) ++ cnorm = zzdotc10 (n, resid, 1, workd, 1) + call zgsum2d ( comm, 'All', ' ', 1, 1, cnorm, 1, -1, -1 ) + rnorm0 = sqrt(dlapy2 (dble (cnorm),dimag (cnorm))) + else if (bmat .eq. 'I') then +@@ -394,7 +394,7 @@ c + end if + c + if (bmat .eq. 'G') then +- cnorm = zzdotc (n, resid, 1, workd, 1) ++ cnorm = zzdotc10 (n, resid, 1, workd, 1) + call zgsum2d ( comm, 'All', ' ', 1, 1, cnorm, 1, -1, -1 ) + rnorm = sqrt(dlapy2 (dble (cnorm),dimag (cnorm))) + else if (bmat .eq. 'I') then +diff --git a/PARPACK/SRC/BLACS/pznaitr.f b/PARPACK/SRC/BLACS/pznaitr.f +index cd1ba01..a6a3d29 100644 +--- a/PARPACK/SRC/BLACS/pznaitr.f ++++ b/PARPACK/SRC/BLACS/pznaitr.f +@@ -303,10 +303,10 @@ c | External Functions | + c %--------------------% + c + Complex*16 +- & zzdotc ++ & zzdotc10 + Double precision + & pdlamch, pdznorm2, zlanhs, dlapy2 +- external zzdotc, pdznorm2, zlanhs, pdlamch, dlapy2 ++ external zzdotc10, pdznorm2, zlanhs, pdlamch, dlapy2 + c + c %---------------------% + c | Intrinsic Functions | +@@ -573,7 +573,7 @@ c | Compute the B-norm of OP*v_{j}. | + c %-------------------------------------% + c + if (bmat .eq. 'G') then +- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) ++ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) + call zgsum2d( comm, 'All', ' ', 1, 1, cnorm, 1, -1, -1 ) + wnorm = sqrt( dlapy2(dble(cnorm),dimag(cnorm)) ) + else if (bmat .eq. 'I') then +@@ -647,7 +647,7 @@ c | Compute the B-norm of r_{j}. | + c %------------------------------% + c + if (bmat .eq. 'G') then +- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) ++ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) + call zgsum2d( comm, 'All', ' ', 1, 1, cnorm, 1, -1, -1 ) + rnorm = sqrt( dlapy2(dble(cnorm),dimag(cnorm)) ) + else if (bmat .eq. 'I') then +@@ -749,7 +749,7 @@ c | Compute the B-norm of the corrected residual r_{j}. | + c %-----------------------------------------------------% + c + if (bmat .eq. 'G') then +- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) ++ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) + call zgsum2d( comm, 'All', ' ', 1, 1, cnorm, 1, -1, -1 ) + rnorm1 = sqrt( dlapy2(dble(cnorm),dimag(cnorm)) ) + else if (bmat .eq. 'I') then +diff --git a/PARPACK/SRC/BLACS/pznaup2.f b/PARPACK/SRC/BLACS/pznaup2.f +index 8fff22f..a1af4c2 100644 +--- a/PARPACK/SRC/BLACS/pznaup2.f ++++ b/PARPACK/SRC/BLACS/pznaup2.f +@@ -254,10 +254,10 @@ c | External functions | + c %--------------------% + c + Complex*16 +- & zzdotc ++ & zzdotc10 + Double precision + & pdznorm2, pdlamch, dlapy2 +- external zzdotc, pdznorm2, pdlamch, dlapy2 ++ external zzdotc10, pdznorm2, pdlamch, dlapy2 + c + c %---------------------% + c | Intrinsic Functions | +@@ -767,7 +767,7 @@ c + end if + c + if (bmat .eq. 'G') then +- cmpnorm = zzdotc (n, resid, 1, workd, 1) ++ cmpnorm = zzdotc10 (n, resid, 1, workd, 1) + call zgsum2d( comm, 'All', ' ', 1, 1, cmpnorm, 1, -1, -1 ) + rnorm = sqrt(dlapy2(dble(cmpnorm),dimag(cmpnorm))) + else if (bmat .eq. 'I') then +diff --git a/PARPACK/SRC/BLACS/pzneupd.f b/PARPACK/SRC/BLACS/pzneupd.f +index af76f06..4cc4388 100644 +--- a/PARPACK/SRC/BLACS/pzneupd.f ++++ b/PARPACK/SRC/BLACS/pzneupd.f +@@ -340,8 +340,8 @@ c + external dznrm2,pdlamch,dlapy2 + c + Complex*16 +- & zzdotc +- external zzdotc ++ & zzdotc10 ++ external zzdotc10 + c + c %---------------------% + c | Intrinsic Functions | +@@ -743,7 +743,7 @@ c | upper triangular, thus the length of the | + c | inner product can be set to j. | + c %------------------------------------------% + c +- workev(j) = zzdotc(j, workl(ihbds), 1, ++ workev(j) = zzdotc10(j, workl(ihbds), 1, + & workl(invsub+(j-1)*ldq), 1) + 40 continue + c +diff --git a/PARPACK/SRC/MPI/pzgetv0.f b/PARPACK/SRC/MPI/pzgetv0.f +index 94fb705..a22538d 100644 +--- a/PARPACK/SRC/MPI/pzgetv0.f ++++ b/PARPACK/SRC/MPI/pzgetv0.f +@@ -200,8 +200,8 @@ c + Double precision + & pdznorm2 , dlapy2 + Complex*16 +- & zzdotc +- external zzdotc , pdznorm2 , dlapy2 ++ & zzdotc10 ++ external zzdotc10 , pdznorm2 , dlapy2 + c + c %-----------------% + c | Data Statements | +@@ -331,7 +331,7 @@ c + c + first = .FALSE. + if (bmat .eq. 'G') then +- cnorm_buf = zzdotc (n, resid, 1, workd, 1) ++ cnorm_buf = zzdotc10 (n, resid, 1, workd, 1) + call MPI_ALLREDUCE( [cnorm_buf], buf2, 1, + & MPI_DOUBLE_COMPLEX , MPI_SUM, comm, ierr ) + cnorm = buf2(1) +@@ -393,7 +393,7 @@ c + end if + c + if (bmat .eq. 'G') then +- cnorm_buf = zzdotc (n, resid, 1, workd, 1) ++ cnorm_buf = zzdotc10 (n, resid, 1, workd, 1) + call MPI_ALLREDUCE( [cnorm_buf], buf2, 1, + & MPI_DOUBLE_COMPLEX , MPI_SUM, comm, ierr ) + cnorm = buf2(1) +diff --git a/PARPACK/SRC/MPI/pznaitr.f b/PARPACK/SRC/MPI/pznaitr.f +index 29a757f..d41407f 100644 +--- a/PARPACK/SRC/MPI/pznaitr.f ++++ b/PARPACK/SRC/MPI/pznaitr.f +@@ -307,10 +307,10 @@ c | External Functions | + c %--------------------% + c + Complex*16 +- & zzdotc ++ & zzdotc10 + Double precision + & pdlamch10, pdznorm2, zlanhs, dlapy2 +- external zzdotc, pdznorm2, zlanhs, pdlamch10, dlapy2 ++ external zzdotc10, pdznorm2, zlanhs, pdlamch10, dlapy2 + c + c %---------------------% + c | Intrinsic Functions | +@@ -576,7 +576,7 @@ c | Compute the B-norm of OP*v_{j}. | + c %-------------------------------------% + c + if (bmat .eq. 'G') then +- cnorm_buf = zzdotc (n, resid, 1, workd(ipj), 1) ++ cnorm_buf = zzdotc10 (n, resid, 1, workd(ipj), 1) + call MPI_ALLREDUCE( [cnorm_buf], buf2, 1, + & MPI_DOUBLE_COMPLEX, MPI_SUM, comm, ierr ) + cnorm = buf2(1) +@@ -653,7 +653,7 @@ c | Compute the B-norm of r_{j}. | + c %------------------------------% + c + if (bmat .eq. 'G') then +- cnorm_buf = zzdotc (n, resid, 1, workd(ipj), 1) ++ cnorm_buf = zzdotc10 (n, resid, 1, workd(ipj), 1) + call MPI_ALLREDUCE( [cnorm_buf], buf2, 1, + & MPI_DOUBLE_COMPLEX, MPI_SUM, comm, ierr ) + cnorm = buf2(1) +@@ -758,7 +758,7 @@ c | Compute the B-norm of the corrected residual r_{j}. | + c %-----------------------------------------------------% + c + if (bmat .eq. 'G') then +- cnorm_buf = zzdotc (n, resid, 1, workd(ipj), 1) ++ cnorm_buf = zzdotc10 (n, resid, 1, workd(ipj), 1) + call MPI_ALLREDUCE( [cnorm_buf], buf2, 1, + & MPI_DOUBLE_COMPLEX, MPI_SUM, comm, ierr ) + cnorm = buf2(1) +diff --git a/PARPACK/SRC/MPI/pznaup2.f b/PARPACK/SRC/MPI/pznaup2.f +index 7ea1198..e46e31d 100644 +--- a/PARPACK/SRC/MPI/pznaup2.f ++++ b/PARPACK/SRC/MPI/pznaup2.f +@@ -257,10 +257,10 @@ c | External functions | + c %--------------------% + c + Complex*16 +- & zzdotc ++ & zzdotc10 + Double precision + & pdznorm2, pdlamch10, dlapy2 +- external zzdotc, pdznorm2, pdlamch10, dlapy2 ++ external zzdotc10, pdznorm2, pdlamch10, dlapy2 + c + c %---------------------% + c | Intrinsic Functions | +@@ -770,7 +770,7 @@ c + end if + c + if (bmat .eq. 'G') then +- cmpnorm_buf = zzdotc (n, resid, 1, workd, 1) ++ cmpnorm_buf = zzdotc10 (n, resid, 1, workd, 1) + call MPI_ALLREDUCE( [cmpnorm_buf], buf2, 1, + & MPI_DOUBLE_COMPLEX, MPI_SUM, comm, ierr ) + cmpnorm = buf2(1) +diff --git a/PARPACK/SRC/MPI/pzneupd.f b/PARPACK/SRC/MPI/pzneupd.f +index 395cfee..44b9978 100644 +--- a/PARPACK/SRC/MPI/pzneupd.f ++++ b/PARPACK/SRC/MPI/pzneupd.f +@@ -340,8 +340,8 @@ c + external dznrm2,pdlamch10,dlapy2 + c + Complex*16 +- & zzdotc +- external zzdotc ++ & zzdotc10 ++ external zzdotc10 + c + c %---------------------% + c | Intrinsic Functions | +@@ -743,7 +743,7 @@ c | upper triangular, thus the length of the | + c | inner product can be set to j. | + c %------------------------------------------% + c +- workev(j) = zzdotc(j, workl(ihbds), 1, ++ workev(j) = zzdotc10(j, workl(ihbds), 1, + & workl(invsub+(j-1)*ldq), 1) + 40 continue + c +diff --git a/SRC/Makefile.am b/SRC/Makefile.am +index afc48c0..d8ad395 100644 +--- a/SRC/Makefile.am ++++ b/SRC/Makefile.am +@@ -12,7 +12,7 @@ CSRC = cnaitr.f cnapps.f cnaup2.f cnaupd.f cneigh.f cneupd.f cngets.f cstatn.f \ + cgetv0.f csortc.f ccdotc.f + + ZSRC = znaitr.f znapps.f znaup2.f znaupd.f zneigh.f zneupd.f zngets.f zstatn.f \ +- zgetv0.f zsortc.f zzdotc.f ++ zgetv0.f zsortc.f zzdotc10.f + + if ICB + SSRC += icbass.F90 icbasn.F90 +diff --git a/SRC/zgetv0.f b/SRC/zgetv0.f +index 1fbd508..ee1bd41 100644 +--- a/SRC/zgetv0.f ++++ b/SRC/zgetv0.f +@@ -177,8 +177,8 @@ c + Double precision + & dznrm2, dlapy2 + Complex*16 +- & zzdotc +- external zzdotc, dznrm2, dlapy2 ++ & zzdotc10 ++ external zzdotc10, dznrm2, dlapy2 + c + c %-----------------% + c | Data Statements | +@@ -293,7 +293,7 @@ c + c + first = .FALSE. + if (bmat .eq. 'G') then +- cnorm = zzdotc (n, resid, 1, workd, 1) ++ cnorm = zzdotc10 (n, resid, 1, workd, 1) + rnorm0 = sqrt(dlapy2(dble(cnorm),aimag(cnorm))) + else if (bmat .eq. 'I') then + rnorm0 = dznrm2(n, resid, 1) +@@ -350,7 +350,7 @@ c + end if + c + if (bmat .eq. 'G') then +- cnorm = zzdotc (n, resid, 1, workd, 1) ++ cnorm = zzdotc10 (n, resid, 1, workd, 1) + rnorm = sqrt(dlapy2(dble(cnorm),aimag(cnorm))) + else if (bmat .eq. 'I') then + rnorm = dznrm2(n, resid, 1) +diff --git a/SRC/znaitr.f b/SRC/znaitr.f +index 240412c..55ee683 100644 +--- a/SRC/znaitr.f ++++ b/SRC/znaitr.f +@@ -280,10 +280,10 @@ c | External Functions | + c %--------------------% + c + Complex*16 +- & zzdotc ++ & zzdotc10 + Double precision + & dlamch, dznrm2, zlanhs, dlapy2 +- external zzdotc, dznrm2, zlanhs, dlamch, dlapy2 ++ external zzdotc10, dznrm2, zlanhs, dlamch, dlapy2 + c + c %---------------------% + c | Intrinsic Functions | +@@ -550,7 +550,7 @@ c | Compute the B-norm of OP*v_{j}. | + c %-------------------------------------% + c + if (bmat .eq. 'G') then +- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) ++ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) + wnorm = sqrt( dlapy2(dble(cnorm),aimag(cnorm)) ) + else if (bmat .eq. 'I') then + wnorm = dznrm2(n, resid, 1) +@@ -622,7 +622,7 @@ c | Compute the B-norm of r_{j}. | + c %------------------------------% + c + if (bmat .eq. 'G') then +- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) ++ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) + rnorm = sqrt( dlapy2(dble(cnorm),aimag(cnorm)) ) + else if (bmat .eq. 'I') then + rnorm = dznrm2(n, resid, 1) +@@ -722,7 +722,7 @@ c | Compute the B-norm of the corrected residual r_{j}. | + c %-----------------------------------------------------% + c + if (bmat .eq. 'G') then +- cnorm = zzdotc (n, resid, 1, workd(ipj), 1) ++ cnorm = zzdotc10 (n, resid, 1, workd(ipj), 1) + rnorm1 = sqrt( dlapy2(dble(cnorm),aimag(cnorm)) ) + else if (bmat .eq. 'I') then + rnorm1 = dznrm2(n, resid, 1) +diff --git a/SRC/znaup2.f b/SRC/znaup2.f +index 0ab01dd..d3f7f2b 100644 +--- a/SRC/znaup2.f ++++ b/SRC/znaup2.f +@@ -247,10 +247,10 @@ c | External functions | + c %--------------------% + c + Complex*16 +- & zzdotc ++ & zzdotc10 + Double precision + & dznrm2 , dlamch , dlapy2 +- external zzdotc , dznrm2 , dlamch , dlapy2 ++ external zzdotc10 , dznrm2 , dlamch , dlapy2 + c + c %---------------------% + c | Intrinsic Functions | +@@ -754,7 +754,7 @@ c + end if + c + if (bmat .eq. 'G') then +- cmpnorm = zzdotc (n, resid, 1, workd, 1) ++ cmpnorm = zzdotc10 (n, resid, 1, workd, 1) + rnorm = sqrt(dlapy2 (dble (cmpnorm),aimag (cmpnorm))) + else if (bmat .eq. 'I') then + rnorm = dznrm2 (n, resid, 1) +diff --git a/SRC/zneupd.f b/SRC/zneupd.f +index 92e7dc9..fa78ab7 100644 +--- a/SRC/zneupd.f ++++ b/SRC/zneupd.f +@@ -325,8 +325,8 @@ c + external dznrm2, dlamch, dlapy2 + c + Complex*16 +- & zzdotc +- external zzdotc ++ & zzdotc10 ++ external zzdotc10 + c + c %-----------------------% + c | Executable Statements | +@@ -731,7 +731,7 @@ c | upper triangular, thus the length of the | + c | inner product can be set to j. | + c %------------------------------------------% + c +- workev(j) = zzdotc(j, workl(ihbds), 1, ++ workev(j) = zzdotc10(j, workl(ihbds), 1, + & workl(invsub+(j-1)*ldq), 1) + 40 continue + c +diff --git a/SRC/zzdotc.f b/SRC/zzdotc10.f +similarity index 85% +rename from SRC/zzdotc.f +rename to SRC/zzdotc10.f +index a98c342..fc2383b 100644 +--- a/SRC/zzdotc.f ++++ b/SRC/zzdotc10.f +@@ -1,4 +1,4 @@ +- double complex function zzdotc(n,zx,incx,zy,incy) ++ double complex function zzdotc10(n,zx,incx,zy,incy) + c + c forms the dot product of a vector. + c jack dongarra, 3/11/78. +@@ -7,7 +7,7 @@ c + double complex zx(*),zy(*),ztemp + integer i,incx,incy,ix,iy,n + ztemp = (0.0d0,0.0d0) +- zzdotc = (0.0d0,0.0d0) ++ zzdotc10 = (0.0d0,0.0d0) + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 + c +@@ -23,7 +23,7 @@ c + ix = ix + incx + iy = iy + incy + 10 continue +- zzdotc = ztemp ++ zzdotc10 = ztemp + return + c + c code for both increments equal to 1 +@@ -31,6 +31,6 @@ c + 20 do 30 i = 1,n + ztemp = ztemp + conjg(zx(i))*zy(i) + 30 continue +- zzdotc = ztemp ++ zzdotc10 = ztemp + return + end diff --git a/extern/patch/gslib/patch_build.diff b/extern/patch/gslib/patch_build.diff index 75a528553a..2e9245fadc 100644 --- a/extern/patch/gslib/patch_build.diff +++ b/extern/patch/gslib/patch_build.diff @@ -1,2107 +1,2107 @@ -diff --git a/Makefile b/Makefile -index 63364d8..7cc5683 100644 ---- a/Makefile -+++ b/Makefile -@@ -2,127 +2,163 @@ MPI ?= 1 - ADDUS ?= 1 - USREXIT ?= 0 - NBC ?= 0 --LIBNAME ?= gs - BLAS ?= 0 --DEBUG ?= 0 -+MKL ?= 0 -+ - CFLAGS ?= -O2 - FFLAGS ?= -O2 -+ARFLAGS ?= cr -+ -+LIBNAME ?= gs - CPREFIX ?= gslib_ - FPREFIX ?= fgslib_ - --SRCROOT=. --TESTDIR=$(SRCROOT)/tests --FTESTDIR=$(TESTDIR)/fortran --SRCDIR=$(SRCROOT)/src --INCDIR=$(SRCROOT)/src --LIBDIR=$(SRCROOT)/lib -+STATIC ?= 1 -+SHARED ?= 0 -+ -+SRCROOT = . -+TESTDIR = $(SRCROOT)/tests -+FTESTDIR = $(TESTDIR)/fortran -+SRCDIR = $(SRCROOT)/src -+INCDIR = $(SRCROOT)/src -+LIBDIR = $(SRCROOT)/lib -+ -+DARWIN := $(filter Darwin,$(shell uname -s)) -+SO_EXT := $(if $(DARWIN),dylib,so) - - ifneq (,$(strip $(DESTDIR))) --INSTALL_ROOT = $(DESTDIR) -+ INSTALL_ROOT = $(DESTDIR) - else --INSTALL_ROOT = $(SRCROOT)/build -+ INSTALL_ROOT = $(SRCROOT)/build -+endif -+ -+ifneq (0,$(SHARED)) -+ ifneq (0,$(STATIC)) -+ $(warning Cannot build with both STATIC=1 and SHARED=1, setting SHARED=0) -+ override SHARED = 0 -+ endif -+endif -+ -+ifneq (0,$(SHARED)) -+ ifeq ($(filter -fPIC,$(CFLAGS)),) -+ override CFLAGS += -fPIC -+ endif -+ ifneq ($(DARWIN),) -+ override LDFLAGS += -install_name @rpath/lib$(LIBNAME).$(SO_EXT) -+ endif - endif - - $(shell >config.h) - ifneq (0,$(MPI)) -- SN=MPI -- G:=$(G) -D$(SN) -+ SN = GS_MPI -+ G := $(G) -D$(SN) -+ $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) - ifeq ($(origin CC),default) - CC = mpicc - endif - ifeq ($(origin FC),default) - FC = mpif77 - endif -- $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) - endif - - ifneq (0,$(ADDUS)) -- SN=UNDERSCORE -- G:=$(G) -D$(SN) -+ SN = GS_UNDERSCORE -+ G := $(G) -D$(SN) - $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) - endif - --SN=GLOBAL_LONG_LONG --G:=$(G) -D$(SN) --$(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) -- --SN=PREFIX --G:=$(G) -D$(SN)=$(CPREFIX) -+SN = GS_PREFIX -+G := $(G) -D$(SN)=$(CPREFIX) - $(shell printf "#ifndef ${SN}\n#define ${SN} ${CPREFIX}\n#endif\n" >>config.h) - --SN=FPREFIX --G:=$(G) -D$(SN)=$(FPREFIX) -+SN = GS_FPREFIX -+G := $(G) -D$(SN)=$(FPREFIX) - $(shell printf "#ifndef ${SN}\n#define ${SN} ${FPREFIX}\n#endif\n" >>config.h) - -+SN = GS_GLOBAL_LONG_LONG -+G := $(G) -D$(SN) -+$(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) -+ - ifneq (0,$(USREXIT)) -- G+=-DUSE_USR_EXIT -+ G += -DGS_USE_USR_EXIT - endif - - ifneq (0,$(NBC)) -- G+=-DUSE_NBC -+ G += -DGS_USE_NBC - endif - - ifeq (0,$(BLAS)) -- SN=USE_NAIVE_BLAS -- G:=$(G) -D$(SN) -+ SN = GS_USE_NAIVE_BLAS -+ G := $(G) -D$(SN) - $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) - endif - - ifeq (1,$(BLAS)) -- G+=-DUSE_CBLAS -+ SN = GS_USE_CBLAS -+ G := $(G) -D$(SN) -+ $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) -+ ifeq (1,$(MKL)) -+ SN = GS_USE_MKL -+ G := $(G) -D$(SN) -+ $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) -+ endif - endif - --ifneq (0,$(DEBUG)) -- G+=-DGSLIB_DEBUG -- CFLAGS+=-g --endif -+CCCMD = $(CC) $(CFLAGS) -I$(INCDIR) $(G) -+FCCMD = $(FC) $(FFLAGS) -I$(INCDIR) $(G) - --CCCMD=$(CC) $(CFLAGS) -I$(INCDIR) $(G) --FCCMD=$(FC) $(FFLAGS) -I$(INCDIR) $(G) -+TESTS = $(TESTDIR)/sort_test $(TESTDIR)/sort_test2 $(TESTDIR)/sarray_sort_test \ -+ $(TESTDIR)/comm_test $(TESTDIR)/crystal_test \ -+ $(TESTDIR)/sarray_transfer_test $(TESTDIR)/gs_test \ -+ $(TESTDIR)/gs_test_gop_blocking $(TESTDIR)/gs_test_gop_nonblocking \ -+ $(TESTDIR)/gs_unique_test \ -+ $(TESTDIR)/findpts_el_2_test \ -+ $(TESTDIR)/findpts_el_2_test2 $(TESTDIR)/findpts_el_3_test \ -+ $(TESTDIR)/findpts_el_3_test2 $(TESTDIR)/findpts_local_test \ -+ $(TESTDIR)/findpts_test $(TESTDIR)/findpts_test_ms $(TESTDIR)/poly_test \ -+ $(TESTDIR)/lob_bnd_test $(TESTDIR)/obbox_test - --TESTS=$(TESTDIR)/sort_test $(TESTDIR)/sort_test2 $(TESTDIR)/sarray_sort_test \ -- $(TESTDIR)/comm_test $(TESTDIR)/crystal_test \ -- $(TESTDIR)/sarray_transfer_test $(TESTDIR)/gs_test \ -- $(TESTDIR)/gs_test_gop_blocking $(TESTDIR)/gs_test_gop_nonblocking \ -- $(TESTDIR)/gs_unique_test \ -- $(TESTDIR)/findpts_el_2_test \ -- $(TESTDIR)/findpts_el_2_test2 $(TESTDIR)/findpts_el_3_test \ -- $(TESTDIR)/findpts_el_3_test2 $(TESTDIR)/findpts_local_test \ -- $(TESTDIR)/findpts_test $(TESTDIR)/findpts_test_ms $(TESTDIR)/poly_test \ -- $(TESTDIR)/lob_bnd_test $(TESTDIR)/obbox_test -+FTESTS = $(FTESTDIR)/f-igs - --FTESTS=$(FTESTDIR)/f-igs -+GS = $(SRCDIR)/gs.o $(SRCDIR)/sort.o $(SRCDIR)/sarray_transfer.o \ -+ $(SRCDIR)/sarray_sort.o $(SRCDIR)/gs_local.o $(SRCDIR)/fail.o \ -+ $(SRCDIR)/crystal.o $(SRCDIR)/comm.o $(SRCDIR)/tensor.o - --GS=$(SRCDIR)/gs.o $(SRCDIR)/sort.o $(SRCDIR)/sarray_transfer.o \ -- $(SRCDIR)/sarray_sort.o $(SRCDIR)/gs_local.o $(SRCDIR)/fail.o \ -- $(SRCDIR)/crystal.o $(SRCDIR)/comm.o $(SRCDIR)/tensor.o -+FWRAPPER = $(SRCDIR)/fcrystal.o $(SRCDIR)/findpts.o - --FWRAPPER=$(SRCDIR)/fcrystal.o $(SRCDIR)/findpts.o --INTP=$(SRCDIR)/findpts_local.o $(SRCDIR)/obbox.o $(SRCDIR)/poly.o \ -- $(SRCDIR)/lob_bnd.o $(SRCDIR)/findpts_el_3.o $(SRCDIR)/findpts_el_2.o -+INTP = $(SRCDIR)/findpts_local.o $(SRCDIR)/obbox.o $(SRCDIR)/poly.o \ -+ $(SRCDIR)/lob_bnd.o $(SRCDIR)/findpts_el_3.o $(SRCDIR)/findpts_el_2.o - - .PHONY: all lib install tests clean objects - - all : lib install - --lib: $(GS) $(FWRAPPER) $(INTP) -- @$(AR) cr $(SRCDIR)/lib$(LIBNAME).a $? -- @ranlib $(SRCDIR)/lib$(LIBNAME).a -+lib: $(if $(filter-out 0,$(STATIC)),$(SRCDIR)/lib$(LIBNAME).a) $(if $(filter-out 0,$(SHARED)),$(SRCDIR)/lib$(LIBNAME).$(SO_EXT)) -+ -+$(SRCDIR)/lib$(LIBNAME).a: $(GS) $(FWRAPPER) $(INTP) -+ $(AR) $(ARFLAGS) $@ $^ -+ ranlib $@ -+ -+$(SRCDIR)/lib$(LIBNAME).$(SO_EXT): $(GS) $(FWRAPPER) $(INTP) -+ $(CC) $(CFLAGS) -shared -o $@ $^ $(LDFLAGS) - - install: lib - @mkdir -p $(INSTALL_ROOT)/lib 2>/dev/null -- @cp -v $(SRCDIR)/lib$(LIBNAME).a $(INSTALL_ROOT)/lib 2>/dev/null -- @mkdir -p $(INSTALL_ROOT)/include 2>/dev/null -- @cp $(SRCDIR)/*.h $(INSTALL_ROOT)/include 2>/dev/null -- @cp $(SRCDIR)/*.h $(INSTALL_ROOT)/include 2>/dev/null -- @mv config.h $(INSTALL_ROOT)/include 2>/dev/null -+ $(if $(filter-out 0,$(STATIC)),cp $(SRCDIR)/lib$(LIBNAME).a $(INSTALL_ROOT)/lib) -+ $(if $(filter-out 0,$(SHARED)),cp $(SRCDIR)/lib$(LIBNAME).$(SO_EXT) $(INSTALL_ROOT)/lib) -+ @mkdir -p $(INSTALL_ROOT)/include/gslib 2>/dev/null -+ cp $(SRCDIR)/*.h $(INSTALL_ROOT)/include/gslib -+ mv config.h $(INSTALL_ROOT)/include/gslib -+ @printf '// Automatically generated file\n#include "gslib/gslib.h"\n' \ -+ > $(INSTALL_ROOT)/include/gslib.h && chmod 644 $(INSTALL_ROOT)/include/gslib.h - - tests: $(TESTS) - --clean: ; @$(RM) config.h $(SRCDIR)/*.o $(SRCDIR)/*.s $(SRCDIR)/*.a $(TESTDIR)/*.o $(FTESTDIR)/*.o $(TESTS) -+clean: -+ $(RM) config.h $(SRCDIR)/*.o $(SRCDIR)/*.s $(SRCDIR)/*.a $(SRCDIR)/*.$(SO_EXT) $(TESTDIR)/*.o $(FTESTDIR)/*.o $(TESTS) - - $(TESTS): % : %.c | lib install -- $(CC) $(CFLAGS) -I$(INSTALL_ROOT)/include $< -o $@ -L$(INSTALL_ROOT)/lib -l$(LIBNAME) -lm $(LDFLAGS) -+ $(CC) $(CFLAGS) -I$(INSTALL_ROOT)/include $< -o $@ -L$(INSTALL_ROOT)/lib -l$(LIBNAME) -lm $(LDFLAGS) - - $(FTESTS): % : %.o | lib install - $(FCCMD) $^ -o $@ -L$(SRCDIR) -l$(LIBNAME) -diff --git a/src/c99.h b/src/c99.h -index a5a44e3..62c3ced 100644 ---- a/src/c99.h -+++ b/src/c99.h -@@ -1,16 +1,16 @@ --#ifndef C99_H --#define C99_H -+#ifndef GS_C99_H -+#define GS_C99_H - - #ifndef __STDC_VERSION__ --# define NO_C99 -+# define GS_NO_C99 - #elif __STDC_VERSION__ < 199901L --# define NO_C99 -+# define GS_NO_C99 - #endif - --#ifdef NO_C99 -+#ifdef GS_NO_C99 - # define restrict - # define inline --# undef NO_C99 -+# undef GS_NO_C99 - #endif - - #endif -diff --git a/src/comm.c b/src/comm.c -index 5e05739..225788c 100644 ---- a/src/comm.c -+++ b/src/comm.c -@@ -108,7 +108,7 @@ void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, - void *v, uint vn, void *buf) - { - if(vn==0) return; --#ifdef MPI -+#ifdef GS_MPI - { - MPI_Datatype mpitype; - MPI_Op mpiop; -@@ -117,8 +117,8 @@ void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, - case gs_float: mpitype=MPI_FLOAT; break; \ - case gs_int: mpitype=MPI_INT; break; \ - case gs_long: mpitype=MPI_LONG; break; \ -- WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \ -- default: goto comm_allreduce_byhand; \ -+ GS_WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \ -+ default: goto comm_allreduce_byhand; \ - } \ - } while(0) - DOMAIN_SWITCH(); -@@ -134,7 +134,7 @@ void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, - return; - } - #endif --#ifdef MPI -+#ifdef GS_MPI - comm_allreduce_byhand: - allreduce_imp(com,dom,op, v,vn, buf); - #endif -@@ -144,7 +144,7 @@ void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op - void *v, uint vn, void *buf) - { - if(vn==0) return; --#ifdef MPI -+#ifdef GS_MPI - { - MPI_Datatype mpitype; - MPI_Op mpiop; -@@ -153,8 +153,8 @@ void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op - case gs_float: mpitype=MPI_FLOAT; break; \ - case gs_int: mpitype=MPI_INT; break; \ - case gs_long: mpitype=MPI_LONG; break; \ -- WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \ -- default: goto comm_allreduce_byhand; \ -+ GS_WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \ -+ default: goto comm_allreduce_byhand; \ - } \ - } while(0) - DOMAIN_SWITCH(); -@@ -165,7 +165,7 @@ void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op - case gs_max: mpiop=MPI_MAX; break; - default: goto comm_allreduce_byhand; - } --#ifdef USE_NBC -+#ifdef GS_USE_NBC - MPI_Iallreduce(v,buf,vn,mpitype,mpiop,com->c,req); - #else - fail(1,"comm_iallreduce",__LINE__,"Invalid call to MPI_Iallreduce!\n"); -@@ -174,7 +174,7 @@ void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op - return; - } - #endif --#ifdef MPI -+#ifdef GS_MPI - comm_allreduce_byhand: - allreduce_imp(com,dom,op, v,vn, buf); - #endif -@@ -197,7 +197,7 @@ double comm_dot(const struct comm *comm, double *v, double *w, uint n) - do { T v = *in++; GS_DO_##OP(accum,v); } while(--n) - - #define DEFINE_REDUCE(T) \ --T PREFIXED_NAME(comm_reduce__##T)( \ -+T GS_PREFIXED_NAME(comm_reduce__##T)( \ - const struct comm *comm, gs_op op, const T *in, uint n) \ - { \ - T accum = gs_identity_##T[op], buf; \ -diff --git a/src/comm.h b/src/comm.h -index bb5b290..ac4c920 100644 ---- a/src/comm.h -+++ b/src/comm.h -@@ -1,5 +1,5 @@ --#ifndef COMM_H --#define COMM_H -+#ifndef GS_COMM_H -+#define GS_COMM_H - - /* requires: - for size_t -@@ -10,7 +10,7 @@ - - #include - #include --#if !defined(FAIL_H) || !defined(TYPES_H) -+#if !defined(GS_FAIL_H) || !defined(GS_TYPES_H) - #warning "comm.h" requires "fail.h" and "types.h" - #endif - -@@ -63,7 +63,7 @@ - - */ - --#ifdef MPI -+#ifdef GS_MPI - #include - typedef MPI_Comm comm_ext; - typedef MPI_Request comm_req; -@@ -73,15 +73,15 @@ typedef int comm_req; - typedef int MPI_Fint; - #endif - --#define comm_allreduce PREFIXED_NAME(comm_allreduce ) --#define comm_iallreduce PREFIXED_NAME(comm_iallreduce) --#define comm_scan PREFIXED_NAME(comm_scan ) --#define comm_dot PREFIXED_NAME(comm_dot ) -+#define comm_allreduce GS_PREFIXED_NAME(comm_allreduce ) -+#define comm_iallreduce GS_PREFIXED_NAME(comm_iallreduce) -+#define comm_scan GS_PREFIXED_NAME(comm_scan ) -+#define comm_dot GS_PREFIXED_NAME(comm_dot ) - - /* global id, np vars strictly for diagnostic messages (fail.c) */ - #ifndef comm_gbl_id --#define comm_gbl_id PREFIXED_NAME(comm_gbl_id) --#define comm_gbl_np PREFIXED_NAME(comm_gbl_np) -+#define comm_gbl_id GS_PREFIXED_NAME(comm_gbl_id) -+#define comm_gbl_np GS_PREFIXED_NAME(comm_gbl_np) - extern uint comm_gbl_id, comm_gbl_np; - #endif - -@@ -122,17 +122,17 @@ void comm_scan(void *scan, const struct comm *com, gs_dom dom, gs_op op, - const void *v, uint vn, void *buffer); - - #define DEFINE_REDUCE(T) \ --T PREFIXED_NAME(comm_reduce__##T)( \ -+T GS_PREFIXED_NAME(comm_reduce__##T)( \ - const struct comm *comm, gs_op op, const T *in, uint n); \ - static T comm_reduce_##T(const struct comm *c, gs_op op, const T *v, uint vn) \ --{ return PREFIXED_NAME(comm_reduce__##T)(c,op,v,vn); } -+{ return GS_PREFIXED_NAME(comm_reduce__##T)(c,op,v,vn); } - GS_FOR_EACH_DOMAIN(DEFINE_REDUCE) - #undef DEFINE_REDUCE - - #define comm_reduce_sint \ -- TYPE_LOCAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long) -+ GS_TYPE_LOCAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long) - #define comm_reduce_slong \ -- TYPE_GLOBAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long) -+ GS_TYPE_GLOBAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long) - - #endif - -@@ -142,7 +142,7 @@ GS_FOR_EACH_DOMAIN(DEFINE_REDUCE) - - static void comm_init(struct comm *c, comm_ext ce) - { --#ifdef MPI -+#ifdef GS_MPI - int i; - MPI_Comm_dup(ce, &c->c); - MPI_Comm_rank(c->c,&i), comm_gbl_id=c->id=i; -@@ -155,7 +155,7 @@ static void comm_init(struct comm *c, comm_ext ce) - static void comm_init_check_(struct comm *c, MPI_Fint ce, uint np, - const char *file, unsigned line) - { --#ifdef MPI -+#ifdef GS_MPI - comm_init(c,MPI_Comm_f2c(ce)); - if(c->np != np) - fail(1,file,line,"comm_init_check: passed P=%u, " -@@ -175,7 +175,7 @@ static void comm_dup_(struct comm *d, const struct comm *s, - const char *file, unsigned line) - { - d->id = s->id, d->np = s->np; --#ifdef MPI -+#ifdef GS_MPI - MPI_Comm_dup(s->c,&d->c); - #else - if(s->np!=1) fail(1,file,line,"%s not compiled with -DMPI\n",file); -@@ -185,7 +185,7 @@ static void comm_dup_(struct comm *d, const struct comm *s, - - static void comm_split_(const struct comm *s, int bin, int key, struct comm *d, - const char *file, unsigned line) { --#if defined(MPI) -+#if defined(GS_MPI) - MPI_Comm nc; - MPI_Comm_split(s->c, bin, key, &nc); - comm_init(d, nc); -@@ -198,14 +198,14 @@ static void comm_split_(const struct comm *s, int bin, int key, struct comm *d, - - static void comm_free(struct comm *c) - { --#ifdef MPI -+#ifdef GS_MPI - MPI_Comm_free(&c->c); - #endif - } - - static double comm_time(void) - { --#ifdef MPI -+#ifdef GS_MPI - return MPI_Wtime(); - #else - return 0; -@@ -214,7 +214,7 @@ static double comm_time(void) - - static void comm_barrier(const struct comm *c) - { --#ifdef MPI -+#ifdef GS_MPI - MPI_Barrier(c->c); - #endif - } -@@ -222,7 +222,7 @@ static void comm_barrier(const struct comm *c) - static void comm_recv(const struct comm *c, void *p, size_t n, - uint src, int tag) - { --#ifdef MPI -+#ifdef GS_MPI - # ifndef MPI_STATUS_IGNORE - MPI_Status stat; - MPI_Recv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,&stat); -@@ -235,7 +235,7 @@ static void comm_recv(const struct comm *c, void *p, size_t n, - static void comm_send(const struct comm *c, void *p, size_t n, - uint dst, int tag) - { --#ifdef MPI -+#ifdef GS_MPI - MPI_Send(p,n,MPI_UNSIGNED_CHAR,dst,tag,c->c); - #endif - } -@@ -243,7 +243,7 @@ static void comm_send(const struct comm *c, void *p, size_t n, - static void comm_irecv(comm_req *req, const struct comm *c, - void *p, size_t n, uint src, int tag) - { --#ifdef MPI -+#ifdef GS_MPI - MPI_Irecv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,req); - #endif - } -@@ -251,14 +251,14 @@ static void comm_irecv(comm_req *req, const struct comm *c, - static void comm_isend(comm_req *req, const struct comm *c, - void *p, size_t n, uint dst, int tag) - { --#ifdef MPI -+#ifdef GS_MPI - MPI_Isend(p,n,MPI_UNSIGNED_CHAR,dst,tag,c->c,req); - #endif - } - - static void comm_wait(comm_req *req, int n) - { --#ifdef MPI -+#ifdef GS_MPI - # ifndef MPI_STATUSES_IGNORE - MPI_Status status[8]; - while(n>=8) MPI_Waitall(8,req,status), req+=8, n-=8; -@@ -271,7 +271,7 @@ static void comm_wait(comm_req *req, int n) - - static void comm_bcast(const struct comm *c, void *p, size_t n, uint root) - { --#ifdef MPI -+#ifdef GS_MPI - MPI_Bcast(p,n,MPI_UNSIGNED_CHAR,root,c->c); - #endif - } -@@ -279,7 +279,7 @@ static void comm_bcast(const struct comm *c, void *p, size_t n, uint root) - static void comm_gather(const struct comm *c, void *out, size_t out_n, - void *in, size_t in_n, uint root) - { --#ifdef MPI -+#ifdef GS_MPI - MPI_Gather(out,out_n,MPI_UNSIGNED_CHAR,in,in_n,MPI_UNSIGNED_CHAR,root,c->c); - #else - assert(out_n == in_n); -diff --git a/src/crystal.c b/src/crystal.c -index a0e8135..c444ad8 100644 ---- a/src/crystal.c -+++ b/src/crystal.c -@@ -43,9 +43,9 @@ - #include "comm.h" - #include "mem.h" - --#define crystal_init PREFIXED_NAME(crystal_init ) --#define crystal_free PREFIXED_NAME(crystal_free ) --#define crystal_router PREFIXED_NAME(crystal_router) -+#define crystal_init GS_PREFIXED_NAME(crystal_init ) -+#define crystal_free GS_PREFIXED_NAME(crystal_free ) -+#define crystal_router GS_PREFIXED_NAME(crystal_router) - - struct crystal { - struct comm comm; -diff --git a/src/crystal.h b/src/crystal.h -index b6d4582..67d3c4e 100644 ---- a/src/crystal.h -+++ b/src/crystal.h -@@ -1,13 +1,13 @@ --#ifndef CRYSTAL_H --#define CRYSTAL_H -+#ifndef GS_CRYSTAL_H -+#define GS_CRYSTAL_H - --#if !defined(COMM_H) || !defined(MEM_H) -+#if !defined(GS_COMM_H) || !defined(GS_MEM_H) - #warning "crystal.h" requires "comm.h" and "mem.h" - #endif - --#define crystal_init PREFIXED_NAME(crystal_init ) --#define crystal_free PREFIXED_NAME(crystal_free ) --#define crystal_router PREFIXED_NAME(crystal_router) -+#define crystal_init GS_PREFIXED_NAME(crystal_init ) -+#define crystal_free GS_PREFIXED_NAME(crystal_free ) -+#define crystal_router GS_PREFIXED_NAME(crystal_router) - - struct crystal { - struct comm comm; -diff --git a/src/fail.c b/src/fail.c -index 9ac04bd..c12cd83 100644 ---- a/src/fail.c -+++ b/src/fail.c -@@ -7,8 +7,8 @@ - #include "types.h" - #include "comm.h" - --#ifdef USE_USR_EXIT --#define userExitHandler FORTRAN_NAME(userexithandler,USEREXITHANDLER) -+#ifdef GS_USE_USR_EXIT -+#define userExitHandler GS_FORTRAN_NAME(userexithandler,USEREXITHANDLER) - #define USEREXIT 1 - extern void userExitHandler(int status); - #else -diff --git a/src/fail.h b/src/fail.h -index 0185110..1ce10b2 100644 ---- a/src/fail.h -+++ b/src/fail.h -@@ -1,15 +1,15 @@ --#ifndef FAIL_H --#define FAIL_H -+#ifndef GS_FAIL_H -+#define GS_FAIL_H - --#if !defined(NAME_H) -+#if !defined(GS_NAME_H) - #warning "fail.h" requires "name.h" - #endif - --#define die PREFIXED_NAME( die ) --#define vdiagnostic PREFIXED_NAME(vdiagnostic) --#define diagnostic PREFIXED_NAME( diagnostic) --#define vfail PREFIXED_NAME(vfail ) --#define fail PREFIXED_NAME( fail ) -+#define die GS_PREFIXED_NAME( die ) -+#define vdiagnostic GS_PREFIXED_NAME(vdiagnostic) -+#define diagnostic GS_PREFIXED_NAME( diagnostic) -+#define vfail GS_PREFIXED_NAME(vfail ) -+#define fail GS_PREFIXED_NAME( fail ) - - #ifdef __GNUC__ - # define ATTRBD __attribute__ ((noreturn)) -diff --git a/src/fcrystal.c b/src/fcrystal.c -index 3fe4c9a..44f96f5 100644 ---- a/src/fcrystal.c -+++ b/src/fcrystal.c -@@ -65,20 +65,20 @@ - --------------------------------------------------------------------------*/ - - #undef crystal_free --#define ccrystal_free PREFIXED_NAME(crystal_free) -+#define ccrystal_free GS_PREFIXED_NAME(crystal_free) - - #define fcrystal_setup \ -- FORTRAN_NAME(crystal_setup ,CRYSTAL_SETUP ) -+ GS_FORTRAN_NAME(crystal_setup ,CRYSTAL_SETUP ) - #define fcrystal_ituple_sort \ -- FORTRAN_NAME(crystal_ituple_sort ,CRYSTAL_ITUPLE_SORT ) -+ GS_FORTRAN_NAME(crystal_ituple_sort ,CRYSTAL_ITUPLE_SORT ) - #define fcrystal_tuple_sort \ -- FORTRAN_NAME(crystal_tuple_sort ,CRYSTAL_TUPLE_SORT ) -+ GS_FORTRAN_NAME(crystal_tuple_sort ,CRYSTAL_TUPLE_SORT ) - #define fcrystal_ituple_transfer \ -- FORTRAN_NAME(crystal_ituple_transfer,CRYSTAL_ITUPLE_TRANSFER) -+ GS_FORTRAN_NAME(crystal_ituple_transfer,CRYSTAL_ITUPLE_TRANSFER) - #define fcrystal_tuple_transfer \ -- FORTRAN_NAME(crystal_tuple_transfer ,CRYSTAL_TUPLE_TRANSFER ) -+ GS_FORTRAN_NAME(crystal_tuple_transfer ,CRYSTAL_TUPLE_TRANSFER ) - #define fcrystal_free \ -- FORTRAN_NAME(crystal_free ,CRYSTAL_FREE ) -+ GS_FORTRAN_NAME(crystal_free ,CRYSTAL_FREE ) - - static struct crystal **handle_array = 0; - static int handle_max = 0; -diff --git a/src/findpts.c b/src/findpts.c -index 86ac948..c3a3e9b 100644 ---- a/src/findpts.c -+++ b/src/findpts.c -@@ -117,7 +117,7 @@ static uint count_bits(unsigned char *p, uint n) - approx number of cells per proc for the distributed - global geometric hash table - NOTE: gbl_hash_size*np needs to fit in a "global" integer -- (controlled by -DGLOBAL_LONG or -DGLOBAL_LONG_LONG; -+ (controlled by -DGS_GLOBAL_LONG or -DGS_GLOBAL_LONG_LONG; - see "types.h") - actual number of cells per proc will be greater by - ~ 3 gbl_hash_size^(2/3) / np^(1/3) -@@ -227,17 +227,17 @@ static uint count_bits(unsigned char *p, uint n) - - --------------------------------------------------------------------------*/ - --#define ffindptsms_setup FORTRAN_NAME(findptsms_setup ,FINDPTSMS_SETUP ) --#define ffindptsms_free FORTRAN_NAME(findptsms_free ,FINDPTSMS_FREE ) --#define ffindptsms FORTRAN_NAME(findptsms ,FINDPTSMS ) --#define ffindptsms_eval FORTRAN_NAME(findptsms_eval ,FINDPTSMS_EVAL ) --#define ffindptsms_eval_local FORTRAN_NAME(findptsms_eval_local,FINDPTSMS_EVAL_LOCAL) -+#define ffindptsms_setup GS_FORTRAN_NAME(findptsms_setup ,FINDPTSMS_SETUP ) -+#define ffindptsms_free GS_FORTRAN_NAME(findptsms_free ,FINDPTSMS_FREE ) -+#define ffindptsms GS_FORTRAN_NAME(findptsms ,FINDPTSMS ) -+#define ffindptsms_eval GS_FORTRAN_NAME(findptsms_eval ,FINDPTSMS_EVAL ) -+#define ffindptsms_eval_local GS_FORTRAN_NAME(findptsms_eval_local,FINDPTSMS_EVAL_LOCAL) - --#define ffindpts_setup FORTRAN_NAME(findpts_setup ,FINDPTS_SETUP ) --#define ffindpts_free FORTRAN_NAME(findpts_free ,FINDPTS_FREE ) --#define ffindpts FORTRAN_NAME(findpts ,FINDPTS ) --#define ffindpts_eval FORTRAN_NAME(findpts_eval ,FINDPTS_EVAL ) --#define ffindpts_eval_local FORTRAN_NAME(findpts_eval_local,FINDPTS_EVAL_LOCAL) -+#define ffindpts_setup GS_FORTRAN_NAME(findpts_setup ,FINDPTS_SETUP ) -+#define ffindpts_free GS_FORTRAN_NAME(findpts_free ,FINDPTS_FREE ) -+#define ffindpts GS_FORTRAN_NAME(findpts ,FINDPTS ) -+#define ffindpts_eval GS_FORTRAN_NAME(findpts_eval ,FINDPTS_EVAL ) -+#define ffindpts_eval_local GS_FORTRAN_NAME(findpts_eval_local,FINDPTS_EVAL_LOCAL) - - struct handle { void *data; unsigned ndim; }; - static struct handle *handle_array = 0; -@@ -363,9 +363,9 @@ void ffindptsms_free(const sint *const handle) - { - CHECK_HANDLE("findptsms_free"); - if(h->ndim==2) -- PREFIXED_NAME(findptsms_free_2)(h->data); -+ GS_PREFIXED_NAME(findptsms_free_2)(h->data); - else -- PREFIXED_NAME(findptsms_free_3)(h->data); -+ GS_PREFIXED_NAME(findptsms_free_3)(h->data); - h->data = 0; - } - -@@ -373,9 +373,9 @@ void ffindpts_free(const sint *const handle) - { - CHECK_HANDLE("findpts_free"); - if(h->ndim==2) -- PREFIXED_NAME(findpts_free_2)(h->data); -+ GS_PREFIXED_NAME(findpts_free_2)(h->data); - else -- PREFIXED_NAME(findpts_free_3)(h->data); -+ GS_PREFIXED_NAME(findpts_free_3)(h->data); - h->data = 0; - } - -@@ -405,7 +405,7 @@ void ffindptsms(const sint *const handle, - sess_stride = *session_id_stride*sizeof(uint); - sess_match = session_id_match; - -- PREFIXED_NAME(findptsms_2)( -+ GS_PREFIXED_NAME(findptsms_2)( - (uint*) code_base,(* code_stride)*sizeof(sint ), - (uint*) proc_base,(* proc_stride)*sizeof(sint ), - (uint*) el_base,(* el_stride)*sizeof(sint ), -@@ -428,7 +428,7 @@ void ffindptsms(const sint *const handle, - sess_stride = *session_id_stride*sizeof(uint); - sess_match = session_id_match; - -- PREFIXED_NAME(findptsms_3)( -+ GS_PREFIXED_NAME(findptsms_3)( - (uint*) code_base,(* code_stride)*sizeof(sint ), - (uint*) proc_base,(* proc_stride)*sizeof(sint ), - (uint*) el_base,(* el_stride)*sizeof(sint ), -@@ -459,7 +459,7 @@ void ffindpts(const sint *const handle, - xv_stride[0] = *x_stride*sizeof(double), - xv_stride[1] = *y_stride*sizeof(double); - -- PREFIXED_NAME(findpts_2)( -+ GS_PREFIXED_NAME(findpts_2)( - (uint*) code_base,(* code_stride)*sizeof(sint ), - (uint*) proc_base,(* proc_stride)*sizeof(sint ), - (uint*) el_base,(* el_stride)*sizeof(sint ), -@@ -475,7 +475,7 @@ void ffindpts(const sint *const handle, - xv_stride[1] = *y_stride*sizeof(double), - xv_stride[2] = *z_stride*sizeof(double); - -- PREFIXED_NAME(findpts_3)( -+ GS_PREFIXED_NAME(findpts_3)( - (uint*) code_base,(* code_stride)*sizeof(sint ), - (uint*) proc_base,(* proc_stride)*sizeof(sint ), - (uint*) el_base,(* el_stride)*sizeof(sint ), -@@ -497,7 +497,7 @@ void ffindptsms_eval(const sint *const handle, - { - CHECK_HANDLE("findptsms_eval"); - if(h->ndim==2) -- PREFIXED_NAME(findptsms_eval_2)( -+ GS_PREFIXED_NAME(findptsms_eval_2)( - out_base,(* out_stride)*sizeof(double), - (uint*)code_base,(*code_stride)*sizeof(sint ), - (uint*)proc_base,(*proc_stride)*sizeof(sint ), -@@ -505,7 +505,7 @@ void ffindptsms_eval(const sint *const handle, - r_base,(* r_stride)*sizeof(double), - *npt, in, h->data); - else -- PREFIXED_NAME(findptsms_eval_3)( -+ GS_PREFIXED_NAME(findptsms_eval_3)( - out_base,(* out_stride)*sizeof(double), - (uint*)code_base,(*code_stride)*sizeof(sint ), - (uint*)proc_base,(*proc_stride)*sizeof(sint ), -@@ -524,7 +524,7 @@ void ffindpts_eval(const sint *const handle, - { - CHECK_HANDLE("findpts_eval"); - if(h->ndim==2) -- PREFIXED_NAME(findpts_eval_2)( -+ GS_PREFIXED_NAME(findpts_eval_2)( - out_base,(* out_stride)*sizeof(double), - (uint*)code_base,(*code_stride)*sizeof(sint ), - (uint*)proc_base,(*proc_stride)*sizeof(sint ), -@@ -532,7 +532,7 @@ void ffindpts_eval(const sint *const handle, - r_base,(* r_stride)*sizeof(double), - *npt, in, h->data); - else -- PREFIXED_NAME(findpts_eval_3)( -+ GS_PREFIXED_NAME(findpts_eval_3)( - out_base,(* out_stride)*sizeof(double), - (uint*)code_base,(*code_stride)*sizeof(sint ), - (uint*)proc_base,(*proc_stride)*sizeof(sint ), -diff --git a/src/findpts.h b/src/findpts.h -index c8a1ab3..af2071d 100644 ---- a/src/findpts.h -+++ b/src/findpts.h -@@ -1,27 +1,27 @@ --#ifndef FINDPTSMS_H --#define FINDPTSMS_H -+#ifndef GS_FINDPTSMS_H -+#define GS_FINDPTSMS_H - --#if !defined(COMM_H) -+#if !defined(GS_COMM_H) - #warning "findpts.h" requires "comm.h" - #endif - --#define findptsms_setup_2 PREFIXED_NAME(findptsms_setup_2) --#define findptsms_free_2 PREFIXED_NAME(findptsms_free_2 ) --#define findptsms_2 PREFIXED_NAME(findptsms_2 ) --#define findptsms_eval_2 PREFIXED_NAME(findptsms_eval_2 ) --#define findptsms_setup_3 PREFIXED_NAME(findptsms_setup_3) --#define findptsms_free_3 PREFIXED_NAME(findptsms_free_3 ) --#define findptsms_3 PREFIXED_NAME(findptsms_3 ) --#define findptsms_eval_3 PREFIXED_NAME(findptsms_eval_3 ) -- --#define findpts_setup_2 PREFIXED_NAME(findpts_setup_2) --#define findpts_free_2 PREFIXED_NAME(findpts_free_2 ) --#define findpts_2 PREFIXED_NAME(findpts_2 ) --#define findpts_eval_2 PREFIXED_NAME(findpts_eval_2 ) --#define findpts_setup_3 PREFIXED_NAME(findpts_setup_3) --#define findpts_free_3 PREFIXED_NAME(findpts_free_3 ) --#define findpts_3 PREFIXED_NAME(findpts_3 ) --#define findpts_eval_3 PREFIXED_NAME(findpts_eval_3 ) -+#define findptsms_setup_2 GS_PREFIXED_NAME(findptsms_setup_2) -+#define findptsms_free_2 GS_PREFIXED_NAME(findptsms_free_2 ) -+#define findptsms_2 GS_PREFIXED_NAME(findptsms_2 ) -+#define findptsms_eval_2 GS_PREFIXED_NAME(findptsms_eval_2 ) -+#define findptsms_setup_3 GS_PREFIXED_NAME(findptsms_setup_3) -+#define findptsms_free_3 GS_PREFIXED_NAME(findptsms_free_3 ) -+#define findptsms_3 GS_PREFIXED_NAME(findptsms_3 ) -+#define findptsms_eval_3 GS_PREFIXED_NAME(findptsms_eval_3 ) -+ -+#define findpts_setup_2 GS_PREFIXED_NAME(findpts_setup_2) -+#define findpts_free_2 GS_PREFIXED_NAME(findpts_free_2 ) -+#define findpts_2 GS_PREFIXED_NAME(findpts_2 ) -+#define findpts_eval_2 GS_PREFIXED_NAME(findpts_eval_2 ) -+#define findpts_setup_3 GS_PREFIXED_NAME(findpts_setup_3) -+#define findpts_free_3 GS_PREFIXED_NAME(findpts_free_3 ) -+#define findpts_3 GS_PREFIXED_NAME(findpts_3 ) -+#define findpts_eval_3 GS_PREFIXED_NAME(findpts_eval_3 ) - - struct findpts_data_2; - struct findpts_data_3; -diff --git a/src/findpts_el.h b/src/findpts_el.h -index 4ed119a..ba731cd 100644 ---- a/src/findpts_el.h -+++ b/src/findpts_el.h -@@ -1,14 +1,14 @@ --#ifndef FINDPTS_EL_H --#define FINDPTS_EL_H -+#ifndef GS_FINDPTS_EL_H -+#define GS_FINDPTS_EL_H - --#if !defined(NAME_H) || !defined(POLY_H) -+#if !defined(GS_NAME_H) || !defined(GS_POLY_H) - #warning "findpts_el.h" requires "name.h", "poly.h" - #endif - --#define findpts_el_setup_2 PREFIXED_NAME(findpts_el_setup_2) --#define findpts_el_free_2 PREFIXED_NAME(findpts_el_free_2 ) --#define findpts_el_2 PREFIXED_NAME(findpts_el_2 ) --#define findpts_el_eval_2 PREFIXED_NAME(findpts_el_eval_2 ) -+#define findpts_el_setup_2 GS_PREFIXED_NAME(findpts_el_setup_2) -+#define findpts_el_free_2 GS_PREFIXED_NAME(findpts_el_free_2 ) -+#define findpts_el_2 GS_PREFIXED_NAME(findpts_el_2 ) -+#define findpts_el_eval_2 GS_PREFIXED_NAME(findpts_el_eval_2 ) - - struct findpts_el_pt_2 { - double x[2],r[2],oldr[2],dist2,dist2p,tr; -@@ -60,10 +60,10 @@ static struct findpts_el_pt_2 *findpts_el_points_2( - return fd->p; - } - --#define findpts_el_setup_3 PREFIXED_NAME(findpts_el_setup_3) --#define findpts_el_free_3 PREFIXED_NAME(findpts_el_free_3 ) --#define findpts_el_3 PREFIXED_NAME(findpts_el_3 ) --#define findpts_el_eval_3 PREFIXED_NAME(findpts_el_eval_3 ) -+#define findpts_el_setup_3 GS_PREFIXED_NAME(findpts_el_setup_3) -+#define findpts_el_free_3 GS_PREFIXED_NAME(findpts_el_free_3 ) -+#define findpts_el_3 GS_PREFIXED_NAME(findpts_el_3 ) -+#define findpts_el_eval_3 GS_PREFIXED_NAME(findpts_el_eval_3 ) - - struct findpts_el_pt_3 { - double x[3],r[3],oldr[3],dist2,dist2p,tr; -diff --git a/src/findpts_el_2.c b/src/findpts_el_2.c -index b33f768..aad9aad 100644 ---- a/src/findpts_el_2.c -+++ b/src/findpts_el_2.c -@@ -13,10 +13,10 @@ - #include "tensor.h" - #include "poly.h" - --#define findpts_el_setup_2 PREFIXED_NAME(findpts_el_setup_2) --#define findpts_el_free_2 PREFIXED_NAME(findpts_el_free_2 ) --#define findpts_el_2 PREFIXED_NAME(findpts_el_2 ) --#define findpts_el_eval_2 PREFIXED_NAME(findpts_el_eval_2 ) -+#define findpts_el_setup_2 GS_PREFIXED_NAME(findpts_el_setup_2) -+#define findpts_el_free_2 GS_PREFIXED_NAME(findpts_el_free_2 ) -+#define findpts_el_2 GS_PREFIXED_NAME(findpts_el_2 ) -+#define findpts_el_eval_2 GS_PREFIXED_NAME(findpts_el_eval_2 ) - /* - #define DIAGNOSTICS_1 - #define DIAGNOSTICS_2 -diff --git a/src/findpts_el_3.c b/src/findpts_el_3.c -index 42c335c..62561ff 100644 ---- a/src/findpts_el_3.c -+++ b/src/findpts_el_3.c -@@ -11,10 +11,10 @@ - #include "tensor.h" - #include "poly.h" - --#define findpts_el_setup_3 PREFIXED_NAME(findpts_el_setup_3) --#define findpts_el_free_3 PREFIXED_NAME(findpts_el_free_3 ) --#define findpts_el_3 PREFIXED_NAME(findpts_el_3 ) --#define findpts_el_eval_3 PREFIXED_NAME(findpts_el_eval_3 ) -+#define findpts_el_setup_3 GS_PREFIXED_NAME(findpts_el_setup_3) -+#define findpts_el_free_3 GS_PREFIXED_NAME(findpts_el_free_3 ) -+#define findpts_el_3 GS_PREFIXED_NAME(findpts_el_3 ) -+#define findpts_el_eval_3 GS_PREFIXED_NAME(findpts_el_eval_3 ) - /* - #define DIAGNOSTICS_1 - #define DIAGNOSTICS_2 -diff --git a/src/findpts_imp.h b/src/findpts_imp.h -index b9759af..44b5c7d 100644 ---- a/src/findpts_imp.h -+++ b/src/findpts_imp.h -@@ -1,45 +1,45 @@ - #include - #include --#define obbox TOKEN_PASTE(obbox_,D) --#define local_hash_data TOKEN_PASTE(findpts_local_hash_data_,D) --#define hash_data TOKEN_PASTE(findpts_hash_data_,D) --#define hash_index TOKEN_PASTE(hash_index_ ,D) --#define hash_setfac TOKEN_PASTE(hash_setfac_ ,D) --#define hash_range TOKEN_PASTE(hash_range_ ,D) --#define hash_bb TOKEN_PASTE(hash_bb_ ,D) --#define set_local_mask TOKEN_PASTE(set_local_mask_ ,D) --#define fill_hash TOKEN_PASTE(fill_hash_ ,D) --#define table_from_hash TOKEN_PASTE(table_from_hash_ ,D) --#define hash_build TOKEN_PASTE(hash_build_ ,D) --#define hash_free TOKEN_PASTE(hash_free_ ,D) -- --#define findptsms_local_setup TOKEN_PASTE(PREFIXED_NAME(findptsms_local_setup_),D) --#define findptsms_local_free TOKEN_PASTE(PREFIXED_NAME(findptsms_local_free_ ),D) --#define findptsms_local TOKEN_PASTE(PREFIXED_NAME(findptsms_local_ ),D) --#define findptsms_local_eval TOKEN_PASTE(PREFIXED_NAME(findptsms_local_eval_ ),D) --#define findpts_dummy_ms_data TOKEN_PASTE(findpts_dummy_ms_data_,D) --#define findpts_data TOKEN_PASTE(findpts_data_,D) --#define src_pt TOKEN_PASTE(src_pt_ ,D) --#define out_pt TOKEN_PASTE(out_pt_ ,D) --#define eval_src_pt TOKEN_PASTE(eval_src_pt_ ,D) --#define eval_out_pt TOKEN_PASTE(eval_out_pt_ ,D) --#define setupms_aux TOKEN_PASTE(setupms_aux_,D) --#define findptsms_setup TOKEN_PASTE(PREFIXED_NAME(findptsms_setup_),D) --#define findptsms_free TOKEN_PASTE(PREFIXED_NAME(findptsms_free_ ),D) --#define findptsms TOKEN_PASTE(PREFIXED_NAME(findptsms_ ),D) --#define findptsms_eval TOKEN_PASTE(PREFIXED_NAME(findptsms_eval_ ),D) -- --#define findpts_local_data TOKEN_PASTE(findpts_local_data_,D) --#define findpts_local_setup TOKEN_PASTE(PREFIXED_NAME(findpts_local_setup_),D) --#define findpts_local_free TOKEN_PASTE(PREFIXED_NAME(findpts_local_free_ ),D) --#define findpts_local TOKEN_PASTE(PREFIXED_NAME(findpts_local_ ),D) --#define findpts_local_eval TOKEN_PASTE(PREFIXED_NAME(findpts_local_eval_ ),D) --#define findpts_setup TOKEN_PASTE(PREFIXED_NAME(findpts_setup_),D) --#define findpts_free TOKEN_PASTE(PREFIXED_NAME(findpts_free_ ),D) --#define findpts TOKEN_PASTE(PREFIXED_NAME(findpts_ ),D) --#define findpts_eval TOKEN_PASTE(PREFIXED_NAME(findpts_eval_ ),D) --#define findpts_local_eval TOKEN_PASTE(PREFIXED_NAME(findpts_local_eval_ ),D) --#define setup_fev_aux TOKEN_PASTE(setup_fev_aux_,D) -+#define obbox GS_TOKEN_PASTE(obbox_,D) -+#define local_hash_data GS_TOKEN_PASTE(findpts_local_hash_data_,D) -+#define hash_data GS_TOKEN_PASTE(findpts_hash_data_,D) -+#define hash_index GS_TOKEN_PASTE(hash_index_ ,D) -+#define hash_setfac GS_TOKEN_PASTE(hash_setfac_ ,D) -+#define hash_range GS_TOKEN_PASTE(hash_range_ ,D) -+#define hash_bb GS_TOKEN_PASTE(hash_bb_ ,D) -+#define set_local_mask GS_TOKEN_PASTE(set_local_mask_ ,D) -+#define fill_hash GS_TOKEN_PASTE(fill_hash_ ,D) -+#define table_from_hash GS_TOKEN_PASTE(table_from_hash_ ,D) -+#define hash_build GS_TOKEN_PASTE(hash_build_ ,D) -+#define hash_free GS_TOKEN_PASTE(hash_free_ ,D) -+ -+#define findptsms_local_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_setup_),D) -+#define findptsms_local_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_free_ ),D) -+#define findptsms_local GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_ ),D) -+#define findptsms_local_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_eval_ ),D) -+#define findpts_dummy_ms_data GS_TOKEN_PASTE(findpts_dummy_ms_data_,D) -+#define findpts_data GS_TOKEN_PASTE(findpts_data_,D) -+#define src_pt GS_TOKEN_PASTE(src_pt_ ,D) -+#define out_pt GS_TOKEN_PASTE(out_pt_ ,D) -+#define eval_src_pt GS_TOKEN_PASTE(eval_src_pt_ ,D) -+#define eval_out_pt GS_TOKEN_PASTE(eval_out_pt_ ,D) -+#define setupms_aux GS_TOKEN_PASTE(setupms_aux_,D) -+#define findptsms_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_setup_),D) -+#define findptsms_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_free_ ),D) -+#define findptsms GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_ ),D) -+#define findptsms_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_eval_ ),D) -+ -+#define findpts_local_data GS_TOKEN_PASTE(findpts_local_data_,D) -+#define findpts_local_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_setup_),D) -+#define findpts_local_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_free_ ),D) -+#define findpts_local GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_ ),D) -+#define findpts_local_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_eval_ ),D) -+#define findpts_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_setup_),D) -+#define findpts_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_free_ ),D) -+#define findpts GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_ ),D) -+#define findpts_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_eval_ ),D) -+#define findpts_local_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_eval_ ),D) -+#define setup_fev_aux GS_TOKEN_PASTE(setup_fev_aux_,D) - - struct hash_data { - ulong hash_n; -diff --git a/src/findpts_local.h b/src/findpts_local.h -index 2a9d9da..30be675 100644 ---- a/src/findpts_local.h -+++ b/src/findpts_local.h -@@ -1,19 +1,19 @@ --#ifndef FINDPTS_LOCAL_H --#define FINDPTS_LOCAL_H -+#ifndef GS_FINDPTS_LOCAL_H -+#define GS_FINDPTS_LOCAL_H - --#if !defined(MEM_H) || !defined(FINDPTS_EL_H) || !defined(OBBOX_H) -+#if !defined(GS_MEM_H) || !defined(GS_FINDPTS_EL_H) || !defined(GS_OBBOX_H) - #warning "findpts_local.h" requires "mem.h", "findpts_el.h", "obbox.h" - #endif - --#define findptsms_local_setup_2 PREFIXED_NAME(findptsms_local_setup_2) --#define findptsms_local_free_2 PREFIXED_NAME(findptsms_local_free_2 ) --#define findptsms_local_2 PREFIXED_NAME(findptsms_local_2 ) --#define findptsms_local_eval_2 PREFIXED_NAME(findptsms_local_eval_2 ) -+#define findptsms_local_setup_2 GS_PREFIXED_NAME(findptsms_local_setup_2) -+#define findptsms_local_free_2 GS_PREFIXED_NAME(findptsms_local_free_2 ) -+#define findptsms_local_2 GS_PREFIXED_NAME(findptsms_local_2 ) -+#define findptsms_local_eval_2 GS_PREFIXED_NAME(findptsms_local_eval_2 ) - --#define findpts_local_setup_2 PREFIXED_NAME(findpts_local_setup_2) --#define findpts_local_free_2 PREFIXED_NAME(findpts_local_free_2 ) --#define findpts_local_2 PREFIXED_NAME(findpts_local_2 ) --#define findpts_local_eval_2 PREFIXED_NAME(findpts_local_eval_2 ) -+#define findpts_local_setup_2 GS_PREFIXED_NAME(findpts_local_setup_2) -+#define findpts_local_free_2 GS_PREFIXED_NAME(findpts_local_free_2 ) -+#define findpts_local_2 GS_PREFIXED_NAME(findpts_local_2 ) -+#define findpts_local_eval_2 GS_PREFIXED_NAME(findpts_local_eval_2 ) - - struct findpts_local_hash_data_2 { - uint hash_n; -@@ -85,15 +85,15 @@ void findpts_local_eval_2( - const uint npt, - const double *const in, struct findpts_local_data_2 *const fd); - --#define findptsms_local_setup_3 PREFIXED_NAME(findptsms_local_setup_3) --#define findptsms_local_free_3 PREFIXED_NAME(findptsms_local_free_3 ) --#define findptsms_local_3 PREFIXED_NAME(findptsms_local_3 ) --#define findptsms_local_eval_3 PREFIXED_NAME(findptsms_local_eval_3 ) -+#define findptsms_local_setup_3 GS_PREFIXED_NAME(findptsms_local_setup_3) -+#define findptsms_local_free_3 GS_PREFIXED_NAME(findptsms_local_free_3 ) -+#define findptsms_local_3 GS_PREFIXED_NAME(findptsms_local_3 ) -+#define findptsms_local_eval_3 GS_PREFIXED_NAME(findptsms_local_eval_3 ) - --#define findpts_local_setup_3 PREFIXED_NAME(findpts_local_setup_3) --#define findpts_local_free_3 PREFIXED_NAME(findpts_local_free_3 ) --#define findpts_local_3 PREFIXED_NAME(findpts_local_3 ) --#define findpts_local_eval_3 PREFIXED_NAME(findpts_local_eval_3 ) -+#define findpts_local_setup_3 GS_PREFIXED_NAME(findpts_local_setup_3) -+#define findpts_local_free_3 GS_PREFIXED_NAME(findpts_local_free_3 ) -+#define findpts_local_3 GS_PREFIXED_NAME(findpts_local_3 ) -+#define findpts_local_eval_3 GS_PREFIXED_NAME(findpts_local_eval_3 ) - struct findpts_local_hash_data_3 { - uint hash_n; - struct dbl_range bnd[3]; -diff --git a/src/findpts_local_imp.h b/src/findpts_local_imp.h -index e5310b1..0ca79fd 100644 ---- a/src/findpts_local_imp.h -+++ b/src/findpts_local_imp.h -@@ -1,36 +1,36 @@ - #include --#define obbox TOKEN_PASTE(obbox_ ,D) --#define obbox_calc TOKEN_PASTE(PREFIXED_NAME(obbox_calc_),D) --#define obbox_test TOKEN_PASTE(obbox_test_ ,D) --#define hash_data TOKEN_PASTE(findpts_local_hash_data_,D) --#define hash_index TOKEN_PASTE(hash_index_ ,D) --#define hash_setfac TOKEN_PASTE(hash_setfac_ ,D) --#define hash_range TOKEN_PASTE(hash_range_ ,D) --#define hash_count TOKEN_PASTE(hash_count_ ,D) --#define hash_opt_size TOKEN_PASTE(hash_opt_size_ ,D) --#define hash_bb TOKEN_PASTE(hash_bb_ ,D) --#define hash_build TOKEN_PASTE(hash_build_ ,D) --#define hash_free TOKEN_PASTE(hash_free_ ,D) --#define findpts_el_data TOKEN_PASTE(findpts_el_data_ ,D) --#define findpts_el_pt TOKEN_PASTE(findpts_el_pt_ ,D) --#define findpts_el_setup TOKEN_PASTE(PREFIXED_NAME(findpts_el_setup_),D) --#define findpts_el_free TOKEN_PASTE(PREFIXED_NAME(findpts_el_free_ ),D) --#define findpts_el TOKEN_PASTE(PREFIXED_NAME(findpts_el_ ),D) --#define findpts_el_eval TOKEN_PASTE(PREFIXED_NAME(findpts_el_eval_ ),D) --#define findpts_el_start TOKEN_PASTE(findpts_el_start_ ,D) --#define findpts_el_points TOKEN_PASTE(findpts_el_points_ ,D) --#define findpts_local_data TOKEN_PASTE(findpts_local_data_,D) --#define map_points_to_els TOKEN_PASTE(map_points_to_els_ ,D) -- --#define findptsms_local_setup TOKEN_PASTE(PREFIXED_NAME(findptsms_local_setup_),D) --#define findptsms_local_free TOKEN_PASTE(PREFIXED_NAME(findptsms_local_free_ ),D) --#define findptsms_local TOKEN_PASTE(PREFIXED_NAME(findptsms_local_ ),D) --#define findptsms_local_eval TOKEN_PASTE(PREFIXED_NAME(findptsms_local_eval_ ),D) -- --#define findpts_local_setup TOKEN_PASTE(PREFIXED_NAME(findpts_local_setup_),D) --#define findpts_local_free TOKEN_PASTE(PREFIXED_NAME(findpts_local_free_ ),D) --#define findpts_local TOKEN_PASTE(PREFIXED_NAME(findpts_local_ ),D) --#define findpts_local_eval TOKEN_PASTE(PREFIXED_NAME(findpts_local_eval_ ),D) -+#define obbox GS_TOKEN_PASTE(obbox_ ,D) -+#define obbox_calc GS_TOKEN_PASTE(GS_PREFIXED_NAME(obbox_calc_),D) -+#define obbox_test GS_TOKEN_PASTE(obbox_test_ ,D) -+#define hash_data GS_TOKEN_PASTE(findpts_local_hash_data_,D) -+#define hash_index GS_TOKEN_PASTE(hash_index_ ,D) -+#define hash_setfac GS_TOKEN_PASTE(hash_setfac_ ,D) -+#define hash_range GS_TOKEN_PASTE(hash_range_ ,D) -+#define hash_count GS_TOKEN_PASTE(hash_count_ ,D) -+#define hash_opt_size GS_TOKEN_PASTE(hash_opt_size_ ,D) -+#define hash_bb GS_TOKEN_PASTE(hash_bb_ ,D) -+#define hash_build GS_TOKEN_PASTE(hash_build_ ,D) -+#define hash_free GS_TOKEN_PASTE(hash_free_ ,D) -+#define findpts_el_data GS_TOKEN_PASTE(findpts_el_data_ ,D) -+#define findpts_el_pt GS_TOKEN_PASTE(findpts_el_pt_ ,D) -+#define findpts_el_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_el_setup_),D) -+#define findpts_el_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_el_free_ ),D) -+#define findpts_el GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_el_ ),D) -+#define findpts_el_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_el_eval_ ),D) -+#define findpts_el_start GS_TOKEN_PASTE(findpts_el_start_ ,D) -+#define findpts_el_points GS_TOKEN_PASTE(findpts_el_points_ ,D) -+#define findpts_local_data GS_TOKEN_PASTE(findpts_local_data_,D) -+#define map_points_to_els GS_TOKEN_PASTE(map_points_to_els_ ,D) -+ -+#define findptsms_local_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_setup_),D) -+#define findptsms_local_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_free_ ),D) -+#define findptsms_local GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_ ),D) -+#define findptsms_local_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_eval_ ),D) -+ -+#define findpts_local_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_setup_),D) -+#define findpts_local_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_free_ ),D) -+#define findpts_local GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_ ),D) -+#define findpts_local_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_eval_ ),D) - /*-------------------------------------------------------------------------- - Point to Possible Elements Hashing - -diff --git a/src/gs.c b/src/gs.c -index b1a9aa7..3cf4471 100644 ---- a/src/gs.c -+++ b/src/gs.c -@@ -19,19 +19,19 @@ - #include "sarray_sort.h" - #include "sarray_transfer.h" - --#define gs PREFIXED_NAME(gs ) --#define gs_vec PREFIXED_NAME(gs_vec ) --#define gs_many PREFIXED_NAME(gs_many ) --#define igs PREFIXED_NAME(igs ) --#define igs_vec PREFIXED_NAME(igs_vec ) --#define igs_many PREFIXED_NAME(igs_many ) --#define gs_wait PREFIXED_NAME(gs_wait ) --#define gs_setup PREFIXED_NAME(gs_setup ) --#define gs_free PREFIXED_NAME(gs_free ) --#define gs_unique PREFIXED_NAME(gs_unique) --#define gs_hf2c PREFIXED_NAME(gs_hf2c ) --#define pw_data_nmsg PREFIXED_NAME(pw_data_nmsg ) --#define pw_data_size PREFIXED_NAME(pw_data_size ) -+#define gs GS_PREFIXED_NAME(gs ) -+#define gs_vec GS_PREFIXED_NAME(gs_vec ) -+#define gs_many GS_PREFIXED_NAME(gs_many ) -+#define igs GS_PREFIXED_NAME(igs ) -+#define igs_vec GS_PREFIXED_NAME(igs_vec ) -+#define igs_many GS_PREFIXED_NAME(igs_many ) -+#define gs_wait GS_PREFIXED_NAME(gs_wait ) -+#define gs_setup GS_PREFIXED_NAME(gs_setup ) -+#define gs_free GS_PREFIXED_NAME(gs_free ) -+#define gs_unique GS_PREFIXED_NAME(gs_unique) -+#define gs_hf2c GS_PREFIXED_NAME(gs_hf2c ) -+#define pw_data_nmsg GS_PREFIXED_NAME(pw_data_nmsg ) -+#define pw_data_size GS_PREFIXED_NAME(pw_data_size ) - - GS_DEFINE_DOM_SIZES() - -@@ -1011,7 +1011,7 @@ static void allreduce_exec_wait( - /* Why do I need this? Ugly */ - if (comm->np > 1) - comm_wait(ard->req, 1); --#ifdef MPI -+#ifdef GS_MPI - memcpy(buf,ardbuf,gvn*gs_dom_size[dom]); - #endif - /* buffer -> user array */ -@@ -1497,29 +1497,29 @@ void pw_data_size(struct gs_data *gsh, int *n) - #undef igs_many - #undef gs_wait - --#define cgs PREFIXED_NAME(gs ) --#define cgs_vec PREFIXED_NAME(gs_vec ) --#define cgs_many PREFIXED_NAME(gs_many ) --#define cgs_setup PREFIXED_NAME(gs_setup) --#define cgs_free PREFIXED_NAME(gs_free ) --#define cgs_unique PREFIXED_NAME(gs_unique) --#define cigs PREFIXED_NAME(igs ) --#define cigs_vec PREFIXED_NAME(igs_vec ) --#define cigs_many PREFIXED_NAME(igs_many) --#define cgs_wait PREFIXED_NAME(gs_wait ) -- --#define fgs_setup_pick FORTRAN_NAME(gs_setup_pick,GS_SETUP_PICK) --#define fgs_setup FORTRAN_NAME(gs_setup ,GS_SETUP ) --#define fgs FORTRAN_NAME(gs_op ,GS_OP ) --#define fgs_vec FORTRAN_NAME(gs_op_vec ,GS_OP_VEC ) --#define fgs_many FORTRAN_NAME(gs_op_many ,GS_OP_MANY ) --#define figs FORTRAN_NAME(igs_op ,IGS_OP ) --#define figs_vec FORTRAN_NAME(igs_op_vec ,IGS_OP_VEC ) --#define figs_many FORTRAN_NAME(igs_op_many ,IGS_OP_MANY ) --#define fgs_wait FORTRAN_NAME(gs_op_wait ,GS_OP_WAIT ) --#define fgs_fields FORTRAN_NAME(gs_op_fields ,GS_OP_FIELDS ) --#define fgs_free FORTRAN_NAME(gs_free ,GS_FREE ) --#define fgs_unique FORTRAN_NAME(gs_unique ,GS_UNIQUE ) -+#define cgs GS_PREFIXED_NAME(gs ) -+#define cgs_vec GS_PREFIXED_NAME(gs_vec ) -+#define cgs_many GS_PREFIXED_NAME(gs_many ) -+#define cgs_setup GS_PREFIXED_NAME(gs_setup) -+#define cgs_free GS_PREFIXED_NAME(gs_free ) -+#define cgs_unique GS_PREFIXED_NAME(gs_unique) -+#define cigs GS_PREFIXED_NAME(igs ) -+#define cigs_vec GS_PREFIXED_NAME(igs_vec ) -+#define cigs_many GS_PREFIXED_NAME(igs_many) -+#define cgs_wait GS_PREFIXED_NAME(gs_wait ) -+ -+#define fgs_setup_pick GS_FORTRAN_NAME(gs_setup_pick,GS_SETUP_PICK) -+#define fgs_setup GS_FORTRAN_NAME(gs_setup ,GS_SETUP ) -+#define fgs GS_FORTRAN_NAME(gs_op ,GS_OP ) -+#define fgs_vec GS_FORTRAN_NAME(gs_op_vec ,GS_OP_VEC ) -+#define fgs_many GS_FORTRAN_NAME(gs_op_many ,GS_OP_MANY ) -+#define figs GS_FORTRAN_NAME(igs_op ,IGS_OP ) -+#define figs_vec GS_FORTRAN_NAME(igs_op_vec ,IGS_OP_VEC ) -+#define figs_many GS_FORTRAN_NAME(igs_op_many ,IGS_OP_MANY ) -+#define fgs_wait GS_FORTRAN_NAME(gs_op_wait ,GS_OP_WAIT ) -+#define fgs_fields GS_FORTRAN_NAME(gs_op_fields ,GS_OP_FIELDS ) -+#define fgs_free GS_FORTRAN_NAME(gs_free ,GS_FREE ) -+#define fgs_unique GS_FORTRAN_NAME(gs_unique ,GS_UNIQUE ) - - static struct gs_data **fgs_info = 0; - static int fgs_max = 0; -diff --git a/src/gs.h b/src/gs.h -index a06c99f..83a0591 100644 ---- a/src/gs.h -+++ b/src/gs.h -@@ -1,7 +1,7 @@ - #ifndef GS_H - #define GS_H - --#if !defined(COMM_H) || !defined(GS_DEFS_H) || !defined(MEM_H) -+#if !defined(GS_COMM_H) || !defined(GS_DEFS_H) || !defined(GS_MEM_H) - #warning "gs.h" requires "comm.h", "gs_defs.h", and "mem.h" - #endif - -@@ -116,19 +116,19 @@ - - */ - --#define gs PREFIXED_NAME(gs ) --#define gs_vec PREFIXED_NAME(gs_vec ) --#define gs_many PREFIXED_NAME(gs_many ) --#define igs PREFIXED_NAME(igs ) --#define igs_vec PREFIXED_NAME(igs_vec ) --#define igs_many PREFIXED_NAME(igs_many ) --#define gs_wait PREFIXED_NAME(gs_wait ) --#define gs_setup PREFIXED_NAME(gs_setup ) --#define gs_free PREFIXED_NAME(gs_free ) --#define gs_unique PREFIXED_NAME(gs_unique) --#define gs_hf2c PREFIXED_NAME(gs_hf2c ) --#define pw_data_nmsg PREFIXED_NAME(pw_data_nmsg ) --#define pw_data_size PREFIXED_NAME(pw_data_size ) -+#define gs GS_PREFIXED_NAME(gs ) -+#define gs_vec GS_PREFIXED_NAME(gs_vec ) -+#define gs_many GS_PREFIXED_NAME(gs_many ) -+#define igs GS_PREFIXED_NAME(igs ) -+#define igs_vec GS_PREFIXED_NAME(igs_vec ) -+#define igs_many GS_PREFIXED_NAME(igs_many ) -+#define gs_wait GS_PREFIXED_NAME(gs_wait ) -+#define gs_setup GS_PREFIXED_NAME(gs_setup ) -+#define gs_free GS_PREFIXED_NAME(gs_free ) -+#define gs_unique GS_PREFIXED_NAME(gs_unique) -+#define gs_hf2c GS_PREFIXED_NAME(gs_hf2c ) -+#define pw_data_nmsg GS_PREFIXED_NAME(pw_data_nmsg ) -+#define pw_data_size GS_PREFIXED_NAME(pw_data_size ) - - struct gs_data; - typedef enum {gs_auto, gs_pairwise, gs_crystal_router, gs_all_reduce} gs_method; -diff --git a/src/gs_defs.h b/src/gs_defs.h -index df4ad7b..3914442 100644 ---- a/src/gs_defs.h -+++ b/src/gs_defs.h -@@ -20,7 +20,7 @@ - macro(float ) \ - macro(int ) \ - macro(long ) \ -- WHEN_LONG_LONG(macro(long_long)) -+ GS_WHEN_LONG_LONG(macro(long_long)) - - /* the supported ops */ - #define GS_FOR_EACH_OP(T,macro) \ -@@ -49,7 +49,7 @@ - GS_DEFINE_MONOID_ID(float , -FLT_MAX, FLT_MAX) \ - GS_DEFINE_MONOID_ID(int , INT_MIN, INT_MAX) \ - GS_DEFINE_MONOID_ID(long , LONG_MIN, LONG_MAX) \ -- WHEN_LONG_LONG(GS_DEFINE_MONOID_ID(long_long,LLONG_MIN,LLONG_MAX)) -+ GS_WHEN_LONG_LONG(GS_DEFINE_MONOID_ID(long_long,LLONG_MIN,LLONG_MAX)) - - /*------------------------------------------------------------------------------ - Enums and constants -@@ -62,8 +62,8 @@ typedef enum { LIST } gs_dom; - #undef ITEM - #undef LIST - --#define gs_sint TYPE_LOCAL(gs_int,gs_long,gs_long_long) --#define gs_slong TYPE_GLOBAL(gs_int,gs_long,gs_long_long) -+#define gs_sint GS_TYPE_LOCAL(gs_int,gs_long,gs_long_long) -+#define gs_slong GS_TYPE_GLOBAL(gs_int,gs_long,gs_long_long) - - /* domain type size array */ - #define GS_DOM_SIZE_ITEM(T) sizeof(T), -diff --git a/src/gs_local.c b/src/gs_local.c -index 170e94d..fa758c9 100644 ---- a/src/gs_local.c -+++ b/src/gs_local.c -@@ -5,20 +5,20 @@ - #include "name.h" - #include "types.h" - --#define gs_gather_array PREFIXED_NAME(gs_gather_array ) --#define gs_init_array PREFIXED_NAME(gs_init_array ) --#define gs_gather PREFIXED_NAME(gs_gather ) --#define gs_scatter PREFIXED_NAME(gs_scatter ) --#define gs_init PREFIXED_NAME(gs_init ) --#define gs_gather_vec PREFIXED_NAME(gs_gather_vec ) --#define gs_scatter_vec PREFIXED_NAME(gs_scatter_vec ) --#define gs_init_vec PREFIXED_NAME(gs_init_vec ) --#define gs_gather_many PREFIXED_NAME(gs_gather_many ) --#define gs_scatter_many PREFIXED_NAME(gs_scatter_many ) --#define gs_init_many PREFIXED_NAME(gs_init_many ) --#define gs_gather_vec_to_many PREFIXED_NAME(gs_gather_vec_to_many ) --#define gs_scatter_many_to_vec PREFIXED_NAME(gs_scatter_many_to_vec) --#define gs_scatter_vec_to_many PREFIXED_NAME(gs_scatter_vec_to_many) -+#define gs_gather_array GS_PREFIXED_NAME(gs_gather_array ) -+#define gs_init_array GS_PREFIXED_NAME(gs_init_array ) -+#define gs_gather GS_PREFIXED_NAME(gs_gather ) -+#define gs_scatter GS_PREFIXED_NAME(gs_scatter ) -+#define gs_init GS_PREFIXED_NAME(gs_init ) -+#define gs_gather_vec GS_PREFIXED_NAME(gs_gather_vec ) -+#define gs_scatter_vec GS_PREFIXED_NAME(gs_scatter_vec ) -+#define gs_init_vec GS_PREFIXED_NAME(gs_init_vec ) -+#define gs_gather_many GS_PREFIXED_NAME(gs_gather_many ) -+#define gs_scatter_many GS_PREFIXED_NAME(gs_scatter_many ) -+#define gs_init_many GS_PREFIXED_NAME(gs_init_many ) -+#define gs_gather_vec_to_many GS_PREFIXED_NAME(gs_gather_vec_to_many ) -+#define gs_scatter_many_to_vec GS_PREFIXED_NAME(gs_scatter_many_to_vec) -+#define gs_scatter_vec_to_many GS_PREFIXED_NAME(gs_scatter_vec_to_many) - - #include "gs_defs.h" - GS_DEFINE_IDENTITIES() -diff --git a/src/gs_local.h b/src/gs_local.h -index fc7c414..d09a420 100644 ---- a/src/gs_local.h -+++ b/src/gs_local.h -@@ -1,24 +1,24 @@ - #ifndef GS_LOCAL_H - #define GS_LOCAL_H - --#if !defined(NAME_H) || !defined(TYPES_H) || !defined(GS_DEFS_H) -+#if !defined(GS_NAME_H) || !defined(GS_TYPES_H) || !defined(GS_DEFS_H) - #warning "gs_local.h" requires "name.h", "types.h", and "gs_defs.h" - #endif - --#define gs_gather_array PREFIXED_NAME(gs_gather_array ) --#define gs_init_array PREFIXED_NAME(gs_init_array ) --#define gs_gather PREFIXED_NAME(gs_gather ) --#define gs_scatter PREFIXED_NAME(gs_scatter ) --#define gs_init PREFIXED_NAME(gs_init ) --#define gs_gather_vec PREFIXED_NAME(gs_gather_vec ) --#define gs_scatter_vec PREFIXED_NAME(gs_scatter_vec ) --#define gs_init_vec PREFIXED_NAME(gs_init_vec ) --#define gs_gather_many PREFIXED_NAME(gs_gather_many ) --#define gs_scatter_many PREFIXED_NAME(gs_scatter_many ) --#define gs_init_many PREFIXED_NAME(gs_init_many ) --#define gs_gather_vec_to_many PREFIXED_NAME(gs_gather_vec_to_many ) --#define gs_scatter_many_to_vec PREFIXED_NAME(gs_scatter_many_to_vec) --#define gs_scatter_vec_to_many PREFIXED_NAME(gs_scatter_vec_to_many) -+#define gs_gather_array GS_PREFIXED_NAME(gs_gather_array ) -+#define gs_init_array GS_PREFIXED_NAME(gs_init_array ) -+#define gs_gather GS_PREFIXED_NAME(gs_gather ) -+#define gs_scatter GS_PREFIXED_NAME(gs_scatter ) -+#define gs_init GS_PREFIXED_NAME(gs_init ) -+#define gs_gather_vec GS_PREFIXED_NAME(gs_gather_vec ) -+#define gs_scatter_vec GS_PREFIXED_NAME(gs_scatter_vec ) -+#define gs_init_vec GS_PREFIXED_NAME(gs_init_vec ) -+#define gs_gather_many GS_PREFIXED_NAME(gs_gather_many ) -+#define gs_scatter_many GS_PREFIXED_NAME(gs_scatter_many ) -+#define gs_init_many GS_PREFIXED_NAME(gs_init_many ) -+#define gs_gather_vec_to_many GS_PREFIXED_NAME(gs_gather_vec_to_many ) -+#define gs_scatter_many_to_vec GS_PREFIXED_NAME(gs_scatter_many_to_vec) -+#define gs_scatter_vec_to_many GS_PREFIXED_NAME(gs_scatter_vec_to_many) - - void gs_gather_array(void *out, const void *in, uint n, - gs_dom dom, gs_op op); -diff --git a/src/gslib.h b/src/gslib.h -index e80d7a3..4bf4d72 100644 ---- a/src/gslib.h -+++ b/src/gslib.h -@@ -1,5 +1,5 @@ --#ifndef GSLIB_H --#define GSLIB_H -+#ifndef GS_GSLIB_H -+#define GS_GSLIB_H - - #include - #include -diff --git a/src/lob_bnd.c b/src/lob_bnd.c -index 9d02ca4..0aa0492 100644 ---- a/src/lob_bnd.c -+++ b/src/lob_bnd.c -@@ -10,13 +10,13 @@ - #include "mem.h" - #include "poly.h" - --#define lob_bnd_setup PREFIXED_NAME(lob_bnd_setup) --#define lob_bnd_lin_1 PREFIXED_NAME(lob_bnd_lin_1) --#define lob_bnd_lin_2 PREFIXED_NAME(lob_bnd_lin_2) --#define lob_bnd_lin_3 PREFIXED_NAME(lob_bnd_lin_3) --#define lob_bnd_1 PREFIXED_NAME(lob_bnd_1 ) --#define lob_bnd_2 PREFIXED_NAME(lob_bnd_2 ) --#define lob_bnd_3 PREFIXED_NAME(lob_bnd_3 ) -+#define lob_bnd_setup GS_PREFIXED_NAME(lob_bnd_setup) -+#define lob_bnd_lin_1 GS_PREFIXED_NAME(lob_bnd_lin_1) -+#define lob_bnd_lin_2 GS_PREFIXED_NAME(lob_bnd_lin_2) -+#define lob_bnd_lin_3 GS_PREFIXED_NAME(lob_bnd_lin_3) -+#define lob_bnd_1 GS_PREFIXED_NAME(lob_bnd_1 ) -+#define lob_bnd_2 GS_PREFIXED_NAME(lob_bnd_2 ) -+#define lob_bnd_3 GS_PREFIXED_NAME(lob_bnd_3 ) - - struct dbl_range { double min,max; }; - -diff --git a/src/lob_bnd.h b/src/lob_bnd.h -index 7ecc8a3..52f30ee 100644 ---- a/src/lob_bnd.h -+++ b/src/lob_bnd.h -@@ -1,17 +1,17 @@ --#ifndef LOB_BND_H --#define LOB_BND_H -+#ifndef GS_LOB_BND_H -+#define GS_LOB_BND_H - --#if !defined(TYPES_H) || !defined(NAME_H) -+#if !defined(GS_TYPES_H) || !defined(GS_NAME_H) - #warning "lob_bnd.h" requires "types.h" and "name.h" - #endif - --#define lob_bnd_setup PREFIXED_NAME(lob_bnd_setup) --#define lob_bnd_lin_1 PREFIXED_NAME(lob_bnd_lin_1) --#define lob_bnd_lin_2 PREFIXED_NAME(lob_bnd_lin_2) --#define lob_bnd_lin_3 PREFIXED_NAME(lob_bnd_lin_3) --#define lob_bnd_1 PREFIXED_NAME(lob_bnd_1 ) --#define lob_bnd_2 PREFIXED_NAME(lob_bnd_2 ) --#define lob_bnd_3 PREFIXED_NAME(lob_bnd_3 ) -+#define lob_bnd_setup GS_PREFIXED_NAME(lob_bnd_setup) -+#define lob_bnd_lin_1 GS_PREFIXED_NAME(lob_bnd_lin_1) -+#define lob_bnd_lin_2 GS_PREFIXED_NAME(lob_bnd_lin_2) -+#define lob_bnd_lin_3 GS_PREFIXED_NAME(lob_bnd_lin_3) -+#define lob_bnd_1 GS_PREFIXED_NAME(lob_bnd_1 ) -+#define lob_bnd_2 GS_PREFIXED_NAME(lob_bnd_2 ) -+#define lob_bnd_3 GS_PREFIXED_NAME(lob_bnd_3 ) - - /*-------------------------------------------------------------------------- - Bounds for Polynomials on [-1,1]^d -@@ -85,7 +85,7 @@ void lob_bnd_lin_3( - const double *lob_bnd_data_t, unsigned nt, unsigned mt, - const double *restrict u, uint un, double *restrict work); - --#ifndef OBBOX_H -+#ifndef GS_OBBOX_H - struct dbl_range { double min, max; }; - #endif - -diff --git a/src/mem.h b/src/mem.h -index b68e309..01ba6cb 100644 ---- a/src/mem.h -+++ b/src/mem.h -@@ -1,5 +1,5 @@ --#ifndef MEM_H --#define MEM_H -+#ifndef GS_MEM_H -+#define GS_MEM_H - - /* requires: - for size_t, offsetof -@@ -9,7 +9,7 @@ - "fail.h" - */ - --#if !defined(C99_H) || !defined(FAIL_H) -+#if !defined(GS_C99_H) || !defined(GS_FAIL_H) - #error "mem.h" requires "c99.h" and "fail.h" - #endif - -@@ -31,8 +31,8 @@ - #else - # include - # ifndef comm_gbl_id --# define comm_gbl_id PREFIXED_NAME(comm_gbl_id) --# define comm_gbl_np PREFIXED_NAME(comm_gbl_np) -+# define comm_gbl_id GS_PREFIXED_NAME(comm_gbl_id) -+# define comm_gbl_np GS_PREFIXED_NAME(comm_gbl_np) - # include "types.h" - extern uint comm_gbl_id, comm_gbl_np; - # endif -diff --git a/src/name.h b/src/name.h -index b4bcd91..ac54032 100644 ---- a/src/name.h -+++ b/src/name.h -@@ -1,43 +1,43 @@ --#ifndef NAME_H --#define NAME_H -+#ifndef GS_NAME_H -+#define GS_NAME_H - - /* establishes some macros to establish - * the FORTRAN naming convention -- default gs_setup, etc. -- -DUPCASE GS_SETUP, etc. -- -DUNDERSCORE gs_setup_, etc. -+ default gs_setup, etc. -+ -DGS_UPCASE GS_SETUP, etc. -+ -DGS_UNDERSCORE gs_setup_, etc. - * a prefix for all external (non-FORTRAN) function names -- for example, -DPREFIX=jl_ transforms fail -> jl_fail -+ for example, -DGS_PREFIX=jl_ transforms fail -> jl_fail - * a prefix for all external FORTRAN function names -- for example, -DFPREFIX=jlf_ transforms gs_setup_ -> jlf_gs_setup_ -+ for example, -DGS_FPREFIX=jlf_ transforms gs_setup_ -> jlf_gs_setup_ - */ - - /* the following macro functions like a##b, - but will expand a and/or b if they are themselves macros */ --#define TOKEN_PASTE_(a,b) a##b --#define TOKEN_PASTE(a,b) TOKEN_PASTE_(a,b) -+#define GS_TOKEN_PASTE_(a,b) a##b -+#define GS_TOKEN_PASTE(a,b) GS_TOKEN_PASTE_(a,b) - --#ifdef PREFIX --# define PREFIXED_NAME(x) TOKEN_PASTE(PREFIX,x) -+#ifdef GS_PREFIX -+# define GS_PREFIXED_NAME(x) GS_TOKEN_PASTE(GS_PREFIX,x) - #else --# define PREFIXED_NAME(x) x -+# define GS_PREFIXED_NAME(x) x - #endif - --#ifdef FPREFIX --# define FPREFIXED_NAME(x) TOKEN_PASTE(FPREFIX,x) -+#ifdef GS_FPREFIX -+# define GS_FPREFIXED_NAME(x) GS_TOKEN_PASTE(GS_FPREFIX,x) - #else --# define FPREFIXED_NAME(x) x -+# define GS_FPREFIXED_NAME(x) x - #endif - --#if defined(UPCASE) --# define FORTRAN_NAME(low,up) FPREFIXED_NAME(up) --# define FORTRAN_UNPREFIXED(low,up) up --#elif defined(UNDERSCORE) --# define FORTRAN_NAME(low,up) FPREFIXED_NAME(TOKEN_PASTE(low,_)) --# define FORTRAN_UNPREFIXED(low,up) TOKEN_PASTE(low,_) -+#if defined(GS_UPCASE) -+# define GS_FORTRAN_NAME(low,up) GS_FPREFIXED_NAME(up) -+# define GS_FORTRAN_UNPREFIXED(low,up) up -+#elif defined(GS_UNDERSCORE) -+# define GS_FORTRAN_NAME(low,up) GS_FPREFIXED_NAME(GS_TOKEN_PASTE(low,_)) -+# define GS_FORTRAN_UNPREFIXED(low,up) GS_TOKEN_PASTE(low,_) - #else --# define FORTRAN_NAME(low,up) FPREFIXED_NAME(low) --# define FORTRAN_UNPREFIXED(low,up) low -+# define GS_FORTRAN_NAME(low,up) GS_FPREFIXED_NAME(low) -+# define GS_FORTRAN_UNPREFIXED(low,up) low - #endif - - #endif -diff --git a/src/obbox.c b/src/obbox.c -index 22c4614..611f3ac 100644 ---- a/src/obbox.c -+++ b/src/obbox.c -@@ -10,8 +10,8 @@ - #include "poly.h" - #include "lob_bnd.h" - --#define obbox_calc_2 PREFIXED_NAME(obbox_calc_2) --#define obbox_calc_3 PREFIXED_NAME(obbox_calc_3) -+#define obbox_calc_2 GS_PREFIXED_NAME(obbox_calc_2) -+#define obbox_calc_3 GS_PREFIXED_NAME(obbox_calc_3) - - struct obbox_2 { double c0[2], A[4]; - struct dbl_range x[2]; }; -diff --git a/src/obbox.h b/src/obbox.h -index 8e5764f..86ba0ce 100644 ---- a/src/obbox.h -+++ b/src/obbox.h -@@ -1,12 +1,12 @@ --#ifndef OBBOX_H --#define OBBOX_H -+#ifndef GS_OBBOX_H -+#define GS_OBBOX_H - --#if !defined(TYPES_H) || !defined(NAME_H) -+#if !defined(GS_TYPES_H) || !defined(GS_NAME_H) - #warning "obbox.h" requires "types.h" and "name.h" - #endif - --#define obbox_calc_2 PREFIXED_NAME(obbox_calc_2) --#define obbox_calc_3 PREFIXED_NAME(obbox_calc_3) -+#define obbox_calc_2 GS_PREFIXED_NAME(obbox_calc_2) -+#define obbox_calc_3 GS_PREFIXED_NAME(obbox_calc_3) - - /*-------------------------------------------------------------------------- - Oriented and axis-aligned bounding box computation for spectral elements -@@ -45,7 +45,7 @@ - - --------------------------------------------------------------------------*/ - --#ifndef LOB_BND_H -+#ifndef GS_LOB_BND_H - struct dbl_range { double min, max; }; - #endif - -diff --git a/src/poly.c b/src/poly.c -index 00ad22b..d8585be 100644 ---- a/src/poly.c -+++ b/src/poly.c -@@ -8,14 +8,14 @@ - #include "fail.h" - #include "mem.h" - --#define lagrange_size PREFIXED_NAME(lagrange_size ) --#define lagrange_setup PREFIXED_NAME(lagrange_setup) --#define gauss_nodes PREFIXED_NAME(gauss_nodes ) --#define gauss_quad PREFIXED_NAME(gauss_quad ) --#define lobatto_nodes PREFIXED_NAME(lobatto_nodes ) --#define lobatto_quad PREFIXED_NAME(lobatto_quad ) --#define gll_lag_size PREFIXED_NAME(gll_lag_size ) --#define gll_lag_setup PREFIXED_NAME(gll_lag_setup ) -+#define lagrange_size GS_PREFIXED_NAME(lagrange_size ) -+#define lagrange_setup GS_PREFIXED_NAME(lagrange_setup) -+#define gauss_nodes GS_PREFIXED_NAME(gauss_nodes ) -+#define gauss_quad GS_PREFIXED_NAME(gauss_quad ) -+#define lobatto_nodes GS_PREFIXED_NAME(lobatto_nodes ) -+#define lobatto_quad GS_PREFIXED_NAME(lobatto_quad ) -+#define gll_lag_size GS_PREFIXED_NAME(gll_lag_size ) -+#define gll_lag_setup GS_PREFIXED_NAME(gll_lag_setup ) - - typedef void lagrange_fun(double *restrict p, - double *restrict data, unsigned n, int d, double x); -diff --git a/src/poly.h b/src/poly.h -index 2fa162a..2781b22 100644 ---- a/src/poly.h -+++ b/src/poly.h -@@ -1,18 +1,18 @@ --#ifndef POLY_H --#define POLY_H -+#ifndef GS_POLY_H -+#define GS_POLY_H - --#if !defined(NAME_H) -+#if !defined(GS_NAME_H) - #warning "poly.h" requires "name.h" - #endif - --#define lagrange_size PREFIXED_NAME(lagrange_size ) --#define lagrange_setup PREFIXED_NAME(lagrange_setup) --#define gauss_nodes PREFIXED_NAME(gauss_nodes ) --#define gauss_quad PREFIXED_NAME(gauss_quad ) --#define lobatto_nodes PREFIXED_NAME(lobatto_nodes ) --#define lobatto_quad PREFIXED_NAME(lobatto_quad ) --#define gll_lag_size PREFIXED_NAME(gll_lag_size ) --#define gll_lag_setup PREFIXED_NAME(gll_lag_setup ) -+#define lagrange_size GS_PREFIXED_NAME(lagrange_size ) -+#define lagrange_setup GS_PREFIXED_NAME(lagrange_setup) -+#define gauss_nodes GS_PREFIXED_NAME(gauss_nodes ) -+#define gauss_quad GS_PREFIXED_NAME(gauss_quad ) -+#define lobatto_nodes GS_PREFIXED_NAME(lobatto_nodes ) -+#define lobatto_quad GS_PREFIXED_NAME(lobatto_quad ) -+#define gll_lag_size GS_PREFIXED_NAME(gll_lag_size ) -+#define gll_lag_setup GS_PREFIXED_NAME(gll_lag_setup ) - - /*-------------------------------------------------------------------------- - Quadrature Nodes and Weights Calculation -diff --git a/src/sarray_sort.c b/src/sarray_sort.c -index 0ec26d1..9ba8fc4 100644 ---- a/src/sarray_sort.c -+++ b/src/sarray_sort.c -@@ -8,8 +8,8 @@ - #include "mem.h" - #include "sort.h" - --#define sarray_permute_ PREFIXED_NAME(sarray_permute_) --#define sarray_permute_buf_ PREFIXED_NAME(sarray_permute_buf_) -+#define sarray_permute_ GS_PREFIXED_NAME(sarray_permute_) -+#define sarray_permute_buf_ GS_PREFIXED_NAME(sarray_permute_buf_) - - void sarray_permute_(size_t size, void *A, size_t n, uint *perm, void *work) - { -diff --git a/src/sarray_sort.h b/src/sarray_sort.h -index cd30d7c..97fdba1 100644 ---- a/src/sarray_sort.h -+++ b/src/sarray_sort.h -@@ -1,7 +1,7 @@ --#ifndef SARRAY_SORT_H --#define SARRAY_SORT_H -+#ifndef GS_SARRAY_SORT_H -+#define GS_SARRAY_SORT_H - --#if !defined(SORT_H) -+#if !defined(GS_SORT_H) - #warning "sarray_sort.h" requires "sort.h" - #endif - -@@ -33,8 +33,8 @@ - ----------------------------------------------------------------------------*/ - - --#define sarray_permute_ PREFIXED_NAME(sarray_permute_) --#define sarray_permute_buf_ PREFIXED_NAME(sarray_permute_buf_) -+#define sarray_permute_ GS_PREFIXED_NAME(sarray_permute_) -+#define sarray_permute_buf_ GS_PREFIXED_NAME(sarray_permute_buf_) - - void sarray_permute_(size_t size, void *A, size_t n, uint *perm, void *work); - void sarray_permute_buf_( -diff --git a/src/sarray_transfer.c b/src/sarray_transfer.c -index c5dfd2b..5f94192 100644 ---- a/src/sarray_transfer.c -+++ b/src/sarray_transfer.c -@@ -11,9 +11,9 @@ - #include "crystal.h" - #include "sort.h" - --#define sarray_transfer_many PREFIXED_NAME(sarray_transfer_many) --#define sarray_transfer_ PREFIXED_NAME(sarray_transfer_ ) --#define sarray_transfer_ext_ PREFIXED_NAME(sarray_transfer_ext_) -+#define sarray_transfer_many GS_PREFIXED_NAME(sarray_transfer_many) -+#define sarray_transfer_ GS_PREFIXED_NAME(sarray_transfer_ ) -+#define sarray_transfer_ext_ GS_PREFIXED_NAME(sarray_transfer_ext_) - - static void pack_int( - buffer *const data, const unsigned row_size, const uint id, -diff --git a/src/sarray_transfer.h b/src/sarray_transfer.h -index c195e21..cc441ae 100644 ---- a/src/sarray_transfer.h -+++ b/src/sarray_transfer.h -@@ -1,7 +1,7 @@ --#ifndef SARRAY_TRANSFER_H --#define SARRAY_TRANSFER_H -+#ifndef GS_SARRAY_TRANSFER_H -+#define GS_SARRAY_TRANSFER_H - --#if !defined(CRYSTAL_H) -+#if !defined(GS_CRYSTAL_H) - #warning "sarray_transfer.h" requires "crystal.h" - #endif - -@@ -70,9 +70,9 @@ - - */ - --#define sarray_transfer_many PREFIXED_NAME(sarray_transfer_many) --#define sarray_transfer_ PREFIXED_NAME(sarray_transfer_ ) --#define sarray_transfer_ext_ PREFIXED_NAME(sarray_transfer_ext_) -+#define sarray_transfer_many GS_PREFIXED_NAME(sarray_transfer_many) -+#define sarray_transfer_ GS_PREFIXED_NAME(sarray_transfer_ ) -+#define sarray_transfer_ext_ GS_PREFIXED_NAME(sarray_transfer_ext_) - - uint sarray_transfer_many( - struct array *const *const A, const unsigned *const size, const unsigned An, -diff --git a/src/sort.c b/src/sort.c -index 2bb061b..b642a3f 100644 ---- a/src/sort.c -+++ b/src/sort.c -@@ -14,7 +14,7 @@ - #undef SORT_SUFFIX - #undef T - --#if defined(USE_LONG) || defined(GLOBAL_LONG) -+#if defined(GS_USE_LONG) || defined(GS_GLOBAL_LONG) - # define T unsigned long - # define SORT_SUFFIX _ul - # include "sort_imp.h" -@@ -22,7 +22,7 @@ - # undef T - #endif - --#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG) -+#if defined(GS_USE_LONG_LONG) || defined(GS_GLOBAL_LONG_LONG) - # define T unsigned long long - # define SORT_SUFFIX _ull - # include "sort_imp.h" -diff --git a/src/sort.h b/src/sort.h -index 0b0ee53..4cb2fd2 100644 ---- a/src/sort.h -+++ b/src/sort.h -@@ -1,7 +1,7 @@ --#ifndef SORT_H --#define SORT_H -+#ifndef GS_SORT_H -+#define GS_SORT_H - --#if !defined(TYPES_H) || !defined(MEM_H) -+#if !defined(GS_TYPES_H) || !defined(GS_MEM_H) - #warning "sort.h" requires "types.h" and "mem.h" - /* types.h defines uint, ulong - mem.h defines buffer */ -@@ -44,21 +44,21 @@ - - ----------------------------------------------------------------------------*/ - --#define sortv_ui PREFIXED_NAME(sortv_ui) --#define sortv_ul PREFIXED_NAME(sortv_ul) --#define sortv_ull PREFIXED_NAME(sortv_ull) --#define sortv_double PREFIXED_NAME(sortv_double) --#define sortv_float PREFIXED_NAME(sortv_float) --#define sortp_ui PREFIXED_NAME(sortp_ui) --#define sortp_ul PREFIXED_NAME(sortp_ul) --#define sortp_ull PREFIXED_NAME(sortp_ull) --#define sortp_double PREFIXED_NAME(sortp_double) --#define sortp_float PREFIXED_NAME(sortp_float) -+#define sortv_ui GS_PREFIXED_NAME(sortv_ui) -+#define sortv_ul GS_PREFIXED_NAME(sortv_ul) -+#define sortv_ull GS_PREFIXED_NAME(sortv_ull) -+#define sortv_double GS_PREFIXED_NAME(sortv_double) -+#define sortv_float GS_PREFIXED_NAME(sortv_float) -+#define sortp_ui GS_PREFIXED_NAME(sortp_ui) -+#define sortp_ul GS_PREFIXED_NAME(sortp_ul) -+#define sortp_ull GS_PREFIXED_NAME(sortp_ull) -+#define sortp_double GS_PREFIXED_NAME(sortp_double) -+#define sortp_float GS_PREFIXED_NAME(sortp_float) - --#define sortv TYPE_LOCAL(sortv_ui,sortv_ul,sortv_ull) --#define sortp TYPE_LOCAL(sortp_ui,sortp_ul,sortp_ull) --#define sortv_long TYPE_GLOBAL(sortv_ui,sortv_ul,sortv_ull) --#define sortp_long TYPE_GLOBAL(sortp_ui,sortp_ul,sortp_ull) -+#define sortv GS_TYPE_LOCAL(sortv_ui,sortv_ul,sortv_ull) -+#define sortp GS_TYPE_LOCAL(sortp_ui,sortp_ul,sortp_ull) -+#define sortv_long GS_TYPE_GLOBAL(sortv_ui,sortv_ul,sortv_ull) -+#define sortp_long GS_TYPE_GLOBAL(sortp_ui,sortp_ul,sortp_ull) - - void sortv_ui(unsigned *out, const unsigned *A, uint n, unsigned stride, - buffer *restrict buf); -@@ -79,7 +79,7 @@ uint *sortp_double(buffer *restrict buf, int start_perm, - const double *restrict A, uint n, unsigned stride); - uint *sortp_float(buffer *restrict buf, int start_perm, - const float *restrict A, uint n, unsigned stride); --#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG) -+#if defined(GS_USE_LONG_LONG) || defined(GS_GLOBAL_LONG_LONG) - void sortv_ull(unsigned long long *out, - const unsigned long long *A, uint n, unsigned stride, - buffer *restrict buf); -diff --git a/src/sort_imp.h b/src/sort_imp.h -index 3ec8e0c..a6a0426 100644 ---- a/src/sort_imp.h -+++ b/src/sort_imp.h -@@ -2,27 +2,27 @@ - #error sort_imp.h not meant to be compiled by itself - #endif - --#define sort_data TOKEN_PASTE(sort_data ,SORT_SUFFIX) --#define radix_count TOKEN_PASTE(radix_count ,SORT_SUFFIX) --#define radix_offsets TOKEN_PASTE(radix_offsets ,SORT_SUFFIX) --#define radix_zeros TOKEN_PASTE(radix_zeros ,SORT_SUFFIX) --#define radix_passv TOKEN_PASTE(radix_passv ,SORT_SUFFIX) --#define radix_sortv TOKEN_PASTE(radix_sortv ,SORT_SUFFIX) --#define radix_passp0_b TOKEN_PASTE(radix_passp0_b ,SORT_SUFFIX) --#define radix_passp_b TOKEN_PASTE(radix_passp_b ,SORT_SUFFIX) --#define radix_passp_m TOKEN_PASTE(radix_passp_m ,SORT_SUFFIX) --#define radix_passp_e TOKEN_PASTE(radix_passp_e ,SORT_SUFFIX) --#define radix_passp0_be TOKEN_PASTE(radix_passp0_be,SORT_SUFFIX) --#define radix_passp_be TOKEN_PASTE(radix_passp_be, SORT_SUFFIX) --#define radix_sortp TOKEN_PASTE(radix_sortp ,SORT_SUFFIX) --#define merge_sortv TOKEN_PASTE(merge_sortv ,SORT_SUFFIX) --#define merge_copy_perm TOKEN_PASTE(merge_copy_perm,SORT_SUFFIX) --#define merge_sortp0 TOKEN_PASTE(merge_sortp0 ,SORT_SUFFIX) --#define merge_sortp TOKEN_PASTE(merge_sortp ,SORT_SUFFIX) --#define heap_sortv TOKEN_PASTE(heap_sortv ,SORT_SUFFIX) -- --#define sortv PREFIXED_NAME(TOKEN_PASTE(sortv,SORT_SUFFIX)) --#define sortp PREFIXED_NAME(TOKEN_PASTE(sortp,SORT_SUFFIX)) -+#define sort_data GS_TOKEN_PASTE(sort_data ,SORT_SUFFIX) -+#define radix_count GS_TOKEN_PASTE(radix_count ,SORT_SUFFIX) -+#define radix_offsets GS_TOKEN_PASTE(radix_offsets ,SORT_SUFFIX) -+#define radix_zeros GS_TOKEN_PASTE(radix_zeros ,SORT_SUFFIX) -+#define radix_passv GS_TOKEN_PASTE(radix_passv ,SORT_SUFFIX) -+#define radix_sortv GS_TOKEN_PASTE(radix_sortv ,SORT_SUFFIX) -+#define radix_passp0_b GS_TOKEN_PASTE(radix_passp0_b ,SORT_SUFFIX) -+#define radix_passp_b GS_TOKEN_PASTE(radix_passp_b ,SORT_SUFFIX) -+#define radix_passp_m GS_TOKEN_PASTE(radix_passp_m ,SORT_SUFFIX) -+#define radix_passp_e GS_TOKEN_PASTE(radix_passp_e ,SORT_SUFFIX) -+#define radix_passp0_be GS_TOKEN_PASTE(radix_passp0_be,SORT_SUFFIX) -+#define radix_passp_be GS_TOKEN_PASTE(radix_passp_be, SORT_SUFFIX) -+#define radix_sortp GS_TOKEN_PASTE(radix_sortp ,SORT_SUFFIX) -+#define merge_sortv GS_TOKEN_PASTE(merge_sortv ,SORT_SUFFIX) -+#define merge_copy_perm GS_TOKEN_PASTE(merge_copy_perm,SORT_SUFFIX) -+#define merge_sortp0 GS_TOKEN_PASTE(merge_sortp0 ,SORT_SUFFIX) -+#define merge_sortp GS_TOKEN_PASTE(merge_sortp ,SORT_SUFFIX) -+#define heap_sortv GS_TOKEN_PASTE(heap_sortv ,SORT_SUFFIX) -+ -+#define sortv GS_PREFIXED_NAME(GS_TOKEN_PASTE(sortv,SORT_SUFFIX)) -+#define sortp GS_PREFIXED_NAME(GS_TOKEN_PASTE(sortp,SORT_SUFFIX)) - - typedef struct { T v; uint i; } sort_data; - -diff --git a/src/tensor.c b/src/tensor.c -index a724714..a9cf442 100644 ---- a/src/tensor.c -+++ b/src/tensor.c -@@ -2,10 +2,10 @@ - #include "name.h" - #include "types.h" - --#if !defined(USE_CBLAS) -+#if !defined(GS_USE_CBLAS) - --#define tensor_dot PREFIXED_NAME(tensor_dot ) --#define tensor_mtxm PREFIXED_NAME(tensor_mtxm) -+#define tensor_dot GS_PREFIXED_NAME(tensor_dot ) -+#define tensor_mtxm GS_PREFIXED_NAME(tensor_mtxm) - - /* Matrices are always column-major (FORTRAN style) */ - -@@ -16,10 +16,10 @@ double tensor_dot(const double *a, const double *b, uint n) - return sum; - } - --# if defined(USE_NAIVE_BLAS) --# define tensor_mxv PREFIXED_NAME(tensor_mxv ) --# define tensor_mtxv PREFIXED_NAME(tensor_mtxv) --# define tensor_mxm PREFIXED_NAME(tensor_mxm ) -+# if defined(GS_USE_NAIVE_BLAS) -+# define tensor_mxv GS_PREFIXED_NAME(tensor_mxv ) -+# define tensor_mtxv GS_PREFIXED_NAME(tensor_mtxv) -+# define tensor_mxm GS_PREFIXED_NAME(tensor_mxm ) - - /* y = A x */ - void tensor_mxv( -diff --git a/src/tensor.h b/src/tensor.h -index bb65be1..c692398 100644 ---- a/src/tensor.h -+++ b/src/tensor.h -@@ -1,12 +1,16 @@ --#ifndef TENSOR_H --#define TENSOR_H -+#ifndef GS_TENSOR_H -+#define GS_TENSOR_H - --#if !defined(TYPES_H) || !defined(NAME_H) -+#if !defined(GS_TYPES_H) || !defined(GS_NAME_H) - #warning "tensor.h" requires "types.h" and "name.h" - #endif - --#if defined(USE_CBLAS) -+#if defined(GS_USE_CBLAS) -+#if defined(GS_USE_MKL) -+# include -+#else - # include -+#endif - # define tensor_dot(a,b,n) cblas_ddot((int)(n),a,1,b,1) - # define tensor_mxv(y,ny,A,x,nx) \ - cblas_dgemv(CblasColMajor,CblasNoTrans,(int)ny,(int)nx, \ -@@ -23,17 +27,17 @@ - (int)nc,(int)nb,(int)na,1.0, \ - A,(int)na,B,(int)na,0.0,C,(int)nc) - #else --# define tensor_dot PREFIXED_NAME(tensor_dot ) --# define tensor_mtxm PREFIXED_NAME(tensor_mtxm) -+# define tensor_dot GS_PREFIXED_NAME(tensor_dot ) -+# define tensor_mtxm GS_PREFIXED_NAME(tensor_mtxm) - double tensor_dot(const double *a, const double *b, uint n); - - /* C (nc x nb) = [A (na x nc)]^T * B (na x nb); all column-major */ - void tensor_mtxm(double *C, uint nc, - const double *A, uint na, const double *B, uint nb); --# if defined(USE_NAIVE_BLAS) --# define tensor_mxv PREFIXED_NAME(tensor_mxv ) --# define tensor_mtxv PREFIXED_NAME(tensor_mtxv) --# define tensor_mxm PREFIXED_NAME(tensor_mxm ) -+# if defined(GS_USE_NAIVE_BLAS) -+# define tensor_mxv GS_PREFIXED_NAME(tensor_mxv ) -+# define tensor_mtxv GS_PREFIXED_NAME(tensor_mtxv) -+# define tensor_mxm GS_PREFIXED_NAME(tensor_mxm ) - /* y = A x */ - void tensor_mxv(double *y, uint ny, const double *A, const double *x, uint nx); - -@@ -44,7 +48,7 @@ void tensor_mtxv(double *y, uint ny, const double *A, const double *x, uint nx); - void tensor_mxm(double *C, uint nc, - const double *A, uint na, const double *B, uint nb); - # else --# define mxm FORTRAN_NAME(mxm,MXM) -+# define mxm GS_FORTRAN_NAME(mxm,MXM) - /* C (na x nc) = A (na x nb) * B (nb x nc); all column-major */ - void mxm(const double *A, const uint *na, - const double *B, const uint *nb, -diff --git a/src/types.h b/src/types.h -index 14a94bf..d76cef3 100644 ---- a/src/types.h -+++ b/src/types.h -@@ -1,5 +1,5 @@ --#ifndef TYPES_H --#define TYPES_H -+#ifndef GS_TYPES_H -+#define GS_TYPES_H - #include - - /* -@@ -10,20 +10,20 @@ - most frequently, e.g., for indexing into local arrays, - and for processor ids. It can be one of - -- macro sint/uint type -+ macro sint/uint type - -- (default) int -- USE_LONG long -- USE_LONG_LONG long long -+ (default) int -+ GS_USE_LONG long -+ GS_USE_LONG_LONG long long - - The slong/ulong type is used in relatively few places - for global identifiers and indices. It can be one of - -- macro slong/ulong type -+ macro slong/ulong type - -- (default) int -- GLOBAL_LONG long -- GLOBAL_LONG_LONG long long -+ (default) int -+ GS_GLOBAL_LONG long -+ GS_GLOBAL_LONG_LONG long long - - Since the long long type is not ISO C90, it is never - used unless explicitly asked for. -@@ -34,9 +34,9 @@ - - */ - --#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG) -+#if defined(GS_USE_LONG_LONG) || defined(GS_GLOBAL_LONG_LONG) - typedef long long long_long; --# define WHEN_LONG_LONG(x) x -+# define GS_WHEN_LONG_LONG(x) x - # if !defined(LLONG_MAX) - # if defined(LONG_LONG_MAX) - # define LLONG_MAX LONG_LONG_MAX -@@ -52,34 +52,34 @@ typedef long long long_long; - # endif - # endif - #else --# define WHEN_LONG_LONG(x) -+# define GS_WHEN_LONG_LONG(x) - #endif - --#if !defined(USE_LONG) && !defined(USE_LONG_LONG) --# define TYPE_LOCAL(i,l,ll) i --#elif defined(USE_LONG) --# define TYPE_LOCAL(i,l,ll) l --#elif defined(USE_LONG_LONG) --# define TYPE_LOCAL(i,l,ll) ll -+#if !defined(GS_USE_LONG) && !defined(GS_USE_LONG_LONG) -+# define GS_TYPE_LOCAL(i,l,ll) i -+#elif defined(GS_USE_LONG) -+# define GS_TYPE_LOCAL(i,l,ll) l -+#elif defined(GS_USE_LONG_LONG) -+# define GS_TYPE_LOCAL(i,l,ll) ll - #endif - --#if !defined(GLOBAL_LONG) && !defined(GLOBAL_LONG_LONG) --# define TYPE_GLOBAL(i,l,ll) i --#elif defined(GLOBAL_LONG) --# define TYPE_GLOBAL(i,l,ll) l -+#if !defined(GS_GLOBAL_LONG) && !defined(GS_GLOBAL_LONG_LONG) -+# define GS_TYPE_GLOBAL(i,l,ll) i -+#elif defined(GS_GLOBAL_LONG) -+# define GS_TYPE_GLOBAL(i,l,ll) l - #else --# define TYPE_GLOBAL(i,l,ll) ll -+# define GS_TYPE_GLOBAL(i,l,ll) ll - #endif - - /* local integer type: for quantities O(N/P) */ --#define sint signed TYPE_LOCAL(int,long,long long) --#define uint unsigned TYPE_LOCAL(int,long,long long) --#define iabs TYPE_LOCAL(abs,labs,llabs) -+#define sint signed GS_TYPE_LOCAL(int,long,long long) -+#define uint unsigned GS_TYPE_LOCAL(int,long,long long) -+#define iabs GS_TYPE_LOCAL(abs,labs,llabs) - - /* global integer type: for quantities O(N) */ --#define slong signed TYPE_GLOBAL(int,long,long long) --#define ulong unsigned TYPE_GLOBAL(int,long,long long) --#define iabsl TYPE_GLOBAL(abs,labs,llabs) -+#define slong signed GS_TYPE_GLOBAL(int,long,long long) -+#define ulong unsigned GS_TYPE_GLOBAL(int,long,long long) -+#define iabsl GS_TYPE_GLOBAL(abs,labs,llabs) - - #endif - +diff --git a/Makefile b/Makefile +index 63364d8..7cc5683 100644 +--- a/Makefile ++++ b/Makefile +@@ -2,127 +2,163 @@ MPI ?= 1 + ADDUS ?= 1 + USREXIT ?= 0 + NBC ?= 0 +-LIBNAME ?= gs + BLAS ?= 0 +-DEBUG ?= 0 ++MKL ?= 0 ++ + CFLAGS ?= -O2 + FFLAGS ?= -O2 ++ARFLAGS ?= cr ++ ++LIBNAME ?= gs + CPREFIX ?= gslib_ + FPREFIX ?= fgslib_ + +-SRCROOT=. +-TESTDIR=$(SRCROOT)/tests +-FTESTDIR=$(TESTDIR)/fortran +-SRCDIR=$(SRCROOT)/src +-INCDIR=$(SRCROOT)/src +-LIBDIR=$(SRCROOT)/lib ++STATIC ?= 1 ++SHARED ?= 0 ++ ++SRCROOT = . ++TESTDIR = $(SRCROOT)/tests ++FTESTDIR = $(TESTDIR)/fortran ++SRCDIR = $(SRCROOT)/src ++INCDIR = $(SRCROOT)/src ++LIBDIR = $(SRCROOT)/lib ++ ++DARWIN := $(filter Darwin,$(shell uname -s)) ++SO_EXT := $(if $(DARWIN),dylib,so) + + ifneq (,$(strip $(DESTDIR))) +-INSTALL_ROOT = $(DESTDIR) ++ INSTALL_ROOT = $(DESTDIR) + else +-INSTALL_ROOT = $(SRCROOT)/build ++ INSTALL_ROOT = $(SRCROOT)/build ++endif ++ ++ifneq (0,$(SHARED)) ++ ifneq (0,$(STATIC)) ++ $(warning Cannot build with both STATIC=1 and SHARED=1, setting SHARED=0) ++ override SHARED = 0 ++ endif ++endif ++ ++ifneq (0,$(SHARED)) ++ ifeq ($(filter -fPIC,$(CFLAGS)),) ++ override CFLAGS += -fPIC ++ endif ++ ifneq ($(DARWIN),) ++ override LDFLAGS += -install_name @rpath/lib$(LIBNAME).$(SO_EXT) ++ endif + endif + + $(shell >config.h) + ifneq (0,$(MPI)) +- SN=MPI +- G:=$(G) -D$(SN) ++ SN = GS_MPI ++ G := $(G) -D$(SN) ++ $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) + ifeq ($(origin CC),default) + CC = mpicc + endif + ifeq ($(origin FC),default) + FC = mpif77 + endif +- $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) + endif + + ifneq (0,$(ADDUS)) +- SN=UNDERSCORE +- G:=$(G) -D$(SN) ++ SN = GS_UNDERSCORE ++ G := $(G) -D$(SN) + $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) + endif + +-SN=GLOBAL_LONG_LONG +-G:=$(G) -D$(SN) +-$(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) +- +-SN=PREFIX +-G:=$(G) -D$(SN)=$(CPREFIX) ++SN = GS_PREFIX ++G := $(G) -D$(SN)=$(CPREFIX) + $(shell printf "#ifndef ${SN}\n#define ${SN} ${CPREFIX}\n#endif\n" >>config.h) + +-SN=FPREFIX +-G:=$(G) -D$(SN)=$(FPREFIX) ++SN = GS_FPREFIX ++G := $(G) -D$(SN)=$(FPREFIX) + $(shell printf "#ifndef ${SN}\n#define ${SN} ${FPREFIX}\n#endif\n" >>config.h) + ++SN = GS_GLOBAL_LONG_LONG ++G := $(G) -D$(SN) ++$(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) ++ + ifneq (0,$(USREXIT)) +- G+=-DUSE_USR_EXIT ++ G += -DGS_USE_USR_EXIT + endif + + ifneq (0,$(NBC)) +- G+=-DUSE_NBC ++ G += -DGS_USE_NBC + endif + + ifeq (0,$(BLAS)) +- SN=USE_NAIVE_BLAS +- G:=$(G) -D$(SN) ++ SN = GS_USE_NAIVE_BLAS ++ G := $(G) -D$(SN) + $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) + endif + + ifeq (1,$(BLAS)) +- G+=-DUSE_CBLAS ++ SN = GS_USE_CBLAS ++ G := $(G) -D$(SN) ++ $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) ++ ifeq (1,$(MKL)) ++ SN = GS_USE_MKL ++ G := $(G) -D$(SN) ++ $(shell printf "#ifndef ${SN}\n#define ${SN}\n#endif\n" >>config.h) ++ endif + endif + +-ifneq (0,$(DEBUG)) +- G+=-DGSLIB_DEBUG +- CFLAGS+=-g +-endif ++CCCMD = $(CC) $(CFLAGS) -I$(INCDIR) $(G) ++FCCMD = $(FC) $(FFLAGS) -I$(INCDIR) $(G) + +-CCCMD=$(CC) $(CFLAGS) -I$(INCDIR) $(G) +-FCCMD=$(FC) $(FFLAGS) -I$(INCDIR) $(G) ++TESTS = $(TESTDIR)/sort_test $(TESTDIR)/sort_test2 $(TESTDIR)/sarray_sort_test \ ++ $(TESTDIR)/comm_test $(TESTDIR)/crystal_test \ ++ $(TESTDIR)/sarray_transfer_test $(TESTDIR)/gs_test \ ++ $(TESTDIR)/gs_test_gop_blocking $(TESTDIR)/gs_test_gop_nonblocking \ ++ $(TESTDIR)/gs_unique_test \ ++ $(TESTDIR)/findpts_el_2_test \ ++ $(TESTDIR)/findpts_el_2_test2 $(TESTDIR)/findpts_el_3_test \ ++ $(TESTDIR)/findpts_el_3_test2 $(TESTDIR)/findpts_local_test \ ++ $(TESTDIR)/findpts_test $(TESTDIR)/findpts_test_ms $(TESTDIR)/poly_test \ ++ $(TESTDIR)/lob_bnd_test $(TESTDIR)/obbox_test + +-TESTS=$(TESTDIR)/sort_test $(TESTDIR)/sort_test2 $(TESTDIR)/sarray_sort_test \ +- $(TESTDIR)/comm_test $(TESTDIR)/crystal_test \ +- $(TESTDIR)/sarray_transfer_test $(TESTDIR)/gs_test \ +- $(TESTDIR)/gs_test_gop_blocking $(TESTDIR)/gs_test_gop_nonblocking \ +- $(TESTDIR)/gs_unique_test \ +- $(TESTDIR)/findpts_el_2_test \ +- $(TESTDIR)/findpts_el_2_test2 $(TESTDIR)/findpts_el_3_test \ +- $(TESTDIR)/findpts_el_3_test2 $(TESTDIR)/findpts_local_test \ +- $(TESTDIR)/findpts_test $(TESTDIR)/findpts_test_ms $(TESTDIR)/poly_test \ +- $(TESTDIR)/lob_bnd_test $(TESTDIR)/obbox_test ++FTESTS = $(FTESTDIR)/f-igs + +-FTESTS=$(FTESTDIR)/f-igs ++GS = $(SRCDIR)/gs.o $(SRCDIR)/sort.o $(SRCDIR)/sarray_transfer.o \ ++ $(SRCDIR)/sarray_sort.o $(SRCDIR)/gs_local.o $(SRCDIR)/fail.o \ ++ $(SRCDIR)/crystal.o $(SRCDIR)/comm.o $(SRCDIR)/tensor.o + +-GS=$(SRCDIR)/gs.o $(SRCDIR)/sort.o $(SRCDIR)/sarray_transfer.o \ +- $(SRCDIR)/sarray_sort.o $(SRCDIR)/gs_local.o $(SRCDIR)/fail.o \ +- $(SRCDIR)/crystal.o $(SRCDIR)/comm.o $(SRCDIR)/tensor.o ++FWRAPPER = $(SRCDIR)/fcrystal.o $(SRCDIR)/findpts.o + +-FWRAPPER=$(SRCDIR)/fcrystal.o $(SRCDIR)/findpts.o +-INTP=$(SRCDIR)/findpts_local.o $(SRCDIR)/obbox.o $(SRCDIR)/poly.o \ +- $(SRCDIR)/lob_bnd.o $(SRCDIR)/findpts_el_3.o $(SRCDIR)/findpts_el_2.o ++INTP = $(SRCDIR)/findpts_local.o $(SRCDIR)/obbox.o $(SRCDIR)/poly.o \ ++ $(SRCDIR)/lob_bnd.o $(SRCDIR)/findpts_el_3.o $(SRCDIR)/findpts_el_2.o + + .PHONY: all lib install tests clean objects + + all : lib install + +-lib: $(GS) $(FWRAPPER) $(INTP) +- @$(AR) cr $(SRCDIR)/lib$(LIBNAME).a $? +- @ranlib $(SRCDIR)/lib$(LIBNAME).a ++lib: $(if $(filter-out 0,$(STATIC)),$(SRCDIR)/lib$(LIBNAME).a) $(if $(filter-out 0,$(SHARED)),$(SRCDIR)/lib$(LIBNAME).$(SO_EXT)) ++ ++$(SRCDIR)/lib$(LIBNAME).a: $(GS) $(FWRAPPER) $(INTP) ++ $(AR) $(ARFLAGS) $@ $^ ++ ranlib $@ ++ ++$(SRCDIR)/lib$(LIBNAME).$(SO_EXT): $(GS) $(FWRAPPER) $(INTP) ++ $(CC) $(CFLAGS) -shared -o $@ $^ $(LDFLAGS) + + install: lib + @mkdir -p $(INSTALL_ROOT)/lib 2>/dev/null +- @cp -v $(SRCDIR)/lib$(LIBNAME).a $(INSTALL_ROOT)/lib 2>/dev/null +- @mkdir -p $(INSTALL_ROOT)/include 2>/dev/null +- @cp $(SRCDIR)/*.h $(INSTALL_ROOT)/include 2>/dev/null +- @cp $(SRCDIR)/*.h $(INSTALL_ROOT)/include 2>/dev/null +- @mv config.h $(INSTALL_ROOT)/include 2>/dev/null ++ $(if $(filter-out 0,$(STATIC)),cp $(SRCDIR)/lib$(LIBNAME).a $(INSTALL_ROOT)/lib) ++ $(if $(filter-out 0,$(SHARED)),cp $(SRCDIR)/lib$(LIBNAME).$(SO_EXT) $(INSTALL_ROOT)/lib) ++ @mkdir -p $(INSTALL_ROOT)/include/gslib 2>/dev/null ++ cp $(SRCDIR)/*.h $(INSTALL_ROOT)/include/gslib ++ mv config.h $(INSTALL_ROOT)/include/gslib ++ @printf '// Automatically generated file\n#include "gslib/gslib.h"\n' \ ++ > $(INSTALL_ROOT)/include/gslib.h && chmod 644 $(INSTALL_ROOT)/include/gslib.h + + tests: $(TESTS) + +-clean: ; @$(RM) config.h $(SRCDIR)/*.o $(SRCDIR)/*.s $(SRCDIR)/*.a $(TESTDIR)/*.o $(FTESTDIR)/*.o $(TESTS) ++clean: ++ $(RM) config.h $(SRCDIR)/*.o $(SRCDIR)/*.s $(SRCDIR)/*.a $(SRCDIR)/*.$(SO_EXT) $(TESTDIR)/*.o $(FTESTDIR)/*.o $(TESTS) + + $(TESTS): % : %.c | lib install +- $(CC) $(CFLAGS) -I$(INSTALL_ROOT)/include $< -o $@ -L$(INSTALL_ROOT)/lib -l$(LIBNAME) -lm $(LDFLAGS) ++ $(CC) $(CFLAGS) -I$(INSTALL_ROOT)/include $< -o $@ -L$(INSTALL_ROOT)/lib -l$(LIBNAME) -lm $(LDFLAGS) + + $(FTESTS): % : %.o | lib install + $(FCCMD) $^ -o $@ -L$(SRCDIR) -l$(LIBNAME) +diff --git a/src/c99.h b/src/c99.h +index a5a44e3..62c3ced 100644 +--- a/src/c99.h ++++ b/src/c99.h +@@ -1,16 +1,16 @@ +-#ifndef C99_H +-#define C99_H ++#ifndef GS_C99_H ++#define GS_C99_H + + #ifndef __STDC_VERSION__ +-# define NO_C99 ++# define GS_NO_C99 + #elif __STDC_VERSION__ < 199901L +-# define NO_C99 ++# define GS_NO_C99 + #endif + +-#ifdef NO_C99 ++#ifdef GS_NO_C99 + # define restrict + # define inline +-# undef NO_C99 ++# undef GS_NO_C99 + #endif + + #endif +diff --git a/src/comm.c b/src/comm.c +index 5e05739..225788c 100644 +--- a/src/comm.c ++++ b/src/comm.c +@@ -108,7 +108,7 @@ void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, + void *v, uint vn, void *buf) + { + if(vn==0) return; +-#ifdef MPI ++#ifdef GS_MPI + { + MPI_Datatype mpitype; + MPI_Op mpiop; +@@ -117,8 +117,8 @@ void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, + case gs_float: mpitype=MPI_FLOAT; break; \ + case gs_int: mpitype=MPI_INT; break; \ + case gs_long: mpitype=MPI_LONG; break; \ +- WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \ +- default: goto comm_allreduce_byhand; \ ++ GS_WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \ ++ default: goto comm_allreduce_byhand; \ + } \ + } while(0) + DOMAIN_SWITCH(); +@@ -134,7 +134,7 @@ void comm_allreduce(const struct comm *com, gs_dom dom, gs_op op, + return; + } + #endif +-#ifdef MPI ++#ifdef GS_MPI + comm_allreduce_byhand: + allreduce_imp(com,dom,op, v,vn, buf); + #endif +@@ -144,7 +144,7 @@ void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op + void *v, uint vn, void *buf) + { + if(vn==0) return; +-#ifdef MPI ++#ifdef GS_MPI + { + MPI_Datatype mpitype; + MPI_Op mpiop; +@@ -153,8 +153,8 @@ void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op + case gs_float: mpitype=MPI_FLOAT; break; \ + case gs_int: mpitype=MPI_INT; break; \ + case gs_long: mpitype=MPI_LONG; break; \ +- WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \ +- default: goto comm_allreduce_byhand; \ ++ GS_WHEN_LONG_LONG(case gs_long_long: mpitype=MPI_LONG_LONG; break;) \ ++ default: goto comm_allreduce_byhand; \ + } \ + } while(0) + DOMAIN_SWITCH(); +@@ -165,7 +165,7 @@ void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op + case gs_max: mpiop=MPI_MAX; break; + default: goto comm_allreduce_byhand; + } +-#ifdef USE_NBC ++#ifdef GS_USE_NBC + MPI_Iallreduce(v,buf,vn,mpitype,mpiop,com->c,req); + #else + fail(1,"comm_iallreduce",__LINE__,"Invalid call to MPI_Iallreduce!\n"); +@@ -174,7 +174,7 @@ void comm_iallreduce(comm_req *req, const struct comm *com, gs_dom dom, gs_op op + return; + } + #endif +-#ifdef MPI ++#ifdef GS_MPI + comm_allreduce_byhand: + allreduce_imp(com,dom,op, v,vn, buf); + #endif +@@ -197,7 +197,7 @@ double comm_dot(const struct comm *comm, double *v, double *w, uint n) + do { T v = *in++; GS_DO_##OP(accum,v); } while(--n) + + #define DEFINE_REDUCE(T) \ +-T PREFIXED_NAME(comm_reduce__##T)( \ ++T GS_PREFIXED_NAME(comm_reduce__##T)( \ + const struct comm *comm, gs_op op, const T *in, uint n) \ + { \ + T accum = gs_identity_##T[op], buf; \ +diff --git a/src/comm.h b/src/comm.h +index bb5b290..ac4c920 100644 +--- a/src/comm.h ++++ b/src/comm.h +@@ -1,5 +1,5 @@ +-#ifndef COMM_H +-#define COMM_H ++#ifndef GS_COMM_H ++#define GS_COMM_H + + /* requires: + for size_t +@@ -10,7 +10,7 @@ + + #include + #include +-#if !defined(FAIL_H) || !defined(TYPES_H) ++#if !defined(GS_FAIL_H) || !defined(GS_TYPES_H) + #warning "comm.h" requires "fail.h" and "types.h" + #endif + +@@ -63,7 +63,7 @@ + + */ + +-#ifdef MPI ++#ifdef GS_MPI + #include + typedef MPI_Comm comm_ext; + typedef MPI_Request comm_req; +@@ -73,15 +73,15 @@ typedef int comm_req; + typedef int MPI_Fint; + #endif + +-#define comm_allreduce PREFIXED_NAME(comm_allreduce ) +-#define comm_iallreduce PREFIXED_NAME(comm_iallreduce) +-#define comm_scan PREFIXED_NAME(comm_scan ) +-#define comm_dot PREFIXED_NAME(comm_dot ) ++#define comm_allreduce GS_PREFIXED_NAME(comm_allreduce ) ++#define comm_iallreduce GS_PREFIXED_NAME(comm_iallreduce) ++#define comm_scan GS_PREFIXED_NAME(comm_scan ) ++#define comm_dot GS_PREFIXED_NAME(comm_dot ) + + /* global id, np vars strictly for diagnostic messages (fail.c) */ + #ifndef comm_gbl_id +-#define comm_gbl_id PREFIXED_NAME(comm_gbl_id) +-#define comm_gbl_np PREFIXED_NAME(comm_gbl_np) ++#define comm_gbl_id GS_PREFIXED_NAME(comm_gbl_id) ++#define comm_gbl_np GS_PREFIXED_NAME(comm_gbl_np) + extern uint comm_gbl_id, comm_gbl_np; + #endif + +@@ -122,17 +122,17 @@ void comm_scan(void *scan, const struct comm *com, gs_dom dom, gs_op op, + const void *v, uint vn, void *buffer); + + #define DEFINE_REDUCE(T) \ +-T PREFIXED_NAME(comm_reduce__##T)( \ ++T GS_PREFIXED_NAME(comm_reduce__##T)( \ + const struct comm *comm, gs_op op, const T *in, uint n); \ + static T comm_reduce_##T(const struct comm *c, gs_op op, const T *v, uint vn) \ +-{ return PREFIXED_NAME(comm_reduce__##T)(c,op,v,vn); } ++{ return GS_PREFIXED_NAME(comm_reduce__##T)(c,op,v,vn); } + GS_FOR_EACH_DOMAIN(DEFINE_REDUCE) + #undef DEFINE_REDUCE + + #define comm_reduce_sint \ +- TYPE_LOCAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long) ++ GS_TYPE_LOCAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long) + #define comm_reduce_slong \ +- TYPE_GLOBAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long) ++ GS_TYPE_GLOBAL(comm_reduce_int,comm_reduce_long,comm_reduce_long_long) + + #endif + +@@ -142,7 +142,7 @@ GS_FOR_EACH_DOMAIN(DEFINE_REDUCE) + + static void comm_init(struct comm *c, comm_ext ce) + { +-#ifdef MPI ++#ifdef GS_MPI + int i; + MPI_Comm_dup(ce, &c->c); + MPI_Comm_rank(c->c,&i), comm_gbl_id=c->id=i; +@@ -155,7 +155,7 @@ static void comm_init(struct comm *c, comm_ext ce) + static void comm_init_check_(struct comm *c, MPI_Fint ce, uint np, + const char *file, unsigned line) + { +-#ifdef MPI ++#ifdef GS_MPI + comm_init(c,MPI_Comm_f2c(ce)); + if(c->np != np) + fail(1,file,line,"comm_init_check: passed P=%u, " +@@ -175,7 +175,7 @@ static void comm_dup_(struct comm *d, const struct comm *s, + const char *file, unsigned line) + { + d->id = s->id, d->np = s->np; +-#ifdef MPI ++#ifdef GS_MPI + MPI_Comm_dup(s->c,&d->c); + #else + if(s->np!=1) fail(1,file,line,"%s not compiled with -DMPI\n",file); +@@ -185,7 +185,7 @@ static void comm_dup_(struct comm *d, const struct comm *s, + + static void comm_split_(const struct comm *s, int bin, int key, struct comm *d, + const char *file, unsigned line) { +-#if defined(MPI) ++#if defined(GS_MPI) + MPI_Comm nc; + MPI_Comm_split(s->c, bin, key, &nc); + comm_init(d, nc); +@@ -198,14 +198,14 @@ static void comm_split_(const struct comm *s, int bin, int key, struct comm *d, + + static void comm_free(struct comm *c) + { +-#ifdef MPI ++#ifdef GS_MPI + MPI_Comm_free(&c->c); + #endif + } + + static double comm_time(void) + { +-#ifdef MPI ++#ifdef GS_MPI + return MPI_Wtime(); + #else + return 0; +@@ -214,7 +214,7 @@ static double comm_time(void) + + static void comm_barrier(const struct comm *c) + { +-#ifdef MPI ++#ifdef GS_MPI + MPI_Barrier(c->c); + #endif + } +@@ -222,7 +222,7 @@ static void comm_barrier(const struct comm *c) + static void comm_recv(const struct comm *c, void *p, size_t n, + uint src, int tag) + { +-#ifdef MPI ++#ifdef GS_MPI + # ifndef MPI_STATUS_IGNORE + MPI_Status stat; + MPI_Recv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,&stat); +@@ -235,7 +235,7 @@ static void comm_recv(const struct comm *c, void *p, size_t n, + static void comm_send(const struct comm *c, void *p, size_t n, + uint dst, int tag) + { +-#ifdef MPI ++#ifdef GS_MPI + MPI_Send(p,n,MPI_UNSIGNED_CHAR,dst,tag,c->c); + #endif + } +@@ -243,7 +243,7 @@ static void comm_send(const struct comm *c, void *p, size_t n, + static void comm_irecv(comm_req *req, const struct comm *c, + void *p, size_t n, uint src, int tag) + { +-#ifdef MPI ++#ifdef GS_MPI + MPI_Irecv(p,n,MPI_UNSIGNED_CHAR,src,tag,c->c,req); + #endif + } +@@ -251,14 +251,14 @@ static void comm_irecv(comm_req *req, const struct comm *c, + static void comm_isend(comm_req *req, const struct comm *c, + void *p, size_t n, uint dst, int tag) + { +-#ifdef MPI ++#ifdef GS_MPI + MPI_Isend(p,n,MPI_UNSIGNED_CHAR,dst,tag,c->c,req); + #endif + } + + static void comm_wait(comm_req *req, int n) + { +-#ifdef MPI ++#ifdef GS_MPI + # ifndef MPI_STATUSES_IGNORE + MPI_Status status[8]; + while(n>=8) MPI_Waitall(8,req,status), req+=8, n-=8; +@@ -271,7 +271,7 @@ static void comm_wait(comm_req *req, int n) + + static void comm_bcast(const struct comm *c, void *p, size_t n, uint root) + { +-#ifdef MPI ++#ifdef GS_MPI + MPI_Bcast(p,n,MPI_UNSIGNED_CHAR,root,c->c); + #endif + } +@@ -279,7 +279,7 @@ static void comm_bcast(const struct comm *c, void *p, size_t n, uint root) + static void comm_gather(const struct comm *c, void *out, size_t out_n, + void *in, size_t in_n, uint root) + { +-#ifdef MPI ++#ifdef GS_MPI + MPI_Gather(out,out_n,MPI_UNSIGNED_CHAR,in,in_n,MPI_UNSIGNED_CHAR,root,c->c); + #else + assert(out_n == in_n); +diff --git a/src/crystal.c b/src/crystal.c +index a0e8135..c444ad8 100644 +--- a/src/crystal.c ++++ b/src/crystal.c +@@ -43,9 +43,9 @@ + #include "comm.h" + #include "mem.h" + +-#define crystal_init PREFIXED_NAME(crystal_init ) +-#define crystal_free PREFIXED_NAME(crystal_free ) +-#define crystal_router PREFIXED_NAME(crystal_router) ++#define crystal_init GS_PREFIXED_NAME(crystal_init ) ++#define crystal_free GS_PREFIXED_NAME(crystal_free ) ++#define crystal_router GS_PREFIXED_NAME(crystal_router) + + struct crystal { + struct comm comm; +diff --git a/src/crystal.h b/src/crystal.h +index b6d4582..67d3c4e 100644 +--- a/src/crystal.h ++++ b/src/crystal.h +@@ -1,13 +1,13 @@ +-#ifndef CRYSTAL_H +-#define CRYSTAL_H ++#ifndef GS_CRYSTAL_H ++#define GS_CRYSTAL_H + +-#if !defined(COMM_H) || !defined(MEM_H) ++#if !defined(GS_COMM_H) || !defined(GS_MEM_H) + #warning "crystal.h" requires "comm.h" and "mem.h" + #endif + +-#define crystal_init PREFIXED_NAME(crystal_init ) +-#define crystal_free PREFIXED_NAME(crystal_free ) +-#define crystal_router PREFIXED_NAME(crystal_router) ++#define crystal_init GS_PREFIXED_NAME(crystal_init ) ++#define crystal_free GS_PREFIXED_NAME(crystal_free ) ++#define crystal_router GS_PREFIXED_NAME(crystal_router) + + struct crystal { + struct comm comm; +diff --git a/src/fail.c b/src/fail.c +index 9ac04bd..c12cd83 100644 +--- a/src/fail.c ++++ b/src/fail.c +@@ -7,8 +7,8 @@ + #include "types.h" + #include "comm.h" + +-#ifdef USE_USR_EXIT +-#define userExitHandler FORTRAN_NAME(userexithandler,USEREXITHANDLER) ++#ifdef GS_USE_USR_EXIT ++#define userExitHandler GS_FORTRAN_NAME(userexithandler,USEREXITHANDLER) + #define USEREXIT 1 + extern void userExitHandler(int status); + #else +diff --git a/src/fail.h b/src/fail.h +index 0185110..1ce10b2 100644 +--- a/src/fail.h ++++ b/src/fail.h +@@ -1,15 +1,15 @@ +-#ifndef FAIL_H +-#define FAIL_H ++#ifndef GS_FAIL_H ++#define GS_FAIL_H + +-#if !defined(NAME_H) ++#if !defined(GS_NAME_H) + #warning "fail.h" requires "name.h" + #endif + +-#define die PREFIXED_NAME( die ) +-#define vdiagnostic PREFIXED_NAME(vdiagnostic) +-#define diagnostic PREFIXED_NAME( diagnostic) +-#define vfail PREFIXED_NAME(vfail ) +-#define fail PREFIXED_NAME( fail ) ++#define die GS_PREFIXED_NAME( die ) ++#define vdiagnostic GS_PREFIXED_NAME(vdiagnostic) ++#define diagnostic GS_PREFIXED_NAME( diagnostic) ++#define vfail GS_PREFIXED_NAME(vfail ) ++#define fail GS_PREFIXED_NAME( fail ) + + #ifdef __GNUC__ + # define ATTRBD __attribute__ ((noreturn)) +diff --git a/src/fcrystal.c b/src/fcrystal.c +index 3fe4c9a..44f96f5 100644 +--- a/src/fcrystal.c ++++ b/src/fcrystal.c +@@ -65,20 +65,20 @@ + --------------------------------------------------------------------------*/ + + #undef crystal_free +-#define ccrystal_free PREFIXED_NAME(crystal_free) ++#define ccrystal_free GS_PREFIXED_NAME(crystal_free) + + #define fcrystal_setup \ +- FORTRAN_NAME(crystal_setup ,CRYSTAL_SETUP ) ++ GS_FORTRAN_NAME(crystal_setup ,CRYSTAL_SETUP ) + #define fcrystal_ituple_sort \ +- FORTRAN_NAME(crystal_ituple_sort ,CRYSTAL_ITUPLE_SORT ) ++ GS_FORTRAN_NAME(crystal_ituple_sort ,CRYSTAL_ITUPLE_SORT ) + #define fcrystal_tuple_sort \ +- FORTRAN_NAME(crystal_tuple_sort ,CRYSTAL_TUPLE_SORT ) ++ GS_FORTRAN_NAME(crystal_tuple_sort ,CRYSTAL_TUPLE_SORT ) + #define fcrystal_ituple_transfer \ +- FORTRAN_NAME(crystal_ituple_transfer,CRYSTAL_ITUPLE_TRANSFER) ++ GS_FORTRAN_NAME(crystal_ituple_transfer,CRYSTAL_ITUPLE_TRANSFER) + #define fcrystal_tuple_transfer \ +- FORTRAN_NAME(crystal_tuple_transfer ,CRYSTAL_TUPLE_TRANSFER ) ++ GS_FORTRAN_NAME(crystal_tuple_transfer ,CRYSTAL_TUPLE_TRANSFER ) + #define fcrystal_free \ +- FORTRAN_NAME(crystal_free ,CRYSTAL_FREE ) ++ GS_FORTRAN_NAME(crystal_free ,CRYSTAL_FREE ) + + static struct crystal **handle_array = 0; + static int handle_max = 0; +diff --git a/src/findpts.c b/src/findpts.c +index 86ac948..c3a3e9b 100644 +--- a/src/findpts.c ++++ b/src/findpts.c +@@ -117,7 +117,7 @@ static uint count_bits(unsigned char *p, uint n) + approx number of cells per proc for the distributed + global geometric hash table + NOTE: gbl_hash_size*np needs to fit in a "global" integer +- (controlled by -DGLOBAL_LONG or -DGLOBAL_LONG_LONG; ++ (controlled by -DGS_GLOBAL_LONG or -DGS_GLOBAL_LONG_LONG; + see "types.h") + actual number of cells per proc will be greater by + ~ 3 gbl_hash_size^(2/3) / np^(1/3) +@@ -227,17 +227,17 @@ static uint count_bits(unsigned char *p, uint n) + + --------------------------------------------------------------------------*/ + +-#define ffindptsms_setup FORTRAN_NAME(findptsms_setup ,FINDPTSMS_SETUP ) +-#define ffindptsms_free FORTRAN_NAME(findptsms_free ,FINDPTSMS_FREE ) +-#define ffindptsms FORTRAN_NAME(findptsms ,FINDPTSMS ) +-#define ffindptsms_eval FORTRAN_NAME(findptsms_eval ,FINDPTSMS_EVAL ) +-#define ffindptsms_eval_local FORTRAN_NAME(findptsms_eval_local,FINDPTSMS_EVAL_LOCAL) ++#define ffindptsms_setup GS_FORTRAN_NAME(findptsms_setup ,FINDPTSMS_SETUP ) ++#define ffindptsms_free GS_FORTRAN_NAME(findptsms_free ,FINDPTSMS_FREE ) ++#define ffindptsms GS_FORTRAN_NAME(findptsms ,FINDPTSMS ) ++#define ffindptsms_eval GS_FORTRAN_NAME(findptsms_eval ,FINDPTSMS_EVAL ) ++#define ffindptsms_eval_local GS_FORTRAN_NAME(findptsms_eval_local,FINDPTSMS_EVAL_LOCAL) + +-#define ffindpts_setup FORTRAN_NAME(findpts_setup ,FINDPTS_SETUP ) +-#define ffindpts_free FORTRAN_NAME(findpts_free ,FINDPTS_FREE ) +-#define ffindpts FORTRAN_NAME(findpts ,FINDPTS ) +-#define ffindpts_eval FORTRAN_NAME(findpts_eval ,FINDPTS_EVAL ) +-#define ffindpts_eval_local FORTRAN_NAME(findpts_eval_local,FINDPTS_EVAL_LOCAL) ++#define ffindpts_setup GS_FORTRAN_NAME(findpts_setup ,FINDPTS_SETUP ) ++#define ffindpts_free GS_FORTRAN_NAME(findpts_free ,FINDPTS_FREE ) ++#define ffindpts GS_FORTRAN_NAME(findpts ,FINDPTS ) ++#define ffindpts_eval GS_FORTRAN_NAME(findpts_eval ,FINDPTS_EVAL ) ++#define ffindpts_eval_local GS_FORTRAN_NAME(findpts_eval_local,FINDPTS_EVAL_LOCAL) + + struct handle { void *data; unsigned ndim; }; + static struct handle *handle_array = 0; +@@ -363,9 +363,9 @@ void ffindptsms_free(const sint *const handle) + { + CHECK_HANDLE("findptsms_free"); + if(h->ndim==2) +- PREFIXED_NAME(findptsms_free_2)(h->data); ++ GS_PREFIXED_NAME(findptsms_free_2)(h->data); + else +- PREFIXED_NAME(findptsms_free_3)(h->data); ++ GS_PREFIXED_NAME(findptsms_free_3)(h->data); + h->data = 0; + } + +@@ -373,9 +373,9 @@ void ffindpts_free(const sint *const handle) + { + CHECK_HANDLE("findpts_free"); + if(h->ndim==2) +- PREFIXED_NAME(findpts_free_2)(h->data); ++ GS_PREFIXED_NAME(findpts_free_2)(h->data); + else +- PREFIXED_NAME(findpts_free_3)(h->data); ++ GS_PREFIXED_NAME(findpts_free_3)(h->data); + h->data = 0; + } + +@@ -405,7 +405,7 @@ void ffindptsms(const sint *const handle, + sess_stride = *session_id_stride*sizeof(uint); + sess_match = session_id_match; + +- PREFIXED_NAME(findptsms_2)( ++ GS_PREFIXED_NAME(findptsms_2)( + (uint*) code_base,(* code_stride)*sizeof(sint ), + (uint*) proc_base,(* proc_stride)*sizeof(sint ), + (uint*) el_base,(* el_stride)*sizeof(sint ), +@@ -428,7 +428,7 @@ void ffindptsms(const sint *const handle, + sess_stride = *session_id_stride*sizeof(uint); + sess_match = session_id_match; + +- PREFIXED_NAME(findptsms_3)( ++ GS_PREFIXED_NAME(findptsms_3)( + (uint*) code_base,(* code_stride)*sizeof(sint ), + (uint*) proc_base,(* proc_stride)*sizeof(sint ), + (uint*) el_base,(* el_stride)*sizeof(sint ), +@@ -459,7 +459,7 @@ void ffindpts(const sint *const handle, + xv_stride[0] = *x_stride*sizeof(double), + xv_stride[1] = *y_stride*sizeof(double); + +- PREFIXED_NAME(findpts_2)( ++ GS_PREFIXED_NAME(findpts_2)( + (uint*) code_base,(* code_stride)*sizeof(sint ), + (uint*) proc_base,(* proc_stride)*sizeof(sint ), + (uint*) el_base,(* el_stride)*sizeof(sint ), +@@ -475,7 +475,7 @@ void ffindpts(const sint *const handle, + xv_stride[1] = *y_stride*sizeof(double), + xv_stride[2] = *z_stride*sizeof(double); + +- PREFIXED_NAME(findpts_3)( ++ GS_PREFIXED_NAME(findpts_3)( + (uint*) code_base,(* code_stride)*sizeof(sint ), + (uint*) proc_base,(* proc_stride)*sizeof(sint ), + (uint*) el_base,(* el_stride)*sizeof(sint ), +@@ -497,7 +497,7 @@ void ffindptsms_eval(const sint *const handle, + { + CHECK_HANDLE("findptsms_eval"); + if(h->ndim==2) +- PREFIXED_NAME(findptsms_eval_2)( ++ GS_PREFIXED_NAME(findptsms_eval_2)( + out_base,(* out_stride)*sizeof(double), + (uint*)code_base,(*code_stride)*sizeof(sint ), + (uint*)proc_base,(*proc_stride)*sizeof(sint ), +@@ -505,7 +505,7 @@ void ffindptsms_eval(const sint *const handle, + r_base,(* r_stride)*sizeof(double), + *npt, in, h->data); + else +- PREFIXED_NAME(findptsms_eval_3)( ++ GS_PREFIXED_NAME(findptsms_eval_3)( + out_base,(* out_stride)*sizeof(double), + (uint*)code_base,(*code_stride)*sizeof(sint ), + (uint*)proc_base,(*proc_stride)*sizeof(sint ), +@@ -524,7 +524,7 @@ void ffindpts_eval(const sint *const handle, + { + CHECK_HANDLE("findpts_eval"); + if(h->ndim==2) +- PREFIXED_NAME(findpts_eval_2)( ++ GS_PREFIXED_NAME(findpts_eval_2)( + out_base,(* out_stride)*sizeof(double), + (uint*)code_base,(*code_stride)*sizeof(sint ), + (uint*)proc_base,(*proc_stride)*sizeof(sint ), +@@ -532,7 +532,7 @@ void ffindpts_eval(const sint *const handle, + r_base,(* r_stride)*sizeof(double), + *npt, in, h->data); + else +- PREFIXED_NAME(findpts_eval_3)( ++ GS_PREFIXED_NAME(findpts_eval_3)( + out_base,(* out_stride)*sizeof(double), + (uint*)code_base,(*code_stride)*sizeof(sint ), + (uint*)proc_base,(*proc_stride)*sizeof(sint ), +diff --git a/src/findpts.h b/src/findpts.h +index c8a1ab3..af2071d 100644 +--- a/src/findpts.h ++++ b/src/findpts.h +@@ -1,27 +1,27 @@ +-#ifndef FINDPTSMS_H +-#define FINDPTSMS_H ++#ifndef GS_FINDPTSMS_H ++#define GS_FINDPTSMS_H + +-#if !defined(COMM_H) ++#if !defined(GS_COMM_H) + #warning "findpts.h" requires "comm.h" + #endif + +-#define findptsms_setup_2 PREFIXED_NAME(findptsms_setup_2) +-#define findptsms_free_2 PREFIXED_NAME(findptsms_free_2 ) +-#define findptsms_2 PREFIXED_NAME(findptsms_2 ) +-#define findptsms_eval_2 PREFIXED_NAME(findptsms_eval_2 ) +-#define findptsms_setup_3 PREFIXED_NAME(findptsms_setup_3) +-#define findptsms_free_3 PREFIXED_NAME(findptsms_free_3 ) +-#define findptsms_3 PREFIXED_NAME(findptsms_3 ) +-#define findptsms_eval_3 PREFIXED_NAME(findptsms_eval_3 ) +- +-#define findpts_setup_2 PREFIXED_NAME(findpts_setup_2) +-#define findpts_free_2 PREFIXED_NAME(findpts_free_2 ) +-#define findpts_2 PREFIXED_NAME(findpts_2 ) +-#define findpts_eval_2 PREFIXED_NAME(findpts_eval_2 ) +-#define findpts_setup_3 PREFIXED_NAME(findpts_setup_3) +-#define findpts_free_3 PREFIXED_NAME(findpts_free_3 ) +-#define findpts_3 PREFIXED_NAME(findpts_3 ) +-#define findpts_eval_3 PREFIXED_NAME(findpts_eval_3 ) ++#define findptsms_setup_2 GS_PREFIXED_NAME(findptsms_setup_2) ++#define findptsms_free_2 GS_PREFIXED_NAME(findptsms_free_2 ) ++#define findptsms_2 GS_PREFIXED_NAME(findptsms_2 ) ++#define findptsms_eval_2 GS_PREFIXED_NAME(findptsms_eval_2 ) ++#define findptsms_setup_3 GS_PREFIXED_NAME(findptsms_setup_3) ++#define findptsms_free_3 GS_PREFIXED_NAME(findptsms_free_3 ) ++#define findptsms_3 GS_PREFIXED_NAME(findptsms_3 ) ++#define findptsms_eval_3 GS_PREFIXED_NAME(findptsms_eval_3 ) ++ ++#define findpts_setup_2 GS_PREFIXED_NAME(findpts_setup_2) ++#define findpts_free_2 GS_PREFIXED_NAME(findpts_free_2 ) ++#define findpts_2 GS_PREFIXED_NAME(findpts_2 ) ++#define findpts_eval_2 GS_PREFIXED_NAME(findpts_eval_2 ) ++#define findpts_setup_3 GS_PREFIXED_NAME(findpts_setup_3) ++#define findpts_free_3 GS_PREFIXED_NAME(findpts_free_3 ) ++#define findpts_3 GS_PREFIXED_NAME(findpts_3 ) ++#define findpts_eval_3 GS_PREFIXED_NAME(findpts_eval_3 ) + + struct findpts_data_2; + struct findpts_data_3; +diff --git a/src/findpts_el.h b/src/findpts_el.h +index 4ed119a..ba731cd 100644 +--- a/src/findpts_el.h ++++ b/src/findpts_el.h +@@ -1,14 +1,14 @@ +-#ifndef FINDPTS_EL_H +-#define FINDPTS_EL_H ++#ifndef GS_FINDPTS_EL_H ++#define GS_FINDPTS_EL_H + +-#if !defined(NAME_H) || !defined(POLY_H) ++#if !defined(GS_NAME_H) || !defined(GS_POLY_H) + #warning "findpts_el.h" requires "name.h", "poly.h" + #endif + +-#define findpts_el_setup_2 PREFIXED_NAME(findpts_el_setup_2) +-#define findpts_el_free_2 PREFIXED_NAME(findpts_el_free_2 ) +-#define findpts_el_2 PREFIXED_NAME(findpts_el_2 ) +-#define findpts_el_eval_2 PREFIXED_NAME(findpts_el_eval_2 ) ++#define findpts_el_setup_2 GS_PREFIXED_NAME(findpts_el_setup_2) ++#define findpts_el_free_2 GS_PREFIXED_NAME(findpts_el_free_2 ) ++#define findpts_el_2 GS_PREFIXED_NAME(findpts_el_2 ) ++#define findpts_el_eval_2 GS_PREFIXED_NAME(findpts_el_eval_2 ) + + struct findpts_el_pt_2 { + double x[2],r[2],oldr[2],dist2,dist2p,tr; +@@ -60,10 +60,10 @@ static struct findpts_el_pt_2 *findpts_el_points_2( + return fd->p; + } + +-#define findpts_el_setup_3 PREFIXED_NAME(findpts_el_setup_3) +-#define findpts_el_free_3 PREFIXED_NAME(findpts_el_free_3 ) +-#define findpts_el_3 PREFIXED_NAME(findpts_el_3 ) +-#define findpts_el_eval_3 PREFIXED_NAME(findpts_el_eval_3 ) ++#define findpts_el_setup_3 GS_PREFIXED_NAME(findpts_el_setup_3) ++#define findpts_el_free_3 GS_PREFIXED_NAME(findpts_el_free_3 ) ++#define findpts_el_3 GS_PREFIXED_NAME(findpts_el_3 ) ++#define findpts_el_eval_3 GS_PREFIXED_NAME(findpts_el_eval_3 ) + + struct findpts_el_pt_3 { + double x[3],r[3],oldr[3],dist2,dist2p,tr; +diff --git a/src/findpts_el_2.c b/src/findpts_el_2.c +index b33f768..aad9aad 100644 +--- a/src/findpts_el_2.c ++++ b/src/findpts_el_2.c +@@ -13,10 +13,10 @@ + #include "tensor.h" + #include "poly.h" + +-#define findpts_el_setup_2 PREFIXED_NAME(findpts_el_setup_2) +-#define findpts_el_free_2 PREFIXED_NAME(findpts_el_free_2 ) +-#define findpts_el_2 PREFIXED_NAME(findpts_el_2 ) +-#define findpts_el_eval_2 PREFIXED_NAME(findpts_el_eval_2 ) ++#define findpts_el_setup_2 GS_PREFIXED_NAME(findpts_el_setup_2) ++#define findpts_el_free_2 GS_PREFIXED_NAME(findpts_el_free_2 ) ++#define findpts_el_2 GS_PREFIXED_NAME(findpts_el_2 ) ++#define findpts_el_eval_2 GS_PREFIXED_NAME(findpts_el_eval_2 ) + /* + #define DIAGNOSTICS_1 + #define DIAGNOSTICS_2 +diff --git a/src/findpts_el_3.c b/src/findpts_el_3.c +index 42c335c..62561ff 100644 +--- a/src/findpts_el_3.c ++++ b/src/findpts_el_3.c +@@ -11,10 +11,10 @@ + #include "tensor.h" + #include "poly.h" + +-#define findpts_el_setup_3 PREFIXED_NAME(findpts_el_setup_3) +-#define findpts_el_free_3 PREFIXED_NAME(findpts_el_free_3 ) +-#define findpts_el_3 PREFIXED_NAME(findpts_el_3 ) +-#define findpts_el_eval_3 PREFIXED_NAME(findpts_el_eval_3 ) ++#define findpts_el_setup_3 GS_PREFIXED_NAME(findpts_el_setup_3) ++#define findpts_el_free_3 GS_PREFIXED_NAME(findpts_el_free_3 ) ++#define findpts_el_3 GS_PREFIXED_NAME(findpts_el_3 ) ++#define findpts_el_eval_3 GS_PREFIXED_NAME(findpts_el_eval_3 ) + /* + #define DIAGNOSTICS_1 + #define DIAGNOSTICS_2 +diff --git a/src/findpts_imp.h b/src/findpts_imp.h +index b9759af..44b5c7d 100644 +--- a/src/findpts_imp.h ++++ b/src/findpts_imp.h +@@ -1,45 +1,45 @@ + #include + #include +-#define obbox TOKEN_PASTE(obbox_,D) +-#define local_hash_data TOKEN_PASTE(findpts_local_hash_data_,D) +-#define hash_data TOKEN_PASTE(findpts_hash_data_,D) +-#define hash_index TOKEN_PASTE(hash_index_ ,D) +-#define hash_setfac TOKEN_PASTE(hash_setfac_ ,D) +-#define hash_range TOKEN_PASTE(hash_range_ ,D) +-#define hash_bb TOKEN_PASTE(hash_bb_ ,D) +-#define set_local_mask TOKEN_PASTE(set_local_mask_ ,D) +-#define fill_hash TOKEN_PASTE(fill_hash_ ,D) +-#define table_from_hash TOKEN_PASTE(table_from_hash_ ,D) +-#define hash_build TOKEN_PASTE(hash_build_ ,D) +-#define hash_free TOKEN_PASTE(hash_free_ ,D) +- +-#define findptsms_local_setup TOKEN_PASTE(PREFIXED_NAME(findptsms_local_setup_),D) +-#define findptsms_local_free TOKEN_PASTE(PREFIXED_NAME(findptsms_local_free_ ),D) +-#define findptsms_local TOKEN_PASTE(PREFIXED_NAME(findptsms_local_ ),D) +-#define findptsms_local_eval TOKEN_PASTE(PREFIXED_NAME(findptsms_local_eval_ ),D) +-#define findpts_dummy_ms_data TOKEN_PASTE(findpts_dummy_ms_data_,D) +-#define findpts_data TOKEN_PASTE(findpts_data_,D) +-#define src_pt TOKEN_PASTE(src_pt_ ,D) +-#define out_pt TOKEN_PASTE(out_pt_ ,D) +-#define eval_src_pt TOKEN_PASTE(eval_src_pt_ ,D) +-#define eval_out_pt TOKEN_PASTE(eval_out_pt_ ,D) +-#define setupms_aux TOKEN_PASTE(setupms_aux_,D) +-#define findptsms_setup TOKEN_PASTE(PREFIXED_NAME(findptsms_setup_),D) +-#define findptsms_free TOKEN_PASTE(PREFIXED_NAME(findptsms_free_ ),D) +-#define findptsms TOKEN_PASTE(PREFIXED_NAME(findptsms_ ),D) +-#define findptsms_eval TOKEN_PASTE(PREFIXED_NAME(findptsms_eval_ ),D) +- +-#define findpts_local_data TOKEN_PASTE(findpts_local_data_,D) +-#define findpts_local_setup TOKEN_PASTE(PREFIXED_NAME(findpts_local_setup_),D) +-#define findpts_local_free TOKEN_PASTE(PREFIXED_NAME(findpts_local_free_ ),D) +-#define findpts_local TOKEN_PASTE(PREFIXED_NAME(findpts_local_ ),D) +-#define findpts_local_eval TOKEN_PASTE(PREFIXED_NAME(findpts_local_eval_ ),D) +-#define findpts_setup TOKEN_PASTE(PREFIXED_NAME(findpts_setup_),D) +-#define findpts_free TOKEN_PASTE(PREFIXED_NAME(findpts_free_ ),D) +-#define findpts TOKEN_PASTE(PREFIXED_NAME(findpts_ ),D) +-#define findpts_eval TOKEN_PASTE(PREFIXED_NAME(findpts_eval_ ),D) +-#define findpts_local_eval TOKEN_PASTE(PREFIXED_NAME(findpts_local_eval_ ),D) +-#define setup_fev_aux TOKEN_PASTE(setup_fev_aux_,D) ++#define obbox GS_TOKEN_PASTE(obbox_,D) ++#define local_hash_data GS_TOKEN_PASTE(findpts_local_hash_data_,D) ++#define hash_data GS_TOKEN_PASTE(findpts_hash_data_,D) ++#define hash_index GS_TOKEN_PASTE(hash_index_ ,D) ++#define hash_setfac GS_TOKEN_PASTE(hash_setfac_ ,D) ++#define hash_range GS_TOKEN_PASTE(hash_range_ ,D) ++#define hash_bb GS_TOKEN_PASTE(hash_bb_ ,D) ++#define set_local_mask GS_TOKEN_PASTE(set_local_mask_ ,D) ++#define fill_hash GS_TOKEN_PASTE(fill_hash_ ,D) ++#define table_from_hash GS_TOKEN_PASTE(table_from_hash_ ,D) ++#define hash_build GS_TOKEN_PASTE(hash_build_ ,D) ++#define hash_free GS_TOKEN_PASTE(hash_free_ ,D) ++ ++#define findptsms_local_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_setup_),D) ++#define findptsms_local_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_free_ ),D) ++#define findptsms_local GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_ ),D) ++#define findptsms_local_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_eval_ ),D) ++#define findpts_dummy_ms_data GS_TOKEN_PASTE(findpts_dummy_ms_data_,D) ++#define findpts_data GS_TOKEN_PASTE(findpts_data_,D) ++#define src_pt GS_TOKEN_PASTE(src_pt_ ,D) ++#define out_pt GS_TOKEN_PASTE(out_pt_ ,D) ++#define eval_src_pt GS_TOKEN_PASTE(eval_src_pt_ ,D) ++#define eval_out_pt GS_TOKEN_PASTE(eval_out_pt_ ,D) ++#define setupms_aux GS_TOKEN_PASTE(setupms_aux_,D) ++#define findptsms_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_setup_),D) ++#define findptsms_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_free_ ),D) ++#define findptsms GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_ ),D) ++#define findptsms_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_eval_ ),D) ++ ++#define findpts_local_data GS_TOKEN_PASTE(findpts_local_data_,D) ++#define findpts_local_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_setup_),D) ++#define findpts_local_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_free_ ),D) ++#define findpts_local GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_ ),D) ++#define findpts_local_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_eval_ ),D) ++#define findpts_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_setup_),D) ++#define findpts_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_free_ ),D) ++#define findpts GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_ ),D) ++#define findpts_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_eval_ ),D) ++#define findpts_local_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_eval_ ),D) ++#define setup_fev_aux GS_TOKEN_PASTE(setup_fev_aux_,D) + + struct hash_data { + ulong hash_n; +diff --git a/src/findpts_local.h b/src/findpts_local.h +index 2a9d9da..30be675 100644 +--- a/src/findpts_local.h ++++ b/src/findpts_local.h +@@ -1,19 +1,19 @@ +-#ifndef FINDPTS_LOCAL_H +-#define FINDPTS_LOCAL_H ++#ifndef GS_FINDPTS_LOCAL_H ++#define GS_FINDPTS_LOCAL_H + +-#if !defined(MEM_H) || !defined(FINDPTS_EL_H) || !defined(OBBOX_H) ++#if !defined(GS_MEM_H) || !defined(GS_FINDPTS_EL_H) || !defined(GS_OBBOX_H) + #warning "findpts_local.h" requires "mem.h", "findpts_el.h", "obbox.h" + #endif + +-#define findptsms_local_setup_2 PREFIXED_NAME(findptsms_local_setup_2) +-#define findptsms_local_free_2 PREFIXED_NAME(findptsms_local_free_2 ) +-#define findptsms_local_2 PREFIXED_NAME(findptsms_local_2 ) +-#define findptsms_local_eval_2 PREFIXED_NAME(findptsms_local_eval_2 ) ++#define findptsms_local_setup_2 GS_PREFIXED_NAME(findptsms_local_setup_2) ++#define findptsms_local_free_2 GS_PREFIXED_NAME(findptsms_local_free_2 ) ++#define findptsms_local_2 GS_PREFIXED_NAME(findptsms_local_2 ) ++#define findptsms_local_eval_2 GS_PREFIXED_NAME(findptsms_local_eval_2 ) + +-#define findpts_local_setup_2 PREFIXED_NAME(findpts_local_setup_2) +-#define findpts_local_free_2 PREFIXED_NAME(findpts_local_free_2 ) +-#define findpts_local_2 PREFIXED_NAME(findpts_local_2 ) +-#define findpts_local_eval_2 PREFIXED_NAME(findpts_local_eval_2 ) ++#define findpts_local_setup_2 GS_PREFIXED_NAME(findpts_local_setup_2) ++#define findpts_local_free_2 GS_PREFIXED_NAME(findpts_local_free_2 ) ++#define findpts_local_2 GS_PREFIXED_NAME(findpts_local_2 ) ++#define findpts_local_eval_2 GS_PREFIXED_NAME(findpts_local_eval_2 ) + + struct findpts_local_hash_data_2 { + uint hash_n; +@@ -85,15 +85,15 @@ void findpts_local_eval_2( + const uint npt, + const double *const in, struct findpts_local_data_2 *const fd); + +-#define findptsms_local_setup_3 PREFIXED_NAME(findptsms_local_setup_3) +-#define findptsms_local_free_3 PREFIXED_NAME(findptsms_local_free_3 ) +-#define findptsms_local_3 PREFIXED_NAME(findptsms_local_3 ) +-#define findptsms_local_eval_3 PREFIXED_NAME(findptsms_local_eval_3 ) ++#define findptsms_local_setup_3 GS_PREFIXED_NAME(findptsms_local_setup_3) ++#define findptsms_local_free_3 GS_PREFIXED_NAME(findptsms_local_free_3 ) ++#define findptsms_local_3 GS_PREFIXED_NAME(findptsms_local_3 ) ++#define findptsms_local_eval_3 GS_PREFIXED_NAME(findptsms_local_eval_3 ) + +-#define findpts_local_setup_3 PREFIXED_NAME(findpts_local_setup_3) +-#define findpts_local_free_3 PREFIXED_NAME(findpts_local_free_3 ) +-#define findpts_local_3 PREFIXED_NAME(findpts_local_3 ) +-#define findpts_local_eval_3 PREFIXED_NAME(findpts_local_eval_3 ) ++#define findpts_local_setup_3 GS_PREFIXED_NAME(findpts_local_setup_3) ++#define findpts_local_free_3 GS_PREFIXED_NAME(findpts_local_free_3 ) ++#define findpts_local_3 GS_PREFIXED_NAME(findpts_local_3 ) ++#define findpts_local_eval_3 GS_PREFIXED_NAME(findpts_local_eval_3 ) + struct findpts_local_hash_data_3 { + uint hash_n; + struct dbl_range bnd[3]; +diff --git a/src/findpts_local_imp.h b/src/findpts_local_imp.h +index e5310b1..0ca79fd 100644 +--- a/src/findpts_local_imp.h ++++ b/src/findpts_local_imp.h +@@ -1,36 +1,36 @@ + #include +-#define obbox TOKEN_PASTE(obbox_ ,D) +-#define obbox_calc TOKEN_PASTE(PREFIXED_NAME(obbox_calc_),D) +-#define obbox_test TOKEN_PASTE(obbox_test_ ,D) +-#define hash_data TOKEN_PASTE(findpts_local_hash_data_,D) +-#define hash_index TOKEN_PASTE(hash_index_ ,D) +-#define hash_setfac TOKEN_PASTE(hash_setfac_ ,D) +-#define hash_range TOKEN_PASTE(hash_range_ ,D) +-#define hash_count TOKEN_PASTE(hash_count_ ,D) +-#define hash_opt_size TOKEN_PASTE(hash_opt_size_ ,D) +-#define hash_bb TOKEN_PASTE(hash_bb_ ,D) +-#define hash_build TOKEN_PASTE(hash_build_ ,D) +-#define hash_free TOKEN_PASTE(hash_free_ ,D) +-#define findpts_el_data TOKEN_PASTE(findpts_el_data_ ,D) +-#define findpts_el_pt TOKEN_PASTE(findpts_el_pt_ ,D) +-#define findpts_el_setup TOKEN_PASTE(PREFIXED_NAME(findpts_el_setup_),D) +-#define findpts_el_free TOKEN_PASTE(PREFIXED_NAME(findpts_el_free_ ),D) +-#define findpts_el TOKEN_PASTE(PREFIXED_NAME(findpts_el_ ),D) +-#define findpts_el_eval TOKEN_PASTE(PREFIXED_NAME(findpts_el_eval_ ),D) +-#define findpts_el_start TOKEN_PASTE(findpts_el_start_ ,D) +-#define findpts_el_points TOKEN_PASTE(findpts_el_points_ ,D) +-#define findpts_local_data TOKEN_PASTE(findpts_local_data_,D) +-#define map_points_to_els TOKEN_PASTE(map_points_to_els_ ,D) +- +-#define findptsms_local_setup TOKEN_PASTE(PREFIXED_NAME(findptsms_local_setup_),D) +-#define findptsms_local_free TOKEN_PASTE(PREFIXED_NAME(findptsms_local_free_ ),D) +-#define findptsms_local TOKEN_PASTE(PREFIXED_NAME(findptsms_local_ ),D) +-#define findptsms_local_eval TOKEN_PASTE(PREFIXED_NAME(findptsms_local_eval_ ),D) +- +-#define findpts_local_setup TOKEN_PASTE(PREFIXED_NAME(findpts_local_setup_),D) +-#define findpts_local_free TOKEN_PASTE(PREFIXED_NAME(findpts_local_free_ ),D) +-#define findpts_local TOKEN_PASTE(PREFIXED_NAME(findpts_local_ ),D) +-#define findpts_local_eval TOKEN_PASTE(PREFIXED_NAME(findpts_local_eval_ ),D) ++#define obbox GS_TOKEN_PASTE(obbox_ ,D) ++#define obbox_calc GS_TOKEN_PASTE(GS_PREFIXED_NAME(obbox_calc_),D) ++#define obbox_test GS_TOKEN_PASTE(obbox_test_ ,D) ++#define hash_data GS_TOKEN_PASTE(findpts_local_hash_data_,D) ++#define hash_index GS_TOKEN_PASTE(hash_index_ ,D) ++#define hash_setfac GS_TOKEN_PASTE(hash_setfac_ ,D) ++#define hash_range GS_TOKEN_PASTE(hash_range_ ,D) ++#define hash_count GS_TOKEN_PASTE(hash_count_ ,D) ++#define hash_opt_size GS_TOKEN_PASTE(hash_opt_size_ ,D) ++#define hash_bb GS_TOKEN_PASTE(hash_bb_ ,D) ++#define hash_build GS_TOKEN_PASTE(hash_build_ ,D) ++#define hash_free GS_TOKEN_PASTE(hash_free_ ,D) ++#define findpts_el_data GS_TOKEN_PASTE(findpts_el_data_ ,D) ++#define findpts_el_pt GS_TOKEN_PASTE(findpts_el_pt_ ,D) ++#define findpts_el_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_el_setup_),D) ++#define findpts_el_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_el_free_ ),D) ++#define findpts_el GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_el_ ),D) ++#define findpts_el_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_el_eval_ ),D) ++#define findpts_el_start GS_TOKEN_PASTE(findpts_el_start_ ,D) ++#define findpts_el_points GS_TOKEN_PASTE(findpts_el_points_ ,D) ++#define findpts_local_data GS_TOKEN_PASTE(findpts_local_data_,D) ++#define map_points_to_els GS_TOKEN_PASTE(map_points_to_els_ ,D) ++ ++#define findptsms_local_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_setup_),D) ++#define findptsms_local_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_free_ ),D) ++#define findptsms_local GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_ ),D) ++#define findptsms_local_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findptsms_local_eval_ ),D) ++ ++#define findpts_local_setup GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_setup_),D) ++#define findpts_local_free GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_free_ ),D) ++#define findpts_local GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_ ),D) ++#define findpts_local_eval GS_TOKEN_PASTE(GS_PREFIXED_NAME(findpts_local_eval_ ),D) + /*-------------------------------------------------------------------------- + Point to Possible Elements Hashing + +diff --git a/src/gs.c b/src/gs.c +index b1a9aa7..3cf4471 100644 +--- a/src/gs.c ++++ b/src/gs.c +@@ -19,19 +19,19 @@ + #include "sarray_sort.h" + #include "sarray_transfer.h" + +-#define gs PREFIXED_NAME(gs ) +-#define gs_vec PREFIXED_NAME(gs_vec ) +-#define gs_many PREFIXED_NAME(gs_many ) +-#define igs PREFIXED_NAME(igs ) +-#define igs_vec PREFIXED_NAME(igs_vec ) +-#define igs_many PREFIXED_NAME(igs_many ) +-#define gs_wait PREFIXED_NAME(gs_wait ) +-#define gs_setup PREFIXED_NAME(gs_setup ) +-#define gs_free PREFIXED_NAME(gs_free ) +-#define gs_unique PREFIXED_NAME(gs_unique) +-#define gs_hf2c PREFIXED_NAME(gs_hf2c ) +-#define pw_data_nmsg PREFIXED_NAME(pw_data_nmsg ) +-#define pw_data_size PREFIXED_NAME(pw_data_size ) ++#define gs GS_PREFIXED_NAME(gs ) ++#define gs_vec GS_PREFIXED_NAME(gs_vec ) ++#define gs_many GS_PREFIXED_NAME(gs_many ) ++#define igs GS_PREFIXED_NAME(igs ) ++#define igs_vec GS_PREFIXED_NAME(igs_vec ) ++#define igs_many GS_PREFIXED_NAME(igs_many ) ++#define gs_wait GS_PREFIXED_NAME(gs_wait ) ++#define gs_setup GS_PREFIXED_NAME(gs_setup ) ++#define gs_free GS_PREFIXED_NAME(gs_free ) ++#define gs_unique GS_PREFIXED_NAME(gs_unique) ++#define gs_hf2c GS_PREFIXED_NAME(gs_hf2c ) ++#define pw_data_nmsg GS_PREFIXED_NAME(pw_data_nmsg ) ++#define pw_data_size GS_PREFIXED_NAME(pw_data_size ) + + GS_DEFINE_DOM_SIZES() + +@@ -1011,7 +1011,7 @@ static void allreduce_exec_wait( + /* Why do I need this? Ugly */ + if (comm->np > 1) + comm_wait(ard->req, 1); +-#ifdef MPI ++#ifdef GS_MPI + memcpy(buf,ardbuf,gvn*gs_dom_size[dom]); + #endif + /* buffer -> user array */ +@@ -1497,29 +1497,29 @@ void pw_data_size(struct gs_data *gsh, int *n) + #undef igs_many + #undef gs_wait + +-#define cgs PREFIXED_NAME(gs ) +-#define cgs_vec PREFIXED_NAME(gs_vec ) +-#define cgs_many PREFIXED_NAME(gs_many ) +-#define cgs_setup PREFIXED_NAME(gs_setup) +-#define cgs_free PREFIXED_NAME(gs_free ) +-#define cgs_unique PREFIXED_NAME(gs_unique) +-#define cigs PREFIXED_NAME(igs ) +-#define cigs_vec PREFIXED_NAME(igs_vec ) +-#define cigs_many PREFIXED_NAME(igs_many) +-#define cgs_wait PREFIXED_NAME(gs_wait ) +- +-#define fgs_setup_pick FORTRAN_NAME(gs_setup_pick,GS_SETUP_PICK) +-#define fgs_setup FORTRAN_NAME(gs_setup ,GS_SETUP ) +-#define fgs FORTRAN_NAME(gs_op ,GS_OP ) +-#define fgs_vec FORTRAN_NAME(gs_op_vec ,GS_OP_VEC ) +-#define fgs_many FORTRAN_NAME(gs_op_many ,GS_OP_MANY ) +-#define figs FORTRAN_NAME(igs_op ,IGS_OP ) +-#define figs_vec FORTRAN_NAME(igs_op_vec ,IGS_OP_VEC ) +-#define figs_many FORTRAN_NAME(igs_op_many ,IGS_OP_MANY ) +-#define fgs_wait FORTRAN_NAME(gs_op_wait ,GS_OP_WAIT ) +-#define fgs_fields FORTRAN_NAME(gs_op_fields ,GS_OP_FIELDS ) +-#define fgs_free FORTRAN_NAME(gs_free ,GS_FREE ) +-#define fgs_unique FORTRAN_NAME(gs_unique ,GS_UNIQUE ) ++#define cgs GS_PREFIXED_NAME(gs ) ++#define cgs_vec GS_PREFIXED_NAME(gs_vec ) ++#define cgs_many GS_PREFIXED_NAME(gs_many ) ++#define cgs_setup GS_PREFIXED_NAME(gs_setup) ++#define cgs_free GS_PREFIXED_NAME(gs_free ) ++#define cgs_unique GS_PREFIXED_NAME(gs_unique) ++#define cigs GS_PREFIXED_NAME(igs ) ++#define cigs_vec GS_PREFIXED_NAME(igs_vec ) ++#define cigs_many GS_PREFIXED_NAME(igs_many) ++#define cgs_wait GS_PREFIXED_NAME(gs_wait ) ++ ++#define fgs_setup_pick GS_FORTRAN_NAME(gs_setup_pick,GS_SETUP_PICK) ++#define fgs_setup GS_FORTRAN_NAME(gs_setup ,GS_SETUP ) ++#define fgs GS_FORTRAN_NAME(gs_op ,GS_OP ) ++#define fgs_vec GS_FORTRAN_NAME(gs_op_vec ,GS_OP_VEC ) ++#define fgs_many GS_FORTRAN_NAME(gs_op_many ,GS_OP_MANY ) ++#define figs GS_FORTRAN_NAME(igs_op ,IGS_OP ) ++#define figs_vec GS_FORTRAN_NAME(igs_op_vec ,IGS_OP_VEC ) ++#define figs_many GS_FORTRAN_NAME(igs_op_many ,IGS_OP_MANY ) ++#define fgs_wait GS_FORTRAN_NAME(gs_op_wait ,GS_OP_WAIT ) ++#define fgs_fields GS_FORTRAN_NAME(gs_op_fields ,GS_OP_FIELDS ) ++#define fgs_free GS_FORTRAN_NAME(gs_free ,GS_FREE ) ++#define fgs_unique GS_FORTRAN_NAME(gs_unique ,GS_UNIQUE ) + + static struct gs_data **fgs_info = 0; + static int fgs_max = 0; +diff --git a/src/gs.h b/src/gs.h +index a06c99f..83a0591 100644 +--- a/src/gs.h ++++ b/src/gs.h +@@ -1,7 +1,7 @@ + #ifndef GS_H + #define GS_H + +-#if !defined(COMM_H) || !defined(GS_DEFS_H) || !defined(MEM_H) ++#if !defined(GS_COMM_H) || !defined(GS_DEFS_H) || !defined(GS_MEM_H) + #warning "gs.h" requires "comm.h", "gs_defs.h", and "mem.h" + #endif + +@@ -116,19 +116,19 @@ + + */ + +-#define gs PREFIXED_NAME(gs ) +-#define gs_vec PREFIXED_NAME(gs_vec ) +-#define gs_many PREFIXED_NAME(gs_many ) +-#define igs PREFIXED_NAME(igs ) +-#define igs_vec PREFIXED_NAME(igs_vec ) +-#define igs_many PREFIXED_NAME(igs_many ) +-#define gs_wait PREFIXED_NAME(gs_wait ) +-#define gs_setup PREFIXED_NAME(gs_setup ) +-#define gs_free PREFIXED_NAME(gs_free ) +-#define gs_unique PREFIXED_NAME(gs_unique) +-#define gs_hf2c PREFIXED_NAME(gs_hf2c ) +-#define pw_data_nmsg PREFIXED_NAME(pw_data_nmsg ) +-#define pw_data_size PREFIXED_NAME(pw_data_size ) ++#define gs GS_PREFIXED_NAME(gs ) ++#define gs_vec GS_PREFIXED_NAME(gs_vec ) ++#define gs_many GS_PREFIXED_NAME(gs_many ) ++#define igs GS_PREFIXED_NAME(igs ) ++#define igs_vec GS_PREFIXED_NAME(igs_vec ) ++#define igs_many GS_PREFIXED_NAME(igs_many ) ++#define gs_wait GS_PREFIXED_NAME(gs_wait ) ++#define gs_setup GS_PREFIXED_NAME(gs_setup ) ++#define gs_free GS_PREFIXED_NAME(gs_free ) ++#define gs_unique GS_PREFIXED_NAME(gs_unique) ++#define gs_hf2c GS_PREFIXED_NAME(gs_hf2c ) ++#define pw_data_nmsg GS_PREFIXED_NAME(pw_data_nmsg ) ++#define pw_data_size GS_PREFIXED_NAME(pw_data_size ) + + struct gs_data; + typedef enum {gs_auto, gs_pairwise, gs_crystal_router, gs_all_reduce} gs_method; +diff --git a/src/gs_defs.h b/src/gs_defs.h +index df4ad7b..3914442 100644 +--- a/src/gs_defs.h ++++ b/src/gs_defs.h +@@ -20,7 +20,7 @@ + macro(float ) \ + macro(int ) \ + macro(long ) \ +- WHEN_LONG_LONG(macro(long_long)) ++ GS_WHEN_LONG_LONG(macro(long_long)) + + /* the supported ops */ + #define GS_FOR_EACH_OP(T,macro) \ +@@ -49,7 +49,7 @@ + GS_DEFINE_MONOID_ID(float , -FLT_MAX, FLT_MAX) \ + GS_DEFINE_MONOID_ID(int , INT_MIN, INT_MAX) \ + GS_DEFINE_MONOID_ID(long , LONG_MIN, LONG_MAX) \ +- WHEN_LONG_LONG(GS_DEFINE_MONOID_ID(long_long,LLONG_MIN,LLONG_MAX)) ++ GS_WHEN_LONG_LONG(GS_DEFINE_MONOID_ID(long_long,LLONG_MIN,LLONG_MAX)) + + /*------------------------------------------------------------------------------ + Enums and constants +@@ -62,8 +62,8 @@ typedef enum { LIST } gs_dom; + #undef ITEM + #undef LIST + +-#define gs_sint TYPE_LOCAL(gs_int,gs_long,gs_long_long) +-#define gs_slong TYPE_GLOBAL(gs_int,gs_long,gs_long_long) ++#define gs_sint GS_TYPE_LOCAL(gs_int,gs_long,gs_long_long) ++#define gs_slong GS_TYPE_GLOBAL(gs_int,gs_long,gs_long_long) + + /* domain type size array */ + #define GS_DOM_SIZE_ITEM(T) sizeof(T), +diff --git a/src/gs_local.c b/src/gs_local.c +index 170e94d..fa758c9 100644 +--- a/src/gs_local.c ++++ b/src/gs_local.c +@@ -5,20 +5,20 @@ + #include "name.h" + #include "types.h" + +-#define gs_gather_array PREFIXED_NAME(gs_gather_array ) +-#define gs_init_array PREFIXED_NAME(gs_init_array ) +-#define gs_gather PREFIXED_NAME(gs_gather ) +-#define gs_scatter PREFIXED_NAME(gs_scatter ) +-#define gs_init PREFIXED_NAME(gs_init ) +-#define gs_gather_vec PREFIXED_NAME(gs_gather_vec ) +-#define gs_scatter_vec PREFIXED_NAME(gs_scatter_vec ) +-#define gs_init_vec PREFIXED_NAME(gs_init_vec ) +-#define gs_gather_many PREFIXED_NAME(gs_gather_many ) +-#define gs_scatter_many PREFIXED_NAME(gs_scatter_many ) +-#define gs_init_many PREFIXED_NAME(gs_init_many ) +-#define gs_gather_vec_to_many PREFIXED_NAME(gs_gather_vec_to_many ) +-#define gs_scatter_many_to_vec PREFIXED_NAME(gs_scatter_many_to_vec) +-#define gs_scatter_vec_to_many PREFIXED_NAME(gs_scatter_vec_to_many) ++#define gs_gather_array GS_PREFIXED_NAME(gs_gather_array ) ++#define gs_init_array GS_PREFIXED_NAME(gs_init_array ) ++#define gs_gather GS_PREFIXED_NAME(gs_gather ) ++#define gs_scatter GS_PREFIXED_NAME(gs_scatter ) ++#define gs_init GS_PREFIXED_NAME(gs_init ) ++#define gs_gather_vec GS_PREFIXED_NAME(gs_gather_vec ) ++#define gs_scatter_vec GS_PREFIXED_NAME(gs_scatter_vec ) ++#define gs_init_vec GS_PREFIXED_NAME(gs_init_vec ) ++#define gs_gather_many GS_PREFIXED_NAME(gs_gather_many ) ++#define gs_scatter_many GS_PREFIXED_NAME(gs_scatter_many ) ++#define gs_init_many GS_PREFIXED_NAME(gs_init_many ) ++#define gs_gather_vec_to_many GS_PREFIXED_NAME(gs_gather_vec_to_many ) ++#define gs_scatter_many_to_vec GS_PREFIXED_NAME(gs_scatter_many_to_vec) ++#define gs_scatter_vec_to_many GS_PREFIXED_NAME(gs_scatter_vec_to_many) + + #include "gs_defs.h" + GS_DEFINE_IDENTITIES() +diff --git a/src/gs_local.h b/src/gs_local.h +index fc7c414..d09a420 100644 +--- a/src/gs_local.h ++++ b/src/gs_local.h +@@ -1,24 +1,24 @@ + #ifndef GS_LOCAL_H + #define GS_LOCAL_H + +-#if !defined(NAME_H) || !defined(TYPES_H) || !defined(GS_DEFS_H) ++#if !defined(GS_NAME_H) || !defined(GS_TYPES_H) || !defined(GS_DEFS_H) + #warning "gs_local.h" requires "name.h", "types.h", and "gs_defs.h" + #endif + +-#define gs_gather_array PREFIXED_NAME(gs_gather_array ) +-#define gs_init_array PREFIXED_NAME(gs_init_array ) +-#define gs_gather PREFIXED_NAME(gs_gather ) +-#define gs_scatter PREFIXED_NAME(gs_scatter ) +-#define gs_init PREFIXED_NAME(gs_init ) +-#define gs_gather_vec PREFIXED_NAME(gs_gather_vec ) +-#define gs_scatter_vec PREFIXED_NAME(gs_scatter_vec ) +-#define gs_init_vec PREFIXED_NAME(gs_init_vec ) +-#define gs_gather_many PREFIXED_NAME(gs_gather_many ) +-#define gs_scatter_many PREFIXED_NAME(gs_scatter_many ) +-#define gs_init_many PREFIXED_NAME(gs_init_many ) +-#define gs_gather_vec_to_many PREFIXED_NAME(gs_gather_vec_to_many ) +-#define gs_scatter_many_to_vec PREFIXED_NAME(gs_scatter_many_to_vec) +-#define gs_scatter_vec_to_many PREFIXED_NAME(gs_scatter_vec_to_many) ++#define gs_gather_array GS_PREFIXED_NAME(gs_gather_array ) ++#define gs_init_array GS_PREFIXED_NAME(gs_init_array ) ++#define gs_gather GS_PREFIXED_NAME(gs_gather ) ++#define gs_scatter GS_PREFIXED_NAME(gs_scatter ) ++#define gs_init GS_PREFIXED_NAME(gs_init ) ++#define gs_gather_vec GS_PREFIXED_NAME(gs_gather_vec ) ++#define gs_scatter_vec GS_PREFIXED_NAME(gs_scatter_vec ) ++#define gs_init_vec GS_PREFIXED_NAME(gs_init_vec ) ++#define gs_gather_many GS_PREFIXED_NAME(gs_gather_many ) ++#define gs_scatter_many GS_PREFIXED_NAME(gs_scatter_many ) ++#define gs_init_many GS_PREFIXED_NAME(gs_init_many ) ++#define gs_gather_vec_to_many GS_PREFIXED_NAME(gs_gather_vec_to_many ) ++#define gs_scatter_many_to_vec GS_PREFIXED_NAME(gs_scatter_many_to_vec) ++#define gs_scatter_vec_to_many GS_PREFIXED_NAME(gs_scatter_vec_to_many) + + void gs_gather_array(void *out, const void *in, uint n, + gs_dom dom, gs_op op); +diff --git a/src/gslib.h b/src/gslib.h +index e80d7a3..4bf4d72 100644 +--- a/src/gslib.h ++++ b/src/gslib.h +@@ -1,5 +1,5 @@ +-#ifndef GSLIB_H +-#define GSLIB_H ++#ifndef GS_GSLIB_H ++#define GS_GSLIB_H + + #include + #include +diff --git a/src/lob_bnd.c b/src/lob_bnd.c +index 9d02ca4..0aa0492 100644 +--- a/src/lob_bnd.c ++++ b/src/lob_bnd.c +@@ -10,13 +10,13 @@ + #include "mem.h" + #include "poly.h" + +-#define lob_bnd_setup PREFIXED_NAME(lob_bnd_setup) +-#define lob_bnd_lin_1 PREFIXED_NAME(lob_bnd_lin_1) +-#define lob_bnd_lin_2 PREFIXED_NAME(lob_bnd_lin_2) +-#define lob_bnd_lin_3 PREFIXED_NAME(lob_bnd_lin_3) +-#define lob_bnd_1 PREFIXED_NAME(lob_bnd_1 ) +-#define lob_bnd_2 PREFIXED_NAME(lob_bnd_2 ) +-#define lob_bnd_3 PREFIXED_NAME(lob_bnd_3 ) ++#define lob_bnd_setup GS_PREFIXED_NAME(lob_bnd_setup) ++#define lob_bnd_lin_1 GS_PREFIXED_NAME(lob_bnd_lin_1) ++#define lob_bnd_lin_2 GS_PREFIXED_NAME(lob_bnd_lin_2) ++#define lob_bnd_lin_3 GS_PREFIXED_NAME(lob_bnd_lin_3) ++#define lob_bnd_1 GS_PREFIXED_NAME(lob_bnd_1 ) ++#define lob_bnd_2 GS_PREFIXED_NAME(lob_bnd_2 ) ++#define lob_bnd_3 GS_PREFIXED_NAME(lob_bnd_3 ) + + struct dbl_range { double min,max; }; + +diff --git a/src/lob_bnd.h b/src/lob_bnd.h +index 7ecc8a3..52f30ee 100644 +--- a/src/lob_bnd.h ++++ b/src/lob_bnd.h +@@ -1,17 +1,17 @@ +-#ifndef LOB_BND_H +-#define LOB_BND_H ++#ifndef GS_LOB_BND_H ++#define GS_LOB_BND_H + +-#if !defined(TYPES_H) || !defined(NAME_H) ++#if !defined(GS_TYPES_H) || !defined(GS_NAME_H) + #warning "lob_bnd.h" requires "types.h" and "name.h" + #endif + +-#define lob_bnd_setup PREFIXED_NAME(lob_bnd_setup) +-#define lob_bnd_lin_1 PREFIXED_NAME(lob_bnd_lin_1) +-#define lob_bnd_lin_2 PREFIXED_NAME(lob_bnd_lin_2) +-#define lob_bnd_lin_3 PREFIXED_NAME(lob_bnd_lin_3) +-#define lob_bnd_1 PREFIXED_NAME(lob_bnd_1 ) +-#define lob_bnd_2 PREFIXED_NAME(lob_bnd_2 ) +-#define lob_bnd_3 PREFIXED_NAME(lob_bnd_3 ) ++#define lob_bnd_setup GS_PREFIXED_NAME(lob_bnd_setup) ++#define lob_bnd_lin_1 GS_PREFIXED_NAME(lob_bnd_lin_1) ++#define lob_bnd_lin_2 GS_PREFIXED_NAME(lob_bnd_lin_2) ++#define lob_bnd_lin_3 GS_PREFIXED_NAME(lob_bnd_lin_3) ++#define lob_bnd_1 GS_PREFIXED_NAME(lob_bnd_1 ) ++#define lob_bnd_2 GS_PREFIXED_NAME(lob_bnd_2 ) ++#define lob_bnd_3 GS_PREFIXED_NAME(lob_bnd_3 ) + + /*-------------------------------------------------------------------------- + Bounds for Polynomials on [-1,1]^d +@@ -85,7 +85,7 @@ void lob_bnd_lin_3( + const double *lob_bnd_data_t, unsigned nt, unsigned mt, + const double *restrict u, uint un, double *restrict work); + +-#ifndef OBBOX_H ++#ifndef GS_OBBOX_H + struct dbl_range { double min, max; }; + #endif + +diff --git a/src/mem.h b/src/mem.h +index b68e309..01ba6cb 100644 +--- a/src/mem.h ++++ b/src/mem.h +@@ -1,5 +1,5 @@ +-#ifndef MEM_H +-#define MEM_H ++#ifndef GS_MEM_H ++#define GS_MEM_H + + /* requires: + for size_t, offsetof +@@ -9,7 +9,7 @@ + "fail.h" + */ + +-#if !defined(C99_H) || !defined(FAIL_H) ++#if !defined(GS_C99_H) || !defined(GS_FAIL_H) + #error "mem.h" requires "c99.h" and "fail.h" + #endif + +@@ -31,8 +31,8 @@ + #else + # include + # ifndef comm_gbl_id +-# define comm_gbl_id PREFIXED_NAME(comm_gbl_id) +-# define comm_gbl_np PREFIXED_NAME(comm_gbl_np) ++# define comm_gbl_id GS_PREFIXED_NAME(comm_gbl_id) ++# define comm_gbl_np GS_PREFIXED_NAME(comm_gbl_np) + # include "types.h" + extern uint comm_gbl_id, comm_gbl_np; + # endif +diff --git a/src/name.h b/src/name.h +index b4bcd91..ac54032 100644 +--- a/src/name.h ++++ b/src/name.h +@@ -1,43 +1,43 @@ +-#ifndef NAME_H +-#define NAME_H ++#ifndef GS_NAME_H ++#define GS_NAME_H + + /* establishes some macros to establish + * the FORTRAN naming convention +- default gs_setup, etc. +- -DUPCASE GS_SETUP, etc. +- -DUNDERSCORE gs_setup_, etc. ++ default gs_setup, etc. ++ -DGS_UPCASE GS_SETUP, etc. ++ -DGS_UNDERSCORE gs_setup_, etc. + * a prefix for all external (non-FORTRAN) function names +- for example, -DPREFIX=jl_ transforms fail -> jl_fail ++ for example, -DGS_PREFIX=jl_ transforms fail -> jl_fail + * a prefix for all external FORTRAN function names +- for example, -DFPREFIX=jlf_ transforms gs_setup_ -> jlf_gs_setup_ ++ for example, -DGS_FPREFIX=jlf_ transforms gs_setup_ -> jlf_gs_setup_ + */ + + /* the following macro functions like a##b, + but will expand a and/or b if they are themselves macros */ +-#define TOKEN_PASTE_(a,b) a##b +-#define TOKEN_PASTE(a,b) TOKEN_PASTE_(a,b) ++#define GS_TOKEN_PASTE_(a,b) a##b ++#define GS_TOKEN_PASTE(a,b) GS_TOKEN_PASTE_(a,b) + +-#ifdef PREFIX +-# define PREFIXED_NAME(x) TOKEN_PASTE(PREFIX,x) ++#ifdef GS_PREFIX ++# define GS_PREFIXED_NAME(x) GS_TOKEN_PASTE(GS_PREFIX,x) + #else +-# define PREFIXED_NAME(x) x ++# define GS_PREFIXED_NAME(x) x + #endif + +-#ifdef FPREFIX +-# define FPREFIXED_NAME(x) TOKEN_PASTE(FPREFIX,x) ++#ifdef GS_FPREFIX ++# define GS_FPREFIXED_NAME(x) GS_TOKEN_PASTE(GS_FPREFIX,x) + #else +-# define FPREFIXED_NAME(x) x ++# define GS_FPREFIXED_NAME(x) x + #endif + +-#if defined(UPCASE) +-# define FORTRAN_NAME(low,up) FPREFIXED_NAME(up) +-# define FORTRAN_UNPREFIXED(low,up) up +-#elif defined(UNDERSCORE) +-# define FORTRAN_NAME(low,up) FPREFIXED_NAME(TOKEN_PASTE(low,_)) +-# define FORTRAN_UNPREFIXED(low,up) TOKEN_PASTE(low,_) ++#if defined(GS_UPCASE) ++# define GS_FORTRAN_NAME(low,up) GS_FPREFIXED_NAME(up) ++# define GS_FORTRAN_UNPREFIXED(low,up) up ++#elif defined(GS_UNDERSCORE) ++# define GS_FORTRAN_NAME(low,up) GS_FPREFIXED_NAME(GS_TOKEN_PASTE(low,_)) ++# define GS_FORTRAN_UNPREFIXED(low,up) GS_TOKEN_PASTE(low,_) + #else +-# define FORTRAN_NAME(low,up) FPREFIXED_NAME(low) +-# define FORTRAN_UNPREFIXED(low,up) low ++# define GS_FORTRAN_NAME(low,up) GS_FPREFIXED_NAME(low) ++# define GS_FORTRAN_UNPREFIXED(low,up) low + #endif + + #endif +diff --git a/src/obbox.c b/src/obbox.c +index 22c4614..611f3ac 100644 +--- a/src/obbox.c ++++ b/src/obbox.c +@@ -10,8 +10,8 @@ + #include "poly.h" + #include "lob_bnd.h" + +-#define obbox_calc_2 PREFIXED_NAME(obbox_calc_2) +-#define obbox_calc_3 PREFIXED_NAME(obbox_calc_3) ++#define obbox_calc_2 GS_PREFIXED_NAME(obbox_calc_2) ++#define obbox_calc_3 GS_PREFIXED_NAME(obbox_calc_3) + + struct obbox_2 { double c0[2], A[4]; + struct dbl_range x[2]; }; +diff --git a/src/obbox.h b/src/obbox.h +index 8e5764f..86ba0ce 100644 +--- a/src/obbox.h ++++ b/src/obbox.h +@@ -1,12 +1,12 @@ +-#ifndef OBBOX_H +-#define OBBOX_H ++#ifndef GS_OBBOX_H ++#define GS_OBBOX_H + +-#if !defined(TYPES_H) || !defined(NAME_H) ++#if !defined(GS_TYPES_H) || !defined(GS_NAME_H) + #warning "obbox.h" requires "types.h" and "name.h" + #endif + +-#define obbox_calc_2 PREFIXED_NAME(obbox_calc_2) +-#define obbox_calc_3 PREFIXED_NAME(obbox_calc_3) ++#define obbox_calc_2 GS_PREFIXED_NAME(obbox_calc_2) ++#define obbox_calc_3 GS_PREFIXED_NAME(obbox_calc_3) + + /*-------------------------------------------------------------------------- + Oriented and axis-aligned bounding box computation for spectral elements +@@ -45,7 +45,7 @@ + + --------------------------------------------------------------------------*/ + +-#ifndef LOB_BND_H ++#ifndef GS_LOB_BND_H + struct dbl_range { double min, max; }; + #endif + +diff --git a/src/poly.c b/src/poly.c +index 00ad22b..d8585be 100644 +--- a/src/poly.c ++++ b/src/poly.c +@@ -8,14 +8,14 @@ + #include "fail.h" + #include "mem.h" + +-#define lagrange_size PREFIXED_NAME(lagrange_size ) +-#define lagrange_setup PREFIXED_NAME(lagrange_setup) +-#define gauss_nodes PREFIXED_NAME(gauss_nodes ) +-#define gauss_quad PREFIXED_NAME(gauss_quad ) +-#define lobatto_nodes PREFIXED_NAME(lobatto_nodes ) +-#define lobatto_quad PREFIXED_NAME(lobatto_quad ) +-#define gll_lag_size PREFIXED_NAME(gll_lag_size ) +-#define gll_lag_setup PREFIXED_NAME(gll_lag_setup ) ++#define lagrange_size GS_PREFIXED_NAME(lagrange_size ) ++#define lagrange_setup GS_PREFIXED_NAME(lagrange_setup) ++#define gauss_nodes GS_PREFIXED_NAME(gauss_nodes ) ++#define gauss_quad GS_PREFIXED_NAME(gauss_quad ) ++#define lobatto_nodes GS_PREFIXED_NAME(lobatto_nodes ) ++#define lobatto_quad GS_PREFIXED_NAME(lobatto_quad ) ++#define gll_lag_size GS_PREFIXED_NAME(gll_lag_size ) ++#define gll_lag_setup GS_PREFIXED_NAME(gll_lag_setup ) + + typedef void lagrange_fun(double *restrict p, + double *restrict data, unsigned n, int d, double x); +diff --git a/src/poly.h b/src/poly.h +index 2fa162a..2781b22 100644 +--- a/src/poly.h ++++ b/src/poly.h +@@ -1,18 +1,18 @@ +-#ifndef POLY_H +-#define POLY_H ++#ifndef GS_POLY_H ++#define GS_POLY_H + +-#if !defined(NAME_H) ++#if !defined(GS_NAME_H) + #warning "poly.h" requires "name.h" + #endif + +-#define lagrange_size PREFIXED_NAME(lagrange_size ) +-#define lagrange_setup PREFIXED_NAME(lagrange_setup) +-#define gauss_nodes PREFIXED_NAME(gauss_nodes ) +-#define gauss_quad PREFIXED_NAME(gauss_quad ) +-#define lobatto_nodes PREFIXED_NAME(lobatto_nodes ) +-#define lobatto_quad PREFIXED_NAME(lobatto_quad ) +-#define gll_lag_size PREFIXED_NAME(gll_lag_size ) +-#define gll_lag_setup PREFIXED_NAME(gll_lag_setup ) ++#define lagrange_size GS_PREFIXED_NAME(lagrange_size ) ++#define lagrange_setup GS_PREFIXED_NAME(lagrange_setup) ++#define gauss_nodes GS_PREFIXED_NAME(gauss_nodes ) ++#define gauss_quad GS_PREFIXED_NAME(gauss_quad ) ++#define lobatto_nodes GS_PREFIXED_NAME(lobatto_nodes ) ++#define lobatto_quad GS_PREFIXED_NAME(lobatto_quad ) ++#define gll_lag_size GS_PREFIXED_NAME(gll_lag_size ) ++#define gll_lag_setup GS_PREFIXED_NAME(gll_lag_setup ) + + /*-------------------------------------------------------------------------- + Quadrature Nodes and Weights Calculation +diff --git a/src/sarray_sort.c b/src/sarray_sort.c +index 0ec26d1..9ba8fc4 100644 +--- a/src/sarray_sort.c ++++ b/src/sarray_sort.c +@@ -8,8 +8,8 @@ + #include "mem.h" + #include "sort.h" + +-#define sarray_permute_ PREFIXED_NAME(sarray_permute_) +-#define sarray_permute_buf_ PREFIXED_NAME(sarray_permute_buf_) ++#define sarray_permute_ GS_PREFIXED_NAME(sarray_permute_) ++#define sarray_permute_buf_ GS_PREFIXED_NAME(sarray_permute_buf_) + + void sarray_permute_(size_t size, void *A, size_t n, uint *perm, void *work) + { +diff --git a/src/sarray_sort.h b/src/sarray_sort.h +index cd30d7c..97fdba1 100644 +--- a/src/sarray_sort.h ++++ b/src/sarray_sort.h +@@ -1,7 +1,7 @@ +-#ifndef SARRAY_SORT_H +-#define SARRAY_SORT_H ++#ifndef GS_SARRAY_SORT_H ++#define GS_SARRAY_SORT_H + +-#if !defined(SORT_H) ++#if !defined(GS_SORT_H) + #warning "sarray_sort.h" requires "sort.h" + #endif + +@@ -33,8 +33,8 @@ + ----------------------------------------------------------------------------*/ + + +-#define sarray_permute_ PREFIXED_NAME(sarray_permute_) +-#define sarray_permute_buf_ PREFIXED_NAME(sarray_permute_buf_) ++#define sarray_permute_ GS_PREFIXED_NAME(sarray_permute_) ++#define sarray_permute_buf_ GS_PREFIXED_NAME(sarray_permute_buf_) + + void sarray_permute_(size_t size, void *A, size_t n, uint *perm, void *work); + void sarray_permute_buf_( +diff --git a/src/sarray_transfer.c b/src/sarray_transfer.c +index c5dfd2b..5f94192 100644 +--- a/src/sarray_transfer.c ++++ b/src/sarray_transfer.c +@@ -11,9 +11,9 @@ + #include "crystal.h" + #include "sort.h" + +-#define sarray_transfer_many PREFIXED_NAME(sarray_transfer_many) +-#define sarray_transfer_ PREFIXED_NAME(sarray_transfer_ ) +-#define sarray_transfer_ext_ PREFIXED_NAME(sarray_transfer_ext_) ++#define sarray_transfer_many GS_PREFIXED_NAME(sarray_transfer_many) ++#define sarray_transfer_ GS_PREFIXED_NAME(sarray_transfer_ ) ++#define sarray_transfer_ext_ GS_PREFIXED_NAME(sarray_transfer_ext_) + + static void pack_int( + buffer *const data, const unsigned row_size, const uint id, +diff --git a/src/sarray_transfer.h b/src/sarray_transfer.h +index c195e21..cc441ae 100644 +--- a/src/sarray_transfer.h ++++ b/src/sarray_transfer.h +@@ -1,7 +1,7 @@ +-#ifndef SARRAY_TRANSFER_H +-#define SARRAY_TRANSFER_H ++#ifndef GS_SARRAY_TRANSFER_H ++#define GS_SARRAY_TRANSFER_H + +-#if !defined(CRYSTAL_H) ++#if !defined(GS_CRYSTAL_H) + #warning "sarray_transfer.h" requires "crystal.h" + #endif + +@@ -70,9 +70,9 @@ + + */ + +-#define sarray_transfer_many PREFIXED_NAME(sarray_transfer_many) +-#define sarray_transfer_ PREFIXED_NAME(sarray_transfer_ ) +-#define sarray_transfer_ext_ PREFIXED_NAME(sarray_transfer_ext_) ++#define sarray_transfer_many GS_PREFIXED_NAME(sarray_transfer_many) ++#define sarray_transfer_ GS_PREFIXED_NAME(sarray_transfer_ ) ++#define sarray_transfer_ext_ GS_PREFIXED_NAME(sarray_transfer_ext_) + + uint sarray_transfer_many( + struct array *const *const A, const unsigned *const size, const unsigned An, +diff --git a/src/sort.c b/src/sort.c +index 2bb061b..b642a3f 100644 +--- a/src/sort.c ++++ b/src/sort.c +@@ -14,7 +14,7 @@ + #undef SORT_SUFFIX + #undef T + +-#if defined(USE_LONG) || defined(GLOBAL_LONG) ++#if defined(GS_USE_LONG) || defined(GS_GLOBAL_LONG) + # define T unsigned long + # define SORT_SUFFIX _ul + # include "sort_imp.h" +@@ -22,7 +22,7 @@ + # undef T + #endif + +-#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG) ++#if defined(GS_USE_LONG_LONG) || defined(GS_GLOBAL_LONG_LONG) + # define T unsigned long long + # define SORT_SUFFIX _ull + # include "sort_imp.h" +diff --git a/src/sort.h b/src/sort.h +index 0b0ee53..4cb2fd2 100644 +--- a/src/sort.h ++++ b/src/sort.h +@@ -1,7 +1,7 @@ +-#ifndef SORT_H +-#define SORT_H ++#ifndef GS_SORT_H ++#define GS_SORT_H + +-#if !defined(TYPES_H) || !defined(MEM_H) ++#if !defined(GS_TYPES_H) || !defined(GS_MEM_H) + #warning "sort.h" requires "types.h" and "mem.h" + /* types.h defines uint, ulong + mem.h defines buffer */ +@@ -44,21 +44,21 @@ + + ----------------------------------------------------------------------------*/ + +-#define sortv_ui PREFIXED_NAME(sortv_ui) +-#define sortv_ul PREFIXED_NAME(sortv_ul) +-#define sortv_ull PREFIXED_NAME(sortv_ull) +-#define sortv_double PREFIXED_NAME(sortv_double) +-#define sortv_float PREFIXED_NAME(sortv_float) +-#define sortp_ui PREFIXED_NAME(sortp_ui) +-#define sortp_ul PREFIXED_NAME(sortp_ul) +-#define sortp_ull PREFIXED_NAME(sortp_ull) +-#define sortp_double PREFIXED_NAME(sortp_double) +-#define sortp_float PREFIXED_NAME(sortp_float) ++#define sortv_ui GS_PREFIXED_NAME(sortv_ui) ++#define sortv_ul GS_PREFIXED_NAME(sortv_ul) ++#define sortv_ull GS_PREFIXED_NAME(sortv_ull) ++#define sortv_double GS_PREFIXED_NAME(sortv_double) ++#define sortv_float GS_PREFIXED_NAME(sortv_float) ++#define sortp_ui GS_PREFIXED_NAME(sortp_ui) ++#define sortp_ul GS_PREFIXED_NAME(sortp_ul) ++#define sortp_ull GS_PREFIXED_NAME(sortp_ull) ++#define sortp_double GS_PREFIXED_NAME(sortp_double) ++#define sortp_float GS_PREFIXED_NAME(sortp_float) + +-#define sortv TYPE_LOCAL(sortv_ui,sortv_ul,sortv_ull) +-#define sortp TYPE_LOCAL(sortp_ui,sortp_ul,sortp_ull) +-#define sortv_long TYPE_GLOBAL(sortv_ui,sortv_ul,sortv_ull) +-#define sortp_long TYPE_GLOBAL(sortp_ui,sortp_ul,sortp_ull) ++#define sortv GS_TYPE_LOCAL(sortv_ui,sortv_ul,sortv_ull) ++#define sortp GS_TYPE_LOCAL(sortp_ui,sortp_ul,sortp_ull) ++#define sortv_long GS_TYPE_GLOBAL(sortv_ui,sortv_ul,sortv_ull) ++#define sortp_long GS_TYPE_GLOBAL(sortp_ui,sortp_ul,sortp_ull) + + void sortv_ui(unsigned *out, const unsigned *A, uint n, unsigned stride, + buffer *restrict buf); +@@ -79,7 +79,7 @@ uint *sortp_double(buffer *restrict buf, int start_perm, + const double *restrict A, uint n, unsigned stride); + uint *sortp_float(buffer *restrict buf, int start_perm, + const float *restrict A, uint n, unsigned stride); +-#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG) ++#if defined(GS_USE_LONG_LONG) || defined(GS_GLOBAL_LONG_LONG) + void sortv_ull(unsigned long long *out, + const unsigned long long *A, uint n, unsigned stride, + buffer *restrict buf); +diff --git a/src/sort_imp.h b/src/sort_imp.h +index 3ec8e0c..a6a0426 100644 +--- a/src/sort_imp.h ++++ b/src/sort_imp.h +@@ -2,27 +2,27 @@ + #error sort_imp.h not meant to be compiled by itself + #endif + +-#define sort_data TOKEN_PASTE(sort_data ,SORT_SUFFIX) +-#define radix_count TOKEN_PASTE(radix_count ,SORT_SUFFIX) +-#define radix_offsets TOKEN_PASTE(radix_offsets ,SORT_SUFFIX) +-#define radix_zeros TOKEN_PASTE(radix_zeros ,SORT_SUFFIX) +-#define radix_passv TOKEN_PASTE(radix_passv ,SORT_SUFFIX) +-#define radix_sortv TOKEN_PASTE(radix_sortv ,SORT_SUFFIX) +-#define radix_passp0_b TOKEN_PASTE(radix_passp0_b ,SORT_SUFFIX) +-#define radix_passp_b TOKEN_PASTE(radix_passp_b ,SORT_SUFFIX) +-#define radix_passp_m TOKEN_PASTE(radix_passp_m ,SORT_SUFFIX) +-#define radix_passp_e TOKEN_PASTE(radix_passp_e ,SORT_SUFFIX) +-#define radix_passp0_be TOKEN_PASTE(radix_passp0_be,SORT_SUFFIX) +-#define radix_passp_be TOKEN_PASTE(radix_passp_be, SORT_SUFFIX) +-#define radix_sortp TOKEN_PASTE(radix_sortp ,SORT_SUFFIX) +-#define merge_sortv TOKEN_PASTE(merge_sortv ,SORT_SUFFIX) +-#define merge_copy_perm TOKEN_PASTE(merge_copy_perm,SORT_SUFFIX) +-#define merge_sortp0 TOKEN_PASTE(merge_sortp0 ,SORT_SUFFIX) +-#define merge_sortp TOKEN_PASTE(merge_sortp ,SORT_SUFFIX) +-#define heap_sortv TOKEN_PASTE(heap_sortv ,SORT_SUFFIX) +- +-#define sortv PREFIXED_NAME(TOKEN_PASTE(sortv,SORT_SUFFIX)) +-#define sortp PREFIXED_NAME(TOKEN_PASTE(sortp,SORT_SUFFIX)) ++#define sort_data GS_TOKEN_PASTE(sort_data ,SORT_SUFFIX) ++#define radix_count GS_TOKEN_PASTE(radix_count ,SORT_SUFFIX) ++#define radix_offsets GS_TOKEN_PASTE(radix_offsets ,SORT_SUFFIX) ++#define radix_zeros GS_TOKEN_PASTE(radix_zeros ,SORT_SUFFIX) ++#define radix_passv GS_TOKEN_PASTE(radix_passv ,SORT_SUFFIX) ++#define radix_sortv GS_TOKEN_PASTE(radix_sortv ,SORT_SUFFIX) ++#define radix_passp0_b GS_TOKEN_PASTE(radix_passp0_b ,SORT_SUFFIX) ++#define radix_passp_b GS_TOKEN_PASTE(radix_passp_b ,SORT_SUFFIX) ++#define radix_passp_m GS_TOKEN_PASTE(radix_passp_m ,SORT_SUFFIX) ++#define radix_passp_e GS_TOKEN_PASTE(radix_passp_e ,SORT_SUFFIX) ++#define radix_passp0_be GS_TOKEN_PASTE(radix_passp0_be,SORT_SUFFIX) ++#define radix_passp_be GS_TOKEN_PASTE(radix_passp_be, SORT_SUFFIX) ++#define radix_sortp GS_TOKEN_PASTE(radix_sortp ,SORT_SUFFIX) ++#define merge_sortv GS_TOKEN_PASTE(merge_sortv ,SORT_SUFFIX) ++#define merge_copy_perm GS_TOKEN_PASTE(merge_copy_perm,SORT_SUFFIX) ++#define merge_sortp0 GS_TOKEN_PASTE(merge_sortp0 ,SORT_SUFFIX) ++#define merge_sortp GS_TOKEN_PASTE(merge_sortp ,SORT_SUFFIX) ++#define heap_sortv GS_TOKEN_PASTE(heap_sortv ,SORT_SUFFIX) ++ ++#define sortv GS_PREFIXED_NAME(GS_TOKEN_PASTE(sortv,SORT_SUFFIX)) ++#define sortp GS_PREFIXED_NAME(GS_TOKEN_PASTE(sortp,SORT_SUFFIX)) + + typedef struct { T v; uint i; } sort_data; + +diff --git a/src/tensor.c b/src/tensor.c +index a724714..a9cf442 100644 +--- a/src/tensor.c ++++ b/src/tensor.c +@@ -2,10 +2,10 @@ + #include "name.h" + #include "types.h" + +-#if !defined(USE_CBLAS) ++#if !defined(GS_USE_CBLAS) + +-#define tensor_dot PREFIXED_NAME(tensor_dot ) +-#define tensor_mtxm PREFIXED_NAME(tensor_mtxm) ++#define tensor_dot GS_PREFIXED_NAME(tensor_dot ) ++#define tensor_mtxm GS_PREFIXED_NAME(tensor_mtxm) + + /* Matrices are always column-major (FORTRAN style) */ + +@@ -16,10 +16,10 @@ double tensor_dot(const double *a, const double *b, uint n) + return sum; + } + +-# if defined(USE_NAIVE_BLAS) +-# define tensor_mxv PREFIXED_NAME(tensor_mxv ) +-# define tensor_mtxv PREFIXED_NAME(tensor_mtxv) +-# define tensor_mxm PREFIXED_NAME(tensor_mxm ) ++# if defined(GS_USE_NAIVE_BLAS) ++# define tensor_mxv GS_PREFIXED_NAME(tensor_mxv ) ++# define tensor_mtxv GS_PREFIXED_NAME(tensor_mtxv) ++# define tensor_mxm GS_PREFIXED_NAME(tensor_mxm ) + + /* y = A x */ + void tensor_mxv( +diff --git a/src/tensor.h b/src/tensor.h +index bb65be1..c692398 100644 +--- a/src/tensor.h ++++ b/src/tensor.h +@@ -1,12 +1,16 @@ +-#ifndef TENSOR_H +-#define TENSOR_H ++#ifndef GS_TENSOR_H ++#define GS_TENSOR_H + +-#if !defined(TYPES_H) || !defined(NAME_H) ++#if !defined(GS_TYPES_H) || !defined(GS_NAME_H) + #warning "tensor.h" requires "types.h" and "name.h" + #endif + +-#if defined(USE_CBLAS) ++#if defined(GS_USE_CBLAS) ++#if defined(GS_USE_MKL) ++# include ++#else + # include ++#endif + # define tensor_dot(a,b,n) cblas_ddot((int)(n),a,1,b,1) + # define tensor_mxv(y,ny,A,x,nx) \ + cblas_dgemv(CblasColMajor,CblasNoTrans,(int)ny,(int)nx, \ +@@ -23,17 +27,17 @@ + (int)nc,(int)nb,(int)na,1.0, \ + A,(int)na,B,(int)na,0.0,C,(int)nc) + #else +-# define tensor_dot PREFIXED_NAME(tensor_dot ) +-# define tensor_mtxm PREFIXED_NAME(tensor_mtxm) ++# define tensor_dot GS_PREFIXED_NAME(tensor_dot ) ++# define tensor_mtxm GS_PREFIXED_NAME(tensor_mtxm) + double tensor_dot(const double *a, const double *b, uint n); + + /* C (nc x nb) = [A (na x nc)]^T * B (na x nb); all column-major */ + void tensor_mtxm(double *C, uint nc, + const double *A, uint na, const double *B, uint nb); +-# if defined(USE_NAIVE_BLAS) +-# define tensor_mxv PREFIXED_NAME(tensor_mxv ) +-# define tensor_mtxv PREFIXED_NAME(tensor_mtxv) +-# define tensor_mxm PREFIXED_NAME(tensor_mxm ) ++# if defined(GS_USE_NAIVE_BLAS) ++# define tensor_mxv GS_PREFIXED_NAME(tensor_mxv ) ++# define tensor_mtxv GS_PREFIXED_NAME(tensor_mtxv) ++# define tensor_mxm GS_PREFIXED_NAME(tensor_mxm ) + /* y = A x */ + void tensor_mxv(double *y, uint ny, const double *A, const double *x, uint nx); + +@@ -44,7 +48,7 @@ void tensor_mtxv(double *y, uint ny, const double *A, const double *x, uint nx); + void tensor_mxm(double *C, uint nc, + const double *A, uint na, const double *B, uint nb); + # else +-# define mxm FORTRAN_NAME(mxm,MXM) ++# define mxm GS_FORTRAN_NAME(mxm,MXM) + /* C (na x nc) = A (na x nb) * B (nb x nc); all column-major */ + void mxm(const double *A, const uint *na, + const double *B, const uint *nb, +diff --git a/src/types.h b/src/types.h +index 14a94bf..d76cef3 100644 +--- a/src/types.h ++++ b/src/types.h +@@ -1,5 +1,5 @@ +-#ifndef TYPES_H +-#define TYPES_H ++#ifndef GS_TYPES_H ++#define GS_TYPES_H + #include + + /* +@@ -10,20 +10,20 @@ + most frequently, e.g., for indexing into local arrays, + and for processor ids. It can be one of + +- macro sint/uint type ++ macro sint/uint type + +- (default) int +- USE_LONG long +- USE_LONG_LONG long long ++ (default) int ++ GS_USE_LONG long ++ GS_USE_LONG_LONG long long + + The slong/ulong type is used in relatively few places + for global identifiers and indices. It can be one of + +- macro slong/ulong type ++ macro slong/ulong type + +- (default) int +- GLOBAL_LONG long +- GLOBAL_LONG_LONG long long ++ (default) int ++ GS_GLOBAL_LONG long ++ GS_GLOBAL_LONG_LONG long long + + Since the long long type is not ISO C90, it is never + used unless explicitly asked for. +@@ -34,9 +34,9 @@ + + */ + +-#if defined(USE_LONG_LONG) || defined(GLOBAL_LONG_LONG) ++#if defined(GS_USE_LONG_LONG) || defined(GS_GLOBAL_LONG_LONG) + typedef long long long_long; +-# define WHEN_LONG_LONG(x) x ++# define GS_WHEN_LONG_LONG(x) x + # if !defined(LLONG_MAX) + # if defined(LONG_LONG_MAX) + # define LLONG_MAX LONG_LONG_MAX +@@ -52,34 +52,34 @@ typedef long long long_long; + # endif + # endif + #else +-# define WHEN_LONG_LONG(x) ++# define GS_WHEN_LONG_LONG(x) + #endif + +-#if !defined(USE_LONG) && !defined(USE_LONG_LONG) +-# define TYPE_LOCAL(i,l,ll) i +-#elif defined(USE_LONG) +-# define TYPE_LOCAL(i,l,ll) l +-#elif defined(USE_LONG_LONG) +-# define TYPE_LOCAL(i,l,ll) ll ++#if !defined(GS_USE_LONG) && !defined(GS_USE_LONG_LONG) ++# define GS_TYPE_LOCAL(i,l,ll) i ++#elif defined(GS_USE_LONG) ++# define GS_TYPE_LOCAL(i,l,ll) l ++#elif defined(GS_USE_LONG_LONG) ++# define GS_TYPE_LOCAL(i,l,ll) ll + #endif + +-#if !defined(GLOBAL_LONG) && !defined(GLOBAL_LONG_LONG) +-# define TYPE_GLOBAL(i,l,ll) i +-#elif defined(GLOBAL_LONG) +-# define TYPE_GLOBAL(i,l,ll) l ++#if !defined(GS_GLOBAL_LONG) && !defined(GS_GLOBAL_LONG_LONG) ++# define GS_TYPE_GLOBAL(i,l,ll) i ++#elif defined(GS_GLOBAL_LONG) ++# define GS_TYPE_GLOBAL(i,l,ll) l + #else +-# define TYPE_GLOBAL(i,l,ll) ll ++# define GS_TYPE_GLOBAL(i,l,ll) ll + #endif + + /* local integer type: for quantities O(N/P) */ +-#define sint signed TYPE_LOCAL(int,long,long long) +-#define uint unsigned TYPE_LOCAL(int,long,long long) +-#define iabs TYPE_LOCAL(abs,labs,llabs) ++#define sint signed GS_TYPE_LOCAL(int,long,long long) ++#define uint unsigned GS_TYPE_LOCAL(int,long,long long) ++#define iabs GS_TYPE_LOCAL(abs,labs,llabs) + + /* global integer type: for quantities O(N) */ +-#define slong signed TYPE_GLOBAL(int,long,long long) +-#define ulong unsigned TYPE_GLOBAL(int,long,long long) +-#define iabsl TYPE_GLOBAL(abs,labs,llabs) ++#define slong signed GS_TYPE_GLOBAL(int,long,long long) ++#define ulong unsigned GS_TYPE_GLOBAL(int,long,long long) ++#define iabsl GS_TYPE_GLOBAL(abs,labs,llabs) + + #endif + diff --git a/extern/patch/libCEED/patch_gpu_restriction_dev.diff b/extern/patch/libCEED/patch_gpu_restriction_dev.diff index 9a46b09487..f8bbcbaa17 100644 --- a/extern/patch/libCEED/patch_gpu_restriction_dev.diff +++ b/extern/patch/libCEED/patch_gpu_restriction_dev.diff @@ -1,5871 +1,5871 @@ -diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c -index f47c52b7..29d3b083 100644 ---- a/backends/cuda-ref/ceed-cuda-ref-operator.c -+++ b/backends/cuda-ref/ceed-cuda-ref-operator.c -@@ -54,15 +54,18 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) { - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallCuda(ceed, cuModuleUnload(impl->diag->module)); -- CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in)); -- CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out)); -- CeedCallCuda(ceed, cudaFree(impl->diag->d_e_mode_in)); -- CeedCallCuda(ceed, cudaFree(impl->diag->d_e_mode_out)); -+ CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_in)); -+ CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_out)); - CeedCallCuda(ceed, cudaFree(impl->diag->d_identity)); - CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_in)); - CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_out)); - CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_in)); - CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_out)); -+ CeedCallCuda(ceed, cudaFree(impl->diag->d_div_in)); -+ CeedCallCuda(ceed, cudaFree(impl->diag->d_div_out)); -+ CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_in)); -+ CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_out)); -+ CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); - CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); - CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); -@@ -86,17 +89,13 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) { - //------------------------------------------------------------------------------ - // Setup infields or outfields - //------------------------------------------------------------------------------ --static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt e_start, -+static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, - CeedInt num_fields, CeedInt Q, CeedInt num_elem) { - Ceed ceed; -- bool is_strided, skip_restriction; -- CeedSize q_size; -- CeedInt dim, size; - CeedQFunctionField *qf_fields; - CeedOperatorField *op_fields; - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); -- - if (is_input) { - CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); -@@ -107,30 +106,29 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool - - // Loop over fields - for (CeedInt i = 0; i < num_fields; i++) { -- CeedEvalMode e_mode; -+ bool is_strided = false, skip_restriction = false; -+ CeedSize q_size; -+ CeedInt size; -+ CeedEvalMode eval_mode; - CeedBasis basis; - -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); -- -- is_strided = false; -- skip_restriction = false; -- if (e_mode != CEED_EVAL_WEIGHT) { -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); -+ if (eval_mode != CEED_EVAL_WEIGHT) { - CeedElemRestriction elem_rstr; - -- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); -- - // Check whether this field can skip the element restriction: -- // must be passive input, with e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. -+ // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. -+ CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); - - // First, check whether the field is input or output: - if (is_input) { - CeedVector vec; - -- // Check for passive input: -+ // Check for passive input - CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); - if (vec != CEED_VECTOR_ACTIVE) { -- // Check e_mode -- if (e_mode == CEED_EVAL_NONE) { -+ // Check eval_mode -+ if (eval_mode == CEED_EVAL_NONE) { - // Check for strided restriction - CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); - if (is_strided) { -@@ -142,27 +140,23 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool - } - if (skip_restriction) { - // We do not need an E-Vector, but will use the input field vector's data directly in the operator application. -- e_vecs[i + e_start] = NULL; -+ e_vecs[i + start_e] = NULL; - } else { -- CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + e_start])); -+ CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e])); - } - } - -- switch (e_mode) { -+ switch (eval_mode) { - case CEED_EVAL_NONE: - CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); - q_size = (CeedSize)num_elem * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - break; - case CEED_EVAL_INTERP: -- CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); -- q_size = (CeedSize)num_elem * Q * size; -- CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); -- break; - case CEED_EVAL_GRAD: -- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); -+ case CEED_EVAL_DIV: -+ case CEED_EVAL_CURL: - CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); -- CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - q_size = (CeedSize)num_elem * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - break; -@@ -172,10 +166,6 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool - CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); - break; -- case CEED_EVAL_DIV: -- break; // TODO: Not implemented -- case CEED_EVAL_CURL: -- break; // TODO: Not implemented - } - } - return CEED_ERROR_SUCCESS; -@@ -206,10 +196,8 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) { - - // Allocate - CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); -- - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); -- - impl->num_inputs = num_input_fields; - impl->num_outputs = num_output_fields; - -@@ -227,23 +215,25 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) { - // Setup Operator Inputs - //------------------------------------------------------------------------------ - static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, -- CeedVector in_vec, const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], -+ CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], - CeedOperator_Cuda *impl, CeedRequest *request) { - for (CeedInt i = 0; i < num_input_fields; i++) { -- CeedEvalMode e_mode; -+ CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_rstr; - - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { -- if (skip_active_in) continue; -+ if (skip_active) continue; - else vec = in_vec; - } - -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); -- if (e_mode == CEED_EVAL_WEIGHT) { // Skip -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); -+ if (eval_mode == CEED_EVAL_WEIGHT) { // Skip - } else { -+ // Get input vector -+ CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - // Get input element restriction - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); - if (vec == CEED_VECTOR_ACTIVE) vec = in_vec; -@@ -265,45 +255,40 @@ static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFu - // Input Basis Action - //------------------------------------------------------------------------------ - static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, -- CeedInt num_input_fields, const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], -+ CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], - CeedOperator_Cuda *impl) { - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedInt elem_size, size; -- CeedEvalMode e_mode; -+ CeedEvalMode eval_mode; - CeedElemRestriction elem_rstr; - CeedBasis basis; - - // Skip active input -- if (skip_active_in) { -+ if (skip_active) { - CeedVector vec; - - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; - } -- // Get elem_size, e_mode, size -+ // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); - // Basis action -- switch (e_mode) { -+ switch (eval_mode) { - case CEED_EVAL_NONE: - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); - break; - case CEED_EVAL_INTERP: -- CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); -- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i])); -- break; - case CEED_EVAL_GRAD: -+ case CEED_EVAL_DIV: -+ case CEED_EVAL_CURL: - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); -- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i])); -+ CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i])); - break; - case CEED_EVAL_WEIGHT: - break; // No action -- case CEED_EVAL_DIV: -- break; // TODO: Not implemented -- case CEED_EVAL_CURL: -- break; // TODO: Not implemented - } - } - return CEED_ERROR_SUCCESS; -@@ -313,18 +298,18 @@ static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionFie - // Restore Input Vectors - //------------------------------------------------------------------------------ - static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, -- const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { -+ const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { - for (CeedInt i = 0; i < num_input_fields; i++) { -- CeedEvalMode e_mode; -+ CeedEvalMode eval_mode; - CeedVector vec; - - // Skip active input -- if (skip_active_in) { -+ if (skip_active) { - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; - } -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); -- if (e_mode == CEED_EVAL_WEIGHT) { // Skip -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); -+ if (eval_mode == CEED_EVAL_WEIGHT) { // Skip - } else { - if (!impl->e_vecs[i]) { // This was a skip_restriction case - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); -@@ -341,13 +326,12 @@ static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQ - // Apply and add to output - //------------------------------------------------------------------------------ - static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { -- CeedOperator_Cuda *impl; - CeedInt Q, num_elem, elem_size, num_input_fields, num_output_fields, size; -- CeedEvalMode e_mode; - CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; -- CeedOperatorField *op_input_fields, *op_output_fields; - CeedQFunctionField *qf_input_fields, *qf_output_fields; - CeedQFunction qf; -+ CeedOperatorField *op_input_fields, *op_output_fields; -+ CeedOperator_Cuda *impl; - - CeedCallBackend(CeedOperatorGetData(op, &impl)); - CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); -@@ -359,7 +343,7 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec - // Setup - CeedCallBackend(CeedOperatorSetup_Cuda(op)); - -- // Input e_vecs and Restriction -+ // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); - - // Input basis apply if needed -@@ -367,8 +351,10 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec - - // Output pointers, as necessary - for (CeedInt i = 0; i < num_output_fields; i++) { -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); -- if (e_mode == CEED_EVAL_NONE) { -+ CeedEvalMode eval_mode; -+ -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); -+ if (eval_mode == CEED_EVAL_NONE) { - // Set the output Q-Vector to use the E-Vector data directly. - CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); -@@ -380,49 +366,46 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec - - // Output basis apply if needed - for (CeedInt i = 0; i < num_output_fields; i++) { -+ CeedEvalMode eval_mode; - CeedElemRestriction elem_rstr; - CeedBasis basis; - -- // Get elem_size, e_mode, size -+ // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); - // Basis action -- switch (e_mode) { -+ switch (eval_mode) { - case CEED_EVAL_NONE: -- break; -+ break; // No action - case CEED_EVAL_INTERP: -- CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); -- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); -- break; - case CEED_EVAL_GRAD: -+ case CEED_EVAL_DIV: -+ case CEED_EVAL_CURL: - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); -- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); -+ CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { - Ceed ceed; -+ - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); -- break; // Should not occur -+ // LCOV_EXCL_STOP - } -- case CEED_EVAL_DIV: -- break; // TODO: Not implemented -- case CEED_EVAL_CURL: -- break; // TODO: Not implemented -- // LCOV_EXCL_STOP - } - } - - // Output restriction - for (CeedInt i = 0; i < num_output_fields; i++) { -+ CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_rstr; - - // Restore evec -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); -- if (e_mode == CEED_EVAL_NONE) { -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); -+ if (eval_mode == CEED_EVAL_NONE) { - CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields])); - } - // Get output vector -@@ -441,13 +424,12 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec - } - - //------------------------------------------------------------------------------ --// Core code for assembling linear QFunction -+// Linear QFunction Assembly Core - //------------------------------------------------------------------------------ - static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, - CeedRequest *request) { - Ceed ceed, ceed_parent; - CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; -- CeedSize q_size; - CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; - CeedVector *active_inputs; - CeedQFunctionField *qf_input_fields, *qf_output_fields; -@@ -469,7 +451,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, - // Setup - CeedCallBackend(CeedOperatorSetup_Cuda(op)); - -- // Input e_vecs and Restriction -+ // Input Evecs and Restriction - CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); - - // Count number of active input fields -@@ -487,7 +469,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, - CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); - CeedCallBackend(CeedRealloc(num_active_in + size, &active_inputs)); - for (CeedInt field = 0; field < size; field++) { -- q_size = (CeedSize)Q * num_elem; -+ CeedSize q_size = (CeedSize)Q * num_elem; -+ - CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field])); - CeedCallBackend( - CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); -@@ -521,12 +504,13 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, - - // Build objects if needed - if (build_objects) { -+ CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; -+ CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ -+ - // Create output restriction -- CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ - CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, - num_active_in * num_active_out * num_elem * Q, strides, rstr)); - // Create assembled vector -- CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; - CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); - } - CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); -@@ -594,14 +578,14 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, CeedV - } - - //------------------------------------------------------------------------------ --// Assemble diagonal setup -+// Assemble Diagonal Setup - //------------------------------------------------------------------------------ - static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) { - Ceed ceed; - char *diagonal_kernel_path, *diagonal_kernel_source; -- CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0, num_nodes, num_qpts; -- CeedEvalMode *e_mode_in = NULL, *e_mode_out = NULL; -- CeedElemRestriction rstr_in = NULL, rstr_out = NULL; -+ CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; -+ CeedInt num_comp, q_comp, num_nodes, num_qpts; -+ CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; - CeedBasis basis_in = NULL, basis_out = NULL; - CeedQFunctionField *qf_fields; - CeedQFunction qf; -@@ -620,33 +604,20 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedIn - - CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { -- CeedEvalMode e_mode; -- CeedElemRestriction rstr; -- -- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); -- CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); -- CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); -- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); -- CeedCheck(!rstr_in || rstr_in == rstr, ceed, CEED_ERROR_BACKEND, -- "Backend does not implement multi-field non-composite operator diagonal assembly"); -- rstr_in = rstr; -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); -- switch (e_mode) { -- case CEED_EVAL_NONE: -- case CEED_EVAL_INTERP: -- CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in)); -- e_mode_in[num_e_mode_in] = e_mode; -- num_e_mode_in += 1; -- break; -- case CEED_EVAL_GRAD: -- CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in)); -- for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode; -- num_e_mode_in += dim; -- break; -- case CEED_EVAL_WEIGHT: -- case CEED_EVAL_DIV: -- case CEED_EVAL_CURL: -- break; // Caught by QF Assembly -+ CeedBasis basis; -+ CeedEvalMode eval_mode; -+ -+ CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); -+ CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, -+ "Backend does not implement operator diagonal assembly with multiple active bases"); -+ basis_in = basis; -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); -+ if (eval_mode != CEED_EVAL_WEIGHT) { -+ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly -+ CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in)); -+ for (CeedInt d = 0; d < q_comp; d++) eval_modes_in[num_eval_modes_in + d] = eval_mode; -+ num_eval_modes_in += q_comp; - } - } - } -@@ -659,31 +630,20 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedIn - - CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { -- CeedEvalMode e_mode; -- CeedElemRestriction rstr; -- -- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); -- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); -- CeedCheck(!rstr_out || rstr_out == rstr, ceed, CEED_ERROR_BACKEND, -- "Backend does not implement multi-field non-composite operator diagonal assembly"); -- rstr_out = rstr; -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); -- switch (e_mode) { -- case CEED_EVAL_NONE: -- case CEED_EVAL_INTERP: -- CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out)); -- e_mode_out[num_e_mode_out] = e_mode; -- num_e_mode_out += 1; -- break; -- case CEED_EVAL_GRAD: -- CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out)); -- for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode; -- num_e_mode_out += dim; -- break; -- case CEED_EVAL_WEIGHT: -- case CEED_EVAL_DIV: -- case CEED_EVAL_CURL: -- break; // Caught by QF Assembly -+ CeedBasis basis; -+ CeedEvalMode eval_mode; -+ -+ CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); -+ CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, -+ "Backend does not implement operator diagonal assembly with multiple active bases"); -+ basis_out = basis; -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); -+ if (eval_mode != CEED_EVAL_WEIGHT) { -+ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly -+ CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out)); -+ for (CeedInt d = 0; d < q_comp; d++) eval_modes_out[num_eval_modes_out + d] = eval_mode; -+ num_eval_modes_out += q_comp; - } - } - } -@@ -693,95 +653,147 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedIn - CeedCallBackend(CeedCalloc(1, &impl->diag)); - CeedOperatorDiag_Cuda *diag = impl->diag; - -- diag->basis_in = basis_in; -- diag->basis_out = basis_out; -- diag->h_e_mode_in = e_mode_in; -- diag->h_e_mode_out = e_mode_out; -- diag->num_e_mode_in = num_e_mode_in; -- diag->num_e_mode_out = num_e_mode_out; -- - // Assemble kernel -+ CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); -+ CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); -+ if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; -+ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); -- CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); -- CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); -- diag->num_nodes = num_nodes; -- CeedCallCuda(ceed, -- CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_E_MODE_IN", num_e_mode_in, "NUM_E_MODE_OUT", num_e_mode_out, -- "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "NUM_COMP", num_comp, "USE_CEEDSIZE", use_ceedsize_idx)); -- CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal)); -- CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock)); -+ CeedCallCuda( -+ ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", -+ num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "CEED_SIZE", use_ceedsize_idx)); -+ CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "LinearDiagonal", &diag->LinearDiagonal)); -+ CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "LinearPointBlockDiagonal", &diag->LinearPointBlock)); - CeedCallBackend(CeedFree(&diagonal_kernel_path)); - CeedCallBackend(CeedFree(&diagonal_kernel_source)); - - // Basis matrices -- const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); -- const CeedInt interp_bytes = q_bytes * num_nodes; -- const CeedInt grad_bytes = q_bytes * num_nodes * dim; -- const CeedInt e_mode_bytes = sizeof(CeedEvalMode); -- const CeedScalar *interp_in, *interp_out, *grad_in, *grad_out; -+ const CeedInt interp_bytes = num_nodes * num_qpts * sizeof(CeedScalar); -+ const CeedInt eval_modes_bytes = sizeof(CeedEvalMode); -+ bool has_eval_none = false; - - // CEED_EVAL_NONE -- CeedScalar *identity = NULL; -- bool is_eval_none = false; -+ for (CeedInt i = 0; i < num_eval_modes_in; i++) has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE); -+ for (CeedInt i = 0; i < num_eval_modes_out; i++) has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE); -+ if (has_eval_none) { -+ CeedScalar *identity = NULL; - -- for (CeedInt i = 0; i < num_e_mode_in; i++) is_eval_none = is_eval_none || (e_mode_in[i] == CEED_EVAL_NONE); -- for (CeedInt i = 0; i < num_e_mode_out; i++) is_eval_none = is_eval_none || (e_mode_out[i] == CEED_EVAL_NONE); -- if (is_eval_none) { -- CeedCallBackend(CeedCalloc(num_qpts * num_nodes, &identity)); -+ CeedCallBackend(CeedCalloc(num_nodes * num_qpts, &identity)); - for (CeedInt i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0; - CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_identity, interp_bytes)); - CeedCallCuda(ceed, cudaMemcpy(diag->d_identity, identity, interp_bytes, cudaMemcpyHostToDevice)); -+ CeedCallBackend(CeedFree(&identity)); - } - -- // CEED_EVAL_INTERP -- CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); -- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interp_in, interp_bytes)); -- CeedCallCuda(ceed, cudaMemcpy(diag->d_interp_in, interp_in, interp_bytes, cudaMemcpyHostToDevice)); -- CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); -- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interp_out, interp_bytes)); -- CeedCallCuda(ceed, cudaMemcpy(diag->d_interp_out, interp_out, interp_bytes, cudaMemcpyHostToDevice)); -- -- // CEED_EVAL_GRAD -- CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); -- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_grad_in, grad_bytes)); -- CeedCallCuda(ceed, cudaMemcpy(diag->d_grad_in, grad_in, grad_bytes, cudaMemcpyHostToDevice)); -- CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); -- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_grad_out, grad_bytes)); -- CeedCallCuda(ceed, cudaMemcpy(diag->d_grad_out, grad_out, grad_bytes, cudaMemcpyHostToDevice)); -- -- // Arrays of e_modes -- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_e_mode_in, num_e_mode_in * e_mode_bytes)); -- CeedCallCuda(ceed, cudaMemcpy(diag->d_e_mode_in, e_mode_in, num_e_mode_in * e_mode_bytes, cudaMemcpyHostToDevice)); -- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_e_mode_out, num_e_mode_out * e_mode_bytes)); -- CeedCallCuda(ceed, cudaMemcpy(diag->d_e_mode_out, e_mode_out, num_e_mode_out * e_mode_bytes, cudaMemcpyHostToDevice)); -- -- // Restriction -- diag->diag_rstr = rstr_out; -+ // CEED_EVAL_INTERP, CEED_EVAL_GRAD, CEED_EVAL_DIV, and CEED_EVAL_CURL -+ for (CeedInt in = 0; in < 2; in++) { -+ CeedFESpace fespace; -+ CeedBasis basis = in ? basis_in : basis_out; -+ -+ CeedCallBackend(CeedBasisGetFESpace(basis, &fespace)); -+ switch (fespace) { -+ case CEED_FE_SPACE_H1: { -+ CeedInt q_comp_interp, q_comp_grad; -+ const CeedScalar *interp, *grad; -+ CeedScalar *d_interp, *d_grad; -+ -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); -+ -+ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); -+ CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); -+ CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice)); -+ CeedCallBackend(CeedBasisGetGrad(basis, &grad)); -+ CeedCallCuda(ceed, cudaMalloc((void **)&d_grad, interp_bytes * q_comp_grad)); -+ CeedCallCuda(ceed, cudaMemcpy(d_grad, grad, interp_bytes * q_comp_grad, cudaMemcpyHostToDevice)); -+ if (in) { -+ diag->d_interp_in = d_interp; -+ diag->d_grad_in = d_grad; -+ } else { -+ diag->d_interp_out = d_interp; -+ diag->d_grad_out = d_grad; -+ } -+ } break; -+ case CEED_FE_SPACE_HDIV: { -+ CeedInt q_comp_interp, q_comp_div; -+ const CeedScalar *interp, *div; -+ CeedScalar *d_interp, *d_div; -+ -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); -+ -+ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); -+ CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); -+ CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice)); -+ CeedCallBackend(CeedBasisGetDiv(basis, &div)); -+ CeedCallCuda(ceed, cudaMalloc((void **)&d_div, interp_bytes * q_comp_div)); -+ CeedCallCuda(ceed, cudaMemcpy(d_div, div, interp_bytes * q_comp_div, cudaMemcpyHostToDevice)); -+ if (in) { -+ diag->d_interp_in = d_interp; -+ diag->d_div_in = d_div; -+ } else { -+ diag->d_interp_out = d_interp; -+ diag->d_div_out = d_div; -+ } -+ } break; -+ case CEED_FE_SPACE_HCURL: { -+ CeedInt q_comp_interp, q_comp_curl; -+ const CeedScalar *interp, *curl; -+ CeedScalar *d_interp, *d_curl; -+ -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); -+ -+ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); -+ CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); -+ CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice)); -+ CeedCallBackend(CeedBasisGetCurl(basis, &curl)); -+ CeedCallCuda(ceed, cudaMalloc((void **)&d_curl, interp_bytes * q_comp_curl)); -+ CeedCallCuda(ceed, cudaMemcpy(d_curl, curl, interp_bytes * q_comp_curl, cudaMemcpyHostToDevice)); -+ if (in) { -+ diag->d_interp_in = d_interp; -+ diag->d_curl_in = d_curl; -+ } else { -+ diag->d_interp_out = d_interp; -+ diag->d_curl_out = d_curl; -+ } -+ } break; -+ } -+ } -+ -+ // Arrays of eval_modes -+ CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_eval_modes_in, num_eval_modes_in * eval_modes_bytes)); -+ CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_in, eval_modes_in, num_eval_modes_in * eval_modes_bytes, cudaMemcpyHostToDevice)); -+ CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_eval_modes_out, num_eval_modes_out * eval_modes_bytes)); -+ CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, cudaMemcpyHostToDevice)); -+ CeedCallBackend(CeedFree(&eval_modes_in)); -+ CeedCallBackend(CeedFree(&eval_modes_out)); - return CEED_ERROR_SUCCESS; - } - - //------------------------------------------------------------------------------ --// Assemble diagonal common code -+// Assemble Diagonal Core - //------------------------------------------------------------------------------ - static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { - Ceed ceed; -- CeedSize assembled_length = 0, assembled_qf_length = 0; -- CeedInt use_ceedsize_idx = 0, num_elem; -+ CeedSize assembled_length, assembled_qf_length; -+ CeedInt use_ceedsize_idx = 0, num_elem, num_nodes; - CeedScalar *elem_diag_array; - const CeedScalar *assembled_qf_array; -- CeedVector assembled_qf = NULL; -- CeedElemRestriction rstr = NULL; -+ CeedVector assembled_qf = NULL, elem_diag; -+ CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out, diag_rstr; - CeedOperator_Cuda *impl; - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetData(op, &impl)); - - // Assemble QFunction -- CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request)); -- CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); -+ CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, request)); -+ CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); -+ CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); - - CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); - CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); -@@ -793,36 +805,37 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVec - - assert(diag != NULL); - -- // Restriction -- if (is_point_block && !diag->point_block_diag_rstr) { -- CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(diag->diag_rstr, &diag->point_block_diag_rstr)); -- } -- CeedElemRestriction diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; -- -- // Create diagonal vector -- CeedVector elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; -- -- if (!elem_diag) { -- CeedCallBackend(CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag)); -- if (is_point_block) diag->point_block_elem_diag = elem_diag; -- else diag->elem_diag = elem_diag; -+ // Restriction and diagonal vector -+ CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); -+ CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND, -+ "Cannot assemble operator diagonal with different input and output active element restrictions"); -+ if (!is_point_block && !diag->diag_rstr) { -+ CeedCallBackend(CeedElemRestrictionCreateUnsignedCopy(rstr_out, &diag->diag_rstr)); -+ CeedCallBackend(CeedElemRestrictionCreateVector(diag->diag_rstr, NULL, &diag->elem_diag)); -+ } else if (is_point_block && !diag->point_block_diag_rstr) { -+ CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr)); -+ CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag)); - } -+ diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; -+ elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; - CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); - - // Assemble element operator diagonals - CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); -- CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); - CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(diag_rstr, &num_nodes)); - - // Compute the diagonal of B^T D B -- int elem_per_block = 1; -- int grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); -- void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_interp_out, -- &diag->d_grad_out, &diag->d_e_mode_in, &diag->d_e_mode_out, &assembled_qf_array, &elem_diag_array}; -+ CeedInt elems_per_block = 1; -+ CeedInt grid = CeedDivUpInt(num_elem, elems_per_block); -+ void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_div_in, -+ &diag->d_curl_in, &diag->d_interp_out, &diag->d_grad_out, &diag->d_div_out, &diag->d_curl_out, -+ &diag->d_eval_modes_in, &diag->d_eval_modes_out, &assembled_qf_array, &elem_diag_array}; -+ - if (is_point_block) { -- CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearPointBlock, grid, diag->num_nodes, 1, elem_per_block, args)); -+ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args)); - } else { -- CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearDiagonal, grid, diag->num_nodes, 1, elem_per_block, args)); -+ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args)); - } - - // Restore arrays -@@ -854,14 +867,15 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, - } - - //------------------------------------------------------------------------------ --// Single operator assembly setup -+// Single Operator Assembly Setup - //------------------------------------------------------------------------------ - static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) { -- Ceed ceed; -- char *assembly_kernel_path, *assembly_kernel_source; -- CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_qpts = 0, elem_size = 0, -- num_e_mode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0, num_elem, num_comp; -- CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; -+ Ceed ceed; -+ Ceed_Cuda *cuda_data; -+ char *assembly_kernel_path, *assembly_kernel_source; -+ CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; -+ CeedInt elem_size_in, num_qpts_in, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp; -+ CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; - CeedElemRestriction rstr_in = NULL, rstr_out = NULL; - CeedBasis basis_in = NULL, basis_out = NULL; - CeedQFunctionField *qf_fields; -@@ -878,34 +892,30 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee - // Determine active input basis eval mode - CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); -- // Note that the kernel will treat each dimension of a gradient action separately; -- // i.e., when an active input has a CEED_EVAL_GRAD mode, num_e_mode_in will increment by dim. -- // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so -- // num_B_in_mats_to_load will be incremented by 1. - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedVector vec; - - CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { -+ CeedBasis basis; - CeedEvalMode eval_mode; - -- CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in)); -- CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); -- CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); -+ CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis)); -+ CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); -+ basis_in = basis; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); -- CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); -+ if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in; -+ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); -- if (eval_mode != CEED_EVAL_NONE) { -- CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); -- eval_mode_in[num_B_in_mats_to_load] = eval_mode; -- num_B_in_mats_to_load += 1; -- if (eval_mode == CEED_EVAL_GRAD) { -- num_e_mode_in += dim; -- size_B_in += dim * elem_size * num_qpts; -- } else { -- num_e_mode_in += 1; -- size_B_in += elem_size * num_qpts; -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); -+ if (eval_mode != CEED_EVAL_WEIGHT) { -+ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly -+ CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in)); -+ for (CeedInt d = 0; d < q_comp; d++) { -+ eval_modes_in[num_eval_modes_in + d] = eval_mode; - } -+ num_eval_modes_in += q_comp; - } - } - } -@@ -917,112 +927,134 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee - - CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { -+ CeedBasis basis; - CeedEvalMode eval_mode; - -- CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out)); -+ CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis)); -+ CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, -+ "Backend does not implement operator assembly with multiple active bases"); -+ basis_out = basis; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); -- CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); -+ if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out; -+ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out)); -+ CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED, -+ "Active input and output bases must have the same number of quadrature points"); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); -- if (eval_mode != CEED_EVAL_NONE) { -- CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); -- eval_mode_out[num_B_out_mats_to_load] = eval_mode; -- num_B_out_mats_to_load += 1; -- if (eval_mode == CEED_EVAL_GRAD) { -- num_e_mode_out += dim; -- size_B_out += dim * elem_size * num_qpts; -- } else { -- num_e_mode_out += 1; -- size_B_out += elem_size * num_qpts; -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); -+ if (eval_mode != CEED_EVAL_WEIGHT) { -+ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly -+ CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out)); -+ for (CeedInt d = 0; d < q_comp; d++) { -+ eval_modes_out[num_eval_modes_out + d] = eval_mode; - } -+ num_eval_modes_out += q_comp; - } - } - } -- CeedCheck(num_e_mode_in > 0 && num_e_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); -- -- CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem)); -- CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp)); -+ CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); - - CeedCallBackend(CeedCalloc(1, &impl->asmb)); - CeedOperatorAssemble_Cuda *asmb = impl->asmb; -- asmb->num_elem = num_elem; -- -- // Compile kernels -- int elem_per_block = 1; -- asmb->elem_per_block = elem_per_block; -- CeedInt block_size = elem_size * elem_size * elem_per_block; -- Ceed_Cuda *cuda_data; -+ asmb->elems_per_block = 1; -+ asmb->block_size_x = elem_size_in; -+ asmb->block_size_y = elem_size_out; - - CeedCallBackend(CeedGetData(ceed, &cuda_data)); -- CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path)); -- CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); -- CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); -- CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); -- bool fallback = block_size > cuda_data->device_prop.maxThreadsPerBlock; -+ bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > cuda_data->device_prop.maxThreadsPerBlock; - - if (fallback) { - // Use fallback kernel with 1D threadblock -- block_size = elem_size * elem_per_block; -- asmb->block_size_x = elem_size; - asmb->block_size_y = 1; -- } else { // Use kernel with 2D threadblock -- asmb->block_size_x = elem_size; -- asmb->block_size_y = elem_size; - } -- CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 8, "NUM_ELEM", num_elem, "NUM_E_MODE_IN", num_e_mode_in, -- "NUM_E_MODE_OUT", num_e_mode_out, "NUM_QPTS", num_qpts, "NUM_NODES", elem_size, "BLOCK_SIZE", block_size, -- "NUM_COMP", num_comp, "USE_CEEDSIZE", use_ceedsize_idx)); -- CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble)); -+ -+ // Compile kernels -+ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in)); -+ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out)); -+ CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path)); -+ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); -+ CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); -+ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); -+ CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", -+ num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in, -+ "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE", -+ asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "CEED_SIZE", -+ use_ceedsize_idx)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble)); - CeedCallBackend(CeedFree(&assembly_kernel_path)); - CeedCallBackend(CeedFree(&assembly_kernel_source)); - -- // Build 'full' B matrices (not 1D arrays used for tensor-product matrices) -- const CeedScalar *interp_in, *grad_in; -+ // Load into B_in, in order that they will be used in eval_modes_in -+ { -+ const CeedInt in_bytes = elem_size_in * num_qpts_in * num_eval_modes_in * sizeof(CeedScalar); -+ CeedInt d_in = 0; -+ CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE; -+ bool has_eval_none = false; -+ CeedScalar *identity = NULL; - -- CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); -- CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); -+ for (CeedInt i = 0; i < num_eval_modes_in; i++) { -+ has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE); -+ } -+ if (has_eval_none) { -+ CeedCallBackend(CeedCalloc(elem_size_in * num_qpts_in, &identity)); -+ for (CeedInt i = 0; i < (elem_size_in < num_qpts_in ? elem_size_in : num_qpts_in); i++) identity[i * elem_size_in + i] = 1.0; -+ } -+ -+ CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_in, in_bytes)); -+ for (CeedInt i = 0; i < num_eval_modes_in; i++) { -+ const CeedScalar *h_B_in; - -- // Load into B_in, in order that they will be used in eval_mode -- const CeedInt inBytes = size_B_in * sizeof(CeedScalar); -- CeedInt mat_start = 0; -+ CeedCallBackend(CeedOperatorGetBasisPointer(basis_in, eval_modes_in[i], identity, &h_B_in)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_modes_in[i], &q_comp)); -+ if (q_comp > 1) { -+ if (i == 0 || eval_modes_in[i] != eval_modes_in_prev) d_in = 0; -+ else h_B_in = &h_B_in[(++d_in) * elem_size_in * num_qpts_in]; -+ } -+ eval_modes_in_prev = eval_modes_in[i]; - -- CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_in, inBytes)); -- for (int i = 0; i < num_B_in_mats_to_load; i++) { -- CeedEvalMode eval_mode = eval_mode_in[i]; -+ CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar), -+ cudaMemcpyHostToDevice)); -+ } - -- if (eval_mode == CEED_EVAL_INTERP) { -- CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], interp_in, elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); -- mat_start += elem_size * num_qpts; -- } else if (eval_mode == CEED_EVAL_GRAD) { -- CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); -- mat_start += dim * elem_size * num_qpts; -+ if (identity) { -+ CeedCallBackend(CeedFree(&identity)); - } - } - -- const CeedScalar *interp_out, *grad_out; -+ // Load into B_out, in order that they will be used in eval_modes_out -+ { -+ const CeedInt out_bytes = elem_size_out * num_qpts_out * num_eval_modes_out * sizeof(CeedScalar); -+ CeedInt d_out = 0; -+ CeedEvalMode eval_modes_out_prev = CEED_EVAL_NONE; -+ bool has_eval_none = false; -+ CeedScalar *identity = NULL; - -- // Note that this function currently assumes 1 basis, so this should always be true for now -- if (basis_out == basis_in) { -- interp_out = interp_in; -- grad_out = grad_in; -- } else { -- CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); -- CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); -- } -+ for (CeedInt i = 0; i < num_eval_modes_out; i++) { -+ has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE); -+ } -+ if (has_eval_none) { -+ CeedCallBackend(CeedCalloc(elem_size_out * num_qpts_out, &identity)); -+ for (CeedInt i = 0; i < (elem_size_out < num_qpts_out ? elem_size_out : num_qpts_out); i++) identity[i * elem_size_out + i] = 1.0; -+ } -+ -+ CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, out_bytes)); -+ for (CeedInt i = 0; i < num_eval_modes_out; i++) { -+ const CeedScalar *h_B_out; - -- // Load into B_out, in order that they will be used in eval_mode -- const CeedInt outBytes = size_B_out * sizeof(CeedScalar); -- mat_start = 0; -+ CeedCallBackend(CeedOperatorGetBasisPointer(basis_out, eval_modes_out[i], identity, &h_B_out)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_modes_out[i], &q_comp)); -+ if (q_comp > 1) { -+ if (i == 0 || eval_modes_out[i] != eval_modes_out_prev) d_out = 0; -+ else h_B_out = &h_B_out[(++d_out) * elem_size_out * num_qpts_out]; -+ } -+ eval_modes_out_prev = eval_modes_out[i]; - -- CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, outBytes)); -- for (int i = 0; i < num_B_out_mats_to_load; i++) { -- CeedEvalMode eval_mode = eval_mode_out[i]; -+ CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar), -+ cudaMemcpyHostToDevice)); -+ } - -- if (eval_mode == CEED_EVAL_INTERP) { -- CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], interp_out, elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); -- mat_start += elem_size * num_qpts; -- } else if (eval_mode == CEED_EVAL_GRAD) { -- CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); -- mat_start += dim * elem_size * num_qpts; -+ if (identity) { -+ CeedCallBackend(CeedFree(&identity)); - } - } - return CEED_ERROR_SUCCESS; -@@ -1039,47 +1071,96 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee - static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) { - Ceed ceed; - CeedSize values_length = 0, assembled_qf_length = 0; -- CeedInt use_ceedsize_idx = 0; -+ CeedInt use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out; - CeedScalar *values_array; -- const CeedScalar *qf_array; -- CeedVector assembled_qf = NULL; -- CeedElemRestriction rstr_q = NULL; -+ const CeedScalar *assembled_qf_array; -+ CeedVector assembled_qf = NULL; -+ CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out; -+ CeedRestrictionType rstr_type_in, rstr_type_out; -+ const bool *orients_in = NULL, *orients_out = NULL; -+ const CeedInt8 *curl_orients_in = NULL, *curl_orients_out = NULL; - CeedOperator_Cuda *impl; - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetData(op, &impl)); - - // Assemble QFunction -- CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); -- CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); -- CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); -- values_array += offset; -- CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); -+ CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, CEED_REQUEST_IMMEDIATE)); -+ CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); -+ CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); - - CeedCallBackend(CeedVectorGetLength(values, &values_length)); - CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); - if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; -+ - // Setup -- if (!impl->asmb) { -- CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx)); -- assert(impl->asmb != NULL); -+ if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx)); -+ CeedOperatorAssemble_Cuda *asmb = impl->asmb; -+ -+ assert(asmb != NULL); -+ -+ // Assemble element operator -+ CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); -+ values_array += offset; -+ -+ CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem_in)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); -+ -+ CeedCallBackend(CeedElemRestrictionGetType(rstr_in, &rstr_type_in)); -+ if (rstr_type_in == CEED_RESTRICTION_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_in, CEED_MEM_DEVICE, &orients_in)); -+ } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_in, CEED_MEM_DEVICE, &curl_orients_in)); -+ } -+ -+ if (rstr_in != rstr_out) { -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out)); -+ CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED, -+ "Active input and output operator restrictions must have the same number of elements"); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); -+ -+ CeedCallBackend(CeedElemRestrictionGetType(rstr_out, &rstr_type_out)); -+ if (rstr_type_out == CEED_RESTRICTION_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_out, CEED_MEM_DEVICE, &orients_out)); -+ } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_out, CEED_MEM_DEVICE, &curl_orients_out)); -+ } -+ } else { -+ elem_size_out = elem_size_in; -+ orients_out = orients_in; -+ curl_orients_out = curl_orients_in; - } - - // Compute B^T D B -- const CeedInt num_elem = impl->asmb->num_elem; -- const CeedInt elem_per_block = impl->asmb->elem_per_block; -- const CeedInt grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); -- void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; -+ CeedInt shared_mem = -+ ((curl_orients_in || curl_orients_out ? elem_size_in * elem_size_out : 0) + (curl_orients_in ? elem_size_in * asmb->block_size_y : 0)) * -+ sizeof(CeedScalar); -+ CeedInt grid = CeedDivUpInt(num_elem_in, asmb->elems_per_block); -+ void *args[] = {(void *)&num_elem_in, &asmb->d_B_in, &asmb->d_B_out, &orients_in, &curl_orients_in, -+ &orients_out, &curl_orients_out, &assembled_qf_array, &values_array}; - - CeedCallBackend( -- CeedRunKernelDim_Cuda(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elem_per_block, args)); -+ CeedRunKernelDimShared_Cuda(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args)); - - // Restore arrays - CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); -- CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array)); -+ CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); - - // Cleanup - CeedCallBackend(CeedVectorDestroy(&assembled_qf)); -+ if (rstr_type_in == CEED_RESTRICTION_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_in, &orients_in)); -+ } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_in, &curl_orients_in)); -+ } -+ if (rstr_in != rstr_out) { -+ if (rstr_type_out == CEED_RESTRICTION_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_out, &orients_out)); -+ } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out)); -+ } -+ } - return CEED_ERROR_SUCCESS; - } - -diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c -index 71ed2821..f2b190e5 100644 ---- a/backends/cuda-ref/ceed-cuda-ref-restriction.c -+++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c -@@ -19,22 +19,23 @@ - #include "ceed-cuda-ref.h" - - //------------------------------------------------------------------------------ --// Apply restriction -+// Core apply restriction code - //------------------------------------------------------------------------------ --static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { -+static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients, -+ CeedVector u, CeedVector v, CeedRequest *request) { - Ceed ceed; -- Ceed_Cuda *data; -- CUfunction kernel; - CeedInt num_elem, elem_size; -+ CeedRestrictionType rstr_type; - const CeedScalar *d_u; - CeedScalar *d_v; - CeedElemRestriction_Cuda *impl; -+ CUfunction kernel; - -- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -- CeedCallBackend(CeedGetData(ceed, &data)); -- CeedElemRestrictionGetNumElements(r, &num_elem); -- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); -+ CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); - const CeedInt num_nodes = impl->num_nodes; - - // Get vectors -@@ -50,45 +51,155 @@ static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMod - // Restrict - if (t_mode == CEED_NOTRANSPOSE) { - // L-vector -> E-vector -- if (impl->d_ind) { -- // -- Offsets provided -- kernel = impl->OffsetNoTranspose; -- void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -- CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; -- -- CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); -- } else { -- // -- Strided restriction -- kernel = impl->StridedNoTranspose; -- void *args[] = {&num_elem, &d_u, &d_v}; -- CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; -- -- CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); -+ const CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; -+ const CeedInt grid = CeedDivUpInt(num_nodes, block_size); -+ -+ switch (rstr_type) { -+ case CEED_RESTRICTION_STRIDED: { -+ kernel = impl->StridedNoTranspose; -+ void *args[] = {&num_elem, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } break; -+ case CEED_RESTRICTION_STANDARD: { -+ kernel = impl->OffsetNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } break; -+ case CEED_RESTRICTION_ORIENTED: { -+ if (use_signs) { -+ kernel = impl->OrientedNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OffsetNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } -+ } break; -+ case CEED_RESTRICTION_CURL_ORIENTED: { -+ if (use_signs && use_orients) { -+ kernel = impl->CurlOrientedNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } else if (use_orients) { -+ kernel = impl->CurlOrientedUnsignedNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OffsetNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } -+ } break; -+ case CEED_RESTRICTION_POINTS: { -+ // LCOV_EXCL_START -+ return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); -+ // LCOV_EXCL_STOP -+ } break; - } - } else { - // E-vector -> L-vector -- if (impl->d_ind) { -- // -- Offsets provided -- CeedInt block_size = 32; -+ const CeedInt block_size = 32; -+ const CeedInt grid = CeedDivUpInt(num_nodes, block_size); -+ -+ switch (rstr_type) { -+ case CEED_RESTRICTION_STRIDED: { -+ kernel = impl->StridedTranspose; -+ void *args[] = {&num_elem, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } break; -+ case CEED_RESTRICTION_STANDARD: { -+ if (impl->OffsetTranspose) { -+ kernel = impl->OffsetTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OffsetTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } -+ } break; -+ case CEED_RESTRICTION_ORIENTED: { -+ if (use_signs) { -+ if (impl->OrientedTranspose) { -+ kernel = impl->OrientedTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OrientedTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } -+ } else { -+ if (impl->OffsetTranspose) { -+ kernel = impl->OffsetTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; - -- if (impl->OffsetTranspose) { -- kernel = impl->OffsetTranspose; -- void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OffsetTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; - -- CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); -- } else { -- kernel = impl->OffsetTransposeDet; -- void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } -+ } -+ } break; -+ case CEED_RESTRICTION_CURL_ORIENTED: { -+ if (use_signs && use_orients) { -+ if (impl->CurlOrientedTranspose) { -+ kernel = impl->CurlOrientedTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->CurlOrientedTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } -+ } else if (use_orients) { -+ if (impl->CurlOrientedUnsignedTranspose) { -+ kernel = impl->CurlOrientedUnsignedTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; - -- CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); -- } -- } else { -- // -- Strided restriction -- kernel = impl->StridedTranspose; -- void *args[] = {&num_elem, &d_u, &d_v}; -- CeedInt block_size = 32; -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->CurlOrientedUnsignedTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } -+ } else { -+ if (impl->OffsetTranspose) { -+ kernel = impl->OffsetTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; - -- CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OffsetTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); -+ } -+ } -+ } break; -+ case CEED_RESTRICTION_POINTS: { -+ // LCOV_EXCL_START -+ return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); -+ // LCOV_EXCL_STOP -+ } break; - } - } - -@@ -100,6 +211,29 @@ static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMod - return CEED_ERROR_SUCCESS; - } - -+//------------------------------------------------------------------------------ -+// Apply restriction -+//------------------------------------------------------------------------------ -+static int CeedElemRestrictionApply_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { -+ return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, true, true, u, v, request); -+} -+ -+//------------------------------------------------------------------------------ -+// Apply unsigned restriction -+//------------------------------------------------------------------------------ -+static int CeedElemRestrictionApplyUnsigned_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, -+ CeedRequest *request) { -+ return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, true, u, v, request); -+} -+ -+//------------------------------------------------------------------------------ -+// Apply unoriented restriction -+//------------------------------------------------------------------------------ -+static int CeedElemRestrictionApplyUnoriented_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, -+ CeedRequest *request) { -+ return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, false, u, v, request); -+} -+ - //------------------------------------------------------------------------------ - // Get offsets - //------------------------------------------------------------------------------ -@@ -118,21 +252,61 @@ static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemT - return CEED_ERROR_SUCCESS; - } - -+//------------------------------------------------------------------------------ -+// Get orientations -+//------------------------------------------------------------------------------ -+static int CeedElemRestrictionGetOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) { -+ CeedElemRestriction_Cuda *impl; -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ -+ switch (mem_type) { -+ case CEED_MEM_HOST: -+ *orients = impl->h_orients; -+ break; -+ case CEED_MEM_DEVICE: -+ *orients = impl->d_orients; -+ break; -+ } -+ return CEED_ERROR_SUCCESS; -+} -+ -+//------------------------------------------------------------------------------ -+// Get curl-conforming orientations -+//------------------------------------------------------------------------------ -+static int CeedElemRestrictionGetCurlOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) { -+ CeedElemRestriction_Cuda *impl; -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ -+ switch (mem_type) { -+ case CEED_MEM_HOST: -+ *curl_orients = impl->h_curl_orients; -+ break; -+ case CEED_MEM_DEVICE: -+ *curl_orients = impl->d_curl_orients; -+ break; -+ } -+ return CEED_ERROR_SUCCESS; -+} -+ - //------------------------------------------------------------------------------ - // Destroy restriction - //------------------------------------------------------------------------------ --static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) { -+static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) { - Ceed ceed; - CeedElemRestriction_Cuda *impl; - -- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); - CeedCallCuda(ceed, cuModuleUnload(impl->module)); - CeedCallBackend(CeedFree(&impl->h_ind_allocated)); - CeedCallCuda(ceed, cudaFree(impl->d_ind_allocated)); - CeedCallCuda(ceed, cudaFree(impl->d_t_offsets)); - CeedCallCuda(ceed, cudaFree(impl->d_t_indices)); - CeedCallCuda(ceed, cudaFree(impl->d_l_vec_indices)); -+ CeedCallBackend(CeedFree(&impl->h_orients_allocated)); -+ CeedCallCuda(ceed, cudaFree(impl->d_orients_allocated)); -+ CeedCallBackend(CeedFree(&impl->h_curl_orients_allocated)); -+ CeedCallCuda(ceed, cudaFree(impl->d_curl_orients_allocated)); - CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; - } -@@ -140,7 +314,7 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) { - //------------------------------------------------------------------------------ - // Create transpose offsets and indices - //------------------------------------------------------------------------------ --static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const CeedInt *indices) { -+static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt *indices) { - Ceed ceed; - bool *is_node; - CeedSize l_size; -@@ -148,12 +322,12 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const Cee - CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; - CeedElemRestriction_Cuda *impl; - -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); -- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); -- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); -- CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); -- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); -+ CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); -+ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); - const CeedInt size_indices = num_elem * elem_size; - - // Count num_nodes -@@ -221,153 +395,223 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const Cee - // Create restriction - //------------------------------------------------------------------------------ - int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, -- const CeedInt8 *curl_orients, CeedElemRestriction r) { -+ const CeedInt8 *curl_orients, CeedElemRestriction rstr) { - Ceed ceed, ceed_parent; -- bool is_deterministic, is_strided; -+ bool is_deterministic; - CeedInt num_elem, num_comp, elem_size, comp_stride = 1; - CeedRestrictionType rstr_type; -+ char *restriction_kernel_path, *restriction_kernel_source; - CeedElemRestriction_Cuda *impl; - -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -- CeedCallBackend(CeedCalloc(1, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); - CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); - CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); -- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); -- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); -- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); -+ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); - const CeedInt size = num_elem * elem_size; - CeedInt strides[3] = {1, size, elem_size}; - CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; - -- CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); -- CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, -- "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); -- - // Stride data -- CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); -- if (is_strided) { -+ CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); -+ if (rstr_type == CEED_RESTRICTION_STRIDED) { - bool has_backend_strides; - -- CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); -+ CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); - if (!has_backend_strides) { -- CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); -+ CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); - } - } else { -- CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); -+ CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); - } - -- impl->h_ind = NULL; -- impl->h_ind_allocated = NULL; -- impl->d_ind = NULL; -- impl->d_ind_allocated = NULL; -- impl->d_t_indices = NULL; -- impl->d_t_offsets = NULL; -- impl->num_nodes = size; -- CeedCallBackend(CeedElemRestrictionSetData(r, impl)); -- CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); -- -- // Set up device indices/offset arrays -- switch (mem_type) { -- case CEED_MEM_HOST: { -- switch (copy_mode) { -- case CEED_OWN_POINTER: -- impl->h_ind_allocated = (CeedInt *)indices; -- impl->h_ind = (CeedInt *)indices; -- break; -- case CEED_USE_POINTER: -- impl->h_ind = (CeedInt *)indices; -- break; -- case CEED_COPY_VALUES: -- if (indices != NULL) { -- CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); -- memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); -+ CeedCallBackend(CeedCalloc(1, &impl)); -+ impl->num_nodes = size; -+ impl->h_ind = NULL; -+ impl->h_ind_allocated = NULL; -+ impl->d_ind = NULL; -+ impl->d_ind_allocated = NULL; -+ impl->d_t_indices = NULL; -+ impl->d_t_offsets = NULL; -+ impl->h_orients = NULL; -+ impl->h_orients_allocated = NULL; -+ impl->d_orients = NULL; -+ impl->d_orients_allocated = NULL; -+ impl->h_curl_orients = NULL; -+ impl->h_curl_orients_allocated = NULL; -+ impl->d_curl_orients = NULL; -+ impl->d_curl_orients_allocated = NULL; -+ CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); -+ CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); -+ -+ // Set up device offset/orientation arrays -+ if (rstr_type != CEED_RESTRICTION_STRIDED) { -+ switch (mem_type) { -+ case CEED_MEM_HOST: { -+ switch (copy_mode) { -+ case CEED_OWN_POINTER: -+ impl->h_ind_allocated = (CeedInt *)indices; -+ impl->h_ind = (CeedInt *)indices; -+ break; -+ case CEED_USE_POINTER: -+ impl->h_ind = (CeedInt *)indices; -+ break; -+ case CEED_COPY_VALUES: -+ CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); -+ memcpy(impl->h_ind_allocated, indices, size * sizeof(CeedInt)); - impl->h_ind = impl->h_ind_allocated; -- } -- break; -- } -- if (indices != NULL) { -+ break; -+ } - CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); - impl->d_ind_allocated = impl->d_ind; // We own the device memory - CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyHostToDevice)); -- if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices)); -- } -- break; -- } -- case CEED_MEM_DEVICE: { -- switch (copy_mode) { -- case CEED_COPY_VALUES: -- if (indices != NULL) { -+ if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, indices)); -+ } break; -+ case CEED_MEM_DEVICE: { -+ switch (copy_mode) { -+ case CEED_COPY_VALUES: - CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); - impl->d_ind_allocated = impl->d_ind; // We own the device memory - CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyDeviceToDevice)); -+ break; -+ case CEED_OWN_POINTER: -+ impl->d_ind = (CeedInt *)indices; -+ impl->d_ind_allocated = impl->d_ind; -+ break; -+ case CEED_USE_POINTER: -+ impl->d_ind = (CeedInt *)indices; -+ break; -+ } -+ CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); -+ CeedCallCuda(ceed, cudaMemcpy(impl->h_ind_allocated, impl->d_ind, size * sizeof(CeedInt), cudaMemcpyDeviceToHost)); -+ impl->h_ind = impl->h_ind_allocated; -+ if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, indices)); -+ } break; -+ } -+ -+ // Orientation data -+ if (rstr_type == CEED_RESTRICTION_ORIENTED) { -+ switch (mem_type) { -+ case CEED_MEM_HOST: { -+ switch (copy_mode) { -+ case CEED_OWN_POINTER: -+ impl->h_orients_allocated = (bool *)orients; -+ impl->h_orients = (bool *)orients; -+ break; -+ case CEED_USE_POINTER: -+ impl->h_orients = (bool *)orients; -+ break; -+ case CEED_COPY_VALUES: -+ CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); -+ memcpy(impl->h_orients_allocated, orients, size * sizeof(bool)); -+ impl->h_orients = impl->h_orients_allocated; -+ break; - } -- break; -- case CEED_OWN_POINTER: -- impl->d_ind = (CeedInt *)indices; -- impl->d_ind_allocated = impl->d_ind; -- break; -- case CEED_USE_POINTER: -- impl->d_ind = (CeedInt *)indices; -+ CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_orients, size * sizeof(bool))); -+ impl->d_orients_allocated = impl->d_orients; // We own the device memory -+ CeedCallCuda(ceed, cudaMemcpy(impl->d_orients, orients, size * sizeof(bool), cudaMemcpyHostToDevice)); -+ } break; -+ case CEED_MEM_DEVICE: { -+ switch (copy_mode) { -+ case CEED_COPY_VALUES: -+ CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_orients, size * sizeof(bool))); -+ impl->d_orients_allocated = impl->d_orients; // We own the device memory -+ CeedCallCuda(ceed, cudaMemcpy(impl->d_orients, orients, size * sizeof(bool), cudaMemcpyDeviceToDevice)); -+ break; -+ case CEED_OWN_POINTER: -+ impl->d_orients = (bool *)orients; -+ impl->d_orients_allocated = impl->d_orients; -+ break; -+ case CEED_USE_POINTER: -+ impl->d_orients = (bool *)orients; -+ break; -+ } -+ CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); -+ CeedCallCuda(ceed, cudaMemcpy(impl->h_orients_allocated, impl->d_orients, size * sizeof(bool), cudaMemcpyDeviceToHost)); -+ impl->h_orients = impl->h_orients_allocated; -+ } break; - } -- if (indices != NULL) { -- CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); -- CeedCallCuda(ceed, cudaMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), cudaMemcpyDeviceToHost)); -- impl->h_ind = impl->h_ind_allocated; -- if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices)); -+ } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) { -+ switch (mem_type) { -+ case CEED_MEM_HOST: { -+ switch (copy_mode) { -+ case CEED_OWN_POINTER: -+ impl->h_curl_orients_allocated = (CeedInt8 *)curl_orients; -+ impl->h_curl_orients = (CeedInt8 *)curl_orients; -+ break; -+ case CEED_USE_POINTER: -+ impl->h_curl_orients = (CeedInt8 *)curl_orients; -+ break; -+ case CEED_COPY_VALUES: -+ CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); -+ memcpy(impl->h_curl_orients_allocated, curl_orients, 3 * size * sizeof(CeedInt8)); -+ impl->h_curl_orients = impl->h_curl_orients_allocated; -+ break; -+ } -+ CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); -+ impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory -+ CeedCallCuda(ceed, cudaMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyHostToDevice)); -+ } break; -+ case CEED_MEM_DEVICE: { -+ switch (copy_mode) { -+ case CEED_COPY_VALUES: -+ CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); -+ impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory -+ CeedCallCuda(ceed, cudaMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyDeviceToDevice)); -+ break; -+ case CEED_OWN_POINTER: -+ impl->d_curl_orients = (CeedInt8 *)curl_orients; -+ impl->d_curl_orients_allocated = impl->d_curl_orients; -+ break; -+ case CEED_USE_POINTER: -+ impl->d_curl_orients = (CeedInt8 *)curl_orients; -+ break; -+ } -+ CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); -+ CeedCallCuda(ceed, cudaMemcpy(impl->h_curl_orients_allocated, impl->d_curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyDeviceToHost)); -+ impl->h_curl_orients = impl->h_curl_orients_allocated; -+ } break; - } -- break; - } -- // LCOV_EXCL_START -- default: -- return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); -- // LCOV_EXCL_STOP - } - -- // Compile CUDA kernels (add atomicAdd function for old NVidia architectures) -- CeedInt num_nodes = impl->num_nodes; -- char *restriction_kernel_path, *restriction_kernel_source = NULL; -- -+ // Compile CUDA kernels - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); -- if (!is_deterministic) { -- struct cudaDeviceProp prop; -- Ceed_Cuda *ceed_data; -- -- CeedCallBackend(CeedGetData(ceed, &ceed_data)); -- CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); -- if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { -- char *atomic_add_path; -- -- CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path)); -- CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &restriction_kernel_source)); -- CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); -- CeedCallBackend(CeedFree(&atomic_add_path)); -- } -- } -- if (!restriction_kernel_source) { -- CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); -- } -+ CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); - CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, -- "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", -+ "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", - strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedNoTranspose", &impl->OrientedNoTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedNoTranspose", &impl->CurlOrientedNoTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->CurlOrientedUnsignedNoTranspose)); - if (!is_deterministic) { - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->OrientedTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->CurlOrientedTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->CurlOrientedUnsignedTranspose)); - } else { - CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTransposeDet", &impl->OrientedTransposeDet)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTransposeDet", &impl->CurlOrientedTransposeDet)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTransposeDet", &impl->CurlOrientedUnsignedTransposeDet)); - } - CeedCallBackend(CeedFree(&restriction_kernel_path)); - CeedCallBackend(CeedFree(&restriction_kernel_source)); - - // Register backend functions -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Cuda)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Cuda)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Cuda)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Cuda)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Cuda)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Cuda)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Cuda)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Cuda)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Cuda)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda)); - return CEED_ERROR_SUCCESS; - } - -diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h -index 309c1056..93008817 100644 ---- a/backends/cuda-ref/ceed-cuda-ref.h -+++ b/backends/cuda-ref/ceed-cuda-ref.h -@@ -30,6 +30,15 @@ typedef struct { - CUfunction OffsetNoTranspose; - CUfunction OffsetTranspose; - CUfunction OffsetTransposeDet; -+ CUfunction OrientedNoTranspose; -+ CUfunction OrientedTranspose; -+ CUfunction OrientedTransposeDet; -+ CUfunction CurlOrientedNoTranspose; -+ CUfunction CurlOrientedTranspose; -+ CUfunction CurlOrientedTransposeDet; -+ CUfunction CurlOrientedUnsignedNoTranspose; -+ CUfunction CurlOrientedUnsignedTranspose; -+ CUfunction CurlOrientedUnsignedTransposeDet; - CeedInt num_nodes; - CeedInt *h_ind; - CeedInt *h_ind_allocated; -@@ -37,7 +46,15 @@ typedef struct { - CeedInt *d_ind_allocated; - CeedInt *d_t_offsets; - CeedInt *d_t_indices; -- CeedInt *d_l_vec_indices; -+ CeedInt *d_l_indices; -+ bool *h_orients; -+ bool *h_orients_allocated; -+ bool *d_orients; -+ bool *d_orients_allocated; -+ CeedInt8 *h_curl_orients; -+ CeedInt8 *h_curl_orients_allocated; -+ CeedInt8 *d_curl_orients; -+ CeedInt8 *d_curl_orients_allocated; - } CeedElemRestriction_Cuda; - - typedef struct { -@@ -80,21 +97,19 @@ typedef struct { - - typedef struct { - CUmodule module; -- CUfunction linearDiagonal; -- CUfunction linearPointBlock; -- CeedBasis basis_in, basis_out; -+ CUfunction LinearDiagonal; -+ CUfunction LinearPointBlock; - CeedElemRestriction diag_rstr, point_block_diag_rstr; - CeedVector elem_diag, point_block_elem_diag; -- CeedInt num_e_mode_in, num_e_mode_out, num_nodes; -- CeedEvalMode *h_e_mode_in, *h_e_mode_out; -- CeedEvalMode *d_e_mode_in, *d_e_mode_out; -- CeedScalar *d_identity, *d_interp_in, *d_interp_out, *d_grad_in, *d_grad_out; -+ CeedEvalMode *d_eval_modes_in, *d_eval_modes_out; -+ CeedScalar *d_identity, *d_interp_in, *d_grad_in, *d_div_in, *d_curl_in; -+ CeedScalar *d_interp_out, *d_grad_out, *d_div_out, *d_curl_out; - } CeedOperatorDiag_Cuda; - - typedef struct { - CUmodule module; -- CUfunction linearAssemble; -- CeedInt num_elem, block_size_x, block_size_y, elem_per_block; -+ CUfunction LinearAssemble; -+ CeedInt block_size_x, block_size_y, elems_per_block; - CeedScalar *d_B_in, *d_B_out; - } CeedOperatorAssemble_Cuda; - -diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c -index 486269bb..023b3d8a 100644 ---- a/backends/hip-ref/ceed-hip-ref-operator.c -+++ b/backends/hip-ref/ceed-hip-ref-operator.c -@@ -53,15 +53,18 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) { - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallHip(ceed, hipModuleUnload(impl->diag->module)); -- CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in)); -- CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out)); -- CeedCallHip(ceed, hipFree(impl->diag->d_e_mode_in)); -- CeedCallHip(ceed, hipFree(impl->diag->d_e_mode_out)); -+ CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_in)); -+ CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_out)); - CeedCallHip(ceed, hipFree(impl->diag->d_identity)); - CeedCallHip(ceed, hipFree(impl->diag->d_interp_in)); - CeedCallHip(ceed, hipFree(impl->diag->d_interp_out)); - CeedCallHip(ceed, hipFree(impl->diag->d_grad_in)); - CeedCallHip(ceed, hipFree(impl->diag->d_grad_out)); -+ CeedCallHip(ceed, hipFree(impl->diag->d_div_in)); -+ CeedCallHip(ceed, hipFree(impl->diag->d_div_out)); -+ CeedCallHip(ceed, hipFree(impl->diag->d_curl_in)); -+ CeedCallHip(ceed, hipFree(impl->diag->d_curl_out)); -+ CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr)); - CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); - CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); - CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); -@@ -102,30 +105,29 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i - - // Loop over fields - for (CeedInt i = 0; i < num_fields; i++) { -- bool is_strided, skip_restriction; -- CeedSize q_size; -- CeedInt dim, size; -- CeedEvalMode e_mode; -- CeedVector vec; -- CeedElemRestriction elem_rstr; -- CeedBasis basis; -+ bool is_strided = false, skip_restriction = false; -+ CeedSize q_size; -+ CeedInt size; -+ CeedEvalMode eval_mode; -+ CeedBasis basis; - -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); -- is_strided = false; -- skip_restriction = false; -- if (e_mode != CEED_EVAL_WEIGHT) { -- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); -+ if (eval_mode != CEED_EVAL_WEIGHT) { -+ CeedElemRestriction elem_rstr; - - // Check whether this field can skip the element restriction: -- // must be passive input, with e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. -+ // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. -+ CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); - - // First, check whether the field is input or output: - if (is_input) { -- // Check for passive input: -+ CeedVector vec; -+ -+ // Check for passive input - CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); - if (vec != CEED_VECTOR_ACTIVE) { -- // Check e_mode -- if (e_mode == CEED_EVAL_NONE) { -+ // Check eval_mode -+ if (eval_mode == CEED_EVAL_NONE) { - // Check for strided restriction - CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); - if (is_strided) { -@@ -143,21 +145,17 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i - } - } - -- switch (e_mode) { -+ switch (eval_mode) { - case CEED_EVAL_NONE: - CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); - q_size = (CeedSize)num_elem * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - break; - case CEED_EVAL_INTERP: -- CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); -- q_size = (CeedSize)num_elem * Q * size; -- CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); -- break; - case CEED_EVAL_GRAD: -- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); -+ case CEED_EVAL_DIV: -+ case CEED_EVAL_CURL: - CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); -- CeedCallBackend(CeedBasisGetDimension(basis, &dim)); - q_size = (CeedSize)num_elem * Q * size; - CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - break; -@@ -167,10 +165,6 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i - CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); - CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); - break; -- case CEED_EVAL_DIV: -- break; // TODO: Not implemented -- case CEED_EVAL_CURL: -- break; // TODO: Not implemented - } - } - return CEED_ERROR_SUCCESS; -@@ -201,17 +195,14 @@ static int CeedOperatorSetup_Hip(CeedOperator op) { - - // Allocate - CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); -- - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); - CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); -- - impl->num_inputs = num_input_fields; - impl->num_outputs = num_output_fields; - - // Set up infield and outfield e_vecs and q_vecs - // Infields - CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); -- - // Outfields - CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); - -@@ -226,7 +217,7 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFun - CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], - CeedOperator_Hip *impl, CeedRequest *request) { - for (CeedInt i = 0; i < num_input_fields; i++) { -- CeedEvalMode e_mode; -+ CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_rstr; - -@@ -237,8 +228,8 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFun - else vec = in_vec; - } - -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); -- if (e_mode == CEED_EVAL_WEIGHT) { // Skip -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); -+ if (eval_mode == CEED_EVAL_WEIGHT) { // Skip - } else { - // Get input vector - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); -@@ -267,7 +258,7 @@ static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionFiel - CeedOperator_Hip *impl) { - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedInt elem_size, size; -- CeedEvalMode e_mode; -+ CeedEvalMode eval_mode; - CeedElemRestriction elem_rstr; - CeedBasis basis; - -@@ -278,30 +269,25 @@ static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionFiel - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; - } -- // Get elem_size, e_mode, size -+ // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); - // Basis action -- switch (e_mode) { -+ switch (eval_mode) { - case CEED_EVAL_NONE: - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); - break; - case CEED_EVAL_INTERP: -- CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); -- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i])); -- break; - case CEED_EVAL_GRAD: -+ case CEED_EVAL_DIV: -+ case CEED_EVAL_CURL: - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); -- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i])); -+ CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i])); - break; - case CEED_EVAL_WEIGHT: - break; // No action -- case CEED_EVAL_DIV: -- break; // TODO: Not implemented -- case CEED_EVAL_CURL: -- break; // TODO: Not implemented - } - } - return CEED_ERROR_SUCCESS; -@@ -313,15 +299,16 @@ static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionFiel - static inline int CeedOperatorRestoreInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, - const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) { - for (CeedInt i = 0; i < num_input_fields; i++) { -- CeedEvalMode e_mode; -+ CeedEvalMode eval_mode; - CeedVector vec; -+ - // Skip active input - if (skip_active) { - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) continue; - } -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); -- if (e_mode == CEED_EVAL_WEIGHT) { // Skip -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); -+ if (eval_mode == CEED_EVAL_WEIGHT) { // Skip - } else { - if (!impl->e_vecs[i]) { // This was a skip_restriction case - CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); -@@ -363,10 +350,10 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect - - // Output pointers, as necessary - for (CeedInt i = 0; i < num_output_fields; i++) { -- CeedEvalMode e_mode; -+ CeedEvalMode eval_mode; - -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); -- if (e_mode == CEED_EVAL_NONE) { -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); -+ if (eval_mode == CEED_EVAL_NONE) { - // Set the output Q-Vector to use the E-Vector data directly. - CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); - CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); -@@ -378,26 +365,25 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect - - // Output basis apply if needed - for (CeedInt i = 0; i < num_output_fields; i++) { -- CeedEvalMode e_mode; -+ CeedEvalMode eval_mode; - CeedElemRestriction elem_rstr; - CeedBasis basis; - -- // Get elem_size, e_mode, size -+ // Get elem_size, eval_mode, size - CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); - CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); - CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); - // Basis action -- switch (e_mode) { -+ switch (eval_mode) { - case CEED_EVAL_NONE: -- break; -+ break; // No action - case CEED_EVAL_INTERP: -- CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); -- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); -- break; - case CEED_EVAL_GRAD: -+ case CEED_EVAL_DIV: -+ case CEED_EVAL_CURL: - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); -- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); -+ CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); - break; - // LCOV_EXCL_START - case CEED_EVAL_WEIGHT: { -@@ -405,25 +391,20 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); -- break; // Should not occur -+ // LCOV_EXCL_STOP - } -- case CEED_EVAL_DIV: -- break; // TODO: Not implemented -- case CEED_EVAL_CURL: -- break; // TODO: Not implemented -- // LCOV_EXCL_STOP - } - } - - // Output restriction - for (CeedInt i = 0; i < num_output_fields; i++) { -- CeedEvalMode e_mode; -+ CeedEvalMode eval_mode; - CeedVector vec; - CeedElemRestriction elem_rstr; - - // Restore evec -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); -- if (e_mode == CEED_EVAL_NONE) { -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); -+ if (eval_mode == CEED_EVAL_NONE) { - CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields])); - } - // Get output vector -@@ -442,15 +423,14 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect - } - - //------------------------------------------------------------------------------ --// Core code for assembling linear QFunction -+// Linear QFunction Assembly Core - //------------------------------------------------------------------------------ - static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, - CeedRequest *request) { - Ceed ceed, ceed_parent; -- CeedSize q_size; - CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; - CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; -- CeedVector *active_in; -+ CeedVector *active_inputs; - CeedQFunctionField *qf_input_fields, *qf_output_fields; - CeedQFunction qf; - CeedOperatorField *op_input_fields, *op_output_fields; -@@ -459,14 +439,13 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent)); - CeedCallBackend(CeedOperatorGetData(op, &impl)); -- CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); - CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); -+ CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); -- active_in = impl->qf_active_in; -- num_active_in = impl->num_active_in; -- num_active_out = impl->num_active_out; -+ active_inputs = impl->qf_active_in; -+ num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; - - // Setup - CeedCallBackend(CeedOperatorSetup_Hip(op)); -@@ -487,19 +466,20 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b - CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); - CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); - CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); -- CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); -+ CeedCallBackend(CeedRealloc(num_active_in + size, &active_inputs)); - for (CeedInt field = 0; field < size; field++) { -- q_size = (CeedSize)Q * num_elem; -- CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); -+ CeedSize q_size = (CeedSize)Q * num_elem; -+ -+ CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field])); - CeedCallBackend( -- CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); -+ CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); - } - num_active_in += size; - CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); - } - } - impl->num_active_in = num_active_in; -- impl->qf_active_in = active_in; -+ impl->qf_active_in = active_inputs; - } - - // Count number of active output fields -@@ -523,10 +503,10 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b - - // Build objects if needed - if (build_objects) { -- // Create output restriction - CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; - CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ - -+ // Create output restriction - CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, - num_active_in * num_active_out * num_elem * Q, strides, rstr)); - // Create assembled vector -@@ -541,9 +521,9 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b - // Assemble QFunction - for (CeedInt in = 0; in < num_active_in; in++) { - // Set Inputs -- CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0)); -+ CeedCallBackend(CeedVectorSetValue(active_inputs[in], 1.0)); - if (num_active_in > 1) { -- CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); -+ CeedCallBackend(CeedVectorSetValue(active_inputs[(in + num_active_in - 1) % num_active_in], 0.0)); - } - // Set Outputs - for (CeedInt out = 0; out < num_output_fields; out++) { -@@ -562,7 +542,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b - CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); - } - -- // Un-set output Qvecs to prevent accidental overwrite of Assembled -+ // Un-set output q_vecs to prevent accidental overwrite of Assembled - for (CeedInt out = 0; out < num_output_fields; out++) { - CeedVector vec; - -@@ -597,14 +577,14 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Hip(CeedOperator op, CeedVe - } - - //------------------------------------------------------------------------------ --// Assemble diagonal setup -+// Assemble Diagonal Setup - //------------------------------------------------------------------------------ - static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) { - Ceed ceed; - char *diagonal_kernel_path, *diagonal_kernel_source; -- CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0; -- CeedEvalMode *e_mode_in = NULL, *e_mode_out = NULL; -- CeedElemRestriction rstr_in = NULL, rstr_out = NULL; -+ CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; -+ CeedInt num_comp, q_comp, num_nodes, num_qpts; -+ CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; - CeedBasis basis_in = NULL, basis_out = NULL; - CeedQFunctionField *qf_fields; - CeedQFunction qf; -@@ -623,33 +603,20 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt - - CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { -- CeedEvalMode e_mode; -- CeedElemRestriction rstr; -- -- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); -- CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); -- CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); -- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); -- CeedCheck(!rstr_in || rstr_in == rstr, ceed, CEED_ERROR_BACKEND, -- "Backend does not implement multi-field non-composite operator diagonal assembly"); -- rstr_in = rstr; -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); -- switch (e_mode) { -- case CEED_EVAL_NONE: -- case CEED_EVAL_INTERP: -- CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in)); -- e_mode_in[num_e_mode_in] = e_mode; -- num_e_mode_in += 1; -- break; -- case CEED_EVAL_GRAD: -- CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in)); -- for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode; -- num_e_mode_in += dim; -- break; -- case CEED_EVAL_WEIGHT: -- case CEED_EVAL_DIV: -- case CEED_EVAL_CURL: -- break; // Caught by QF Assembly -+ CeedBasis basis; -+ CeedEvalMode eval_mode; -+ -+ CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); -+ CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, -+ "Backend does not implement operator diagonal assembly with multiple active bases"); -+ basis_in = basis; -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); -+ if (eval_mode != CEED_EVAL_WEIGHT) { -+ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly -+ CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in)); -+ for (CeedInt d = 0; d < q_comp; d++) eval_modes_in[num_eval_modes_in + d] = eval_mode; -+ num_eval_modes_in += q_comp; - } - } - } -@@ -662,31 +629,20 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt - - CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { -- CeedEvalMode e_mode; -- CeedElemRestriction rstr; -- -- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); -- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); -- CeedCheck(!rstr_out || rstr_out == rstr, ceed, CEED_ERROR_BACKEND, -- "Backend does not implement multi-field non-composite operator diagonal assembly"); -- rstr_out = rstr; -- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); -- switch (e_mode) { -- case CEED_EVAL_NONE: -- case CEED_EVAL_INTERP: -- CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out)); -- e_mode_out[num_e_mode_out] = e_mode; -- num_e_mode_out += 1; -- break; -- case CEED_EVAL_GRAD: -- CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out)); -- for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode; -- num_e_mode_out += dim; -- break; -- case CEED_EVAL_WEIGHT: -- case CEED_EVAL_DIV: -- case CEED_EVAL_CURL: -- break; // Caught by QF Assembly -+ CeedBasis basis; -+ CeedEvalMode eval_mode; -+ -+ CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); -+ CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, -+ "Backend does not implement operator diagonal assembly with multiple active bases"); -+ basis_out = basis; -+ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); -+ if (eval_mode != CEED_EVAL_WEIGHT) { -+ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly -+ CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out)); -+ for (CeedInt d = 0; d < q_comp; d++) eval_modes_out[num_eval_modes_out + d] = eval_mode; -+ num_eval_modes_out += q_comp; - } - } - } -@@ -696,95 +652,147 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt - CeedCallBackend(CeedCalloc(1, &impl->diag)); - CeedOperatorDiag_Hip *diag = impl->diag; - -- diag->basis_in = basis_in; -- diag->basis_out = basis_out; -- diag->h_e_mode_in = e_mode_in; -- diag->h_e_mode_out = e_mode_out; -- diag->num_e_mode_in = num_e_mode_in; -- diag->num_e_mode_out = num_e_mode_out; -- - // Assemble kernel -+ CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); -+ CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); -+ if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; -+ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); -- CeedInt num_modes, num_qpts; -- CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_modes)); -- CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); -- diag->num_modes = num_modes; -- CeedCallBackend(CeedCompile_Hip(ceed, diagonal_kernel_source, &diag->module, 6, "NUMEMODEIN", num_e_mode_in, "NUMEMODEOUT", num_e_mode_out, -- "NNODES", num_modes, "NQPTS", num_qpts, "NCOMP", num_comp, "CEEDSIZE", use_ceedsize_idx)); -- CeedCallBackend(CeedGetKernel_Hip(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal)); -- CeedCallBackend(CeedGetKernel_Hip(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock)); -+ CeedCallHip(ceed, -+ CeedCompile_Hip(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", -+ num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "CEED_SIZE", use_ceedsize_idx)); -+ CeedCallHip(ceed, CeedGetKernel_Hip(ceed, diag->module, "LinearDiagonal", &diag->LinearDiagonal)); -+ CeedCallHip(ceed, CeedGetKernel_Hip(ceed, diag->module, "LinearPointBlockDiagonal", &diag->LinearPointBlock)); - CeedCallBackend(CeedFree(&diagonal_kernel_path)); - CeedCallBackend(CeedFree(&diagonal_kernel_source)); - - // Basis matrices -- const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); -- const CeedInt interp_bytes = q_bytes * num_modes; -- const CeedInt grad_bytes = q_bytes * num_modes * dim; -- const CeedInt e_mode_bytes = sizeof(CeedEvalMode); -- const CeedScalar *interp_in, *interp_out, *grad_in, *grad_out; -+ const CeedInt interp_bytes = num_nodes * num_qpts * sizeof(CeedScalar); -+ const CeedInt eval_modes_bytes = sizeof(CeedEvalMode); -+ bool has_eval_none = false; - - // CEED_EVAL_NONE -- CeedScalar *identity = NULL; -- bool is_eval_none = false; -- -- for (CeedInt i = 0; i < num_e_mode_in; i++) is_eval_none = is_eval_none || (e_mode_in[i] == CEED_EVAL_NONE); -- for (CeedInt i = 0; i < num_e_mode_out; i++) is_eval_none = is_eval_none || (e_mode_out[i] == CEED_EVAL_NONE); -- if (is_eval_none) { -- CeedCallBackend(CeedCalloc(num_qpts * num_modes, &identity)); -- for (CeedInt i = 0; i < (num_modes < num_qpts ? num_modes : num_qpts); i++) identity[i * num_modes + i] = 1.0; -+ for (CeedInt i = 0; i < num_eval_modes_in; i++) has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE); -+ for (CeedInt i = 0; i < num_eval_modes_out; i++) has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE); -+ if (has_eval_none) { -+ CeedScalar *identity = NULL; -+ -+ CeedCallBackend(CeedCalloc(num_nodes * num_qpts, &identity)); -+ for (CeedInt i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0; - CeedCallHip(ceed, hipMalloc((void **)&diag->d_identity, interp_bytes)); - CeedCallHip(ceed, hipMemcpy(diag->d_identity, identity, interp_bytes, hipMemcpyHostToDevice)); -+ CeedCallBackend(CeedFree(&identity)); -+ } -+ -+ // CEED_EVAL_INTERP, CEED_EVAL_GRAD, CEED_EVAL_DIV, and CEED_EVAL_CURL -+ for (CeedInt in = 0; in < 2; in++) { -+ CeedFESpace fespace; -+ CeedBasis basis = in ? basis_in : basis_out; -+ -+ CeedCallBackend(CeedBasisGetFESpace(basis, &fespace)); -+ switch (fespace) { -+ case CEED_FE_SPACE_H1: { -+ CeedInt q_comp_interp, q_comp_grad; -+ const CeedScalar *interp, *grad; -+ CeedScalar *d_interp, *d_grad; -+ -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); -+ -+ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); -+ CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); -+ CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice)); -+ CeedCallBackend(CeedBasisGetGrad(basis, &grad)); -+ CeedCallHip(ceed, hipMalloc((void **)&d_grad, interp_bytes * q_comp_grad)); -+ CeedCallHip(ceed, hipMemcpy(d_grad, grad, interp_bytes * q_comp_grad, hipMemcpyHostToDevice)); -+ if (in) { -+ diag->d_interp_in = d_interp; -+ diag->d_grad_in = d_grad; -+ } else { -+ diag->d_interp_out = d_interp; -+ diag->d_grad_out = d_grad; -+ } -+ } break; -+ case CEED_FE_SPACE_HDIV: { -+ CeedInt q_comp_interp, q_comp_div; -+ const CeedScalar *interp, *div; -+ CeedScalar *d_interp, *d_div; -+ -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); -+ -+ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); -+ CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); -+ CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice)); -+ CeedCallBackend(CeedBasisGetDiv(basis, &div)); -+ CeedCallHip(ceed, hipMalloc((void **)&d_div, interp_bytes * q_comp_div)); -+ CeedCallHip(ceed, hipMemcpy(d_div, div, interp_bytes * q_comp_div, hipMemcpyHostToDevice)); -+ if (in) { -+ diag->d_interp_in = d_interp; -+ diag->d_div_in = d_div; -+ } else { -+ diag->d_interp_out = d_interp; -+ diag->d_div_out = d_div; -+ } -+ } break; -+ case CEED_FE_SPACE_HCURL: { -+ CeedInt q_comp_interp, q_comp_curl; -+ const CeedScalar *interp, *curl; -+ CeedScalar *d_interp, *d_curl; -+ -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); -+ -+ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); -+ CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); -+ CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice)); -+ CeedCallBackend(CeedBasisGetCurl(basis, &curl)); -+ CeedCallHip(ceed, hipMalloc((void **)&d_curl, interp_bytes * q_comp_curl)); -+ CeedCallHip(ceed, hipMemcpy(d_curl, curl, interp_bytes * q_comp_curl, hipMemcpyHostToDevice)); -+ if (in) { -+ diag->d_interp_in = d_interp; -+ diag->d_curl_in = d_curl; -+ } else { -+ diag->d_interp_out = d_interp; -+ diag->d_curl_out = d_curl; -+ } -+ } break; -+ } - } - -- // CEED_EVAL_INTERP -- CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); -- CeedCallHip(ceed, hipMalloc((void **)&diag->d_interp_in, interp_bytes)); -- CeedCallHip(ceed, hipMemcpy(diag->d_interp_in, interp_in, interp_bytes, hipMemcpyHostToDevice)); -- CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); -- CeedCallHip(ceed, hipMalloc((void **)&diag->d_interp_out, interp_bytes)); -- CeedCallHip(ceed, hipMemcpy(diag->d_interp_out, interp_out, interp_bytes, hipMemcpyHostToDevice)); -- -- // CEED_EVAL_GRAD -- CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); -- CeedCallHip(ceed, hipMalloc((void **)&diag->d_grad_in, grad_bytes)); -- CeedCallHip(ceed, hipMemcpy(diag->d_grad_in, grad_in, grad_bytes, hipMemcpyHostToDevice)); -- CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); -- CeedCallHip(ceed, hipMalloc((void **)&diag->d_grad_out, grad_bytes)); -- CeedCallHip(ceed, hipMemcpy(diag->d_grad_out, grad_out, grad_bytes, hipMemcpyHostToDevice)); -- -- // Arrays of e_modes -- CeedCallHip(ceed, hipMalloc((void **)&diag->d_e_mode_in, num_e_mode_in * e_mode_bytes)); -- CeedCallHip(ceed, hipMemcpy(diag->d_e_mode_in, e_mode_in, num_e_mode_in * e_mode_bytes, hipMemcpyHostToDevice)); -- CeedCallHip(ceed, hipMalloc((void **)&diag->d_e_mode_out, num_e_mode_out * e_mode_bytes)); -- CeedCallHip(ceed, hipMemcpy(diag->d_e_mode_out, e_mode_out, num_e_mode_out * e_mode_bytes, hipMemcpyHostToDevice)); -- -- // Restriction -- diag->diag_rstr = rstr_out; -+ // Arrays of eval_modes -+ CeedCallHip(ceed, hipMalloc((void **)&diag->d_eval_modes_in, num_eval_modes_in * eval_modes_bytes)); -+ CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_in, eval_modes_in, num_eval_modes_in * eval_modes_bytes, hipMemcpyHostToDevice)); -+ CeedCallHip(ceed, hipMalloc((void **)&diag->d_eval_modes_out, num_eval_modes_out * eval_modes_bytes)); -+ CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, hipMemcpyHostToDevice)); -+ CeedCallBackend(CeedFree(&eval_modes_in)); -+ CeedCallBackend(CeedFree(&eval_modes_out)); - return CEED_ERROR_SUCCESS; - } - - //------------------------------------------------------------------------------ --// Assemble diagonal common code -+// Assemble Diagonal Core - //------------------------------------------------------------------------------ - static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { - Ceed ceed; -- CeedSize assembled_length = 0, assembled_qf_length = 0; -- CeedInt use_ceedsize_idx = 0, num_elem; -+ CeedSize assembled_length, assembled_qf_length; -+ CeedInt use_ceedsize_idx = 0, num_elem, num_nodes; - CeedScalar *elem_diag_array; - const CeedScalar *assembled_qf_array; -- CeedVector assembled_qf = NULL; -- CeedElemRestriction rstr = NULL; -+ CeedVector assembled_qf = NULL, elem_diag; -+ CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out, diag_rstr; - CeedOperator_Hip *impl; - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetData(op, &impl)); - - // Assemble QFunction -- CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request)); -- CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); -+ CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, request)); -+ CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); -+ CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); - - CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); - CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); -@@ -796,37 +804,37 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect - - assert(diag != NULL); - -- // Restriction -- if (is_point_block && !diag->point_block_diag_rstr) { -- CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(diag->diag_rstr, &diag->point_block_diag_rstr)); -- } -- CeedElemRestriction diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; -- -- // Create diagonal vector -- CeedVector elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; -- -- if (!elem_diag) { -- CeedCallBackend(CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag)); -- if (is_point_block) diag->point_block_elem_diag = elem_diag; -- else diag->elem_diag = elem_diag; -+ // Restriction and diagonal vector -+ CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); -+ CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND, -+ "Cannot assemble operator diagonal with different input and output active element restrictions"); -+ if (!is_point_block && !diag->diag_rstr) { -+ CeedCallBackend(CeedElemRestrictionCreateUnsignedCopy(rstr_out, &diag->diag_rstr)); -+ CeedCallBackend(CeedElemRestrictionCreateVector(diag->diag_rstr, NULL, &diag->elem_diag)); -+ } else if (is_point_block && !diag->point_block_diag_rstr) { -+ CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr)); -+ CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag)); - } -+ diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; -+ elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; - CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); - - // Assemble element operator diagonals - CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); -- CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); - CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(diag_rstr, &num_nodes)); - - // Compute the diagonal of B^T D B -- int elem_per_block = 1; -- int grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); -- void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_interp_out, -- &diag->d_grad_out, &diag->d_e_mode_in, &diag->d_e_mode_out, &assembled_qf_array, &elem_diag_array}; -+ CeedInt elems_per_block = 1; -+ CeedInt grid = CeedDivUpInt(num_elem, elems_per_block); -+ void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_div_in, -+ &diag->d_curl_in, &diag->d_interp_out, &diag->d_grad_out, &diag->d_div_out, &diag->d_curl_out, -+ &diag->d_eval_modes_in, &diag->d_eval_modes_out, &assembled_qf_array, &elem_diag_array}; - - if (is_point_block) { -- CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->linearPointBlock, grid, diag->num_modes, 1, elem_per_block, args)); -+ CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args)); - } else { -- CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->linearDiagonal, grid, diag->num_modes, 1, elem_per_block, args)); -+ CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args)); - } - - // Restore arrays -@@ -858,13 +866,14 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op, - } - - //------------------------------------------------------------------------------ --// Single operator assembly setup -+// Single Operator Assembly Setup - //------------------------------------------------------------------------------ - static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) { -- Ceed ceed; -- CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_qpts = 0, elem_size = 0, -- num_e_mode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0, num_elem, num_comp; -- CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; -+ Ceed ceed; -+ char *assembly_kernel_path, *assembly_kernel_source; -+ CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; -+ CeedInt elem_size_in, num_qpts_in, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp; -+ CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; - CeedElemRestriction rstr_in = NULL, rstr_out = NULL; - CeedBasis basis_in = NULL, basis_out = NULL; - CeedQFunctionField *qf_fields; -@@ -881,34 +890,30 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed - // Determine active input basis eval mode - CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); - CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); -- // Note that the kernel will treat each dimension of a gradient action separately; -- // i.e., when an active input has a CEED_EVAL_GRAD mode, num_e_mode_in will increment by dim. -- // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so -- // num_B_in_mats_to_load will be incremented by 1. - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedVector vec; - - CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { -+ CeedBasis basis; - CeedEvalMode eval_mode; - -- CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in)); -- CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); -- CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); -+ CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis)); -+ CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); -+ basis_in = basis; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); -- CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); -+ if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in; -+ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in)); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); -- if (eval_mode != CEED_EVAL_NONE) { -- CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); -- eval_mode_in[num_B_in_mats_to_load] = eval_mode; -- num_B_in_mats_to_load += 1; -- if (eval_mode == CEED_EVAL_GRAD) { -- num_e_mode_in += dim; -- size_B_in += dim * elem_size * num_qpts; -- } else { -- num_e_mode_in += 1; -- size_B_in += elem_size * num_qpts; -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); -+ if (eval_mode != CEED_EVAL_WEIGHT) { -+ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly -+ CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in)); -+ for (CeedInt d = 0; d < q_comp; d++) { -+ eval_modes_in[num_eval_modes_in + d] = eval_mode; - } -+ num_eval_modes_in += q_comp; - } - } - } -@@ -920,106 +925,133 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed - - CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); - if (vec == CEED_VECTOR_ACTIVE) { -+ CeedBasis basis; - CeedEvalMode eval_mode; - -- CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out)); -+ CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis)); -+ CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, -+ "Backend does not implement operator assembly with multiple active bases"); -+ basis_out = basis; - CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); -- CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); -+ if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out; -+ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out)); -+ CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED, -+ "Active input and output bases must have the same number of quadrature points"); - CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); -- if (eval_mode != CEED_EVAL_NONE) { -- CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); -- eval_mode_out[num_B_out_mats_to_load] = eval_mode; -- num_B_out_mats_to_load += 1; -- if (eval_mode == CEED_EVAL_GRAD) { -- num_e_mode_out += dim; -- size_B_out += dim * elem_size * num_qpts; -- } else { -- num_e_mode_out += 1; -- size_B_out += elem_size * num_qpts; -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); -+ if (eval_mode != CEED_EVAL_WEIGHT) { -+ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly -+ CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out)); -+ for (CeedInt d = 0; d < q_comp; d++) { -+ eval_modes_out[num_eval_modes_out + d] = eval_mode; - } -+ num_eval_modes_out += q_comp; - } - } - } -- -- CeedCheck(num_e_mode_in > 0 && num_e_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); -- -- CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem)); -- CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp)); -+ CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); - - CeedCallBackend(CeedCalloc(1, &impl->asmb)); - CeedOperatorAssemble_Hip *asmb = impl->asmb; -- asmb->num_elem = num_elem; -+ asmb->elems_per_block = 1; -+ asmb->block_size_x = elem_size_in; -+ asmb->block_size_y = elem_size_out; -+ -+ bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > 1024; -+ -+ if (fallback) { -+ // Use fallback kernel with 1D threadblock -+ asmb->block_size_y = 1; -+ } - - // Compile kernels -- int elem_per_block = 1; -- asmb->elem_per_block = elem_per_block; -- CeedInt block_size = elem_size * elem_size * elem_per_block; -- char *assembly_kernel_path, *assembly_kernel_source; -+ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in)); -+ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out)); - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble.h", &assembly_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); -- bool fallback = block_size > 1024; -- if (fallback) { // Use fallback kernel with 1D threadblock -- block_size = elem_size * elem_per_block; -- asmb->block_size_x = elem_size; -- asmb->block_size_y = 1; -- } else { // Use kernel with 2D threadblock -- asmb->block_size_x = elem_size; -- asmb->block_size_y = elem_size; -- } -- CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 8, "NELEM", num_elem, "NUMEMODEIN", num_e_mode_in, "NUMEMODEOUT", -- num_e_mode_out, "NQPTS", num_qpts, "NNODES", elem_size, "BLOCK_SIZE", block_size, "NCOMP", num_comp, "CEEDSIZE", -+ CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", -+ num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in, -+ "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE", -+ asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "CEED_SIZE", - use_ceedsize_idx)); -- CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble)); - CeedCallBackend(CeedFree(&assembly_kernel_path)); - CeedCallBackend(CeedFree(&assembly_kernel_source)); - -- // Build 'full' B matrices (not 1D arrays used for tensor-product matrices) -- const CeedScalar *interp_in, *grad_in; -- CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); -- CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); -- -- // Load into B_in, in order that they will be used in eval_mode -- const CeedInt in_bytes = size_B_in * sizeof(CeedScalar); -- CeedInt mat_start = 0; -- -- CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_in, in_bytes)); -- for (int i = 0; i < num_B_in_mats_to_load; i++) { -- CeedEvalMode eval_mode = eval_mode_in[i]; -- if (eval_mode == CEED_EVAL_INTERP) { -- CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[mat_start], interp_in, elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); -- mat_start += elem_size * num_qpts; -- } else if (eval_mode == CEED_EVAL_GRAD) { -- CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); -- mat_start += dim * elem_size * num_qpts; -+ // Load into B_in, in order that they will be used in eval_modes_in -+ { -+ const CeedInt in_bytes = elem_size_in * num_qpts_in * num_eval_modes_in * sizeof(CeedScalar); -+ CeedInt d_in = 0; -+ CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE; -+ bool has_eval_none = false; -+ CeedScalar *identity = NULL; -+ -+ for (CeedInt i = 0; i < num_eval_modes_in; i++) { -+ has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE); -+ } -+ if (has_eval_none) { -+ CeedCallBackend(CeedCalloc(elem_size_in * num_qpts_in, &identity)); -+ for (CeedInt i = 0; i < (elem_size_in < num_qpts_in ? elem_size_in : num_qpts_in); i++) identity[i * elem_size_in + i] = 1.0; - } -- } - -- const CeedScalar *interp_out, *grad_out; -+ CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_in, in_bytes)); -+ for (CeedInt i = 0; i < num_eval_modes_in; i++) { -+ const CeedScalar *h_B_in; - -- // Note that this function currently assumes 1 basis, so this should always be true for now -- if (basis_out == basis_in) { -- interp_out = interp_in; -- grad_out = grad_in; -- } else { -- CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); -- CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); -+ CeedCallBackend(CeedOperatorGetBasisPointer(basis_in, eval_modes_in[i], identity, &h_B_in)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_modes_in[i], &q_comp)); -+ if (q_comp > 1) { -+ if (i == 0 || eval_modes_in[i] != eval_modes_in_prev) d_in = 0; -+ else h_B_in = &h_B_in[(++d_in) * elem_size_in * num_qpts_in]; -+ } -+ eval_modes_in_prev = eval_modes_in[i]; -+ -+ CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar), -+ hipMemcpyHostToDevice)); -+ } -+ -+ if (identity) { -+ CeedCallBackend(CeedFree(&identity)); -+ } - } - -- // Load into B_out, in order that they will be used in eval_mode -- const CeedInt out_bytes = size_B_out * sizeof(CeedScalar); -- -- mat_start = 0; -- CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_out, out_bytes)); -- for (int i = 0; i < num_B_out_mats_to_load; i++) { -- CeedEvalMode eval_mode = eval_mode_out[i]; -- if (eval_mode == CEED_EVAL_INTERP) { -- CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[mat_start], interp_out, elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); -- mat_start += elem_size * num_qpts; -- } else if (eval_mode == CEED_EVAL_GRAD) { -- CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); -- mat_start += dim * elem_size * num_qpts; -+ // Load into B_out, in order that they will be used in eval_modes_out -+ { -+ const CeedInt out_bytes = elem_size_out * num_qpts_out * num_eval_modes_out * sizeof(CeedScalar); -+ CeedInt d_out = 0; -+ CeedEvalMode eval_modes_out_prev = CEED_EVAL_NONE; -+ bool has_eval_none = false; -+ CeedScalar *identity = NULL; -+ -+ for (CeedInt i = 0; i < num_eval_modes_out; i++) { -+ has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE); -+ } -+ if (has_eval_none) { -+ CeedCallBackend(CeedCalloc(elem_size_out * num_qpts_out, &identity)); -+ for (CeedInt i = 0; i < (elem_size_out < num_qpts_out ? elem_size_out : num_qpts_out); i++) identity[i * elem_size_out + i] = 1.0; -+ } -+ -+ CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_out, out_bytes)); -+ for (CeedInt i = 0; i < num_eval_modes_out; i++) { -+ const CeedScalar *h_B_out; -+ -+ CeedCallBackend(CeedOperatorGetBasisPointer(basis_out, eval_modes_out[i], identity, &h_B_out)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_modes_out[i], &q_comp)); -+ if (q_comp > 1) { -+ if (i == 0 || eval_modes_out[i] != eval_modes_out_prev) d_out = 0; -+ else h_B_out = &h_B_out[(++d_out) * elem_size_out * num_qpts_out]; -+ } -+ eval_modes_out_prev = eval_modes_out[i]; -+ -+ CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar), -+ hipMemcpyHostToDevice)); -+ } -+ -+ if (identity) { -+ CeedCallBackend(CeedFree(&identity)); - } - } - return CEED_ERROR_SUCCESS; -@@ -1036,47 +1068,96 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed - static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedVector values) { - Ceed ceed; - CeedSize values_length = 0, assembled_qf_length = 0; -- CeedInt use_ceedsize_idx = 0; -+ CeedInt use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out; - CeedScalar *values_array; -- const CeedScalar *qf_array; -- CeedVector assembled_qf = NULL; -- CeedElemRestriction rstr_q = NULL; -+ const CeedScalar *assembled_qf_array; -+ CeedVector assembled_qf = NULL; -+ CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out; -+ CeedRestrictionType rstr_type_in, rstr_type_out; -+ const bool *orients_in = NULL, *orients_out = NULL; -+ const CeedInt8 *curl_orients_in = NULL, *curl_orients_out = NULL; - CeedOperator_Hip *impl; - - CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetData(op, &impl)); - - // Assemble QFunction -- CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); -- CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); -- CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); -- values_array += offset; -- CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); -+ CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, CEED_REQUEST_IMMEDIATE)); -+ CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); -+ CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); - - CeedCallBackend(CeedVectorGetLength(values, &values_length)); - CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); - if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; -+ - // Setup -- if (!impl->asmb) { -- CeedCallBackend(CeedSingleOperatorAssembleSetup_Hip(op, use_ceedsize_idx)); -- assert(impl->asmb != NULL); -+ if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Hip(op, use_ceedsize_idx)); -+ CeedOperatorAssemble_Hip *asmb = impl->asmb; -+ -+ assert(asmb != NULL); -+ -+ // Assemble element operator -+ CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); -+ values_array += offset; -+ -+ CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem_in)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); -+ -+ CeedCallBackend(CeedElemRestrictionGetType(rstr_in, &rstr_type_in)); -+ if (rstr_type_in == CEED_RESTRICTION_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_in, CEED_MEM_DEVICE, &orients_in)); -+ } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_in, CEED_MEM_DEVICE, &curl_orients_in)); -+ } -+ -+ if (rstr_in != rstr_out) { -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out)); -+ CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED, -+ "Active input and output operator restrictions must have the same number of elements"); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); -+ -+ CeedCallBackend(CeedElemRestrictionGetType(rstr_out, &rstr_type_out)); -+ if (rstr_type_out == CEED_RESTRICTION_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_out, CEED_MEM_DEVICE, &orients_out)); -+ } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_out, CEED_MEM_DEVICE, &curl_orients_out)); -+ } -+ } else { -+ elem_size_out = elem_size_in; -+ orients_out = orients_in; -+ curl_orients_out = curl_orients_in; - } - - // Compute B^T D B -- const CeedInt num_elem = impl->asmb->num_elem; -- const CeedInt elem_per_block = impl->asmb->elem_per_block; -- const CeedInt grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); -- void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; -+ CeedInt shared_mem = -+ ((curl_orients_in || curl_orients_out ? elem_size_in * elem_size_out : 0) + (curl_orients_in ? elem_size_in * asmb->block_size_y : 0)) * -+ sizeof(CeedScalar); -+ CeedInt grid = CeedDivUpInt(num_elem_in, asmb->elems_per_block); -+ void *args[] = {(void *)&num_elem_in, &asmb->d_B_in, &asmb->d_B_out, &orients_in, &curl_orients_in, -+ &orients_out, &curl_orients_out, &assembled_qf_array, &values_array}; - - CeedCallBackend( -- CeedRunKernelDim_Hip(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elem_per_block, args)); -+ CeedRunKernelDimShared_Hip(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args)); - - // Restore arrays - CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); -- CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array)); -+ CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); - - // Cleanup - CeedCallBackend(CeedVectorDestroy(&assembled_qf)); -+ if (rstr_type_in == CEED_RESTRICTION_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_in, &orients_in)); -+ } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_in, &curl_orients_in)); -+ } -+ if (rstr_in != rstr_out) { -+ if (rstr_type_out == CEED_RESTRICTION_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_out, &orients_out)); -+ } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) { -+ CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out)); -+ } -+ } - return CEED_ERROR_SUCCESS; - } - -diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c -index 0dd11b16..5824fa48 100644 ---- a/backends/hip-ref/ceed-hip-ref-restriction.c -+++ b/backends/hip-ref/ceed-hip-ref-restriction.c -@@ -18,22 +18,23 @@ - #include "ceed-hip-ref.h" - - //------------------------------------------------------------------------------ --// Apply restriction -+// Core apply restriction code - //------------------------------------------------------------------------------ --static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { -+static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients, -+ CeedVector u, CeedVector v, CeedRequest *request) { - Ceed ceed; -- Ceed_Hip *data; - CeedInt num_elem, elem_size; -+ CeedRestrictionType rstr_type; - const CeedScalar *d_u; - CeedScalar *d_v; - CeedElemRestriction_Hip *impl; - hipFunction_t kernel; - -- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -- CeedCallBackend(CeedGetData(ceed, &data)); -- CeedElemRestrictionGetNumElements(r, &num_elem); -- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); -+ CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); - const CeedInt num_nodes = impl->num_nodes; - - // Get vectors -@@ -49,45 +50,155 @@ static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode - // Restrict - if (t_mode == CEED_NOTRANSPOSE) { - // L-vector -> E-vector -- if (impl->d_ind) { -- // -- Offsets provided -- kernel = impl->OffsetNoTranspose; -- void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -- CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; -- -- CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); -- } else { -- // -- Strided restriction -- kernel = impl->StridedNoTranspose; -- void *args[] = {&num_elem, &d_u, &d_v}; -- CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; -- -- CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); -+ const CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; -+ const CeedInt grid = CeedDivUpInt(num_nodes, block_size); -+ -+ switch (rstr_type) { -+ case CEED_RESTRICTION_STRIDED: { -+ kernel = impl->StridedNoTranspose; -+ void *args[] = {&num_elem, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } break; -+ case CEED_RESTRICTION_STANDARD: { -+ kernel = impl->OffsetNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } break; -+ case CEED_RESTRICTION_ORIENTED: { -+ if (use_signs) { -+ kernel = impl->OrientedNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OffsetNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } -+ } break; -+ case CEED_RESTRICTION_CURL_ORIENTED: { -+ if (use_signs && use_orients) { -+ kernel = impl->CurlOrientedNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } else if (use_orients) { -+ kernel = impl->CurlOrientedUnsignedNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OffsetNoTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } -+ } break; -+ case CEED_RESTRICTION_POINTS: { -+ // LCOV_EXCL_START -+ return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); -+ // LCOV_EXCL_STOP -+ } break; - } - } else { - // E-vector -> L-vector -- if (impl->d_ind) { -- // -- Offsets provided -- CeedInt block_size = 64; -+ const CeedInt block_size = 64; -+ const CeedInt grid = CeedDivUpInt(num_nodes, block_size); -+ -+ switch (rstr_type) { -+ case CEED_RESTRICTION_STRIDED: { -+ kernel = impl->StridedTranspose; -+ void *args[] = {&num_elem, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } break; -+ case CEED_RESTRICTION_STANDARD: { -+ if (impl->OffsetTranspose) { -+ kernel = impl->OffsetTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OffsetTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } -+ } break; -+ case CEED_RESTRICTION_ORIENTED: { -+ if (use_signs) { -+ if (impl->OrientedTranspose) { -+ kernel = impl->OrientedTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OrientedTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } -+ } else { -+ if (impl->OffsetTranspose) { -+ kernel = impl->OffsetTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; - -- if (impl->OffsetTranspose) { -- kernel = impl->OffsetTranspose; -- void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OffsetTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; - -- CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); -- } else { -- kernel = impl->OffsetTransposeDet; -- void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } -+ } -+ } break; -+ case CEED_RESTRICTION_CURL_ORIENTED: { -+ if (use_signs && use_orients) { -+ if (impl->CurlOrientedTranspose) { -+ kernel = impl->CurlOrientedTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->CurlOrientedTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } -+ } else if (use_orients) { -+ if (impl->CurlOrientedUnsignedTranspose) { -+ kernel = impl->CurlOrientedUnsignedTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; - -- CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); -- } -- } else { -- // -- Strided restriction -- kernel = impl->StridedTranspose; -- void *args[] = {&num_elem, &d_u, &d_v}; -- CeedInt block_size = 64; -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->CurlOrientedUnsignedTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; - -- CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } -+ } else { -+ if (impl->OffsetTranspose) { -+ kernel = impl->OffsetTranspose; -+ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } else { -+ kernel = impl->OffsetTransposeDet; -+ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; -+ -+ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); -+ } -+ } -+ } break; -+ case CEED_RESTRICTION_POINTS: { -+ // LCOV_EXCL_START -+ return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); -+ // LCOV_EXCL_STOP -+ } break; - } - } - -@@ -99,6 +210,29 @@ static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode - return CEED_ERROR_SUCCESS; - } - -+//------------------------------------------------------------------------------ -+// Apply restriction -+//------------------------------------------------------------------------------ -+static int CeedElemRestrictionApply_Hip(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { -+ return CeedElemRestrictionApply_Hip_Core(rstr, t_mode, true, true, u, v, request); -+} -+ -+//------------------------------------------------------------------------------ -+// Apply unsigned restriction -+//------------------------------------------------------------------------------ -+static int CeedElemRestrictionApplyUnsigned_Hip(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, -+ CeedRequest *request) { -+ return CeedElemRestrictionApply_Hip_Core(rstr, t_mode, false, true, u, v, request); -+} -+ -+//------------------------------------------------------------------------------ -+// Apply unoriented restriction -+//------------------------------------------------------------------------------ -+static int CeedElemRestrictionApplyUnoriented_Hip(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, -+ CeedRequest *request) { -+ return CeedElemRestrictionApply_Hip_Core(rstr, t_mode, false, false, u, v, request); -+} -+ - //------------------------------------------------------------------------------ - // Get offsets - //------------------------------------------------------------------------------ -@@ -117,21 +251,61 @@ static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, CeedMemTy - return CEED_ERROR_SUCCESS; - } - -+//------------------------------------------------------------------------------ -+// Get orientations -+//------------------------------------------------------------------------------ -+static int CeedElemRestrictionGetOrientations_Hip(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) { -+ CeedElemRestriction_Hip *impl; -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ -+ switch (mem_type) { -+ case CEED_MEM_HOST: -+ *orients = impl->h_orients; -+ break; -+ case CEED_MEM_DEVICE: -+ *orients = impl->d_orients; -+ break; -+ } -+ return CEED_ERROR_SUCCESS; -+} -+ -+//------------------------------------------------------------------------------ -+// Get curl-conforming orientations -+//------------------------------------------------------------------------------ -+static int CeedElemRestrictionGetCurlOrientations_Hip(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) { -+ CeedElemRestriction_Hip *impl; -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ -+ switch (mem_type) { -+ case CEED_MEM_HOST: -+ *curl_orients = impl->h_curl_orients; -+ break; -+ case CEED_MEM_DEVICE: -+ *curl_orients = impl->d_curl_orients; -+ break; -+ } -+ return CEED_ERROR_SUCCESS; -+} -+ - //------------------------------------------------------------------------------ - // Destroy restriction - //------------------------------------------------------------------------------ --static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { -+static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction rstr) { - Ceed ceed; - CeedElemRestriction_Hip *impl; - -- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); - CeedCallHip(ceed, hipModuleUnload(impl->module)); - CeedCallBackend(CeedFree(&impl->h_ind_allocated)); - CeedCallHip(ceed, hipFree(impl->d_ind_allocated)); - CeedCallHip(ceed, hipFree(impl->d_t_offsets)); - CeedCallHip(ceed, hipFree(impl->d_t_indices)); - CeedCallHip(ceed, hipFree(impl->d_l_vec_indices)); -+ CeedCallBackend(CeedFree(&impl->h_orients_allocated)); -+ CeedCallHip(ceed, hipFree(impl->d_orients_allocated)); -+ CeedCallBackend(CeedFree(&impl->h_curl_orients_allocated)); -+ CeedCallHip(ceed, hipFree(impl->d_curl_orients_allocated)); - CeedCallBackend(CeedFree(&impl)); - return CEED_ERROR_SUCCESS; - } -@@ -139,23 +313,25 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { - //------------------------------------------------------------------------------ - // Create transpose offsets and indices - //------------------------------------------------------------------------------ --static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const CeedInt *indices) { -+static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction rstr, const CeedInt *indices) { - Ceed ceed; - bool *is_node; - CeedSize l_size; -- CeedInt num_elem, elem_size, num_comp, num_nodes = 0, *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; -+ CeedInt num_elem, elem_size, num_comp, num_nodes = 0; -+ CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; - CeedElemRestriction_Hip *impl; - -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); -- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); -- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); -- CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); -- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); -+ CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); -+ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); - const CeedInt size_indices = num_elem * elem_size; - - // Count num_nodes - CeedCallBackend(CeedCalloc(l_size, &is_node)); -+ - for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; - for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; - impl->num_nodes = num_nodes; -@@ -218,136 +394,223 @@ static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const Ceed - // Create restriction - //------------------------------------------------------------------------------ - int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, -- const CeedInt8 *curl_orients, CeedElemRestriction r) { -+ const CeedInt8 *curl_orients, CeedElemRestriction rstr) { - Ceed ceed, ceed_parent; -- bool is_deterministic, is_strided; -- char *restriction_kernel_path, *restriction_kernel_source; -+ bool is_deterministic; - CeedInt num_elem, num_comp, elem_size, comp_stride = 1; - CeedRestrictionType rstr_type; -+ char *restriction_kernel_path, *restriction_kernel_source; - CeedElemRestriction_Hip *impl; - -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -- CeedCallBackend(CeedCalloc(1, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); - CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); - CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); -- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); -- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); -- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); -- CeedInt size = num_elem * elem_size; -- CeedInt strides[3] = {1, size, elem_size}; -- CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; -- -- CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); -- CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, -- "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); -+ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); -+ const CeedInt size = num_elem * elem_size; -+ CeedInt strides[3] = {1, size, elem_size}; -+ CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; - - // Stride data -- CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); -- if (is_strided) { -+ CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); -+ if (rstr_type == CEED_RESTRICTION_STRIDED) { - bool has_backend_strides; - -- CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); -+ CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); - if (!has_backend_strides) { -- CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); -+ CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); - } - } else { -- CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); -+ CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); - } - -- impl->h_ind = NULL; -- impl->h_ind_allocated = NULL; -- impl->d_ind = NULL; -- impl->d_ind_allocated = NULL; -- impl->d_t_indices = NULL; -- impl->d_t_offsets = NULL; -- impl->num_nodes = size; -- CeedCallBackend(CeedElemRestrictionSetData(r, impl)); -- CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); -- -- // Set up device indices/offset arrays -- switch (mem_type) { -- case CEED_MEM_HOST: { -- switch (copy_mode) { -- case CEED_OWN_POINTER: -- impl->h_ind_allocated = (CeedInt *)indices; -- impl->h_ind = (CeedInt *)indices; -- break; -- case CEED_USE_POINTER: -- impl->h_ind = (CeedInt *)indices; -- break; -- case CEED_COPY_VALUES: -- if (indices != NULL) { -- CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); -- memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); -+ CeedCallBackend(CeedCalloc(1, &impl)); -+ impl->num_nodes = size; -+ impl->h_ind = NULL; -+ impl->h_ind_allocated = NULL; -+ impl->d_ind = NULL; -+ impl->d_ind_allocated = NULL; -+ impl->d_t_indices = NULL; -+ impl->d_t_offsets = NULL; -+ impl->h_orients = NULL; -+ impl->h_orients_allocated = NULL; -+ impl->d_orients = NULL; -+ impl->d_orients_allocated = NULL; -+ impl->h_curl_orients = NULL; -+ impl->h_curl_orients_allocated = NULL; -+ impl->d_curl_orients = NULL; -+ impl->d_curl_orients_allocated = NULL; -+ CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); -+ CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); -+ -+ // Set up device offset/orientation arrays -+ if (rstr_type != CEED_RESTRICTION_STRIDED) { -+ switch (mem_type) { -+ case CEED_MEM_HOST: { -+ switch (copy_mode) { -+ case CEED_OWN_POINTER: -+ impl->h_ind_allocated = (CeedInt *)indices; -+ impl->h_ind = (CeedInt *)indices; -+ break; -+ case CEED_USE_POINTER: -+ impl->h_ind = (CeedInt *)indices; -+ break; -+ case CEED_COPY_VALUES: -+ CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); -+ memcpy(impl->h_ind_allocated, indices, size * sizeof(CeedInt)); - impl->h_ind = impl->h_ind_allocated; -- } -- break; -- } -- if (indices != NULL) { -+ break; -+ } - CeedCallHip(ceed, hipMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); - impl->d_ind_allocated = impl->d_ind; // We own the device memory - CeedCallHip(ceed, hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), hipMemcpyHostToDevice)); -- if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(r, indices)); -- } -- break; -- } -- case CEED_MEM_DEVICE: { -- switch (copy_mode) { -- case CEED_COPY_VALUES: -- if (indices != NULL) { -+ if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, indices)); -+ } break; -+ case CEED_MEM_DEVICE: { -+ switch (copy_mode) { -+ case CEED_COPY_VALUES: - CeedCallHip(ceed, hipMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); - impl->d_ind_allocated = impl->d_ind; // We own the device memory - CeedCallHip(ceed, hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), hipMemcpyDeviceToDevice)); -+ break; -+ case CEED_OWN_POINTER: -+ impl->d_ind = (CeedInt *)indices; -+ impl->d_ind_allocated = impl->d_ind; -+ break; -+ case CEED_USE_POINTER: -+ impl->d_ind = (CeedInt *)indices; -+ break; -+ } -+ CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); -+ CeedCallHip(ceed, hipMemcpy(impl->h_ind_allocated, impl->d_ind, size * sizeof(CeedInt), hipMemcpyDeviceToHost)); -+ impl->h_ind = impl->h_ind_allocated; -+ if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, indices)); -+ } break; -+ } -+ -+ // Orientation data -+ if (rstr_type == CEED_RESTRICTION_ORIENTED) { -+ switch (mem_type) { -+ case CEED_MEM_HOST: { -+ switch (copy_mode) { -+ case CEED_OWN_POINTER: -+ impl->h_orients_allocated = (bool *)orients; -+ impl->h_orients = (bool *)orients; -+ break; -+ case CEED_USE_POINTER: -+ impl->h_orients = (bool *)orients; -+ break; -+ case CEED_COPY_VALUES: -+ CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); -+ memcpy(impl->h_orients_allocated, orients, size * sizeof(bool)); -+ impl->h_orients = impl->h_orients_allocated; -+ break; - } -- break; -- case CEED_OWN_POINTER: -- impl->d_ind = (CeedInt *)indices; -- impl->d_ind_allocated = impl->d_ind; -- break; -- case CEED_USE_POINTER: -- impl->d_ind = (CeedInt *)indices; -+ CeedCallHip(ceed, hipMalloc((void **)&impl->d_orients, size * sizeof(bool))); -+ impl->d_orients_allocated = impl->d_orients; // We own the device memory -+ CeedCallHip(ceed, hipMemcpy(impl->d_orients, orients, size * sizeof(bool), hipMemcpyHostToDevice)); -+ } break; -+ case CEED_MEM_DEVICE: { -+ switch (copy_mode) { -+ case CEED_COPY_VALUES: -+ CeedCallHip(ceed, hipMalloc((void **)&impl->d_orients, size * sizeof(bool))); -+ impl->d_orients_allocated = impl->d_orients; // We own the device memory -+ CeedCallHip(ceed, hipMemcpy(impl->d_orients, orients, size * sizeof(bool), hipMemcpyDeviceToDevice)); -+ break; -+ case CEED_OWN_POINTER: -+ impl->d_orients = (bool *)orients; -+ impl->d_orients_allocated = impl->d_orients; -+ break; -+ case CEED_USE_POINTER: -+ impl->d_orients = (bool *)orients; -+ break; -+ } -+ CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); -+ CeedCallHip(ceed, hipMemcpy(impl->h_orients_allocated, impl->d_orients, size * sizeof(bool), hipMemcpyDeviceToHost)); -+ impl->h_orients = impl->h_orients_allocated; -+ } break; - } -- if (indices != NULL) { -- CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); -- CeedCallHip(ceed, hipMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), hipMemcpyDeviceToHost)); -- impl->h_ind = impl->h_ind_allocated; -- if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(r, indices)); -+ } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) { -+ switch (mem_type) { -+ case CEED_MEM_HOST: { -+ switch (copy_mode) { -+ case CEED_OWN_POINTER: -+ impl->h_curl_orients_allocated = (CeedInt8 *)curl_orients; -+ impl->h_curl_orients = (CeedInt8 *)curl_orients; -+ break; -+ case CEED_USE_POINTER: -+ impl->h_curl_orients = (CeedInt8 *)curl_orients; -+ break; -+ case CEED_COPY_VALUES: -+ CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); -+ memcpy(impl->h_curl_orients_allocated, curl_orients, 3 * size * sizeof(CeedInt8)); -+ impl->h_curl_orients = impl->h_curl_orients_allocated; -+ break; -+ } -+ CeedCallHip(ceed, hipMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); -+ impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory -+ CeedCallHip(ceed, hipMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), hipMemcpyHostToDevice)); -+ } break; -+ case CEED_MEM_DEVICE: { -+ switch (copy_mode) { -+ case CEED_COPY_VALUES: -+ CeedCallHip(ceed, hipMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); -+ impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory -+ CeedCallHip(ceed, hipMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), hipMemcpyDeviceToDevice)); -+ break; -+ case CEED_OWN_POINTER: -+ impl->d_curl_orients = (CeedInt8 *)curl_orients; -+ impl->d_curl_orients_allocated = impl->d_curl_orients; -+ break; -+ case CEED_USE_POINTER: -+ impl->d_curl_orients = (CeedInt8 *)curl_orients; -+ break; -+ } -+ CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); -+ CeedCallHip(ceed, hipMemcpy(impl->h_curl_orients_allocated, impl->d_curl_orients, 3 * size * sizeof(CeedInt8), hipMemcpyDeviceToHost)); -+ impl->h_curl_orients = impl->h_curl_orients_allocated; -+ } break; - } -- break; - } -- // LCOV_EXCL_START -- default: -- return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); -- // LCOV_EXCL_STOP - } - - // Compile HIP kernels -- CeedInt num_nodes = impl->num_nodes; -- - CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction.h", &restriction_kernel_path)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); -- CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 8, "RESTR_ELEM_SIZE", elem_size, "RESTR_NUM_ELEM", num_elem, -- "RESTR_NUM_COMP", num_comp, "RESTR_NUM_NODES", num_nodes, "RESTR_COMP_STRIDE", comp_stride, "RESTR_STRIDE_NODES", -- strides[0], "RESTR_STRIDE_COMP", strides[1], "RESTR_STRIDE_ELEM", strides[2])); -+ CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, -+ "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", -+ strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedNoTranspose", &impl->OrientedNoTranspose)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedNoTranspose", &impl->CurlOrientedNoTranspose)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->CurlOrientedUnsignedNoTranspose)); - if (!is_deterministic) { - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTranspose", &impl->OrientedTranspose)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTranspose", &impl->CurlOrientedTranspose)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->CurlOrientedUnsignedTranspose)); - } else { - CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTransposeDet", &impl->OrientedTransposeDet)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTransposeDet", &impl->CurlOrientedTransposeDet)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTransposeDet", &impl->CurlOrientedUnsignedTransposeDet)); - } - CeedCallBackend(CeedFree(&restriction_kernel_path)); - CeedCallBackend(CeedFree(&restriction_kernel_source)); - - // Register backend functions -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Hip)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Hip)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Hip)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Hip)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Hip)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Hip)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Hip)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Hip)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Hip)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Hip)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Hip)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Hip)); - return CEED_ERROR_SUCCESS; - } - -diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h -index 634bb68d..ecfa8874 100644 ---- a/backends/hip-ref/ceed-hip-ref.h -+++ b/backends/hip-ref/ceed-hip-ref.h -@@ -34,6 +34,15 @@ typedef struct { - hipFunction_t OffsetNoTranspose; - hipFunction_t OffsetTranspose; - hipFunction_t OffsetTransposeDet; -+ hipFunction_t OrientedNoTranspose; -+ hipFunction_t OrientedTranspose; -+ hipFunction_t OrientedTransposeDet; -+ hipFunction_t CurlOrientedNoTranspose; -+ hipFunction_t CurlOrientedTranspose; -+ hipFunction_t CurlOrientedTransposeDet; -+ hipFunction_t CurlOrientedUnsignedNoTranspose; -+ hipFunction_t CurlOrientedUnsignedTranspose; -+ hipFunction_t CurlOrientedUnsignedTransposeDet; - CeedInt num_nodes; - CeedInt *h_ind; - CeedInt *h_ind_allocated; -@@ -42,6 +51,14 @@ typedef struct { - CeedInt *d_t_offsets; - CeedInt *d_t_indices; - CeedInt *d_l_vec_indices; -+ bool *h_orients; -+ bool *h_orients_allocated; -+ bool *d_orients; -+ bool *d_orients_allocated; -+ CeedInt8 *h_curl_orients; -+ CeedInt8 *h_curl_orients_allocated; -+ CeedInt8 *d_curl_orients; -+ CeedInt8 *d_curl_orients_allocated; - } CeedElemRestriction_Hip; - - typedef struct { -@@ -84,21 +101,19 @@ typedef struct { - - typedef struct { - hipModule_t module; -- hipFunction_t linearDiagonal; -- hipFunction_t linearPointBlock; -- CeedBasis basis_in, basis_out; -+ hipFunction_t LinearDiagonal; -+ hipFunction_t LinearPointBlock; - CeedElemRestriction diag_rstr, point_block_diag_rstr; - CeedVector elem_diag, point_block_elem_diag; -- CeedInt num_e_mode_in, num_e_mode_out, num_modes; -- CeedEvalMode *h_e_mode_in, *h_e_mode_out; -- CeedEvalMode *d_e_mode_in, *d_e_mode_out; -- CeedScalar *d_identity, *d_interp_in, *d_interp_out, *d_grad_in, *d_grad_out; -+ CeedEvalMode *d_eval_modes_in, *d_eval_modes_out; -+ CeedScalar *d_identity, *d_interp_in, *d_grad_in, *d_div_in, *d_curl_in; -+ CeedScalar *d_interp_out, *d_grad_out, *d_div_out, *d_curl_out; - } CeedOperatorDiag_Hip; - - typedef struct { - hipModule_t module; -- hipFunction_t linearAssemble; -- CeedInt num_elem, block_size_x, block_size_y, elem_per_block; -+ hipFunction_t LinearAssemble; -+ CeedInt block_size_x, block_size_y, elems_per_block; - CeedScalar *d_B_in, *d_B_out; - } CeedOperatorAssemble_Hip; - -diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c -index 1bf604c4..8deebe3f 100644 ---- a/backends/ref/ceed-ref-restriction.c -+++ b/backends/ref/ceed-ref-restriction.c -@@ -55,9 +55,9 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe - return CEED_ERROR_SUCCESS; - } - --static inline int CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, -- const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, -- CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { -+static inline int CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, -+ const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, -+ CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { - // Default restriction with offsets - CeedElemRestriction_Ref *impl; - -@@ -216,9 +216,9 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest - return CEED_ERROR_SUCCESS; - } - --static inline int CeedElemRestrictionApplyStandardTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, -- const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, -- CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { -+static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, -+ const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, -+ CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { - // Default restriction with offsets - CeedElemRestriction_Ref *impl; - -@@ -367,7 +367,6 @@ static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRes - CeedElemRestriction_Ref *impl; - - CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -- - for (CeedInt e = start; e < stop; e++) { - l_vec_offset = impl->offsets[e]; - CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr, e, &num_points)); -@@ -418,16 +417,16 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co - CeedElemRestrictionApplyStridedTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv)); - break; - case CEED_RESTRICTION_STANDARD: -- CeedCallBackend(CeedElemRestrictionApplyStandardTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, -- v_offset, uu, vv)); -+ CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, -+ v_offset, uu, vv)); - break; - case CEED_RESTRICTION_ORIENTED: - if (use_signs) { - CeedCallBackend(CeedElemRestrictionApplyOrientedTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, - elem_size, v_offset, uu, vv)); - } else { -- CeedCallBackend(CeedElemRestrictionApplyStandardTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, -- elem_size, v_offset, uu, vv)); -+ CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, -+ v_offset, uu, vv)); - } - break; - case CEED_RESTRICTION_CURL_ORIENTED: -@@ -438,8 +437,8 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co - CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, - num_elem, elem_size, v_offset, uu, vv)); - } else { -- CeedCallBackend(CeedElemRestrictionApplyStandardTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, -- elem_size, v_offset, uu, vv)); -+ CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, -+ v_offset, uu, vv)); - } - break; - case CEED_RESTRICTION_POINTS: -@@ -458,16 +457,16 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co - CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv)); - break; - case CEED_RESTRICTION_STANDARD: -- CeedCallBackend(CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, -- elem_size, v_offset, uu, vv)); -+ CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, -+ v_offset, uu, vv)); - break; - case CEED_RESTRICTION_ORIENTED: - if (use_signs) { - CeedCallBackend(CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, - elem_size, v_offset, uu, vv)); - } else { -- CeedCallBackend(CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, -- elem_size, v_offset, uu, vv)); -+ CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, -+ elem_size, v_offset, uu, vv)); - } - break; - case CEED_RESTRICTION_CURL_ORIENTED: -@@ -478,8 +477,8 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co - CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, - num_elem, elem_size, v_offset, uu, vv)); - } else { -- CeedCallBackend(CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, -- elem_size, v_offset, uu, vv)); -+ CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, -+ elem_size, v_offset, uu, vv)); - } - break; - case CEED_RESTRICTION_POINTS: -@@ -625,14 +624,14 @@ static int CeedElemRestrictionApplyUnoriented_Ref(CeedElemRestriction rstr, Ceed - //------------------------------------------------------------------------------ - // ElemRestriction Apply Points - //------------------------------------------------------------------------------ --static int CeedElemRestrictionApplyAtPointsInElement_Ref(CeedElemRestriction r, CeedInt elem, CeedTransposeMode t_mode, CeedVector u, CeedVector v, -+static int CeedElemRestrictionApplyAtPointsInElement_Ref(CeedElemRestriction rstr, CeedInt elem, CeedTransposeMode t_mode, CeedVector u, CeedVector v, - CeedRequest *request) { - CeedInt num_comp; - CeedElemRestriction_Ref *impl; - -- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); -- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); -- return impl->Apply(r, num_comp, 0, 1, elem, elem + 1, t_mode, false, false, u, v, request); -+ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ return impl->Apply(rstr, num_comp, 0, 1, elem, elem + 1, t_mode, false, false, u, v, request); - } - - //------------------------------------------------------------------------------ -@@ -733,7 +732,10 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, - CeedInt layout[3] = {1, elem_size, elem_size * num_comp}; - - CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Only MemType = HOST supported"); -+ - CeedCallBackend(CeedCalloc(1, &impl)); -+ CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); -+ CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); - - // Offsets data - CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); -@@ -813,20 +815,6 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, - } - } - -- CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); -- CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Ref)); -- if (rstr_type == CEED_RESTRICTION_POINTS) { -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", CeedElemRestrictionApplyAtPointsInElement_Ref)); -- } -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Ref)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Ref)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyBlock", CeedElemRestrictionApplyBlock_Ref)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Ref)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Ref)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Ref)); -- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Ref)); -- - // Set apply function based upon num_comp, block_size, and comp_stride - CeedInt index = -1; - -@@ -876,6 +864,19 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, - impl->Apply = CeedElemRestrictionApply_Ref_Core; - break; - } -+ -+ // Register backend functions -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Ref)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Ref)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Ref)); -+ if (rstr_type == CEED_RESTRICTION_POINTS) { -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", CeedElemRestrictionApplyAtPointsInElement_Ref)); -+ } -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyBlock", CeedElemRestrictionApplyBlock_Ref)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Ref)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Ref)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Ref)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Ref)); - return CEED_ERROR_SUCCESS; - } - -diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp -index 78c71681..867960bb 100644 ---- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp -+++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp -@@ -1071,16 +1071,16 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { - CeedCallBackend(CeedGetData(ceed, &sycl_data)); - - // Kernel setup -- int elem_per_block = 1; -- asmb->elem_per_block = elem_per_block; -- CeedInt block_size = elem_size * elem_size * elem_per_block; -+ int elems_per_block = 1; -+ asmb->elems_per_block = elems_per_block; -+ CeedInt block_size = elem_size * elem_size * elems_per_block; - - /* CeedInt maxThreadsPerBlock = sycl_data->sycl_device.get_info(); - bool fallback = block_size > maxThreadsPerBlock; - asmb->fallback = fallback; - if (fallback) { - // Use fallback kernel with 1D threadblock -- block_size = elem_size * elem_per_block; -+ block_size = elem_size * elems_per_block; - asmb->block_size_x = elem_size; - asmb->block_size_y = 1; - } else { // Use kernel with 2D threadblock -@@ -1250,13 +1250,13 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons - CeedScalar *B_in, *B_out; - B_in = asmb->d_B_in; - B_out = asmb->d_B_out; -- const CeedInt elem_per_block = asmb->elem_per_block; -+ const CeedInt elems_per_block = asmb->elems_per_block; - const CeedInt block_size_x = asmb->block_size_x; - const CeedInt block_size_y = asmb->block_size_y; // This will be 1 for the fallback kernel - -- const CeedInt grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); -- sycl::range<3> local_range(block_size_x, block_size_y, elem_per_block); -- sycl::range<3> global_range(grid * block_size_x, block_size_y, elem_per_block); -+ const CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); -+ sycl::range<3> local_range(block_size_x, block_size_y, elems_per_block); -+ sycl::range<3> global_range(grid * block_size_x, block_size_y, elems_per_block); - sycl::nd_range<3> kernel_range(global_range, local_range); - - sycl_queue.parallel_for(kernel_range, [=](sycl::nd_item<3> work_item) { -diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp -index 56544c38..fc7bc775 100644 ---- a/backends/sycl-ref/ceed-sycl-ref.hpp -+++ b/backends/sycl-ref/ceed-sycl-ref.hpp -@@ -94,7 +94,7 @@ typedef struct { - } CeedOperatorDiag_Sycl; - - typedef struct { -- CeedInt num_elem, block_size_x, block_size_y, elem_per_block; -+ CeedInt num_elem, block_size_x, block_size_y, elems_per_block; - CeedInt num_e_mode_in, num_e_mode_out, num_qpts, num_nodes, block_size, num_comp; // Kernel parameters - bool fallback; - CeedScalar *d_B_in, *d_B_out; -diff --git a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp -index 4ace9976..8aac0678 100644 ---- a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp -+++ b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp -@@ -142,15 +142,15 @@ static int CeedElemRestrictionOffsetTranspose_Sycl(sycl::queue &sycl_queue, cons - //------------------------------------------------------------------------------ - // Apply restriction - //------------------------------------------------------------------------------ --static int CeedElemRestrictionApply_Sycl(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { -+static int CeedElemRestrictionApply_Sycl(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { - Ceed ceed; - Ceed_Sycl *data; - const CeedScalar *d_u; - CeedScalar *d_v; - CeedElemRestriction_Sycl *impl; - -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); - CeedCallBackend(CeedGetData(ceed, &data)); - - // Get vectors -@@ -197,12 +197,12 @@ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction r, CeedTransposeMod - //------------------------------------------------------------------------------ - // Get offsets - //------------------------------------------------------------------------------ --static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction r, CeedMemType m_type, const CeedInt **offsets) { -+static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction rstr, CeedMemType m_type, const CeedInt **offsets) { - Ceed ceed; - CeedElemRestriction_Sycl *impl; - -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); - - switch (m_type) { - case CEED_MEM_HOST: -@@ -218,13 +218,13 @@ static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction r, CeedMemType - //------------------------------------------------------------------------------ - // Destroy restriction - //------------------------------------------------------------------------------ --static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction r) { -+static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction rstr) { - Ceed ceed; - Ceed_Sycl *data; - CeedElemRestriction_Sycl *impl; - -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); - CeedCallBackend(CeedGetData(ceed, &data)); - - // Wait for all work to finish before freeing memory -@@ -242,7 +242,7 @@ static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction r) { - //------------------------------------------------------------------------------ - // Create transpose offsets and indices - //------------------------------------------------------------------------------ --static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const CeedInt *indices) { -+static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction rstr, const CeedInt *indices) { - Ceed ceed; - Ceed_Sycl *data; - bool *is_node; -@@ -250,12 +250,12 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const Cee - CeedInt num_elem, elem_size, num_comp, num_nodes = 0, *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; - CeedElemRestriction_Sycl *impl; - -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); -- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); -- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); -- CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); -- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); -+ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); -+ CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); -+ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); - - // Count num_nodes - CeedCallBackend(CeedCalloc(l_size, &is_node)); -@@ -330,7 +330,7 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const Cee - // Create restriction - //------------------------------------------------------------------------------ - int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, -- const CeedInt8 *curl_orients, CeedElemRestriction r) { -+ const CeedInt8 *curl_orients, CeedElemRestriction rstr) { - Ceed ceed; - Ceed_Sycl *data; - bool is_strided; -@@ -338,32 +338,33 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, - CeedRestrictionType rstr_type; - CeedElemRestriction_Sycl *impl; - -- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); -+ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); - CeedCallBackend(CeedGetData(ceed, &data)); -- CeedCallBackend(CeedCalloc(1, &impl)); -- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); -- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); -- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); -- CeedInt size = num_elem * elem_size; -- CeedInt strides[3] = {1, size, elem_size}; -- -- CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); -+ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); -+ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); -+ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); -+ const CeedInt size = num_elem * elem_size; -+ CeedInt strides[3] = {1, size, elem_size}; -+ CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; -+ -+ CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); - CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, - "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); - - // Stride data -- CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); -+ CeedCallBackend(CeedElemRestrictionIsStrided(rstr, &is_strided)); - if (is_strided) { - bool has_backend_strides; - -- CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); -+ CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); - if (!has_backend_strides) { -- CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); -+ CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); - } - } else { -- CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); -+ CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); - } - -+ CeedCallBackend(CeedCalloc(1, &impl)); - impl->h_ind = NULL; - impl->h_ind_allocated = NULL; - impl->d_ind = NULL; -@@ -378,9 +379,8 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, - impl->strides[0] = strides[0]; - impl->strides[1] = strides[1]; - impl->strides[2] = strides[2]; -- CeedCallBackend(CeedElemRestrictionSetData(r, impl)); -- CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; -- CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); -+ CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); -+ CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); - - // Set up device indices/offset arrays - if (mem_type == CEED_MEM_HOST) { -@@ -409,7 +409,7 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, - sycl::event copy_event = data->sycl_queue.copy(indices, impl->d_ind, size, {e}); - // Wait for copy to finish and handle exceptions - CeedCallSycl(ceed, copy_event.wait_and_throw()); -- CeedCallBackend(CeedElemRestrictionOffset_Sycl(r, indices)); -+ CeedCallBackend(CeedElemRestrictionOffset_Sycl(rstr, indices)); - } - } else if (mem_type == CEED_MEM_DEVICE) { - switch (copy_mode) { -@@ -440,7 +440,7 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, - sycl::event copy_event = data->sycl_queue.copy(impl->d_ind, impl->h_ind_allocated, elem_size * num_elem, {e}); - CeedCallSycl(ceed, copy_event.wait_and_throw()); - impl->h_ind = impl->h_ind_allocated; -- CeedCallBackend(CeedElemRestrictionOffset_Sycl(r, indices)); -+ CeedCallBackend(CeedElemRestrictionOffset_Sycl(rstr, indices)); - } - } else { - // LCOV_EXCL_START -@@ -449,10 +449,10 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, - } - - // Register backend functions -- CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Sycl)); -- CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Sycl)); -- CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Sycl)); -- CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Sycl)); -- CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Sycl)); -+ CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Sycl)); -+ CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApply_Sycl)); -+ CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApply_Sycl)); -+ CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Sycl)); -+ CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Sycl)); - return CEED_ERROR_SUCCESS; - } -diff --git a/include/ceed/backend.h b/include/ceed/backend.h -index b3d2f97e..ff1f82c1 100644 ---- a/include/ceed/backend.h -+++ b/include/ceed/backend.h -@@ -368,6 +368,7 @@ CEED_EXTERN int CeedQFunctionContextRestoreInt32Read(CeedQFunctionContext ctx, C - CEED_EXTERN int CeedQFunctionContextGetDataDestroy(CeedQFunctionContext ctx, CeedMemType *f_mem_type, CeedQFunctionContextDataDestroyUser *f); - CEED_EXTERN int CeedQFunctionContextReference(CeedQFunctionContext ctx); - -+CEED_EXTERN int CeedOperatorGetBasisPointer(CeedBasis basis, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar **basis_ptr); - CEED_EXTERN int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, CeedElemRestriction *pointblock_rstr); - CEED_EXTERN int CeedQFunctionAssemblyDataCreate(Ceed ceed, CeedQFunctionAssemblyData *data); - CEED_EXTERN int CeedQFunctionAssemblyDataReference(CeedQFunctionAssemblyData data); -diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h -index f75c31de..6535c733 100644 ---- a/include/ceed/ceed.h -+++ b/include/ceed/ceed.h -@@ -392,7 +392,7 @@ CEED_EXTERN int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx); - CEED_EXTERN int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunction dqfT, CeedOperator *op); - CEED_EXTERN int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op); - CEED_EXTERN int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy); --CEED_EXTERN int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction r, CeedBasis b, CeedVector v); -+CEED_EXTERN int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction rstr, CeedBasis basis, CeedVector v); - CEED_EXTERN int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperatorField **input_fields, CeedInt *num_output_fields, - CeedOperatorField **output_fields); - CEED_EXTERN int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op); -diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h -index 7c6f8789..ab366e79 100644 ---- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h -+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h -@@ -19,11 +19,11 @@ typedef CeedInt IndexType; - #endif - - //------------------------------------------------------------------------------ --// Get Basis Emode Pointer -+// Get basis pointer - //------------------------------------------------------------------------------ --extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basis_ptr, CeedEvalMode e_mode, const CeedScalar *identity, -- const CeedScalar *interp, const CeedScalar *grad) { -- switch (e_mode) { -+static __device__ __inline__ void GetBasisPointer(const CeedScalar **basis_ptr, CeedEvalMode eval_modes, const CeedScalar *identity, -+ const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *div, const CeedScalar *curl) { -+ switch (eval_modes) { - case CEED_EVAL_NONE: - *basis_ptr = identity; - break; -@@ -33,52 +33,67 @@ extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **b - case CEED_EVAL_GRAD: - *basis_ptr = grad; - break; -- case CEED_EVAL_WEIGHT: - case CEED_EVAL_DIV: -+ *basis_ptr = div; -+ break; - case CEED_EVAL_CURL: -- break; // Caught by QF Assembly -+ *basis_ptr = curl; -+ break; -+ case CEED_EVAL_WEIGHT: -+ break; // Caught by QF assembly - } - } - - //------------------------------------------------------------------------------ - // Core code for diagonal assembly - //------------------------------------------------------------------------------ --__device__ void diagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity, const CeedScalar *interp_in, -- const CeedScalar *grad_in, const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedEvalMode *e_mode_in, -- const CeedEvalMode *e_mode_out, const CeedScalar *__restrict__ assembled_qf_array, -- CeedScalar *__restrict__ elem_diag_array) { -- const int tid = threadIdx.x; // running with P threads, tid is evec node -+static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity, -+ const CeedScalar *interp_in, const CeedScalar *grad_in, const CeedScalar *div_in, -+ const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out, -+ const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, -+ const CeedEvalMode *eval_modes_out, const CeedScalar *__restrict__ assembled_qf_array, -+ CeedScalar *__restrict__ elem_diag_array) { -+ const int tid = threadIdx.x; // Running with P threads -+ - if (tid >= NUM_NODES) return; - - // Compute the diagonal of B^T D B - // Each element - for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < num_elem; e += gridDim.x * blockDim.z) { -- IndexType d_out = -1; -- - // Each basis eval mode pair -- for (IndexType e_out = 0; e_out < NUM_E_MODE_OUT; e_out++) { -- const CeedScalar *b_t = NULL; -+ IndexType d_out = 0; -+ CeedEvalMode eval_modes_out_prev = CEED_EVAL_NONE; - -- if (e_mode_out[e_out] == CEED_EVAL_GRAD) d_out += 1; -- CeedOperatorGetBasisPointer_Cuda(&b_t, e_mode_out[e_out], identity, interp_out, &grad_out[d_out * NUM_QPTS * NUM_NODES]); -- IndexType d_in = -1; -+ for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) { -+ IndexType d_in = 0; -+ CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE; -+ const CeedScalar *b_t = NULL; - -- for (IndexType e_in = 0; e_in < NUM_E_MODE_IN; e_in++) { -+ GetBasisPointer(&b_t, eval_modes_out[e_out], identity, interp_out, grad_out, div_out, curl_out); -+ if (e_out == 0 || eval_modes_out[e_out] != eval_modes_out_prev) d_out = 0; -+ else b_t = &b_t[(++d_out) * NUM_QPTS * NUM_NODES]; -+ eval_modes_out_prev = eval_modes_out[e_out]; -+ -+ for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) { - const CeedScalar *b = NULL; - -- if (e_mode_in[e_in] == CEED_EVAL_GRAD) d_in += 1; -- CeedOperatorGetBasisPointer_Cuda(&b, e_mode_in[e_in], identity, interp_in, &grad_in[d_in * NUM_QPTS * NUM_NODES]); -+ GetBasisPointer(&b, eval_modes_in[e_in], identity, interp_in, grad_in, div_in, curl_in); -+ if (e_in == 0 || eval_modes_in[e_in] != eval_modes_in_prev) d_in = 0; -+ else b = &b[(++d_in) * NUM_QPTS * NUM_NODES]; -+ eval_modes_in_prev = eval_modes_in[e_in]; -+ - // Each component - for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { - // Each qpoint/node pair - if (is_point_block) { -- // Point Block Diagonal -+ // Point block diagonal - for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { - CeedScalar e_value = 0.; - - for (IndexType q = 0; q < NUM_QPTS; q++) { - const CeedScalar qf_value = -- assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_E_MODE_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + -+ assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * -+ NUM_QPTS + - q]; - - e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; -@@ -86,12 +101,13 @@ __device__ void diagonalCore(const CeedInt num_elem, const bool is_point_block, - elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value; - } - } else { -- // Diagonal Only -+ // Diagonal only - CeedScalar e_value = 0.; - - for (IndexType q = 0; q < NUM_QPTS; q++) { - const CeedScalar qf_value = -- assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_E_MODE_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + q]; -+ assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + -+ q]; - - e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; - } -@@ -106,21 +122,25 @@ __device__ void diagonalCore(const CeedInt num_elem, const bool is_point_block, - //------------------------------------------------------------------------------ - // Linear diagonal - //------------------------------------------------------------------------------ --extern "C" __global__ void linearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, -- const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedEvalMode *e_mode_in, -- const CeedEvalMode *e_mode_out, const CeedScalar *__restrict__ assembled_qf_array, -- CeedScalar *__restrict__ elem_diag_array) { -- diagonalCore(num_elem, false, identity, interp_in, grad_in, interp_out, grad_out, e_mode_in, e_mode_out, assembled_qf_array, elem_diag_array); -+extern "C" __global__ void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, -+ const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out, -+ const CeedScalar *grad_out, const CeedScalar *div_out, const CeedScalar *curl_out, -+ const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, -+ const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { -+ DiagonalCore(num_elem, false, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, -+ assembled_qf_array, elem_diag_array); - } - - //------------------------------------------------------------------------------ - // Linear point block diagonal - //------------------------------------------------------------------------------ --extern "C" __global__ void linearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, -- const CeedScalar *grad_in, const CeedScalar *interp_out, const CeedScalar *grad_out, -- const CeedEvalMode *e_mode_in, const CeedEvalMode *e_mode_out, -+extern "C" __global__ void LinearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, -+ const CeedScalar *grad_in, const CeedScalar *div_in, const CeedScalar *curl_in, -+ const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedScalar *div_out, -+ const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, - const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { -- diagonalCore(num_elem, true, identity, interp_in, grad_in, interp_out, grad_out, e_mode_in, e_mode_out, assembled_qf_array, elem_diag_array); -+ DiagonalCore(num_elem, true, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, -+ assembled_qf_array, elem_diag_array); - } - - //------------------------------------------------------------------------------ -diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h -index eeb256fe..60d641ed 100644 ---- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h -+++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h -@@ -19,108 +19,92 @@ typedef CeedInt IndexType; - #endif - - //------------------------------------------------------------------------------ --// Matrix assembly kernel for low-order elements (2D thread block) -+// Matrix assembly kernel - //------------------------------------------------------------------------------ - extern "C" __launch_bounds__(BLOCK_SIZE) __global__ -- void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, -- CeedScalar *__restrict__ values_array) { -- // This kernel assumes B_in and B_out have the same number of quadrature points and basis points. -- // TODO: expand to more general cases -- const int i = threadIdx.x; // The output row index of each B^TDB operation -- const int l = threadIdx.y; // The output column index of each B^TDB operation -- // such that we have (Bout^T)_ij D_jk Bin_kl = C_il -- -- // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: element, -- // comp_in, comp_out, node_row, node_col -- const IndexType comp_out_stride = NUM_NODES * NUM_NODES; -- const IndexType comp_in_stride = comp_out_stride * NUM_COMP; -- const IndexType e_stride = comp_in_stride * NUM_COMP; -- // Strides for QF array, slowest --> fastest: e_mode_in, comp_in, e_mode_out, comp_out, elem, qpt -- const IndexType q_e_stride = NUM_QPTS; -- const IndexType q_comp_out_stride = NUM_ELEM * q_e_stride; -- const IndexType q_e_mode_out_stride = q_comp_out_stride * NUM_COMP; -- const IndexType q_comp_in_stride = q_e_mode_out_stride * NUM_E_MODE_OUT; -- const IndexType q_e_mode_in_stride = q_comp_in_stride * NUM_COMP; -- -- // Loop over each element (if necessary) -- for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NUM_ELEM; e += gridDim.x * blockDim.z) { -- for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { -- for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { -- CeedScalar result = 0.0; -- IndexType qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; -- -- for (IndexType e_mode_in = 0; e_mode_in < NUM_E_MODE_IN; e_mode_in++) { -- IndexType b_in_index = e_mode_in * NUM_QPTS * NUM_NODES; -+ void LinearAssemble(const CeedInt num_elem, const CeedScalar *B_in, const CeedScalar *B_out, const bool *orients_in, -+ const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out, -+ const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) { -+ extern __shared__ CeedScalar s_CT[]; -+ CeedScalar *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN; - -- for (IndexType e_mode_out = 0; e_mode_out < NUM_E_MODE_OUT; e_mode_out++) { -- IndexType b_out_index = e_mode_out * NUM_QPTS * NUM_NODES; -- IndexType qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in; -- -- // Perform the B^T D B operation for this 'chunk' of D (the qf_array) -- for (IndexType j = 0; j < NUM_QPTS; j++) { -- result += B_out[b_out_index + j * NUM_NODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES + l]; -- } -- } // end of e_mode_out -- } // end of e_mode_in -- IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NUM_NODES * i + l; -- -- values_array[val_index] = result; -- } // end of out component -- } // end of in component -- } // end of element loop --} -- --//------------------------------------------------------------------------------ --// Fallback kernel for larger orders (1D thread block) --//------------------------------------------------------------------------------ --extern "C" __launch_bounds__(BLOCK_SIZE) __global__ -- void linearAssembleFallback(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, -- CeedScalar *__restrict__ values_array) { -- // This kernel assumes B_in and B_out have the same number of quadrature points and basis points. -- // TODO: expand to more general cases -- const int l = threadIdx.x; // The output column index of each B^TDB operation -+ const int l = threadIdx.x; // The output column index of each B^T D B operation - // such that we have (Bout^T)_ij D_jk Bin_kl = C_il - -- // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: element, -+ // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: e, - // comp_in, comp_out, node_row, node_col -- const IndexType comp_out_stride = NUM_NODES * NUM_NODES; -- const IndexType comp_in_stride = comp_out_stride * NUM_COMP; -- const IndexType e_stride = comp_in_stride * NUM_COMP; -- // Strides for QF array, slowest --> fastest: e_mode_in, comp_in, e_mode_out, comp_out, elem, qpt -- const IndexType q_e_stride = NUM_QPTS; -- const IndexType q_comp_out_stride = NUM_ELEM * q_e_stride; -- const IndexType q_e_mode_out_stride = q_comp_out_stride * NUM_COMP; -- const IndexType q_comp_in_stride = q_e_mode_out_stride * NUM_E_MODE_OUT; -- const IndexType q_e_mode_in_stride = q_comp_in_stride * NUM_COMP; -+ const IndexType comp_out_stride = NUM_NODES_OUT * NUM_NODES_IN; -+ const IndexType comp_in_stride = comp_out_stride * NUM_COMP_OUT; -+ const IndexType e_stride = comp_in_stride * NUM_COMP_IN; -+ -+ // Strides for QF array, slowest --> fastest: e_in, comp_in, e_out, comp_out, e, q -+ const IndexType q_e_stride = NUM_QPTS; -+ const IndexType q_comp_out_stride = num_elem * q_e_stride; -+ const IndexType q_eval_mode_out_stride = q_comp_out_stride * NUM_COMP_OUT; -+ const IndexType q_comp_in_stride = q_eval_mode_out_stride * NUM_EVAL_MODES_OUT; -+ const IndexType q_eval_mode_in_stride = q_comp_in_stride * NUM_COMP_IN; - - // Loop over each element (if necessary) -- for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NUM_ELEM; e += gridDim.x * blockDim.z) { -- for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { -- for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { -- for (IndexType i = 0; i < NUM_NODES; i++) { -+ for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < num_elem; e += gridDim.x * blockDim.z) { -+ for (IndexType comp_in = 0; comp_in < NUM_COMP_IN; comp_in++) { -+ for (IndexType comp_out = 0; comp_out < NUM_COMP_OUT; comp_out++) { -+ for (IndexType i = threadIdx.y; i < NUM_NODES_OUT; i += BLOCK_SIZE_Y) { - CeedScalar result = 0.0; - IndexType qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; - -- for (IndexType e_mode_in = 0; e_mode_in < NUM_E_MODE_IN; e_mode_in++) { -- IndexType b_in_index = e_mode_in * NUM_QPTS * NUM_NODES; -+ for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) { -+ IndexType b_in_index = e_in * NUM_QPTS * NUM_NODES_IN; - -- for (IndexType e_mode_out = 0; e_mode_out < NUM_E_MODE_OUT; e_mode_out++) { -- IndexType b_out_index = e_mode_out * NUM_QPTS * NUM_NODES; -- IndexType qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in; -+ for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) { -+ IndexType b_out_index = e_out * NUM_QPTS * NUM_NODES_OUT; -+ IndexType qf_index = qf_index_comp + q_eval_mode_out_stride * e_out + q_eval_mode_in_stride * e_in; - - // Perform the B^T D B operation for this 'chunk' of D (the qf_array) - for (IndexType j = 0; j < NUM_QPTS; j++) { -- result += B_out[b_out_index + j * NUM_NODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES + l]; -+ result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l]; - } -- } // end of e_mode_out -- } // end of e_mode_in -- IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NUM_NODES * i + l; -- -- values_array[val_index] = result; -+ } // end of out eval mode -+ } // end of in eval mode -+ if (orients_in) { -+ result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0; -+ } -+ if (orients_out) { -+ result *= orients_out[NUM_NODES_OUT * e + i] ? -1.0 : 1.0; -+ } -+ if (!curl_orients_in && !curl_orients_out) { -+ IndexType val_index = e_stride * e + comp_in_stride * comp_in + comp_out_stride * comp_out + NUM_NODES_IN * i + l; -+ -+ values_array[val_index] = result; -+ } else if (curl_orients_in) { -+ s_C[NUM_NODES_IN * threadIdx.y + l] = result; -+ __syncthreads(); -+ s_CT[NUM_NODES_IN * i + l] = -+ (l > 0 ? s_C[NUM_NODES_IN * threadIdx.y + l - 1] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l - 1] : 0.0) + -+ s_C[NUM_NODES_IN * threadIdx.y + l] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l + 1] + -+ (l < (NUM_NODES_IN - 1) ? s_C[NUM_NODES_IN * threadIdx.y + l + 1] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l + 3] : 0.0); -+ } else { -+ s_CT[NUM_NODES_IN * i + l] = result; -+ } - } // end of loop over element node index, i -- } // end of out component -- } // end of in component -- } // end of element loop -+ if (curl_orients_in || curl_orients_out) { -+ // Compute and store the final T^T (B^T D B T) using the fully computed C T product in shared memory -+ if (curl_orients_out) __syncthreads(); -+ for (IndexType i = threadIdx.y; i < NUM_NODES_OUT; i += BLOCK_SIZE_Y) { -+ IndexType val_index = e_stride * e + comp_in_stride * comp_in + comp_out_stride * comp_out + NUM_NODES_IN * i + l; -+ -+ if (curl_orients_out) { -+ values_array[val_index] = -+ (i > 0 ? s_CT[NUM_NODES_IN * (i - 1) + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i - 1] : 0.0) + -+ s_CT[NUM_NODES_IN * i + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i + 1] + -+ (i < (NUM_NODES_OUT - 1) ? s_CT[NUM_NODES_IN * (i + 1) + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i + 3] : 0.0); -+ } else { -+ values_array[val_index] = s_CT[NUM_NODES_IN * i + l]; -+ } -+ } -+ } -+ } // end of out component -+ } // end of in component -+ } // end of element loop - } - - //------------------------------------------------------------------------------ -diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction.h b/include/ceed/jit-source/cuda/cuda-ref-restriction.h -index 1df6f049..80011148 100644 ---- a/include/ceed/jit-source/cuda/cuda-ref-restriction.h -+++ b/include/ceed/jit-source/cuda/cuda-ref-restriction.h -@@ -28,38 +28,107 @@ extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const Ceed - } - - //------------------------------------------------------------------------------ --// E-vector -> L-vector, strided -+// L-vector -> E-vector, standard (with offsets) - //------------------------------------------------------------------------------ --extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, -+ CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt ind = indices[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -- v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] += -- u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; -+ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE]; - } - } - } - - //------------------------------------------------------------------------------ --// L-vector -> E-vector, offsets provided -+// L-vector -> E-vector, oriented - //------------------------------------------------------------------------------ --extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, -- CeedScalar *__restrict__ v) { -+extern "C" __global__ void OrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, -+ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { - for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; -+ const bool orient = orients[node]; - const CeedInt loc_node = node % RSTR_ELEM_SIZE; - const CeedInt elem = node / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -- v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE]; -+ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0); -+ } -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// L-vector -> E-vector, curl-oriented -+//------------------------------------------------------------------------------ -+extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, -+ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, -+ CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; -+ const CeedInt ind_d = indices[node]; -+ const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; -+ const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0]; -+ const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; -+ const CeedInt8 curl_orient_du = curl_orients[3 * node + 2]; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ CeedScalar value = 0.0; -+ value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; -+ value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; -+ value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; -+ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; -+ } -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// L-vector -> E-vector, unsigned curl-oriented -+//------------------------------------------------------------------------------ -+extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, -+ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, -+ CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; -+ const CeedInt ind_d = indices[node]; -+ const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; -+ const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]); -+ const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); -+ const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]); -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ CeedScalar value = 0.0; -+ value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; -+ value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; -+ value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; -+ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; -+ } -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// E-vector -> L-vector, strided -+//------------------------------------------------------------------------------ -+extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] += -+ u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; - } - } - } - - //------------------------------------------------------------------------------ --// E-vector -> L-vector, offsets provided -+// E-vector -> L-vector, standard (with offsets) - //------------------------------------------------------------------------------ - extern "C" __global__ void OffsetTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { -@@ -87,8 +156,8 @@ extern "C" __global__ void OffsetTransposeDet(const CeedInt *__restrict__ l_vec_ - - for (CeedInt j = range_1; j < range_N; j++) { - const CeedInt t_ind = t_indices[j]; -- CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; -- CeedInt elem = t_ind / RSTR_ELEM_SIZE; -+ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; -+ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; - - for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { - value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; -@@ -99,6 +168,165 @@ extern "C" __global__ void OffsetTransposeDet(const CeedInt *__restrict__ l_vec_ - } - } - -+//------------------------------------------------------------------------------ -+// E-vector -> L-vector, oriented -+//------------------------------------------------------------------------------ -+extern "C" __global__ void OrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, -+ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt ind = indices[node]; -+ const bool orient = orients[node]; -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, -+ u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); -+ } -+ } -+} -+ -+extern "C" __global__ void OrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, -+ const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients, -+ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+ CeedScalar value[RSTR_NUM_COMP]; -+ -+ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { -+ const CeedInt ind = l_vec_indices[i]; -+ const CeedInt range_1 = t_offsets[i]; -+ const CeedInt range_N = t_offsets[i + 1]; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; -+ -+ for (CeedInt j = range_1; j < range_N; j++) { -+ const CeedInt t_ind = t_indices[j]; -+ const bool orient = orients[t_ind]; -+ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; -+ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0); -+ } -+ } -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// E-vector -> L-vector, curl-oriented -+//------------------------------------------------------------------------------ -+extern "C" __global__ void CurlOrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, -+ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, -+ CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt ind = indices[node]; -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0; -+ const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; -+ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ CeedScalar value = 0.0; -+ value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; -+ value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; -+ value += -+ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; -+ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); -+ } -+ } -+} -+ -+extern "C" __global__ void CurlOrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, -+ const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, -+ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+ CeedScalar value[RSTR_NUM_COMP]; -+ -+ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { -+ const CeedInt ind = l_vec_indices[i]; -+ const CeedInt range_1 = t_offsets[i]; -+ const CeedInt range_N = t_offsets[i + 1]; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; -+ -+ for (CeedInt j = range_1; j < range_N; j++) { -+ const CeedInt t_ind = t_indices[j]; -+ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; -+ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; -+ const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0; -+ const CeedInt8 curl_orient_d = curl_orients[3 * t_ind + 1]; -+ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; -+ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; -+ value[comp] += -+ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; -+ } -+ } -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// E-vector -> L-vector, unsigned curl-oriented -+//------------------------------------------------------------------------------ -+extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, -+ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, -+ CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ const CeedInt ind = indices[node]; -+ const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0; -+ const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); -+ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ CeedScalar value = 0.0; -+ value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; -+ value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; -+ value += -+ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; -+ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); -+ } -+ } -+} -+ -+extern "C" __global__ void CurlOrientedUnsignedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, -+ const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, -+ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+ CeedScalar value[RSTR_NUM_COMP]; -+ -+ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { -+ const CeedInt ind = l_vec_indices[i]; -+ const CeedInt range_1 = t_offsets[i]; -+ const CeedInt range_N = t_offsets[i + 1]; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; -+ -+ for (CeedInt j = range_1; j < range_N; j++) { -+ const CeedInt t_ind = t_indices[j]; -+ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; -+ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; -+ const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0; -+ const CeedInt8 curl_orient_d = abs(curl_orients[3 * t_ind + 1]); -+ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; -+ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; -+ value[comp] += -+ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; -+ } -+ } -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; -+ } -+} -+ - //------------------------------------------------------------------------------ - - #endif // CEED_CUDA_REF_RESTRICTION_H -diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h -index 8270c73d..fcd8df29 100644 ---- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h -+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h -@@ -12,80 +12,106 @@ - - #include - --#if CEEDSIZE -+#if USE_CEEDSIZE - typedef CeedSize IndexType; - #else - typedef CeedInt IndexType; - #endif - - //------------------------------------------------------------------------------ --// Get Basis Emode Pointer -+// Get basis pointer - //------------------------------------------------------------------------------ --extern "C" __device__ void CeedOperatorGetBasisPointer_Hip(const CeedScalar **basisptr, CeedEvalMode emode, const CeedScalar *identity, -- const CeedScalar *interp, const CeedScalar *grad) { -- switch (emode) { -+static __device__ __inline__ void GetBasisPointer(const CeedScalar **basis_ptr, CeedEvalMode eval_modes, const CeedScalar *identity, -+ const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *div, const CeedScalar *curl) { -+ switch (eval_modes) { - case CEED_EVAL_NONE: -- *basisptr = identity; -+ *basis_ptr = identity; - break; - case CEED_EVAL_INTERP: -- *basisptr = interp; -+ *basis_ptr = interp; - break; - case CEED_EVAL_GRAD: -- *basisptr = grad; -+ *basis_ptr = grad; - break; -- case CEED_EVAL_WEIGHT: - case CEED_EVAL_DIV: -+ *basis_ptr = div; -+ break; - case CEED_EVAL_CURL: -- break; // Caught by QF Assembly -+ *basis_ptr = curl; -+ break; -+ case CEED_EVAL_WEIGHT: -+ break; // Caught by QF assembly - } - } - - //------------------------------------------------------------------------------ - // Core code for diagonal assembly - //------------------------------------------------------------------------------ --__device__ void diagonalCore(const CeedInt nelem, const bool pointBlock, const CeedScalar *identity, const CeedScalar *interpin, -- const CeedScalar *gradin, const CeedScalar *interpout, const CeedScalar *gradout, const CeedEvalMode *emodein, -- const CeedEvalMode *emodeout, const CeedScalar *__restrict__ assembledqfarray, CeedScalar *__restrict__ elemdiagarray) { -- const int tid = threadIdx.x; // running with P threads, tid is evec node -- if (tid >= NNODES) return; -+static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity, -+ const CeedScalar *interp_in, const CeedScalar *grad_in, const CeedScalar *div_in, -+ const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out, -+ const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, -+ const CeedEvalMode *eval_modes_out, const CeedScalar *__restrict__ assembled_qf_array, -+ CeedScalar *__restrict__ elem_diag_array) { -+ const int tid = threadIdx.x; // Running with P threads -+ -+ if (tid >= NUM_NODES) return; - - // Compute the diagonal of B^T D B - // Each element -- for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < nelem; e += gridDim.x * blockDim.z) { -- IndexType dout = -1; -+ for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < num_elem; e += gridDim.x * blockDim.z) { - // Each basis eval mode pair -- for (IndexType eout = 0; eout < NUMEMODEOUT; eout++) { -- const CeedScalar *bt = NULL; -- if (emodeout[eout] == CEED_EVAL_GRAD) dout += 1; -- CeedOperatorGetBasisPointer_Hip(&bt, emodeout[eout], identity, interpout, &gradout[dout * NQPTS * NNODES]); -- IndexType din = -1; -- for (IndexType ein = 0; ein < NUMEMODEIN; ein++) { -+ IndexType d_out = 0; -+ CeedEvalMode eval_modes_out_prev = CEED_EVAL_NONE; -+ -+ for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) { -+ IndexType d_in = 0; -+ CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE; -+ const CeedScalar *b_t = NULL; -+ -+ GetBasisPointer(&b_t, eval_modes_out[e_out], identity, interp_out, grad_out, div_out, curl_out); -+ if (e_out == 0 || eval_modes_out[e_out] != eval_modes_out_prev) d_out = 0; -+ else b_t = &b_t[(++d_out) * NUM_QPTS * NUM_NODES]; -+ eval_modes_out_prev = eval_modes_out[e_out]; -+ -+ for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) { - const CeedScalar *b = NULL; -- if (emodein[ein] == CEED_EVAL_GRAD) din += 1; -- CeedOperatorGetBasisPointer_Hip(&b, emodein[ein], identity, interpin, &gradin[din * NQPTS * NNODES]); -+ -+ GetBasisPointer(&b, eval_modes_in[e_in], identity, interp_in, grad_in, div_in, curl_in); -+ if (e_in == 0 || eval_modes_in[e_in] != eval_modes_in_prev) d_in = 0; -+ else b = &b[(++d_in) * NUM_QPTS * NUM_NODES]; -+ eval_modes_in_prev = eval_modes_in[e_in]; -+ - // Each component -- for (IndexType compOut = 0; compOut < NCOMP; compOut++) { -+ for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { - // Each qpoint/node pair -- if (pointBlock) { -- // Point Block Diagonal -- for (IndexType compIn = 0; compIn < NCOMP; compIn++) { -- CeedScalar evalue = 0.; -- for (IndexType q = 0; q < NQPTS; q++) { -- const CeedScalar qfvalue = -- assembledqfarray[((((ein * NCOMP + compIn) * NUMEMODEOUT + eout) * NCOMP + compOut) * nelem + e) * NQPTS + q]; -- evalue += bt[q * NNODES + tid] * qfvalue * b[q * NNODES + tid]; -+ if (is_point_block) { -+ // Point block diagonal -+ for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { -+ CeedScalar e_value = 0.; -+ -+ for (IndexType q = 0; q < NUM_QPTS; q++) { -+ const CeedScalar qf_value = -+ assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * -+ NUM_QPTS + -+ q]; -+ -+ e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; - } -- elemdiagarray[((compOut * NCOMP + compIn) * nelem + e) * NNODES + tid] += evalue; -+ elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value; - } - } else { -- // Diagonal Only -- CeedScalar evalue = 0.; -- for (IndexType q = 0; q < NQPTS; q++) { -- const CeedScalar qfvalue = -- assembledqfarray[((((ein * NCOMP + compOut) * NUMEMODEOUT + eout) * NCOMP + compOut) * nelem + e) * NQPTS + q]; -- evalue += bt[q * NNODES + tid] * qfvalue * b[q * NNODES + tid]; -+ // Diagonal only -+ CeedScalar e_value = 0.; -+ -+ for (IndexType q = 0; q < NUM_QPTS; q++) { -+ const CeedScalar qf_value = -+ assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + -+ q]; -+ -+ e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; - } -- elemdiagarray[(compOut * nelem + e) * NNODES + tid] += evalue; -+ elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value; - } - } - } -@@ -96,21 +122,25 @@ __device__ void diagonalCore(const CeedInt nelem, const bool pointBlock, const C - //------------------------------------------------------------------------------ - // Linear diagonal - //------------------------------------------------------------------------------ --extern "C" __global__ void linearDiagonal(const CeedInt nelem, const CeedScalar *identity, const CeedScalar *interpin, const CeedScalar *gradin, -- const CeedScalar *interpout, const CeedScalar *gradout, const CeedEvalMode *emodein, -- const CeedEvalMode *emodeout, const CeedScalar *__restrict__ assembledqfarray, -- CeedScalar *__restrict__ elemdiagarray) { -- diagonalCore(nelem, false, identity, interpin, gradin, interpout, gradout, emodein, emodeout, assembledqfarray, elemdiagarray); -+extern "C" __global__ void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, -+ const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out, -+ const CeedScalar *grad_out, const CeedScalar *div_out, const CeedScalar *curl_out, -+ const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, -+ const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { -+ DiagonalCore(num_elem, false, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, -+ assembled_qf_array, elem_diag_array); - } - - //------------------------------------------------------------------------------ - // Linear point block diagonal - //------------------------------------------------------------------------------ --extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem, const CeedScalar *identity, const CeedScalar *interpin, -- const CeedScalar *gradin, const CeedScalar *interpout, const CeedScalar *gradout, -- const CeedEvalMode *emodein, const CeedEvalMode *emodeout, -- const CeedScalar *__restrict__ assembledqfarray, CeedScalar *__restrict__ elemdiagarray) { -- diagonalCore(nelem, true, identity, interpin, gradin, interpout, gradout, emodein, emodeout, assembledqfarray, elemdiagarray); -+extern "C" __global__ void LinearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, -+ const CeedScalar *grad_in, const CeedScalar *div_in, const CeedScalar *curl_in, -+ const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedScalar *div_out, -+ const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, -+ const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { -+ DiagonalCore(num_elem, true, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, -+ assembled_qf_array, elem_diag_array); - } - - //------------------------------------------------------------------------------ -diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h -index 005fa6f7..a0c21f9d 100644 ---- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h -+++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h -@@ -12,107 +12,99 @@ - - #include - --#if CEEDSIZE -+#if USE_CEEDSIZE - typedef CeedSize IndexType; - #else - typedef CeedInt IndexType; - #endif - - //------------------------------------------------------------------------------ --// Matrix assembly kernel for low-order elements (2D thread block) -+// Matrix assembly kernel - //------------------------------------------------------------------------------ - extern "C" __launch_bounds__(BLOCK_SIZE) __global__ -- void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, -- CeedScalar *__restrict__ values_array) { -- // This kernel assumes B_in and B_out have the same number of quadrature points and basis points. -- // TODO: expand to more general cases -- const int i = threadIdx.x; // The output row index of each B^TDB operation -- const int l = threadIdx.y; // The output column index of each B^TDB operation -+ void LinearAssemble(const CeedInt num_elem, const CeedScalar *B_in, const CeedScalar *B_out, const bool *orients_in, -+ const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out, -+ const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) { -+ extern __shared__ CeedScalar s_CT[]; -+ CeedScalar *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN; -+ -+ const int l = threadIdx.x; // The output column index of each B^T D B operation - // such that we have (Bout^T)_ij D_jk Bin_kl = C_il - -- // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: element, -+ // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: e, - // comp_in, comp_out, node_row, node_col -- const IndexType comp_out_stride = NNODES * NNODES; -- const IndexType comp_in_stride = comp_out_stride * NCOMP; -- const IndexType e_stride = comp_in_stride * NCOMP; -- // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt -- const IndexType qe_stride = NQPTS; -- const IndexType qcomp_out_stride = NELEM * qe_stride; -- const IndexType qemode_out_stride = qcomp_out_stride * NCOMP; -- const IndexType qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; -- const IndexType qemode_in_stride = qcomp_in_stride * NCOMP; -+ const IndexType comp_out_stride = NUM_NODES_OUT * NUM_NODES_IN; -+ const IndexType comp_in_stride = comp_out_stride * NUM_COMP_OUT; -+ const IndexType e_stride = comp_in_stride * NUM_COMP_IN; -+ -+ // Strides for QF array, slowest --> fastest: e_in, comp_in, e_out, comp_out, e, q -+ const IndexType q_e_stride = NUM_QPTS; -+ const IndexType q_comp_out_stride = num_elem * q_e_stride; -+ const IndexType q_eval_mode_out_stride = q_comp_out_stride * NUM_COMP_OUT; -+ const IndexType q_comp_in_stride = q_eval_mode_out_stride * NUM_EVAL_MODES_OUT; -+ const IndexType q_eval_mode_in_stride = q_comp_in_stride * NUM_COMP_IN; - - // Loop over each element (if necessary) -- for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) { -- for (IndexType comp_in = 0; comp_in < NCOMP; comp_in++) { -- for (IndexType comp_out = 0; comp_out < NCOMP; comp_out++) { -- CeedScalar result = 0.0; -- IndexType qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; -- for (IndexType emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { -- IndexType b_in_index = emode_in * NQPTS * NNODES; -- for (IndexType emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { -- IndexType b_out_index = emode_out * NQPTS * NNODES; -- IndexType qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; -- // Perform the B^T D B operation for this 'chunk' of D (the qf_array) -- for (IndexType j = 0; j < NQPTS; j++) { -- result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; -- } -- } // end of emode_out -- } // end of emode_in -- IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; -- values_array[val_index] = result; -- } // end of out component -- } // end of in component -- } // end of element loop --} -+ for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < num_elem; e += gridDim.x * blockDim.z) { -+ for (IndexType comp_in = 0; comp_in < NUM_COMP_IN; comp_in++) { -+ for (IndexType comp_out = 0; comp_out < NUM_COMP_OUT; comp_out++) { -+ for (IndexType i = threadIdx.y; i < NUM_NODES_OUT; i += BLOCK_SIZE_Y) { -+ CeedScalar result = 0.0; -+ IndexType qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; - --//------------------------------------------------------------------------------ --// Fallback kernel for larger orders (1D thread block) --//------------------------------------------------------------------------------ --extern "C" __launch_bounds__(BLOCK_SIZE) __global__ -- void linearAssembleFallback(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, -- CeedScalar *__restrict__ values_array) { -- // This kernel assumes B_in and B_out have the same number of quadrature points and basis points. -- // TODO: expand to more general cases -- const int l = threadIdx.x; // The output column index of each B^TDB operation -- // such that we have (Bout^T)_ij D_jk Bin_kl = C_il -+ for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) { -+ IndexType b_in_index = e_in * NUM_QPTS * NUM_NODES_IN; - -- // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: element, -- // comp_in, comp_out, node_row, node_col -- const IndexType comp_out_stride = NNODES * NNODES; -- const IndexType comp_in_stride = comp_out_stride * NCOMP; -- const IndexType e_stride = comp_in_stride * NCOMP; -- // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt -- const IndexType qe_stride = NQPTS; -- const IndexType qcomp_out_stride = NELEM * qe_stride; -- const IndexType qemode_out_stride = qcomp_out_stride * NCOMP; -- const IndexType qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; -- const IndexType qemode_in_stride = qcomp_in_stride * NCOMP; -+ for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) { -+ IndexType b_out_index = e_out * NUM_QPTS * NUM_NODES_OUT; -+ IndexType qf_index = qf_index_comp + q_eval_mode_out_stride * e_out + q_eval_mode_in_stride * e_in; - -- // Loop over each element (if necessary) -- for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) { -- for (IndexType comp_in = 0; comp_in < NCOMP; comp_in++) { -- for (IndexType comp_out = 0; comp_out < NCOMP; comp_out++) { -- for (IndexType i = 0; i < NNODES; i++) { -- CeedScalar result = 0.0; -- IndexType qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; -- for (IndexType emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { -- IndexType b_in_index = emode_in * NQPTS * NNODES; -- for (IndexType emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { -- IndexType b_out_index = emode_out * NQPTS * NNODES; -- IndexType qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; - // Perform the B^T D B operation for this 'chunk' of D (the qf_array) -- for (IndexType j = 0; j < NQPTS; j++) { -- result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; -+ for (IndexType j = 0; j < NUM_QPTS; j++) { -+ result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l]; - } -- } // end of emode_out -- } // end of emode_in -- IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; -- values_array[val_index] = result; -+ } // end of out eval mode -+ } // end of in eval mode -+ if (orients_in) { -+ result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0; -+ } -+ if (orients_out) { -+ result *= orients_out[NUM_NODES_OUT * e + i] ? -1.0 : 1.0; -+ } -+ if (!curl_orients_in && !curl_orients_out) { -+ IndexType val_index = e_stride * e + comp_in_stride * comp_in + comp_out_stride * comp_out + NUM_NODES_IN * i + l; -+ -+ values_array[val_index] = result; -+ } else if (curl_orients_in) { -+ s_C[NUM_NODES_IN * threadIdx.y + l] = result; -+ __syncthreads(); -+ s_CT[NUM_NODES_IN * i + l] = -+ (l > 0 ? s_C[NUM_NODES_IN * threadIdx.y + l - 1] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l - 1] : 0.0) + -+ s_C[NUM_NODES_IN * threadIdx.y + l] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l + 1] + -+ (l < (NUM_NODES_IN - 1) ? s_C[NUM_NODES_IN * threadIdx.y + l + 1] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l + 3] : 0.0); -+ } else { -+ s_CT[NUM_NODES_IN * i + l] = result; -+ } - } // end of loop over element node index, i -- } // end of out component -- } // end of in component -- } // end of element loop -+ if (curl_orients_in || curl_orients_out) { -+ // Compute and store the final T^T (B^T D B T) using the fully computed C T product in shared memory -+ if (curl_orients_out) __syncthreads(); -+ for (IndexType i = threadIdx.y; i < NUM_NODES_OUT; i += BLOCK_SIZE_Y) { -+ IndexType val_index = e_stride * e + comp_in_stride * comp_in + comp_out_stride * comp_out + NUM_NODES_IN * i + l; -+ -+ if (curl_orients_out) { -+ values_array[val_index] = -+ (i > 0 ? s_CT[NUM_NODES_IN * (i - 1) + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i - 1] : 0.0) + -+ s_CT[NUM_NODES_IN * i + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i + 1] + -+ (i < (NUM_NODES_OUT - 1) ? s_CT[NUM_NODES_IN * (i + 1) + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i + 3] : 0.0); -+ } else { -+ values_array[val_index] = s_CT[NUM_NODES_IN * i + l]; -+ } -+ } -+ } -+ } // end of out component -+ } // end of in component -+ } // end of element loop - } - - //------------------------------------------------------------------------------ -diff --git a/include/ceed/jit-source/hip/hip-ref-restriction.h b/include/ceed/jit-source/hip/hip-ref-restriction.h -index c34aa980..e1de6d87 100644 ---- a/include/ceed/jit-source/hip/hip-ref-restriction.h -+++ b/include/ceed/jit-source/hip/hip-ref-restriction.h -@@ -16,86 +16,314 @@ - // L-vector -> E-vector, strided - //------------------------------------------------------------------------------ - extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -- for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -- const CeedInt loc_node = node % RESTR_ELEM_SIZE; -- const CeedInt elem = node / RESTR_ELEM_SIZE; -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; - -- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { -- v[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE] = -- u[loc_node * RESTR_STRIDE_NODES + comp * RESTR_STRIDE_COMP + elem * RESTR_STRIDE_ELEM]; -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = -+ u[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM]; - } - } - } - - //------------------------------------------------------------------------------ --// E-vector -> L-vector, strided -+// L-vector -> E-vector, standard (with offsets) - //------------------------------------------------------------------------------ --extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -- for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -- const CeedInt loc_node = node % RESTR_ELEM_SIZE; -- const CeedInt elem = node / RESTR_ELEM_SIZE; -+extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, -+ CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt ind = indices[node]; -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; - -- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { -- v[loc_node * RESTR_STRIDE_NODES + comp * RESTR_STRIDE_COMP + elem * RESTR_STRIDE_ELEM] += -- u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]; -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE]; - } - } - } - - //------------------------------------------------------------------------------ --// L-vector -> E-vector, offsets provided -+// L-vector -> E-vector, oriented - //------------------------------------------------------------------------------ --extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, -- CeedScalar *__restrict__ v) { -- for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+extern "C" __global__ void OrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, -+ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; -- const CeedInt loc_node = node % RESTR_ELEM_SIZE; -- const CeedInt elem = node / RESTR_ELEM_SIZE; -+ const bool orient = orients[node]; -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0); -+ } -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// L-vector -> E-vector, curl-oriented -+//------------------------------------------------------------------------------ -+extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, -+ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, -+ CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; -+ const CeedInt ind_d = indices[node]; -+ const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; -+ const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0]; -+ const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; -+ const CeedInt8 curl_orient_du = curl_orients[3 * node + 2]; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ CeedScalar value = 0.0; -+ value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; -+ value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; -+ value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; -+ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; -+ } -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// L-vector -> E-vector, unsigned curl-oriented -+//------------------------------------------------------------------------------ -+extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, -+ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, -+ CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; -+ const CeedInt ind_d = indices[node]; -+ const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; -+ const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]); -+ const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); -+ const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]); -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ CeedScalar value = 0.0; -+ value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; -+ value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; -+ value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; -+ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; -+ } -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// E-vector -> L-vector, strided -+//------------------------------------------------------------------------------ -+extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; - -- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { -- v[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE] = u[ind + comp * RESTR_COMP_STRIDE]; -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] += -+ u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; - } - } - } - - //------------------------------------------------------------------------------ --// E-vector -> L-vector, offsets provided -+// E-vector -> L-vector, standard (with offsets) - //------------------------------------------------------------------------------ - extern "C" __global__ void OffsetTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, - CeedScalar *__restrict__ v) { -- for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { - const CeedInt ind = indices[node]; -- const CeedInt loc_node = node % RESTR_ELEM_SIZE; -- const CeedInt elem = node / RESTR_ELEM_SIZE; -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; - -- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { -- atomicAdd(v + ind + comp * RESTR_COMP_STRIDE, u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]); -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); - } - } - } - - extern "C" __global__ void OffsetTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, - const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -- CeedScalar value[RESTR_NUM_COMP]; -+ CeedScalar value[RSTR_NUM_COMP]; - -- for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RESTR_NUM_NODES; i += blockDim.x * gridDim.x) { -+ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { - const CeedInt ind = l_vec_indices[i]; - const CeedInt range_1 = t_offsets[i]; - const CeedInt range_N = t_offsets[i + 1]; - -- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) value[comp] = 0.0; -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; - - for (CeedInt j = range_1; j < range_N; j++) { - const CeedInt t_ind = t_indices[j]; -- CeedInt loc_node = t_ind % RESTR_ELEM_SIZE; -- CeedInt elem = t_ind / RESTR_ELEM_SIZE; -+ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; -+ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; -+ } -+ } -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// E-vector -> L-vector, oriented -+//------------------------------------------------------------------------------ -+extern "C" __global__ void OrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, -+ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt ind = indices[node]; -+ const bool orient = orients[node]; -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, -+ u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); -+ } -+ } -+} -+ -+extern "C" __global__ void OrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, -+ const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients, -+ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+ CeedScalar value[RSTR_NUM_COMP]; -+ -+ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { -+ const CeedInt ind = l_vec_indices[i]; -+ const CeedInt range_1 = t_offsets[i]; -+ const CeedInt range_N = t_offsets[i + 1]; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; -+ -+ for (CeedInt j = range_1; j < range_N; j++) { -+ const CeedInt t_ind = t_indices[j]; -+ const bool orient = orients[t_ind]; -+ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; -+ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0); -+ } -+ } -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// E-vector -> L-vector, curl-oriented -+//------------------------------------------------------------------------------ -+extern "C" __global__ void CurlOrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, -+ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, -+ CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt ind = indices[node]; -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0; -+ const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; -+ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ CeedScalar value = 0.0; -+ value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; -+ value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; -+ value += -+ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; -+ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); -+ } -+ } -+} -+ -+extern "C" __global__ void CurlOrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, -+ const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, -+ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+ CeedScalar value[RSTR_NUM_COMP]; -+ -+ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { -+ const CeedInt ind = l_vec_indices[i]; -+ const CeedInt range_1 = t_offsets[i]; -+ const CeedInt range_N = t_offsets[i + 1]; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; -+ -+ for (CeedInt j = range_1; j < range_N; j++) { -+ const CeedInt t_ind = t_indices[j]; -+ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; -+ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; -+ const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0; -+ const CeedInt8 curl_orient_d = curl_orients[3 * t_ind + 1]; -+ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; -+ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; -+ value[comp] += -+ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; -+ } -+ } -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// E-vector -> L-vector, unsigned curl-oriented -+//------------------------------------------------------------------------------ -+extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, -+ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, -+ CeedScalar *__restrict__ v) { -+ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { -+ const CeedInt loc_node = node % RSTR_ELEM_SIZE; -+ const CeedInt elem = node / RSTR_ELEM_SIZE; -+ const CeedInt ind = indices[node]; -+ const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0; -+ const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); -+ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ CeedScalar value = 0.0; -+ value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; -+ value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; -+ value += -+ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; -+ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); -+ } -+ } -+} -+ -+extern "C" __global__ void CurlOrientedUnsignedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, -+ const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, -+ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { -+ CeedScalar value[RSTR_NUM_COMP]; -+ -+ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { -+ const CeedInt ind = l_vec_indices[i]; -+ const CeedInt range_1 = t_offsets[i]; -+ const CeedInt range_N = t_offsets[i + 1]; -+ -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; -+ -+ for (CeedInt j = range_1; j < range_N; j++) { -+ const CeedInt t_ind = t_indices[j]; -+ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; -+ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; -+ const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0; -+ const CeedInt8 curl_orient_d = abs(curl_orients[3 * t_ind + 1]); -+ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0; - -- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { -- value[comp] += u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]; -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { -+ value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; -+ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; -+ value[comp] += -+ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; - } - } - -- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) v[ind + comp * RESTR_COMP_STRIDE] += value[comp]; -+ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; - } - } - -diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c -index 909d9d51..129f5017 100644 ---- a/interface/ceed-operator.c -+++ b/interface/ceed-operator.c -@@ -26,41 +26,41 @@ - - @param[in] ceed Ceed object for error handling - @param[in] qf_field QFunction Field matching Operator Field -- @param[in] r Operator Field ElemRestriction -- @param[in] b Operator Field Basis -+ @param[in] rstr Operator Field ElemRestriction -+ @param[in] basis Operator Field Basis - - @return An error code: 0 - success, otherwise - failure - - @ref Developer - **/ --static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedElemRestriction r, CeedBasis b) { -+static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedElemRestriction rstr, CeedBasis basis) { - CeedInt dim = 1, num_comp = 1, q_comp = 1, rstr_num_comp = 1, size = qf_field->size; - CeedEvalMode eval_mode = qf_field->eval_mode; - - // Restriction -- CeedCheck((r == CEED_ELEMRESTRICTION_NONE) == (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_INCOMPATIBLE, -+ CeedCheck((rstr == CEED_ELEMRESTRICTION_NONE) == (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_INCOMPATIBLE, - "CEED_ELEMRESTRICTION_NONE and CEED_EVAL_WEIGHT must be used together."); -- if (r != CEED_ELEMRESTRICTION_NONE) { -- CeedCall(CeedElemRestrictionGetNumComponents(r, &rstr_num_comp)); -+ if (rstr != CEED_ELEMRESTRICTION_NONE) { -+ CeedCall(CeedElemRestrictionGetNumComponents(rstr, &rstr_num_comp)); - } - // Basis -- CeedCheck((b == CEED_BASIS_NONE) == (eval_mode == CEED_EVAL_NONE), ceed, CEED_ERROR_INCOMPATIBLE, -+ CeedCheck((basis == CEED_BASIS_NONE) == (eval_mode == CEED_EVAL_NONE), ceed, CEED_ERROR_INCOMPATIBLE, - "CEED_BASIS_NONE and CEED_EVAL_NONE must be used together."); -- if (b != CEED_BASIS_NONE) { -- CeedCall(CeedBasisGetDimension(b, &dim)); -- CeedCall(CeedBasisGetNumComponents(b, &num_comp)); -- CeedCall(CeedBasisGetNumQuadratureComponents(b, eval_mode, &q_comp)); -- CeedCheck(r == CEED_ELEMRESTRICTION_NONE || rstr_num_comp == num_comp, ceed, CEED_ERROR_DIMENSION, -+ if (basis != CEED_BASIS_NONE) { -+ CeedCall(CeedBasisGetDimension(basis, &dim)); -+ CeedCall(CeedBasisGetNumComponents(basis, &num_comp)); -+ CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); -+ CeedCheck(rstr == CEED_ELEMRESTRICTION_NONE || rstr_num_comp == num_comp, ceed, CEED_ERROR_DIMENSION, - "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction has %" CeedInt_FMT " components, but Basis has %" CeedInt_FMT - " components", -- qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], rstr_num_comp, num_comp); -+ qf_field->field_name, size, CeedEvalModes[eval_mode], rstr_num_comp, num_comp); - } - // Field size - switch (eval_mode) { - case CEED_EVAL_NONE: - CeedCheck(size == rstr_num_comp, ceed, CEED_ERROR_DIMENSION, -- "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction has %" CeedInt_FMT " components", qf_field->field_name, -- qf_field->size, CeedEvalModes[qf_field->eval_mode], rstr_num_comp); -+ "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction has %" CeedInt_FMT " components", qf_field->field_name, size, -+ CeedEvalModes[eval_mode], rstr_num_comp); - break; - case CEED_EVAL_INTERP: - case CEED_EVAL_GRAD: -@@ -68,7 +68,7 @@ static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedEl - case CEED_EVAL_CURL: - CeedCheck(size == num_comp * q_comp, ceed, CEED_ERROR_DIMENSION, - "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction/Basis has %" CeedInt_FMT " components", qf_field->field_name, -- qf_field->size, CeedEvalModes[qf_field->eval_mode], num_comp * q_comp); -+ size, CeedEvalModes[eval_mode], num_comp * q_comp); - break; - case CEED_EVAL_WEIGHT: - // No additional checks required -@@ -672,12 +672,12 @@ int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy) { - There can be at most one active input CeedVector and at most one active output CeedVector passed to CeedOperatorApply(). - - The number of quadrature points must agree across all points. -- When using @ref CEED_BASIS_NONE, the number of quadrature points is determined by the element size of r. -+ When using @ref CEED_BASIS_NONE, the number of quadrature points is determined by the element size of rstr. - - @param[in,out] op CeedOperator on which to provide the field - @param[in] field_name Name of the field (to be matched with the name used by CeedQFunction) -- @param[in] r CeedElemRestriction -- @param[in] b CeedBasis in which the field resides or @ref CEED_BASIS_NONE if collocated with quadrature points -+ @param[in] rstr CeedElemRestriction -+ @param[in] basis CeedBasis in which the field resides or @ref CEED_BASIS_NONE if collocated with quadrature points - @param[in] v CeedVector to be used by CeedOperator or @ref CEED_VECTOR_ACTIVE if field is active or @ref CEED_VECTOR_NONE - if using @ref CEED_EVAL_WEIGHT in the QFunction - -@@ -685,35 +685,31 @@ int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy) { - - @ref User - **/ --int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction r, CeedBasis b, CeedVector v) { -- bool is_input = true; -- CeedInt num_elem = 0, num_qpts = 0; -- CeedQFunctionField qf_field; -- CeedOperatorField *op_field; -+int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction rstr, CeedBasis basis, CeedVector v) { -+ bool is_input = true; -+ CeedInt num_elem = 0, num_qpts = 0; -+ CeedRestrictionType rstr_type; -+ CeedQFunctionField qf_field; -+ CeedOperatorField *op_field; - - CeedCheck(!op->is_composite, op->ceed, CEED_ERROR_INCOMPATIBLE, "Cannot add field to composite operator."); - CeedCheck(!op->is_immutable, op->ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable"); -- CeedCheck(r, op->ceed, CEED_ERROR_INCOMPATIBLE, "ElemRestriction r for field \"%s\" must be non-NULL.", field_name); -- CeedCheck(b, op->ceed, CEED_ERROR_INCOMPATIBLE, "Basis b for field \"%s\" must be non-NULL.", field_name); -- CeedCheck(v, op->ceed, CEED_ERROR_INCOMPATIBLE, "Vector v for field \"%s\" must be non-NULL.", field_name); -+ CeedCheck(rstr, op->ceed, CEED_ERROR_INCOMPATIBLE, "ElemRestriction for field \"%s\" must be non-NULL.", field_name); -+ CeedCheck(basis, op->ceed, CEED_ERROR_INCOMPATIBLE, "Basis for field \"%s\" must be non-NULL.", field_name); -+ CeedCheck(v, op->ceed, CEED_ERROR_INCOMPATIBLE, "Vector for field \"%s\" must be non-NULL.", field_name); - -- CeedCall(CeedElemRestrictionGetNumElements(r, &num_elem)); -- CeedCheck(r == CEED_ELEMRESTRICTION_NONE || !op->has_restriction || op->num_elem == num_elem, op->ceed, CEED_ERROR_DIMENSION, -+ CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem)); -+ CeedCheck(rstr == CEED_ELEMRESTRICTION_NONE || !op->has_restriction || op->num_elem == num_elem, op->ceed, CEED_ERROR_DIMENSION, - "ElemRestriction with %" CeedInt_FMT " elements incompatible with prior %" CeedInt_FMT " elements", num_elem, op->num_elem); -- { -- CeedRestrictionType rstr_type; -- -- CeedCall(CeedElemRestrictionGetType(r, &rstr_type)); -- CeedCheck(rstr_type != CEED_RESTRICTION_POINTS, op->ceed, CEED_ERROR_UNSUPPORTED, -- "CeedElemRestrictionAtPoints not supported for standard operator fields"); -- } -- -- if (b == CEED_BASIS_NONE) CeedCall(CeedElemRestrictionGetElementSize(r, &num_qpts)); -- else CeedCall(CeedBasisGetNumQuadraturePoints(b, &num_qpts)); -+ CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type)); -+ CeedCheck(rstr_type != CEED_RESTRICTION_POINTS, op->ceed, CEED_ERROR_UNSUPPORTED, -+ "CeedElemRestrictionAtPoints not supported for standard operator fields"); -+ if (basis == CEED_BASIS_NONE) CeedCall(CeedElemRestrictionGetElementSize(rstr, &num_qpts)); -+ else CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); - CeedCheck(op->num_qpts == 0 || op->num_qpts == num_qpts, op->ceed, CEED_ERROR_DIMENSION, - "%s must correspond to the same number of quadrature points as previously added Bases. Found %" CeedInt_FMT - " quadrature points but expected %" CeedInt_FMT " quadrature points.", -- b == CEED_BASIS_NONE ? "ElemRestriction" : "Basis", num_qpts, op->num_qpts); -+ basis == CEED_BASIS_NONE ? "ElemRestriction" : "Basis", num_qpts, op->num_qpts); - for (CeedInt i = 0; i < op->qf->num_input_fields; i++) { - if (!strcmp(field_name, (*op->qf->input_fields[i]).field_name)) { - qf_field = op->qf->input_fields[i]; -@@ -733,13 +729,13 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri - return CeedError(op->ceed, CEED_ERROR_INCOMPLETE, "QFunction has no knowledge of field '%s'", field_name); - // LCOV_EXCL_STOP - found: -- CeedCall(CeedOperatorCheckField(op->ceed, qf_field, r, b)); -+ CeedCall(CeedOperatorCheckField(op->ceed, qf_field, rstr, basis)); - CeedCall(CeedCalloc(1, op_field)); - - if (v == CEED_VECTOR_ACTIVE) { - CeedSize l_size; - -- CeedCall(CeedElemRestrictionGetLVectorSize(r, &l_size)); -+ CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); - if (is_input) { - if (op->input_size == -1) op->input_size = l_size; - CeedCheck(l_size == op->input_size, op->ceed, CEED_ERROR_INCOMPATIBLE, "LVector size %td does not match previous size %td", l_size, -@@ -752,12 +748,12 @@ found: - } - - CeedCall(CeedVectorReferenceCopy(v, &(*op_field)->vec)); -- CeedCall(CeedElemRestrictionReferenceCopy(r, &(*op_field)->elem_rstr)); -- if (r != CEED_ELEMRESTRICTION_NONE && !op->has_restriction) { -+ CeedCall(CeedElemRestrictionReferenceCopy(rstr, &(*op_field)->elem_rstr)); -+ if (rstr != CEED_ELEMRESTRICTION_NONE && !op->has_restriction) { - op->num_elem = num_elem; - op->has_restriction = true; // Restriction set, but num_elem may be 0 - } -- CeedCall(CeedBasisReferenceCopy(b, &(*op_field)->basis)); -+ CeedCall(CeedBasisReferenceCopy(basis, &(*op_field)->basis)); - if (op->num_qpts == 0) op->num_qpts = num_qpts; - op->num_fields += 1; - CeedCall(CeedStringAllocCopy(field_name, (char **)&(*op_field)->field_name)); -diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c -index 5a09a3b8..549c2432 100644 ---- a/interface/ceed-preconditioning.c -+++ b/interface/ceed-preconditioning.c -@@ -139,40 +139,6 @@ static int CeedOperatorCreateFallback(CeedOperator op) { - return CEED_ERROR_SUCCESS; - } - --/** -- @brief Select correct basis matrix pointer based on CeedEvalMode -- -- @param[in] basis CeedBasis from which to get the basis matrix -- @param[in] eval_mode Current basis evaluation mode -- @param[in] identity Pointer to identity matrix -- @param[out] basis_ptr Basis pointer to set -- -- @ref Developer --**/ --static inline int CeedOperatorGetBasisPointer(CeedBasis basis, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar **basis_ptr) { -- switch (eval_mode) { -- case CEED_EVAL_NONE: -- *basis_ptr = identity; -- break; -- case CEED_EVAL_INTERP: -- CeedCall(CeedBasisGetInterp(basis, basis_ptr)); -- break; -- case CEED_EVAL_GRAD: -- CeedCall(CeedBasisGetGrad(basis, basis_ptr)); -- break; -- case CEED_EVAL_DIV: -- CeedCall(CeedBasisGetDiv(basis, basis_ptr)); -- break; -- case CEED_EVAL_CURL: -- CeedCall(CeedBasisGetCurl(basis, basis_ptr)); -- break; -- case CEED_EVAL_WEIGHT: -- break; // Caught by QF Assembly -- } -- assert(*basis_ptr != NULL); -- return CEED_ERROR_SUCCESS; --} -- - /** - @brief Core logic for assembling operator diagonal or point block diagonal - -@@ -1000,6 +966,40 @@ CeedPragmaOptimizeOn - /// @addtogroup CeedOperatorBackend - /// @{ - -+/** -+ @brief Select correct basis matrix pointer based on CeedEvalMode -+ -+ @param[in] basis CeedBasis from which to get the basis matrix -+ @param[in] eval_mode Current basis evaluation mode -+ @param[in] identity Pointer to identity matrix -+ @param[out] basis_ptr Basis pointer to set -+ -+ @ref Backend -+**/ -+int CeedOperatorGetBasisPointer(CeedBasis basis, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar **basis_ptr) { -+ switch (eval_mode) { -+ case CEED_EVAL_NONE: -+ *basis_ptr = identity; -+ break; -+ case CEED_EVAL_INTERP: -+ CeedCall(CeedBasisGetInterp(basis, basis_ptr)); -+ break; -+ case CEED_EVAL_GRAD: -+ CeedCall(CeedBasisGetGrad(basis, basis_ptr)); -+ break; -+ case CEED_EVAL_DIV: -+ CeedCall(CeedBasisGetDiv(basis, basis_ptr)); -+ break; -+ case CEED_EVAL_CURL: -+ CeedCall(CeedBasisGetCurl(basis, basis_ptr)); -+ break; -+ case CEED_EVAL_WEIGHT: -+ break; // Caught by QF Assembly -+ } -+ assert(*basis_ptr != NULL); -+ return CEED_ERROR_SUCCESS; -+} -+ - /** - @brief Create point block restriction for active operator field - -@@ -1277,10 +1277,10 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem - - // Build OperatorAssembly data - CeedCall(CeedOperatorGetQFunction(op, &qf)); -- CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_fields, NULL, NULL)); -- CeedCall(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); - - // Determine active input basis -+ CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_fields, NULL, NULL)); -+ CeedCall(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedVector vec; - +diff --git a/backends/cuda-ref/ceed-cuda-ref-operator.c b/backends/cuda-ref/ceed-cuda-ref-operator.c +index f47c52b7..29d3b083 100644 +--- a/backends/cuda-ref/ceed-cuda-ref-operator.c ++++ b/backends/cuda-ref/ceed-cuda-ref-operator.c +@@ -54,15 +54,18 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) { + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallCuda(ceed, cuModuleUnload(impl->diag->module)); +- CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in)); +- CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out)); +- CeedCallCuda(ceed, cudaFree(impl->diag->d_e_mode_in)); +- CeedCallCuda(ceed, cudaFree(impl->diag->d_e_mode_out)); ++ CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_in)); ++ CeedCallCuda(ceed, cudaFree(impl->diag->d_eval_modes_out)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_identity)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_in)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_interp_out)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_in)); + CeedCallCuda(ceed, cudaFree(impl->diag->d_grad_out)); ++ CeedCallCuda(ceed, cudaFree(impl->diag->d_div_in)); ++ CeedCallCuda(ceed, cudaFree(impl->diag->d_div_out)); ++ CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_in)); ++ CeedCallCuda(ceed, cudaFree(impl->diag->d_curl_out)); ++ CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); +@@ -86,17 +89,13 @@ static int CeedOperatorDestroy_Cuda(CeedOperator op) { + //------------------------------------------------------------------------------ + // Setup infields or outfields + //------------------------------------------------------------------------------ +-static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt e_start, ++static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool is_input, CeedVector *e_vecs, CeedVector *q_vecs, CeedInt start_e, + CeedInt num_fields, CeedInt Q, CeedInt num_elem) { + Ceed ceed; +- bool is_strided, skip_restriction; +- CeedSize q_size; +- CeedInt dim, size; + CeedQFunctionField *qf_fields; + CeedOperatorField *op_fields; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); +- + if (is_input) { + CeedCallBackend(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); +@@ -107,30 +106,29 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool + + // Loop over fields + for (CeedInt i = 0; i < num_fields; i++) { +- CeedEvalMode e_mode; ++ bool is_strided = false, skip_restriction = false; ++ CeedSize q_size; ++ CeedInt size; ++ CeedEvalMode eval_mode; + CeedBasis basis; + +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); +- +- is_strided = false; +- skip_restriction = false; +- if (e_mode != CEED_EVAL_WEIGHT) { ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); ++ if (eval_mode != CEED_EVAL_WEIGHT) { + CeedElemRestriction elem_rstr; + +- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); +- + // Check whether this field can skip the element restriction: +- // must be passive input, with e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. ++ // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. ++ CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); + + // First, check whether the field is input or output: + if (is_input) { + CeedVector vec; + +- // Check for passive input: ++ // Check for passive input + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec != CEED_VECTOR_ACTIVE) { +- // Check e_mode +- if (e_mode == CEED_EVAL_NONE) { ++ // Check eval_mode ++ if (eval_mode == CEED_EVAL_NONE) { + // Check for strided restriction + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); + if (is_strided) { +@@ -142,27 +140,23 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool + } + if (skip_restriction) { + // We do not need an E-Vector, but will use the input field vector's data directly in the operator application. +- e_vecs[i + e_start] = NULL; ++ e_vecs[i + start_e] = NULL; + } else { +- CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + e_start])); ++ CeedCallBackend(CeedElemRestrictionCreateVector(elem_rstr, NULL, &e_vecs[i + start_e])); + } + } + +- switch (e_mode) { ++ switch (eval_mode) { + case CEED_EVAL_NONE: + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + break; + case CEED_EVAL_INTERP: +- CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); +- q_size = (CeedSize)num_elem * Q * size; +- CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); +- break; + case CEED_EVAL_GRAD: +- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); ++ case CEED_EVAL_DIV: ++ case CEED_EVAL_CURL: + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); +- CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + break; +@@ -172,10 +166,6 @@ static int CeedOperatorSetupFields_Cuda(CeedQFunction qf, CeedOperator op, bool + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + break; +- case CEED_EVAL_DIV: +- break; // TODO: Not implemented +- case CEED_EVAL_CURL: +- break; // TODO: Not implemented + } + } + return CEED_ERROR_SUCCESS; +@@ -206,10 +196,8 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) { + + // Allocate + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); +- + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); +- + impl->num_inputs = num_input_fields; + impl->num_outputs = num_output_fields; + +@@ -227,23 +215,25 @@ static int CeedOperatorSetup_Cuda(CeedOperator op) { + // Setup Operator Inputs + //------------------------------------------------------------------------------ + static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, +- CeedVector in_vec, const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], ++ CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], + CeedOperator_Cuda *impl, CeedRequest *request) { + for (CeedInt i = 0; i < num_input_fields; i++) { +- CeedEvalMode e_mode; ++ CeedEvalMode eval_mode; + CeedVector vec; + CeedElemRestriction elem_rstr; + + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { +- if (skip_active_in) continue; ++ if (skip_active) continue; + else vec = in_vec; + } + +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); +- if (e_mode == CEED_EVAL_WEIGHT) { // Skip ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); ++ if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { ++ // Get input vector ++ CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + // Get input element restriction + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + if (vec == CEED_VECTOR_ACTIVE) vec = in_vec; +@@ -265,45 +255,40 @@ static inline int CeedOperatorSetupInputs_Cuda(CeedInt num_input_fields, CeedQFu + // Input Basis Action + //------------------------------------------------------------------------------ + static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, +- CeedInt num_input_fields, const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], ++ CeedInt num_input_fields, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], + CeedOperator_Cuda *impl) { + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedInt elem_size, size; +- CeedEvalMode e_mode; ++ CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + + // Skip active input +- if (skip_active_in) { ++ if (skip_active) { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) continue; + } +- // Get elem_size, e_mode, size ++ // Get elem_size, eval_mode, size + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); + // Basis action +- switch (e_mode) { ++ switch (eval_mode) { + case CEED_EVAL_NONE: + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); + break; + case CEED_EVAL_INTERP: +- CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); +- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i])); +- break; + case CEED_EVAL_GRAD: ++ case CEED_EVAL_DIV: ++ case CEED_EVAL_CURL: + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); +- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i])); ++ CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i])); + break; + case CEED_EVAL_WEIGHT: + break; // No action +- case CEED_EVAL_DIV: +- break; // TODO: Not implemented +- case CEED_EVAL_CURL: +- break; // TODO: Not implemented + } + } + return CEED_ERROR_SUCCESS; +@@ -313,18 +298,18 @@ static inline int CeedOperatorInputBasis_Cuda(CeedInt num_elem, CeedQFunctionFie + // Restore Input Vectors + //------------------------------------------------------------------------------ + static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, +- const bool skip_active_in, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { ++ const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Cuda *impl) { + for (CeedInt i = 0; i < num_input_fields; i++) { +- CeedEvalMode e_mode; ++ CeedEvalMode eval_mode; + CeedVector vec; + + // Skip active input +- if (skip_active_in) { ++ if (skip_active) { + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) continue; + } +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); +- if (e_mode == CEED_EVAL_WEIGHT) { // Skip ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); ++ if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { + if (!impl->e_vecs[i]) { // This was a skip_restriction case + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); +@@ -341,13 +326,12 @@ static inline int CeedOperatorRestoreInputs_Cuda(CeedInt num_input_fields, CeedQ + // Apply and add to output + //------------------------------------------------------------------------------ + static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVector out_vec, CeedRequest *request) { +- CeedOperator_Cuda *impl; + CeedInt Q, num_elem, elem_size, num_input_fields, num_output_fields, size; +- CeedEvalMode e_mode; + CeedScalar *e_data[2 * CEED_FIELD_MAX] = {NULL}; +- CeedOperatorField *op_input_fields, *op_output_fields; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; ++ CeedOperatorField *op_input_fields, *op_output_fields; ++ CeedOperator_Cuda *impl; + + CeedCallBackend(CeedOperatorGetData(op, &impl)); + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); +@@ -359,7 +343,7 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec + // Setup + CeedCallBackend(CeedOperatorSetup_Cuda(op)); + +- // Input e_vecs and Restriction ++ // Input Evecs and Restriction + CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, in_vec, false, e_data, impl, request)); + + // Input basis apply if needed +@@ -367,8 +351,10 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec + + // Output pointers, as necessary + for (CeedInt i = 0; i < num_output_fields; i++) { +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); +- if (e_mode == CEED_EVAL_NONE) { ++ CeedEvalMode eval_mode; ++ ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); ++ if (eval_mode == CEED_EVAL_NONE) { + // Set the output Q-Vector to use the E-Vector data directly. + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); +@@ -380,49 +366,46 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec + + // Output basis apply if needed + for (CeedInt i = 0; i < num_output_fields; i++) { ++ CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + +- // Get elem_size, e_mode, size ++ // Get elem_size, eval_mode, size + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); + // Basis action +- switch (e_mode) { ++ switch (eval_mode) { + case CEED_EVAL_NONE: +- break; ++ break; // No action + case CEED_EVAL_INTERP: +- CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); +- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); +- break; + case CEED_EVAL_GRAD: ++ case CEED_EVAL_DIV: ++ case CEED_EVAL_CURL: + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); +- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); ++ CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { + Ceed ceed; ++ + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); +- break; // Should not occur ++ // LCOV_EXCL_STOP + } +- case CEED_EVAL_DIV: +- break; // TODO: Not implemented +- case CEED_EVAL_CURL: +- break; // TODO: Not implemented +- // LCOV_EXCL_STOP + } + } + + // Output restriction + for (CeedInt i = 0; i < num_output_fields; i++) { ++ CeedEvalMode eval_mode; + CeedVector vec; + CeedElemRestriction elem_rstr; + + // Restore evec +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); +- if (e_mode == CEED_EVAL_NONE) { ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); ++ if (eval_mode == CEED_EVAL_NONE) { + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields])); + } + // Get output vector +@@ -441,13 +424,12 @@ static int CeedOperatorApplyAdd_Cuda(CeedOperator op, CeedVector in_vec, CeedVec + } + + //------------------------------------------------------------------------------ +-// Core code for assembling linear QFunction ++// Linear QFunction Assembly Core + //------------------------------------------------------------------------------ + static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request) { + Ceed ceed, ceed_parent; + CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; +- CeedSize q_size; + CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; + CeedVector *active_inputs; + CeedQFunctionField *qf_input_fields, *qf_output_fields; +@@ -469,7 +451,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, + // Setup + CeedCallBackend(CeedOperatorSetup_Cuda(op)); + +- // Input e_vecs and Restriction ++ // Input Evecs and Restriction + CeedCallBackend(CeedOperatorSetupInputs_Cuda(num_input_fields, qf_input_fields, op_input_fields, NULL, true, e_data, impl, request)); + + // Count number of active input fields +@@ -487,7 +469,8 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); + CeedCallBackend(CeedRealloc(num_active_in + size, &active_inputs)); + for (CeedInt field = 0; field < size; field++) { +- q_size = (CeedSize)Q * num_elem; ++ CeedSize q_size = (CeedSize)Q * num_elem; ++ + CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field])); + CeedCallBackend( + CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); +@@ -521,12 +504,13 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Cuda(CeedOperator op, + + // Build objects if needed + if (build_objects) { ++ CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; ++ CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ ++ + // Create output restriction +- CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, + num_active_in * num_active_out * num_elem * Q, strides, rstr)); + // Create assembled vector +- CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; + CeedCallBackend(CeedVectorCreate(ceed_parent, l_size, assembled)); + } + CeedCallBackend(CeedVectorSetValue(*assembled, 0.0)); +@@ -594,14 +578,14 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Cuda(CeedOperator op, CeedV + } + + //------------------------------------------------------------------------------ +-// Assemble diagonal setup ++// Assemble Diagonal Setup + //------------------------------------------------------------------------------ + static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) { + Ceed ceed; + char *diagonal_kernel_path, *diagonal_kernel_source; +- CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0, num_nodes, num_qpts; +- CeedEvalMode *e_mode_in = NULL, *e_mode_out = NULL; +- CeedElemRestriction rstr_in = NULL, rstr_out = NULL; ++ CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; ++ CeedInt num_comp, q_comp, num_nodes, num_qpts; ++ CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedQFunctionField *qf_fields; + CeedQFunction qf; +@@ -620,33 +604,20 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedIn + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { +- CeedEvalMode e_mode; +- CeedElemRestriction rstr; +- +- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); +- CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); +- CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); +- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); +- CeedCheck(!rstr_in || rstr_in == rstr, ceed, CEED_ERROR_BACKEND, +- "Backend does not implement multi-field non-composite operator diagonal assembly"); +- rstr_in = rstr; +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); +- switch (e_mode) { +- case CEED_EVAL_NONE: +- case CEED_EVAL_INTERP: +- CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in)); +- e_mode_in[num_e_mode_in] = e_mode; +- num_e_mode_in += 1; +- break; +- case CEED_EVAL_GRAD: +- CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in)); +- for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode; +- num_e_mode_in += dim; +- break; +- case CEED_EVAL_WEIGHT: +- case CEED_EVAL_DIV: +- case CEED_EVAL_CURL: +- break; // Caught by QF Assembly ++ CeedBasis basis; ++ CeedEvalMode eval_mode; ++ ++ CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); ++ CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, ++ "Backend does not implement operator diagonal assembly with multiple active bases"); ++ basis_in = basis; ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); ++ if (eval_mode != CEED_EVAL_WEIGHT) { ++ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly ++ CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in)); ++ for (CeedInt d = 0; d < q_comp; d++) eval_modes_in[num_eval_modes_in + d] = eval_mode; ++ num_eval_modes_in += q_comp; + } + } + } +@@ -659,31 +630,20 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedIn + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { +- CeedEvalMode e_mode; +- CeedElemRestriction rstr; +- +- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); +- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); +- CeedCheck(!rstr_out || rstr_out == rstr, ceed, CEED_ERROR_BACKEND, +- "Backend does not implement multi-field non-composite operator diagonal assembly"); +- rstr_out = rstr; +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); +- switch (e_mode) { +- case CEED_EVAL_NONE: +- case CEED_EVAL_INTERP: +- CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out)); +- e_mode_out[num_e_mode_out] = e_mode; +- num_e_mode_out += 1; +- break; +- case CEED_EVAL_GRAD: +- CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out)); +- for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode; +- num_e_mode_out += dim; +- break; +- case CEED_EVAL_WEIGHT: +- case CEED_EVAL_DIV: +- case CEED_EVAL_CURL: +- break; // Caught by QF Assembly ++ CeedBasis basis; ++ CeedEvalMode eval_mode; ++ ++ CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); ++ CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, ++ "Backend does not implement operator diagonal assembly with multiple active bases"); ++ basis_out = basis; ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); ++ if (eval_mode != CEED_EVAL_WEIGHT) { ++ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly ++ CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out)); ++ for (CeedInt d = 0; d < q_comp; d++) eval_modes_out[num_eval_modes_out + d] = eval_mode; ++ num_eval_modes_out += q_comp; + } + } + } +@@ -693,95 +653,147 @@ static inline int CeedOperatorAssembleDiagonalSetup_Cuda(CeedOperator op, CeedIn + CeedCallBackend(CeedCalloc(1, &impl->diag)); + CeedOperatorDiag_Cuda *diag = impl->diag; + +- diag->basis_in = basis_in; +- diag->basis_out = basis_out; +- diag->h_e_mode_in = e_mode_in; +- diag->h_e_mode_out = e_mode_out; +- diag->num_e_mode_in = num_e_mode_in; +- diag->num_e_mode_out = num_e_mode_out; +- + // Assemble kernel ++ CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); ++ CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); ++ if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; ++ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); +- CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); +- CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); +- diag->num_nodes = num_nodes; +- CeedCallCuda(ceed, +- CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_E_MODE_IN", num_e_mode_in, "NUM_E_MODE_OUT", num_e_mode_out, +- "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "NUM_COMP", num_comp, "USE_CEEDSIZE", use_ceedsize_idx)); +- CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal)); +- CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock)); ++ CeedCallCuda( ++ ceed, CeedCompile_Cuda(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", ++ num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "CEED_SIZE", use_ceedsize_idx)); ++ CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "LinearDiagonal", &diag->LinearDiagonal)); ++ CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, diag->module, "LinearPointBlockDiagonal", &diag->LinearPointBlock)); + CeedCallBackend(CeedFree(&diagonal_kernel_path)); + CeedCallBackend(CeedFree(&diagonal_kernel_source)); + + // Basis matrices +- const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); +- const CeedInt interp_bytes = q_bytes * num_nodes; +- const CeedInt grad_bytes = q_bytes * num_nodes * dim; +- const CeedInt e_mode_bytes = sizeof(CeedEvalMode); +- const CeedScalar *interp_in, *interp_out, *grad_in, *grad_out; ++ const CeedInt interp_bytes = num_nodes * num_qpts * sizeof(CeedScalar); ++ const CeedInt eval_modes_bytes = sizeof(CeedEvalMode); ++ bool has_eval_none = false; + + // CEED_EVAL_NONE +- CeedScalar *identity = NULL; +- bool is_eval_none = false; ++ for (CeedInt i = 0; i < num_eval_modes_in; i++) has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE); ++ for (CeedInt i = 0; i < num_eval_modes_out; i++) has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE); ++ if (has_eval_none) { ++ CeedScalar *identity = NULL; + +- for (CeedInt i = 0; i < num_e_mode_in; i++) is_eval_none = is_eval_none || (e_mode_in[i] == CEED_EVAL_NONE); +- for (CeedInt i = 0; i < num_e_mode_out; i++) is_eval_none = is_eval_none || (e_mode_out[i] == CEED_EVAL_NONE); +- if (is_eval_none) { +- CeedCallBackend(CeedCalloc(num_qpts * num_nodes, &identity)); ++ CeedCallBackend(CeedCalloc(num_nodes * num_qpts, &identity)); + for (CeedInt i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0; + CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_identity, interp_bytes)); + CeedCallCuda(ceed, cudaMemcpy(diag->d_identity, identity, interp_bytes, cudaMemcpyHostToDevice)); ++ CeedCallBackend(CeedFree(&identity)); + } + +- // CEED_EVAL_INTERP +- CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); +- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interp_in, interp_bytes)); +- CeedCallCuda(ceed, cudaMemcpy(diag->d_interp_in, interp_in, interp_bytes, cudaMemcpyHostToDevice)); +- CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); +- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_interp_out, interp_bytes)); +- CeedCallCuda(ceed, cudaMemcpy(diag->d_interp_out, interp_out, interp_bytes, cudaMemcpyHostToDevice)); +- +- // CEED_EVAL_GRAD +- CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); +- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_grad_in, grad_bytes)); +- CeedCallCuda(ceed, cudaMemcpy(diag->d_grad_in, grad_in, grad_bytes, cudaMemcpyHostToDevice)); +- CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); +- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_grad_out, grad_bytes)); +- CeedCallCuda(ceed, cudaMemcpy(diag->d_grad_out, grad_out, grad_bytes, cudaMemcpyHostToDevice)); +- +- // Arrays of e_modes +- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_e_mode_in, num_e_mode_in * e_mode_bytes)); +- CeedCallCuda(ceed, cudaMemcpy(diag->d_e_mode_in, e_mode_in, num_e_mode_in * e_mode_bytes, cudaMemcpyHostToDevice)); +- CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_e_mode_out, num_e_mode_out * e_mode_bytes)); +- CeedCallCuda(ceed, cudaMemcpy(diag->d_e_mode_out, e_mode_out, num_e_mode_out * e_mode_bytes, cudaMemcpyHostToDevice)); +- +- // Restriction +- diag->diag_rstr = rstr_out; ++ // CEED_EVAL_INTERP, CEED_EVAL_GRAD, CEED_EVAL_DIV, and CEED_EVAL_CURL ++ for (CeedInt in = 0; in < 2; in++) { ++ CeedFESpace fespace; ++ CeedBasis basis = in ? basis_in : basis_out; ++ ++ CeedCallBackend(CeedBasisGetFESpace(basis, &fespace)); ++ switch (fespace) { ++ case CEED_FE_SPACE_H1: { ++ CeedInt q_comp_interp, q_comp_grad; ++ const CeedScalar *interp, *grad; ++ CeedScalar *d_interp, *d_grad; ++ ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); ++ ++ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); ++ CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); ++ CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice)); ++ CeedCallBackend(CeedBasisGetGrad(basis, &grad)); ++ CeedCallCuda(ceed, cudaMalloc((void **)&d_grad, interp_bytes * q_comp_grad)); ++ CeedCallCuda(ceed, cudaMemcpy(d_grad, grad, interp_bytes * q_comp_grad, cudaMemcpyHostToDevice)); ++ if (in) { ++ diag->d_interp_in = d_interp; ++ diag->d_grad_in = d_grad; ++ } else { ++ diag->d_interp_out = d_interp; ++ diag->d_grad_out = d_grad; ++ } ++ } break; ++ case CEED_FE_SPACE_HDIV: { ++ CeedInt q_comp_interp, q_comp_div; ++ const CeedScalar *interp, *div; ++ CeedScalar *d_interp, *d_div; ++ ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); ++ ++ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); ++ CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); ++ CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice)); ++ CeedCallBackend(CeedBasisGetDiv(basis, &div)); ++ CeedCallCuda(ceed, cudaMalloc((void **)&d_div, interp_bytes * q_comp_div)); ++ CeedCallCuda(ceed, cudaMemcpy(d_div, div, interp_bytes * q_comp_div, cudaMemcpyHostToDevice)); ++ if (in) { ++ diag->d_interp_in = d_interp; ++ diag->d_div_in = d_div; ++ } else { ++ diag->d_interp_out = d_interp; ++ diag->d_div_out = d_div; ++ } ++ } break; ++ case CEED_FE_SPACE_HCURL: { ++ CeedInt q_comp_interp, q_comp_curl; ++ const CeedScalar *interp, *curl; ++ CeedScalar *d_interp, *d_curl; ++ ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); ++ ++ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); ++ CeedCallCuda(ceed, cudaMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); ++ CeedCallCuda(ceed, cudaMemcpy(d_interp, interp, interp_bytes * q_comp_interp, cudaMemcpyHostToDevice)); ++ CeedCallBackend(CeedBasisGetCurl(basis, &curl)); ++ CeedCallCuda(ceed, cudaMalloc((void **)&d_curl, interp_bytes * q_comp_curl)); ++ CeedCallCuda(ceed, cudaMemcpy(d_curl, curl, interp_bytes * q_comp_curl, cudaMemcpyHostToDevice)); ++ if (in) { ++ diag->d_interp_in = d_interp; ++ diag->d_curl_in = d_curl; ++ } else { ++ diag->d_interp_out = d_interp; ++ diag->d_curl_out = d_curl; ++ } ++ } break; ++ } ++ } ++ ++ // Arrays of eval_modes ++ CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_eval_modes_in, num_eval_modes_in * eval_modes_bytes)); ++ CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_in, eval_modes_in, num_eval_modes_in * eval_modes_bytes, cudaMemcpyHostToDevice)); ++ CeedCallCuda(ceed, cudaMalloc((void **)&diag->d_eval_modes_out, num_eval_modes_out * eval_modes_bytes)); ++ CeedCallCuda(ceed, cudaMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, cudaMemcpyHostToDevice)); ++ CeedCallBackend(CeedFree(&eval_modes_in)); ++ CeedCallBackend(CeedFree(&eval_modes_out)); + return CEED_ERROR_SUCCESS; + } + + //------------------------------------------------------------------------------ +-// Assemble diagonal common code ++// Assemble Diagonal Core + //------------------------------------------------------------------------------ + static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { + Ceed ceed; +- CeedSize assembled_length = 0, assembled_qf_length = 0; +- CeedInt use_ceedsize_idx = 0, num_elem; ++ CeedSize assembled_length, assembled_qf_length; ++ CeedInt use_ceedsize_idx = 0, num_elem, num_nodes; + CeedScalar *elem_diag_array; + const CeedScalar *assembled_qf_array; +- CeedVector assembled_qf = NULL; +- CeedElemRestriction rstr = NULL; ++ CeedVector assembled_qf = NULL, elem_diag; ++ CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out, diag_rstr; + CeedOperator_Cuda *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + + // Assemble QFunction +- CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request)); +- CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); ++ CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, request)); ++ CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); ++ CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); + + CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); + CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); +@@ -793,36 +805,37 @@ static inline int CeedOperatorAssembleDiagonalCore_Cuda(CeedOperator op, CeedVec + + assert(diag != NULL); + +- // Restriction +- if (is_point_block && !diag->point_block_diag_rstr) { +- CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(diag->diag_rstr, &diag->point_block_diag_rstr)); +- } +- CeedElemRestriction diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; +- +- // Create diagonal vector +- CeedVector elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; +- +- if (!elem_diag) { +- CeedCallBackend(CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag)); +- if (is_point_block) diag->point_block_elem_diag = elem_diag; +- else diag->elem_diag = elem_diag; ++ // Restriction and diagonal vector ++ CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); ++ CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND, ++ "Cannot assemble operator diagonal with different input and output active element restrictions"); ++ if (!is_point_block && !diag->diag_rstr) { ++ CeedCallBackend(CeedElemRestrictionCreateUnsignedCopy(rstr_out, &diag->diag_rstr)); ++ CeedCallBackend(CeedElemRestrictionCreateVector(diag->diag_rstr, NULL, &diag->elem_diag)); ++ } else if (is_point_block && !diag->point_block_diag_rstr) { ++ CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr)); ++ CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag)); + } ++ diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; ++ elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; + CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); + + // Assemble element operator diagonals + CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); +- CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); + CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(diag_rstr, &num_nodes)); + + // Compute the diagonal of B^T D B +- int elem_per_block = 1; +- int grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); +- void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_interp_out, +- &diag->d_grad_out, &diag->d_e_mode_in, &diag->d_e_mode_out, &assembled_qf_array, &elem_diag_array}; ++ CeedInt elems_per_block = 1; ++ CeedInt grid = CeedDivUpInt(num_elem, elems_per_block); ++ void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_div_in, ++ &diag->d_curl_in, &diag->d_interp_out, &diag->d_grad_out, &diag->d_div_out, &diag->d_curl_out, ++ &diag->d_eval_modes_in, &diag->d_eval_modes_out, &assembled_qf_array, &elem_diag_array}; ++ + if (is_point_block) { +- CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearPointBlock, grid, diag->num_nodes, 1, elem_per_block, args)); ++ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args)); + } else { +- CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->linearDiagonal, grid, diag->num_nodes, 1, elem_per_block, args)); ++ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args)); + } + + // Restore arrays +@@ -854,14 +867,15 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Cuda(CeedOperator op, + } + + //------------------------------------------------------------------------------ +-// Single operator assembly setup ++// Single Operator Assembly Setup + //------------------------------------------------------------------------------ + static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_ceedsize_idx) { +- Ceed ceed; +- char *assembly_kernel_path, *assembly_kernel_source; +- CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_qpts = 0, elem_size = 0, +- num_e_mode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0, num_elem, num_comp; +- CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; ++ Ceed ceed; ++ Ceed_Cuda *cuda_data; ++ char *assembly_kernel_path, *assembly_kernel_source; ++ CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; ++ CeedInt elem_size_in, num_qpts_in, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp; ++ CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; + CeedElemRestriction rstr_in = NULL, rstr_out = NULL; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedQFunctionField *qf_fields; +@@ -878,34 +892,30 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee + // Determine active input basis eval mode + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); +- // Note that the kernel will treat each dimension of a gradient action separately; +- // i.e., when an active input has a CEED_EVAL_GRAD mode, num_e_mode_in will increment by dim. +- // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so +- // num_B_in_mats_to_load will be incremented by 1. + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { ++ CeedBasis basis; + CeedEvalMode eval_mode; + +- CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in)); +- CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); +- CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); ++ CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis)); ++ CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); ++ basis_in = basis; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); +- CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); ++ if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in; ++ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); +- if (eval_mode != CEED_EVAL_NONE) { +- CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); +- eval_mode_in[num_B_in_mats_to_load] = eval_mode; +- num_B_in_mats_to_load += 1; +- if (eval_mode == CEED_EVAL_GRAD) { +- num_e_mode_in += dim; +- size_B_in += dim * elem_size * num_qpts; +- } else { +- num_e_mode_in += 1; +- size_B_in += elem_size * num_qpts; ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); ++ if (eval_mode != CEED_EVAL_WEIGHT) { ++ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly ++ CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in)); ++ for (CeedInt d = 0; d < q_comp; d++) { ++ eval_modes_in[num_eval_modes_in + d] = eval_mode; + } ++ num_eval_modes_in += q_comp; + } + } + } +@@ -917,112 +927,134 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee + + CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { ++ CeedBasis basis; + CeedEvalMode eval_mode; + +- CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out)); ++ CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis)); ++ CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, ++ "Backend does not implement operator assembly with multiple active bases"); ++ basis_out = basis; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); +- CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); ++ if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out; ++ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out)); ++ CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED, ++ "Active input and output bases must have the same number of quadrature points"); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); +- if (eval_mode != CEED_EVAL_NONE) { +- CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); +- eval_mode_out[num_B_out_mats_to_load] = eval_mode; +- num_B_out_mats_to_load += 1; +- if (eval_mode == CEED_EVAL_GRAD) { +- num_e_mode_out += dim; +- size_B_out += dim * elem_size * num_qpts; +- } else { +- num_e_mode_out += 1; +- size_B_out += elem_size * num_qpts; ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); ++ if (eval_mode != CEED_EVAL_WEIGHT) { ++ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly ++ CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out)); ++ for (CeedInt d = 0; d < q_comp; d++) { ++ eval_modes_out[num_eval_modes_out + d] = eval_mode; + } ++ num_eval_modes_out += q_comp; + } + } + } +- CeedCheck(num_e_mode_in > 0 && num_e_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); +- +- CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem)); +- CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp)); ++ CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); + + CeedCallBackend(CeedCalloc(1, &impl->asmb)); + CeedOperatorAssemble_Cuda *asmb = impl->asmb; +- asmb->num_elem = num_elem; +- +- // Compile kernels +- int elem_per_block = 1; +- asmb->elem_per_block = elem_per_block; +- CeedInt block_size = elem_size * elem_size * elem_per_block; +- Ceed_Cuda *cuda_data; ++ asmb->elems_per_block = 1; ++ asmb->block_size_x = elem_size_in; ++ asmb->block_size_y = elem_size_out; + + CeedCallBackend(CeedGetData(ceed, &cuda_data)); +- CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path)); +- CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); +- CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); +- CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); +- bool fallback = block_size > cuda_data->device_prop.maxThreadsPerBlock; ++ bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > cuda_data->device_prop.maxThreadsPerBlock; + + if (fallback) { + // Use fallback kernel with 1D threadblock +- block_size = elem_size * elem_per_block; +- asmb->block_size_x = elem_size; + asmb->block_size_y = 1; +- } else { // Use kernel with 2D threadblock +- asmb->block_size_x = elem_size; +- asmb->block_size_y = elem_size; + } +- CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 8, "NUM_ELEM", num_elem, "NUM_E_MODE_IN", num_e_mode_in, +- "NUM_E_MODE_OUT", num_e_mode_out, "NUM_QPTS", num_qpts, "NUM_NODES", elem_size, "BLOCK_SIZE", block_size, +- "NUM_COMP", num_comp, "USE_CEEDSIZE", use_ceedsize_idx)); +- CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble)); ++ ++ // Compile kernels ++ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in)); ++ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out)); ++ CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-operator-assemble.h", &assembly_kernel_path)); ++ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); ++ CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); ++ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); ++ CeedCallBackend(CeedCompile_Cuda(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", ++ num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in, ++ "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE", ++ asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "CEED_SIZE", ++ use_ceedsize_idx)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble)); + CeedCallBackend(CeedFree(&assembly_kernel_path)); + CeedCallBackend(CeedFree(&assembly_kernel_source)); + +- // Build 'full' B matrices (not 1D arrays used for tensor-product matrices) +- const CeedScalar *interp_in, *grad_in; ++ // Load into B_in, in order that they will be used in eval_modes_in ++ { ++ const CeedInt in_bytes = elem_size_in * num_qpts_in * num_eval_modes_in * sizeof(CeedScalar); ++ CeedInt d_in = 0; ++ CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE; ++ bool has_eval_none = false; ++ CeedScalar *identity = NULL; + +- CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); +- CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); ++ for (CeedInt i = 0; i < num_eval_modes_in; i++) { ++ has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE); ++ } ++ if (has_eval_none) { ++ CeedCallBackend(CeedCalloc(elem_size_in * num_qpts_in, &identity)); ++ for (CeedInt i = 0; i < (elem_size_in < num_qpts_in ? elem_size_in : num_qpts_in); i++) identity[i * elem_size_in + i] = 1.0; ++ } ++ ++ CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_in, in_bytes)); ++ for (CeedInt i = 0; i < num_eval_modes_in; i++) { ++ const CeedScalar *h_B_in; + +- // Load into B_in, in order that they will be used in eval_mode +- const CeedInt inBytes = size_B_in * sizeof(CeedScalar); +- CeedInt mat_start = 0; ++ CeedCallBackend(CeedOperatorGetBasisPointer(basis_in, eval_modes_in[i], identity, &h_B_in)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_modes_in[i], &q_comp)); ++ if (q_comp > 1) { ++ if (i == 0 || eval_modes_in[i] != eval_modes_in_prev) d_in = 0; ++ else h_B_in = &h_B_in[(++d_in) * elem_size_in * num_qpts_in]; ++ } ++ eval_modes_in_prev = eval_modes_in[i]; + +- CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_in, inBytes)); +- for (int i = 0; i < num_B_in_mats_to_load; i++) { +- CeedEvalMode eval_mode = eval_mode_in[i]; ++ CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar), ++ cudaMemcpyHostToDevice)); ++ } + +- if (eval_mode == CEED_EVAL_INTERP) { +- CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], interp_in, elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); +- mat_start += elem_size * num_qpts; +- } else if (eval_mode == CEED_EVAL_GRAD) { +- CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); +- mat_start += dim * elem_size * num_qpts; ++ if (identity) { ++ CeedCallBackend(CeedFree(&identity)); + } + } + +- const CeedScalar *interp_out, *grad_out; ++ // Load into B_out, in order that they will be used in eval_modes_out ++ { ++ const CeedInt out_bytes = elem_size_out * num_qpts_out * num_eval_modes_out * sizeof(CeedScalar); ++ CeedInt d_out = 0; ++ CeedEvalMode eval_modes_out_prev = CEED_EVAL_NONE; ++ bool has_eval_none = false; ++ CeedScalar *identity = NULL; + +- // Note that this function currently assumes 1 basis, so this should always be true for now +- if (basis_out == basis_in) { +- interp_out = interp_in; +- grad_out = grad_in; +- } else { +- CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); +- CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); +- } ++ for (CeedInt i = 0; i < num_eval_modes_out; i++) { ++ has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE); ++ } ++ if (has_eval_none) { ++ CeedCallBackend(CeedCalloc(elem_size_out * num_qpts_out, &identity)); ++ for (CeedInt i = 0; i < (elem_size_out < num_qpts_out ? elem_size_out : num_qpts_out); i++) identity[i * elem_size_out + i] = 1.0; ++ } ++ ++ CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, out_bytes)); ++ for (CeedInt i = 0; i < num_eval_modes_out; i++) { ++ const CeedScalar *h_B_out; + +- // Load into B_out, in order that they will be used in eval_mode +- const CeedInt outBytes = size_B_out * sizeof(CeedScalar); +- mat_start = 0; ++ CeedCallBackend(CeedOperatorGetBasisPointer(basis_out, eval_modes_out[i], identity, &h_B_out)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_modes_out[i], &q_comp)); ++ if (q_comp > 1) { ++ if (i == 0 || eval_modes_out[i] != eval_modes_out_prev) d_out = 0; ++ else h_B_out = &h_B_out[(++d_out) * elem_size_out * num_qpts_out]; ++ } ++ eval_modes_out_prev = eval_modes_out[i]; + +- CeedCallCuda(ceed, cudaMalloc((void **)&asmb->d_B_out, outBytes)); +- for (int i = 0; i < num_B_out_mats_to_load; i++) { +- CeedEvalMode eval_mode = eval_mode_out[i]; ++ CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar), ++ cudaMemcpyHostToDevice)); ++ } + +- if (eval_mode == CEED_EVAL_INTERP) { +- CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], interp_out, elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); +- mat_start += elem_size * num_qpts; +- } else if (eval_mode == CEED_EVAL_GRAD) { +- CeedCallCuda(ceed, cudaMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * elem_size * num_qpts * sizeof(CeedScalar), cudaMemcpyHostToDevice)); +- mat_start += dim * elem_size * num_qpts; ++ if (identity) { ++ CeedCallBackend(CeedFree(&identity)); + } + } + return CEED_ERROR_SUCCESS; +@@ -1039,47 +1071,96 @@ static int CeedSingleOperatorAssembleSetup_Cuda(CeedOperator op, CeedInt use_cee + static int CeedSingleOperatorAssemble_Cuda(CeedOperator op, CeedInt offset, CeedVector values) { + Ceed ceed; + CeedSize values_length = 0, assembled_qf_length = 0; +- CeedInt use_ceedsize_idx = 0; ++ CeedInt use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out; + CeedScalar *values_array; +- const CeedScalar *qf_array; +- CeedVector assembled_qf = NULL; +- CeedElemRestriction rstr_q = NULL; ++ const CeedScalar *assembled_qf_array; ++ CeedVector assembled_qf = NULL; ++ CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out; ++ CeedRestrictionType rstr_type_in, rstr_type_out; ++ const bool *orients_in = NULL, *orients_out = NULL; ++ const CeedInt8 *curl_orients_in = NULL, *curl_orients_out = NULL; + CeedOperator_Cuda *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + + // Assemble QFunction +- CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); +- CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); +- CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); +- values_array += offset; +- CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); ++ CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, CEED_REQUEST_IMMEDIATE)); ++ CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); ++ CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); + + CeedCallBackend(CeedVectorGetLength(values, &values_length)); + CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); + if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; ++ + // Setup +- if (!impl->asmb) { +- CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx)); +- assert(impl->asmb != NULL); ++ if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Cuda(op, use_ceedsize_idx)); ++ CeedOperatorAssemble_Cuda *asmb = impl->asmb; ++ ++ assert(asmb != NULL); ++ ++ // Assemble element operator ++ CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); ++ values_array += offset; ++ ++ CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem_in)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); ++ ++ CeedCallBackend(CeedElemRestrictionGetType(rstr_in, &rstr_type_in)); ++ if (rstr_type_in == CEED_RESTRICTION_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_in, CEED_MEM_DEVICE, &orients_in)); ++ } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_in, CEED_MEM_DEVICE, &curl_orients_in)); ++ } ++ ++ if (rstr_in != rstr_out) { ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out)); ++ CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED, ++ "Active input and output operator restrictions must have the same number of elements"); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); ++ ++ CeedCallBackend(CeedElemRestrictionGetType(rstr_out, &rstr_type_out)); ++ if (rstr_type_out == CEED_RESTRICTION_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_out, CEED_MEM_DEVICE, &orients_out)); ++ } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_out, CEED_MEM_DEVICE, &curl_orients_out)); ++ } ++ } else { ++ elem_size_out = elem_size_in; ++ orients_out = orients_in; ++ curl_orients_out = curl_orients_in; + } + + // Compute B^T D B +- const CeedInt num_elem = impl->asmb->num_elem; +- const CeedInt elem_per_block = impl->asmb->elem_per_block; +- const CeedInt grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); +- void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; ++ CeedInt shared_mem = ++ ((curl_orients_in || curl_orients_out ? elem_size_in * elem_size_out : 0) + (curl_orients_in ? elem_size_in * asmb->block_size_y : 0)) * ++ sizeof(CeedScalar); ++ CeedInt grid = CeedDivUpInt(num_elem_in, asmb->elems_per_block); ++ void *args[] = {(void *)&num_elem_in, &asmb->d_B_in, &asmb->d_B_out, &orients_in, &curl_orients_in, ++ &orients_out, &curl_orients_out, &assembled_qf_array, &values_array}; + + CeedCallBackend( +- CeedRunKernelDim_Cuda(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elem_per_block, args)); ++ CeedRunKernelDimShared_Cuda(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args)); + + // Restore arrays + CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); +- CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array)); ++ CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); + + // Cleanup + CeedCallBackend(CeedVectorDestroy(&assembled_qf)); ++ if (rstr_type_in == CEED_RESTRICTION_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_in, &orients_in)); ++ } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_in, &curl_orients_in)); ++ } ++ if (rstr_in != rstr_out) { ++ if (rstr_type_out == CEED_RESTRICTION_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_out, &orients_out)); ++ } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out)); ++ } ++ } + return CEED_ERROR_SUCCESS; + } + +diff --git a/backends/cuda-ref/ceed-cuda-ref-restriction.c b/backends/cuda-ref/ceed-cuda-ref-restriction.c +index 71ed2821..f2b190e5 100644 +--- a/backends/cuda-ref/ceed-cuda-ref-restriction.c ++++ b/backends/cuda-ref/ceed-cuda-ref-restriction.c +@@ -19,22 +19,23 @@ + #include "ceed-cuda-ref.h" + + //------------------------------------------------------------------------------ +-// Apply restriction ++// Core apply restriction code + //------------------------------------------------------------------------------ +-static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { ++static inline int CeedElemRestrictionApply_Cuda_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients, ++ CeedVector u, CeedVector v, CeedRequest *request) { + Ceed ceed; +- Ceed_Cuda *data; +- CUfunction kernel; + CeedInt num_elem, elem_size; ++ CeedRestrictionType rstr_type; + const CeedScalar *d_u; + CeedScalar *d_v; + CeedElemRestriction_Cuda *impl; ++ CUfunction kernel; + +- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); +- CeedCallBackend(CeedGetData(ceed, &data)); +- CeedElemRestrictionGetNumElements(r, &num_elem); +- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); ++ CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); + const CeedInt num_nodes = impl->num_nodes; + + // Get vectors +@@ -50,45 +51,155 @@ static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMod + // Restrict + if (t_mode == CEED_NOTRANSPOSE) { + // L-vector -> E-vector +- if (impl->d_ind) { +- // -- Offsets provided +- kernel = impl->OffsetNoTranspose; +- void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; +- CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; +- +- CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); +- } else { +- // -- Strided restriction +- kernel = impl->StridedNoTranspose; +- void *args[] = {&num_elem, &d_u, &d_v}; +- CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; +- +- CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); ++ const CeedInt block_size = elem_size < 1024 ? (elem_size > 32 ? elem_size : 32) : 1024; ++ const CeedInt grid = CeedDivUpInt(num_nodes, block_size); ++ ++ switch (rstr_type) { ++ case CEED_RESTRICTION_STRIDED: { ++ kernel = impl->StridedNoTranspose; ++ void *args[] = {&num_elem, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } break; ++ case CEED_RESTRICTION_STANDARD: { ++ kernel = impl->OffsetNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } break; ++ case CEED_RESTRICTION_ORIENTED: { ++ if (use_signs) { ++ kernel = impl->OrientedNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OffsetNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } ++ } break; ++ case CEED_RESTRICTION_CURL_ORIENTED: { ++ if (use_signs && use_orients) { ++ kernel = impl->CurlOrientedNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } else if (use_orients) { ++ kernel = impl->CurlOrientedUnsignedNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OffsetNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } ++ } break; ++ case CEED_RESTRICTION_POINTS: { ++ // LCOV_EXCL_START ++ return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); ++ // LCOV_EXCL_STOP ++ } break; + } + } else { + // E-vector -> L-vector +- if (impl->d_ind) { +- // -- Offsets provided +- CeedInt block_size = 32; ++ const CeedInt block_size = 32; ++ const CeedInt grid = CeedDivUpInt(num_nodes, block_size); ++ ++ switch (rstr_type) { ++ case CEED_RESTRICTION_STRIDED: { ++ kernel = impl->StridedTranspose; ++ void *args[] = {&num_elem, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } break; ++ case CEED_RESTRICTION_STANDARD: { ++ if (impl->OffsetTranspose) { ++ kernel = impl->OffsetTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OffsetTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } ++ } break; ++ case CEED_RESTRICTION_ORIENTED: { ++ if (use_signs) { ++ if (impl->OrientedTranspose) { ++ kernel = impl->OrientedTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OrientedTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } ++ } else { ++ if (impl->OffsetTranspose) { ++ kernel = impl->OffsetTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + +- if (impl->OffsetTranspose) { +- kernel = impl->OffsetTranspose; +- void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OffsetTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; + +- CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); +- } else { +- kernel = impl->OffsetTransposeDet; +- void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } ++ } ++ } break; ++ case CEED_RESTRICTION_CURL_ORIENTED: { ++ if (use_signs && use_orients) { ++ if (impl->CurlOrientedTranspose) { ++ kernel = impl->CurlOrientedTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->CurlOrientedTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } ++ } else if (use_orients) { ++ if (impl->CurlOrientedUnsignedTranspose) { ++ kernel = impl->CurlOrientedUnsignedTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; + +- CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); +- } +- } else { +- // -- Strided restriction +- kernel = impl->StridedTranspose; +- void *args[] = {&num_elem, &d_u, &d_v}; +- CeedInt block_size = 32; ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->CurlOrientedUnsignedTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } ++ } else { ++ if (impl->OffsetTranspose) { ++ kernel = impl->OffsetTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + +- CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OffsetTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, grid, block_size, args)); ++ } ++ } ++ } break; ++ case CEED_RESTRICTION_POINTS: { ++ // LCOV_EXCL_START ++ return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); ++ // LCOV_EXCL_STOP ++ } break; + } + } + +@@ -100,6 +211,29 @@ static int CeedElemRestrictionApply_Cuda(CeedElemRestriction r, CeedTransposeMod + return CEED_ERROR_SUCCESS; + } + ++//------------------------------------------------------------------------------ ++// Apply restriction ++//------------------------------------------------------------------------------ ++static int CeedElemRestrictionApply_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { ++ return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, true, true, u, v, request); ++} ++ ++//------------------------------------------------------------------------------ ++// Apply unsigned restriction ++//------------------------------------------------------------------------------ ++static int CeedElemRestrictionApplyUnsigned_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, ++ CeedRequest *request) { ++ return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, true, u, v, request); ++} ++ ++//------------------------------------------------------------------------------ ++// Apply unoriented restriction ++//------------------------------------------------------------------------------ ++static int CeedElemRestrictionApplyUnoriented_Cuda(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, ++ CeedRequest *request) { ++ return CeedElemRestrictionApply_Cuda_Core(rstr, t_mode, false, false, u, v, request); ++} ++ + //------------------------------------------------------------------------------ + // Get offsets + //------------------------------------------------------------------------------ +@@ -118,21 +252,61 @@ static int CeedElemRestrictionGetOffsets_Cuda(CeedElemRestriction rstr, CeedMemT + return CEED_ERROR_SUCCESS; + } + ++//------------------------------------------------------------------------------ ++// Get orientations ++//------------------------------------------------------------------------------ ++static int CeedElemRestrictionGetOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) { ++ CeedElemRestriction_Cuda *impl; ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ ++ switch (mem_type) { ++ case CEED_MEM_HOST: ++ *orients = impl->h_orients; ++ break; ++ case CEED_MEM_DEVICE: ++ *orients = impl->d_orients; ++ break; ++ } ++ return CEED_ERROR_SUCCESS; ++} ++ ++//------------------------------------------------------------------------------ ++// Get curl-conforming orientations ++//------------------------------------------------------------------------------ ++static int CeedElemRestrictionGetCurlOrientations_Cuda(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) { ++ CeedElemRestriction_Cuda *impl; ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ ++ switch (mem_type) { ++ case CEED_MEM_HOST: ++ *curl_orients = impl->h_curl_orients; ++ break; ++ case CEED_MEM_DEVICE: ++ *curl_orients = impl->d_curl_orients; ++ break; ++ } ++ return CEED_ERROR_SUCCESS; ++} ++ + //------------------------------------------------------------------------------ + // Destroy restriction + //------------------------------------------------------------------------------ +-static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) { ++static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction rstr) { + Ceed ceed; + CeedElemRestriction_Cuda *impl; + +- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); + CeedCallCuda(ceed, cuModuleUnload(impl->module)); + CeedCallBackend(CeedFree(&impl->h_ind_allocated)); + CeedCallCuda(ceed, cudaFree(impl->d_ind_allocated)); + CeedCallCuda(ceed, cudaFree(impl->d_t_offsets)); + CeedCallCuda(ceed, cudaFree(impl->d_t_indices)); + CeedCallCuda(ceed, cudaFree(impl->d_l_vec_indices)); ++ CeedCallBackend(CeedFree(&impl->h_orients_allocated)); ++ CeedCallCuda(ceed, cudaFree(impl->d_orients_allocated)); ++ CeedCallBackend(CeedFree(&impl->h_curl_orients_allocated)); ++ CeedCallCuda(ceed, cudaFree(impl->d_curl_orients_allocated)); + CeedCallBackend(CeedFree(&impl)); + return CEED_ERROR_SUCCESS; + } +@@ -140,7 +314,7 @@ static int CeedElemRestrictionDestroy_Cuda(CeedElemRestriction r) { + //------------------------------------------------------------------------------ + // Create transpose offsets and indices + //------------------------------------------------------------------------------ +-static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const CeedInt *indices) { ++static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction rstr, const CeedInt *indices) { + Ceed ceed; + bool *is_node; + CeedSize l_size; +@@ -148,12 +322,12 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const Cee + CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; + CeedElemRestriction_Cuda *impl; + +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); +- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); +- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); +- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); +- CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); +- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); ++ CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); ++ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + const CeedInt size_indices = num_elem * elem_size; + + // Count num_nodes +@@ -221,153 +395,223 @@ static int CeedElemRestrictionOffset_Cuda(const CeedElemRestriction r, const Cee + // Create restriction + //------------------------------------------------------------------------------ + int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, +- const CeedInt8 *curl_orients, CeedElemRestriction r) { ++ const CeedInt8 *curl_orients, CeedElemRestriction rstr) { + Ceed ceed, ceed_parent; +- bool is_deterministic, is_strided; ++ bool is_deterministic; + CeedInt num_elem, num_comp, elem_size, comp_stride = 1; + CeedRestrictionType rstr_type; ++ char *restriction_kernel_path, *restriction_kernel_source; + CeedElemRestriction_Cuda *impl; + +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); +- CeedCallBackend(CeedCalloc(1, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); + CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); + CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); +- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); +- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); +- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); ++ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); + const CeedInt size = num_elem * elem_size; + CeedInt strides[3] = {1, size, elem_size}; + CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; + +- CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); +- CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, +- "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); +- + // Stride data +- CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); +- if (is_strided) { ++ CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); ++ if (rstr_type == CEED_RESTRICTION_STRIDED) { + bool has_backend_strides; + +- CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); ++ CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); + if (!has_backend_strides) { +- CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); ++ CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); + } + } else { +- CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); ++ CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); + } + +- impl->h_ind = NULL; +- impl->h_ind_allocated = NULL; +- impl->d_ind = NULL; +- impl->d_ind_allocated = NULL; +- impl->d_t_indices = NULL; +- impl->d_t_offsets = NULL; +- impl->num_nodes = size; +- CeedCallBackend(CeedElemRestrictionSetData(r, impl)); +- CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); +- +- // Set up device indices/offset arrays +- switch (mem_type) { +- case CEED_MEM_HOST: { +- switch (copy_mode) { +- case CEED_OWN_POINTER: +- impl->h_ind_allocated = (CeedInt *)indices; +- impl->h_ind = (CeedInt *)indices; +- break; +- case CEED_USE_POINTER: +- impl->h_ind = (CeedInt *)indices; +- break; +- case CEED_COPY_VALUES: +- if (indices != NULL) { +- CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); +- memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); ++ CeedCallBackend(CeedCalloc(1, &impl)); ++ impl->num_nodes = size; ++ impl->h_ind = NULL; ++ impl->h_ind_allocated = NULL; ++ impl->d_ind = NULL; ++ impl->d_ind_allocated = NULL; ++ impl->d_t_indices = NULL; ++ impl->d_t_offsets = NULL; ++ impl->h_orients = NULL; ++ impl->h_orients_allocated = NULL; ++ impl->d_orients = NULL; ++ impl->d_orients_allocated = NULL; ++ impl->h_curl_orients = NULL; ++ impl->h_curl_orients_allocated = NULL; ++ impl->d_curl_orients = NULL; ++ impl->d_curl_orients_allocated = NULL; ++ CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); ++ CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); ++ ++ // Set up device offset/orientation arrays ++ if (rstr_type != CEED_RESTRICTION_STRIDED) { ++ switch (mem_type) { ++ case CEED_MEM_HOST: { ++ switch (copy_mode) { ++ case CEED_OWN_POINTER: ++ impl->h_ind_allocated = (CeedInt *)indices; ++ impl->h_ind = (CeedInt *)indices; ++ break; ++ case CEED_USE_POINTER: ++ impl->h_ind = (CeedInt *)indices; ++ break; ++ case CEED_COPY_VALUES: ++ CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); ++ memcpy(impl->h_ind_allocated, indices, size * sizeof(CeedInt)); + impl->h_ind = impl->h_ind_allocated; +- } +- break; +- } +- if (indices != NULL) { ++ break; ++ } + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); + impl->d_ind_allocated = impl->d_ind; // We own the device memory + CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyHostToDevice)); +- if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices)); +- } +- break; +- } +- case CEED_MEM_DEVICE: { +- switch (copy_mode) { +- case CEED_COPY_VALUES: +- if (indices != NULL) { ++ if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, indices)); ++ } break; ++ case CEED_MEM_DEVICE: { ++ switch (copy_mode) { ++ case CEED_COPY_VALUES: + CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); + impl->d_ind_allocated = impl->d_ind; // We own the device memory + CeedCallCuda(ceed, cudaMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), cudaMemcpyDeviceToDevice)); ++ break; ++ case CEED_OWN_POINTER: ++ impl->d_ind = (CeedInt *)indices; ++ impl->d_ind_allocated = impl->d_ind; ++ break; ++ case CEED_USE_POINTER: ++ impl->d_ind = (CeedInt *)indices; ++ break; ++ } ++ CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); ++ CeedCallCuda(ceed, cudaMemcpy(impl->h_ind_allocated, impl->d_ind, size * sizeof(CeedInt), cudaMemcpyDeviceToHost)); ++ impl->h_ind = impl->h_ind_allocated; ++ if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(rstr, indices)); ++ } break; ++ } ++ ++ // Orientation data ++ if (rstr_type == CEED_RESTRICTION_ORIENTED) { ++ switch (mem_type) { ++ case CEED_MEM_HOST: { ++ switch (copy_mode) { ++ case CEED_OWN_POINTER: ++ impl->h_orients_allocated = (bool *)orients; ++ impl->h_orients = (bool *)orients; ++ break; ++ case CEED_USE_POINTER: ++ impl->h_orients = (bool *)orients; ++ break; ++ case CEED_COPY_VALUES: ++ CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); ++ memcpy(impl->h_orients_allocated, orients, size * sizeof(bool)); ++ impl->h_orients = impl->h_orients_allocated; ++ break; + } +- break; +- case CEED_OWN_POINTER: +- impl->d_ind = (CeedInt *)indices; +- impl->d_ind_allocated = impl->d_ind; +- break; +- case CEED_USE_POINTER: +- impl->d_ind = (CeedInt *)indices; ++ CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_orients, size * sizeof(bool))); ++ impl->d_orients_allocated = impl->d_orients; // We own the device memory ++ CeedCallCuda(ceed, cudaMemcpy(impl->d_orients, orients, size * sizeof(bool), cudaMemcpyHostToDevice)); ++ } break; ++ case CEED_MEM_DEVICE: { ++ switch (copy_mode) { ++ case CEED_COPY_VALUES: ++ CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_orients, size * sizeof(bool))); ++ impl->d_orients_allocated = impl->d_orients; // We own the device memory ++ CeedCallCuda(ceed, cudaMemcpy(impl->d_orients, orients, size * sizeof(bool), cudaMemcpyDeviceToDevice)); ++ break; ++ case CEED_OWN_POINTER: ++ impl->d_orients = (bool *)orients; ++ impl->d_orients_allocated = impl->d_orients; ++ break; ++ case CEED_USE_POINTER: ++ impl->d_orients = (bool *)orients; ++ break; ++ } ++ CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); ++ CeedCallCuda(ceed, cudaMemcpy(impl->h_orients_allocated, impl->d_orients, size * sizeof(bool), cudaMemcpyDeviceToHost)); ++ impl->h_orients = impl->h_orients_allocated; ++ } break; + } +- if (indices != NULL) { +- CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); +- CeedCallCuda(ceed, cudaMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), cudaMemcpyDeviceToHost)); +- impl->h_ind = impl->h_ind_allocated; +- if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Cuda(r, indices)); ++ } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) { ++ switch (mem_type) { ++ case CEED_MEM_HOST: { ++ switch (copy_mode) { ++ case CEED_OWN_POINTER: ++ impl->h_curl_orients_allocated = (CeedInt8 *)curl_orients; ++ impl->h_curl_orients = (CeedInt8 *)curl_orients; ++ break; ++ case CEED_USE_POINTER: ++ impl->h_curl_orients = (CeedInt8 *)curl_orients; ++ break; ++ case CEED_COPY_VALUES: ++ CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); ++ memcpy(impl->h_curl_orients_allocated, curl_orients, 3 * size * sizeof(CeedInt8)); ++ impl->h_curl_orients = impl->h_curl_orients_allocated; ++ break; ++ } ++ CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); ++ impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory ++ CeedCallCuda(ceed, cudaMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyHostToDevice)); ++ } break; ++ case CEED_MEM_DEVICE: { ++ switch (copy_mode) { ++ case CEED_COPY_VALUES: ++ CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); ++ impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory ++ CeedCallCuda(ceed, cudaMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyDeviceToDevice)); ++ break; ++ case CEED_OWN_POINTER: ++ impl->d_curl_orients = (CeedInt8 *)curl_orients; ++ impl->d_curl_orients_allocated = impl->d_curl_orients; ++ break; ++ case CEED_USE_POINTER: ++ impl->d_curl_orients = (CeedInt8 *)curl_orients; ++ break; ++ } ++ CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); ++ CeedCallCuda(ceed, cudaMemcpy(impl->h_curl_orients_allocated, impl->d_curl_orients, 3 * size * sizeof(CeedInt8), cudaMemcpyDeviceToHost)); ++ impl->h_curl_orients = impl->h_curl_orients_allocated; ++ } break; + } +- break; + } +- // LCOV_EXCL_START +- default: +- return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); +- // LCOV_EXCL_STOP + } + +- // Compile CUDA kernels (add atomicAdd function for old NVidia architectures) +- CeedInt num_nodes = impl->num_nodes; +- char *restriction_kernel_path, *restriction_kernel_source = NULL; +- ++ // Compile CUDA kernels + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-restriction.h", &restriction_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); +- if (!is_deterministic) { +- struct cudaDeviceProp prop; +- Ceed_Cuda *ceed_data; +- +- CeedCallBackend(CeedGetData(ceed, &ceed_data)); +- CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); +- if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { +- char *atomic_add_path; +- +- CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path)); +- CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &restriction_kernel_source)); +- CeedCallBackend(CeedLoadSourceToInitializedBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); +- CeedCallBackend(CeedFree(&atomic_add_path)); +- } +- } +- if (!restriction_kernel_source) { +- CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); +- } ++ CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); + CeedCallBackend(CeedCompile_Cuda(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, +- "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", ++ "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", + strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedNoTranspose", &impl->OrientedNoTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedNoTranspose", &impl->CurlOrientedNoTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->CurlOrientedUnsignedNoTranspose)); + if (!is_deterministic) { + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTranspose", &impl->OrientedTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTranspose", &impl->CurlOrientedTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->CurlOrientedUnsignedTranspose)); + } else { + CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "OrientedTransposeDet", &impl->OrientedTransposeDet)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedTransposeDet", &impl->CurlOrientedTransposeDet)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, impl->module, "CurlOrientedUnsignedTransposeDet", &impl->CurlOrientedUnsignedTransposeDet)); + } + CeedCallBackend(CeedFree(&restriction_kernel_path)); + CeedCallBackend(CeedFree(&restriction_kernel_source)); + + // Register backend functions +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Cuda)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Cuda)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Cuda)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Cuda)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Cuda)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Cuda)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Cuda)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Cuda)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Cuda)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Cuda)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Cuda)); + return CEED_ERROR_SUCCESS; + } + +diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h +index 309c1056..93008817 100644 +--- a/backends/cuda-ref/ceed-cuda-ref.h ++++ b/backends/cuda-ref/ceed-cuda-ref.h +@@ -30,6 +30,15 @@ typedef struct { + CUfunction OffsetNoTranspose; + CUfunction OffsetTranspose; + CUfunction OffsetTransposeDet; ++ CUfunction OrientedNoTranspose; ++ CUfunction OrientedTranspose; ++ CUfunction OrientedTransposeDet; ++ CUfunction CurlOrientedNoTranspose; ++ CUfunction CurlOrientedTranspose; ++ CUfunction CurlOrientedTransposeDet; ++ CUfunction CurlOrientedUnsignedNoTranspose; ++ CUfunction CurlOrientedUnsignedTranspose; ++ CUfunction CurlOrientedUnsignedTransposeDet; + CeedInt num_nodes; + CeedInt *h_ind; + CeedInt *h_ind_allocated; +@@ -37,7 +46,15 @@ typedef struct { + CeedInt *d_ind_allocated; + CeedInt *d_t_offsets; + CeedInt *d_t_indices; +- CeedInt *d_l_vec_indices; ++ CeedInt *d_l_indices; ++ bool *h_orients; ++ bool *h_orients_allocated; ++ bool *d_orients; ++ bool *d_orients_allocated; ++ CeedInt8 *h_curl_orients; ++ CeedInt8 *h_curl_orients_allocated; ++ CeedInt8 *d_curl_orients; ++ CeedInt8 *d_curl_orients_allocated; + } CeedElemRestriction_Cuda; + + typedef struct { +@@ -80,21 +97,19 @@ typedef struct { + + typedef struct { + CUmodule module; +- CUfunction linearDiagonal; +- CUfunction linearPointBlock; +- CeedBasis basis_in, basis_out; ++ CUfunction LinearDiagonal; ++ CUfunction LinearPointBlock; + CeedElemRestriction diag_rstr, point_block_diag_rstr; + CeedVector elem_diag, point_block_elem_diag; +- CeedInt num_e_mode_in, num_e_mode_out, num_nodes; +- CeedEvalMode *h_e_mode_in, *h_e_mode_out; +- CeedEvalMode *d_e_mode_in, *d_e_mode_out; +- CeedScalar *d_identity, *d_interp_in, *d_interp_out, *d_grad_in, *d_grad_out; ++ CeedEvalMode *d_eval_modes_in, *d_eval_modes_out; ++ CeedScalar *d_identity, *d_interp_in, *d_grad_in, *d_div_in, *d_curl_in; ++ CeedScalar *d_interp_out, *d_grad_out, *d_div_out, *d_curl_out; + } CeedOperatorDiag_Cuda; + + typedef struct { + CUmodule module; +- CUfunction linearAssemble; +- CeedInt num_elem, block_size_x, block_size_y, elem_per_block; ++ CUfunction LinearAssemble; ++ CeedInt block_size_x, block_size_y, elems_per_block; + CeedScalar *d_B_in, *d_B_out; + } CeedOperatorAssemble_Cuda; + +diff --git a/backends/hip-ref/ceed-hip-ref-operator.c b/backends/hip-ref/ceed-hip-ref-operator.c +index 486269bb..023b3d8a 100644 +--- a/backends/hip-ref/ceed-hip-ref-operator.c ++++ b/backends/hip-ref/ceed-hip-ref-operator.c +@@ -53,15 +53,18 @@ static int CeedOperatorDestroy_Hip(CeedOperator op) { + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallHip(ceed, hipModuleUnload(impl->diag->module)); +- CeedCallBackend(CeedFree(&impl->diag->h_e_mode_in)); +- CeedCallBackend(CeedFree(&impl->diag->h_e_mode_out)); +- CeedCallHip(ceed, hipFree(impl->diag->d_e_mode_in)); +- CeedCallHip(ceed, hipFree(impl->diag->d_e_mode_out)); ++ CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_in)); ++ CeedCallHip(ceed, hipFree(impl->diag->d_eval_modes_out)); + CeedCallHip(ceed, hipFree(impl->diag->d_identity)); + CeedCallHip(ceed, hipFree(impl->diag->d_interp_in)); + CeedCallHip(ceed, hipFree(impl->diag->d_interp_out)); + CeedCallHip(ceed, hipFree(impl->diag->d_grad_in)); + CeedCallHip(ceed, hipFree(impl->diag->d_grad_out)); ++ CeedCallHip(ceed, hipFree(impl->diag->d_div_in)); ++ CeedCallHip(ceed, hipFree(impl->diag->d_div_out)); ++ CeedCallHip(ceed, hipFree(impl->diag->d_curl_in)); ++ CeedCallHip(ceed, hipFree(impl->diag->d_curl_out)); ++ CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->diag_rstr)); + CeedCallBackend(CeedElemRestrictionDestroy(&impl->diag->point_block_diag_rstr)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->elem_diag)); + CeedCallBackend(CeedVectorDestroy(&impl->diag->point_block_elem_diag)); +@@ -102,30 +105,29 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i + + // Loop over fields + for (CeedInt i = 0; i < num_fields; i++) { +- bool is_strided, skip_restriction; +- CeedSize q_size; +- CeedInt dim, size; +- CeedEvalMode e_mode; +- CeedVector vec; +- CeedElemRestriction elem_rstr; +- CeedBasis basis; ++ bool is_strided = false, skip_restriction = false; ++ CeedSize q_size; ++ CeedInt size; ++ CeedEvalMode eval_mode; ++ CeedBasis basis; + +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); +- is_strided = false; +- skip_restriction = false; +- if (e_mode != CEED_EVAL_WEIGHT) { +- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); ++ if (eval_mode != CEED_EVAL_WEIGHT) { ++ CeedElemRestriction elem_rstr; + + // Check whether this field can skip the element restriction: +- // must be passive input, with e_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. ++ // Must be passive input, with eval_mode NONE, and have a strided restriction with CEED_STRIDES_BACKEND. ++ CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &elem_rstr)); + + // First, check whether the field is input or output: + if (is_input) { +- // Check for passive input: ++ CeedVector vec; ++ ++ // Check for passive input + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec != CEED_VECTOR_ACTIVE) { +- // Check e_mode +- if (e_mode == CEED_EVAL_NONE) { ++ // Check eval_mode ++ if (eval_mode == CEED_EVAL_NONE) { + // Check for strided restriction + CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); + if (is_strided) { +@@ -143,21 +145,17 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i + } + } + +- switch (e_mode) { ++ switch (eval_mode) { + case CEED_EVAL_NONE: + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + break; + case CEED_EVAL_INTERP: +- CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); +- q_size = (CeedSize)num_elem * Q * size; +- CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); +- break; + case CEED_EVAL_GRAD: +- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); ++ case CEED_EVAL_DIV: ++ case CEED_EVAL_CURL: + CeedCallBackend(CeedQFunctionFieldGetSize(qf_fields[i], &size)); +- CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + q_size = (CeedSize)num_elem * Q * size; + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + break; +@@ -167,10 +165,6 @@ static int CeedOperatorSetupFields_Hip(CeedQFunction qf, CeedOperator op, bool i + CeedCallBackend(CeedVectorCreate(ceed, q_size, &q_vecs[i])); + CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_WEIGHT, CEED_VECTOR_NONE, q_vecs[i])); + break; +- case CEED_EVAL_DIV: +- break; // TODO: Not implemented +- case CEED_EVAL_CURL: +- break; // TODO: Not implemented + } + } + return CEED_ERROR_SUCCESS; +@@ -201,17 +195,14 @@ static int CeedOperatorSetup_Hip(CeedOperator op) { + + // Allocate + CeedCallBackend(CeedCalloc(num_input_fields + num_output_fields, &impl->e_vecs)); +- + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_in)); + CeedCallBackend(CeedCalloc(CEED_FIELD_MAX, &impl->q_vecs_out)); +- + impl->num_inputs = num_input_fields; + impl->num_outputs = num_output_fields; + + // Set up infield and outfield e_vecs and q_vecs + // Infields + CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, true, impl->e_vecs, impl->q_vecs_in, 0, num_input_fields, Q, num_elem)); +- + // Outfields + CeedCallBackend(CeedOperatorSetupFields_Hip(qf, op, false, impl->e_vecs, impl->q_vecs_out, num_input_fields, num_output_fields, Q, num_elem)); + +@@ -226,7 +217,7 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFun + CeedVector in_vec, const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], + CeedOperator_Hip *impl, CeedRequest *request) { + for (CeedInt i = 0; i < num_input_fields; i++) { +- CeedEvalMode e_mode; ++ CeedEvalMode eval_mode; + CeedVector vec; + CeedElemRestriction elem_rstr; + +@@ -237,8 +228,8 @@ static inline int CeedOperatorSetupInputs_Hip(CeedInt num_input_fields, CeedQFun + else vec = in_vec; + } + +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); +- if (e_mode == CEED_EVAL_WEIGHT) { // Skip ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); ++ if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { + // Get input vector + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); +@@ -267,7 +258,7 @@ static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionFiel + CeedOperator_Hip *impl) { + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedInt elem_size, size; +- CeedEvalMode e_mode; ++ CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + +@@ -278,30 +269,25 @@ static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionFiel + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) continue; + } +- // Get elem_size, e_mode, size ++ // Get elem_size, eval_mode, size + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); + // Basis action +- switch (e_mode) { ++ switch (eval_mode) { + case CEED_EVAL_NONE: + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i])); + break; + case CEED_EVAL_INTERP: +- CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); +- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_INTERP, impl->e_vecs[i], impl->q_vecs_in[i])); +- break; + case CEED_EVAL_GRAD: ++ case CEED_EVAL_DIV: ++ case CEED_EVAL_CURL: + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); +- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, CEED_EVAL_GRAD, impl->e_vecs[i], impl->q_vecs_in[i])); ++ CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_NOTRANSPOSE, eval_mode, impl->e_vecs[i], impl->q_vecs_in[i])); + break; + case CEED_EVAL_WEIGHT: + break; // No action +- case CEED_EVAL_DIV: +- break; // TODO: Not implemented +- case CEED_EVAL_CURL: +- break; // TODO: Not implemented + } + } + return CEED_ERROR_SUCCESS; +@@ -313,15 +299,16 @@ static inline int CeedOperatorInputBasis_Hip(CeedInt num_elem, CeedQFunctionFiel + static inline int CeedOperatorRestoreInputs_Hip(CeedInt num_input_fields, CeedQFunctionField *qf_input_fields, CeedOperatorField *op_input_fields, + const bool skip_active, CeedScalar *e_data[2 * CEED_FIELD_MAX], CeedOperator_Hip *impl) { + for (CeedInt i = 0; i < num_input_fields; i++) { +- CeedEvalMode e_mode; ++ CeedEvalMode eval_mode; + CeedVector vec; ++ + // Skip active input + if (skip_active) { + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) continue; + } +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &e_mode)); +- if (e_mode == CEED_EVAL_WEIGHT) { // Skip ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); ++ if (eval_mode == CEED_EVAL_WEIGHT) { // Skip + } else { + if (!impl->e_vecs[i]) { // This was a skip_restriction case + CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec)); +@@ -363,10 +350,10 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect + + // Output pointers, as necessary + for (CeedInt i = 0; i < num_output_fields; i++) { +- CeedEvalMode e_mode; ++ CeedEvalMode eval_mode; + +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); +- if (e_mode == CEED_EVAL_NONE) { ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); ++ if (eval_mode == CEED_EVAL_NONE) { + // Set the output Q-Vector to use the E-Vector data directly. + CeedCallBackend(CeedVectorGetArrayWrite(impl->e_vecs[i + impl->num_inputs], CEED_MEM_DEVICE, &e_data[i + num_input_fields])); + CeedCallBackend(CeedVectorSetArray(impl->q_vecs_out[i], CEED_MEM_DEVICE, CEED_USE_POINTER, e_data[i + num_input_fields])); +@@ -378,26 +365,25 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect + + // Output basis apply if needed + for (CeedInt i = 0; i < num_output_fields; i++) { +- CeedEvalMode e_mode; ++ CeedEvalMode eval_mode; + CeedElemRestriction elem_rstr; + CeedBasis basis; + +- // Get elem_size, e_mode, size ++ // Get elem_size, eval_mode, size + CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); + CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); + CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &size)); + // Basis action +- switch (e_mode) { ++ switch (eval_mode) { + case CEED_EVAL_NONE: +- break; ++ break; // No action + case CEED_EVAL_INTERP: +- CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); +- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_INTERP, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); +- break; + case CEED_EVAL_GRAD: ++ case CEED_EVAL_DIV: ++ case CEED_EVAL_CURL: + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); +- CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, CEED_EVAL_GRAD, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); ++ CeedCallBackend(CeedBasisApply(basis, num_elem, CEED_TRANSPOSE, eval_mode, impl->q_vecs_out[i], impl->e_vecs[i + impl->num_inputs])); + break; + // LCOV_EXCL_START + case CEED_EVAL_WEIGHT: { +@@ -405,25 +391,20 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); +- break; // Should not occur ++ // LCOV_EXCL_STOP + } +- case CEED_EVAL_DIV: +- break; // TODO: Not implemented +- case CEED_EVAL_CURL: +- break; // TODO: Not implemented +- // LCOV_EXCL_STOP + } + } + + // Output restriction + for (CeedInt i = 0; i < num_output_fields; i++) { +- CeedEvalMode e_mode; ++ CeedEvalMode eval_mode; + CeedVector vec; + CeedElemRestriction elem_rstr; + + // Restore evec +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &e_mode)); +- if (e_mode == CEED_EVAL_NONE) { ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); ++ if (eval_mode == CEED_EVAL_NONE) { + CeedCallBackend(CeedVectorRestoreArray(impl->e_vecs[i + impl->num_inputs], &e_data[i + num_input_fields])); + } + // Get output vector +@@ -442,15 +423,14 @@ static int CeedOperatorApplyAdd_Hip(CeedOperator op, CeedVector in_vec, CeedVect + } + + //------------------------------------------------------------------------------ +-// Core code for assembling linear QFunction ++// Linear QFunction Assembly Core + //------------------------------------------------------------------------------ + static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, bool build_objects, CeedVector *assembled, CeedElemRestriction *rstr, + CeedRequest *request) { + Ceed ceed, ceed_parent; +- CeedSize q_size; + CeedInt num_active_in, num_active_out, Q, num_elem, num_input_fields, num_output_fields, size; + CeedScalar *assembled_array, *e_data[2 * CEED_FIELD_MAX] = {NULL}; +- CeedVector *active_in; ++ CeedVector *active_inputs; + CeedQFunctionField *qf_input_fields, *qf_output_fields; + CeedQFunction qf; + CeedOperatorField *op_input_fields, *op_output_fields; +@@ -459,14 +439,13 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetFallbackParentCeed(op, &ceed_parent)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); +- CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); + CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); ++ CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); +- active_in = impl->qf_active_in; +- num_active_in = impl->num_active_in; +- num_active_out = impl->num_active_out; ++ active_inputs = impl->qf_active_in; ++ num_active_in = impl->num_active_in, num_active_out = impl->num_active_out; + + // Setup + CeedCallBackend(CeedOperatorSetup_Hip(op)); +@@ -487,19 +466,20 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b + CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[i], &size)); + CeedCallBackend(CeedVectorSetValue(impl->q_vecs_in[i], 0.0)); + CeedCallBackend(CeedVectorGetArray(impl->q_vecs_in[i], CEED_MEM_DEVICE, &q_vec_array)); +- CeedCallBackend(CeedRealloc(num_active_in + size, &active_in)); ++ CeedCallBackend(CeedRealloc(num_active_in + size, &active_inputs)); + for (CeedInt field = 0; field < size; field++) { +- q_size = (CeedSize)Q * num_elem; +- CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_in[num_active_in + field])); ++ CeedSize q_size = (CeedSize)Q * num_elem; ++ ++ CeedCallBackend(CeedVectorCreate(ceed, q_size, &active_inputs[num_active_in + field])); + CeedCallBackend( +- CeedVectorSetArray(active_in[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); ++ CeedVectorSetArray(active_inputs[num_active_in + field], CEED_MEM_DEVICE, CEED_USE_POINTER, &q_vec_array[field * Q * num_elem])); + } + num_active_in += size; + CeedCallBackend(CeedVectorRestoreArray(impl->q_vecs_in[i], &q_vec_array)); + } + } + impl->num_active_in = num_active_in; +- impl->qf_active_in = active_in; ++ impl->qf_active_in = active_inputs; + } + + // Count number of active output fields +@@ -523,10 +503,10 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b + + // Build objects if needed + if (build_objects) { +- // Create output restriction + CeedSize l_size = (CeedSize)num_elem * Q * num_active_in * num_active_out; + CeedInt strides[3] = {1, num_elem * Q, Q}; /* *NOPAD* */ + ++ // Create output restriction + CeedCallBackend(CeedElemRestrictionCreateStrided(ceed_parent, num_elem, Q, num_active_in * num_active_out, + num_active_in * num_active_out * num_elem * Q, strides, rstr)); + // Create assembled vector +@@ -541,9 +521,9 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b + // Assemble QFunction + for (CeedInt in = 0; in < num_active_in; in++) { + // Set Inputs +- CeedCallBackend(CeedVectorSetValue(active_in[in], 1.0)); ++ CeedCallBackend(CeedVectorSetValue(active_inputs[in], 1.0)); + if (num_active_in > 1) { +- CeedCallBackend(CeedVectorSetValue(active_in[(in + num_active_in - 1) % num_active_in], 0.0)); ++ CeedCallBackend(CeedVectorSetValue(active_inputs[(in + num_active_in - 1) % num_active_in], 0.0)); + } + // Set Outputs + for (CeedInt out = 0; out < num_output_fields; out++) { +@@ -562,7 +542,7 @@ static inline int CeedOperatorLinearAssembleQFunctionCore_Hip(CeedOperator op, b + CeedCallBackend(CeedQFunctionApply(qf, Q * num_elem, impl->q_vecs_in, impl->q_vecs_out)); + } + +- // Un-set output Qvecs to prevent accidental overwrite of Assembled ++ // Un-set output q_vecs to prevent accidental overwrite of Assembled + for (CeedInt out = 0; out < num_output_fields; out++) { + CeedVector vec; + +@@ -597,14 +577,14 @@ static int CeedOperatorLinearAssembleQFunctionUpdate_Hip(CeedOperator op, CeedVe + } + + //------------------------------------------------------------------------------ +-// Assemble diagonal setup ++// Assemble Diagonal Setup + //------------------------------------------------------------------------------ + static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) { + Ceed ceed; + char *diagonal_kernel_path, *diagonal_kernel_source; +- CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, num_comp = 0, dim = 1, num_e_mode_out = 0; +- CeedEvalMode *e_mode_in = NULL, *e_mode_out = NULL; +- CeedElemRestriction rstr_in = NULL, rstr_out = NULL; ++ CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; ++ CeedInt num_comp, q_comp, num_nodes, num_qpts; ++ CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedQFunctionField *qf_fields; + CeedQFunction qf; +@@ -623,33 +603,20 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { +- CeedEvalMode e_mode; +- CeedElemRestriction rstr; +- +- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_in)); +- CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); +- CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); +- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); +- CeedCheck(!rstr_in || rstr_in == rstr, ceed, CEED_ERROR_BACKEND, +- "Backend does not implement multi-field non-composite operator diagonal assembly"); +- rstr_in = rstr; +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); +- switch (e_mode) { +- case CEED_EVAL_NONE: +- case CEED_EVAL_INTERP: +- CeedCallBackend(CeedRealloc(num_e_mode_in + 1, &e_mode_in)); +- e_mode_in[num_e_mode_in] = e_mode; +- num_e_mode_in += 1; +- break; +- case CEED_EVAL_GRAD: +- CeedCallBackend(CeedRealloc(num_e_mode_in + dim, &e_mode_in)); +- for (CeedInt d = 0; d < dim; d++) e_mode_in[num_e_mode_in + d] = e_mode; +- num_e_mode_in += dim; +- break; +- case CEED_EVAL_WEIGHT: +- case CEED_EVAL_DIV: +- case CEED_EVAL_CURL: +- break; // Caught by QF Assembly ++ CeedBasis basis; ++ CeedEvalMode eval_mode; ++ ++ CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); ++ CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, ++ "Backend does not implement operator diagonal assembly with multiple active bases"); ++ basis_in = basis; ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); ++ if (eval_mode != CEED_EVAL_WEIGHT) { ++ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly ++ CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in)); ++ for (CeedInt d = 0; d < q_comp; d++) eval_modes_in[num_eval_modes_in + d] = eval_mode; ++ num_eval_modes_in += q_comp; + } + } + } +@@ -662,31 +629,20 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt + + CeedCallBackend(CeedOperatorFieldGetVector(op_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { +- CeedEvalMode e_mode; +- CeedElemRestriction rstr; +- +- CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis_out)); +- CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_fields[i], &rstr)); +- CeedCheck(!rstr_out || rstr_out == rstr, ceed, CEED_ERROR_BACKEND, +- "Backend does not implement multi-field non-composite operator diagonal assembly"); +- rstr_out = rstr; +- CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &e_mode)); +- switch (e_mode) { +- case CEED_EVAL_NONE: +- case CEED_EVAL_INTERP: +- CeedCallBackend(CeedRealloc(num_e_mode_out + 1, &e_mode_out)); +- e_mode_out[num_e_mode_out] = e_mode; +- num_e_mode_out += 1; +- break; +- case CEED_EVAL_GRAD: +- CeedCallBackend(CeedRealloc(num_e_mode_out + dim, &e_mode_out)); +- for (CeedInt d = 0; d < dim; d++) e_mode_out[num_e_mode_out + d] = e_mode; +- num_e_mode_out += dim; +- break; +- case CEED_EVAL_WEIGHT: +- case CEED_EVAL_DIV: +- case CEED_EVAL_CURL: +- break; // Caught by QF Assembly ++ CeedBasis basis; ++ CeedEvalMode eval_mode; ++ ++ CeedCallBackend(CeedOperatorFieldGetBasis(op_fields[i], &basis)); ++ CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, ++ "Backend does not implement operator diagonal assembly with multiple active bases"); ++ basis_out = basis; ++ CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); ++ if (eval_mode != CEED_EVAL_WEIGHT) { ++ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF assembly ++ CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out)); ++ for (CeedInt d = 0; d < q_comp; d++) eval_modes_out[num_eval_modes_out + d] = eval_mode; ++ num_eval_modes_out += q_comp; + } + } + } +@@ -696,95 +652,147 @@ static inline int CeedOperatorAssembleDiagonalSetup_Hip(CeedOperator op, CeedInt + CeedCallBackend(CeedCalloc(1, &impl->diag)); + CeedOperatorDiag_Hip *diag = impl->diag; + +- diag->basis_in = basis_in; +- diag->basis_out = basis_out; +- diag->h_e_mode_in = e_mode_in; +- diag->h_e_mode_out = e_mode_out; +- diag->num_e_mode_in = num_e_mode_in; +- diag->num_e_mode_out = num_e_mode_out; +- + // Assemble kernel ++ CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_nodes)); ++ CeedCallBackend(CeedBasisGetNumComponents(basis_in, &num_comp)); ++ if (basis_in == CEED_BASIS_NONE) num_qpts = num_nodes; ++ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h", &diagonal_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, diagonal_kernel_path, &diagonal_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Diagonal Assembly Source Complete! -----\n"); +- CeedInt num_modes, num_qpts; +- CeedCallBackend(CeedBasisGetNumNodes(basis_in, &num_modes)); +- CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); +- diag->num_modes = num_modes; +- CeedCallBackend(CeedCompile_Hip(ceed, diagonal_kernel_source, &diag->module, 6, "NUMEMODEIN", num_e_mode_in, "NUMEMODEOUT", num_e_mode_out, +- "NNODES", num_modes, "NQPTS", num_qpts, "NCOMP", num_comp, "CEEDSIZE", use_ceedsize_idx)); +- CeedCallBackend(CeedGetKernel_Hip(ceed, diag->module, "linearDiagonal", &diag->linearDiagonal)); +- CeedCallBackend(CeedGetKernel_Hip(ceed, diag->module, "linearPointBlockDiagonal", &diag->linearPointBlock)); ++ CeedCallHip(ceed, ++ CeedCompile_Hip(ceed, diagonal_kernel_source, &diag->module, 6, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", ++ num_eval_modes_out, "NUM_COMP", num_comp, "NUM_NODES", num_nodes, "NUM_QPTS", num_qpts, "CEED_SIZE", use_ceedsize_idx)); ++ CeedCallHip(ceed, CeedGetKernel_Hip(ceed, diag->module, "LinearDiagonal", &diag->LinearDiagonal)); ++ CeedCallHip(ceed, CeedGetKernel_Hip(ceed, diag->module, "LinearPointBlockDiagonal", &diag->LinearPointBlock)); + CeedCallBackend(CeedFree(&diagonal_kernel_path)); + CeedCallBackend(CeedFree(&diagonal_kernel_source)); + + // Basis matrices +- const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); +- const CeedInt interp_bytes = q_bytes * num_modes; +- const CeedInt grad_bytes = q_bytes * num_modes * dim; +- const CeedInt e_mode_bytes = sizeof(CeedEvalMode); +- const CeedScalar *interp_in, *interp_out, *grad_in, *grad_out; ++ const CeedInt interp_bytes = num_nodes * num_qpts * sizeof(CeedScalar); ++ const CeedInt eval_modes_bytes = sizeof(CeedEvalMode); ++ bool has_eval_none = false; + + // CEED_EVAL_NONE +- CeedScalar *identity = NULL; +- bool is_eval_none = false; +- +- for (CeedInt i = 0; i < num_e_mode_in; i++) is_eval_none = is_eval_none || (e_mode_in[i] == CEED_EVAL_NONE); +- for (CeedInt i = 0; i < num_e_mode_out; i++) is_eval_none = is_eval_none || (e_mode_out[i] == CEED_EVAL_NONE); +- if (is_eval_none) { +- CeedCallBackend(CeedCalloc(num_qpts * num_modes, &identity)); +- for (CeedInt i = 0; i < (num_modes < num_qpts ? num_modes : num_qpts); i++) identity[i * num_modes + i] = 1.0; ++ for (CeedInt i = 0; i < num_eval_modes_in; i++) has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE); ++ for (CeedInt i = 0; i < num_eval_modes_out; i++) has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE); ++ if (has_eval_none) { ++ CeedScalar *identity = NULL; ++ ++ CeedCallBackend(CeedCalloc(num_nodes * num_qpts, &identity)); ++ for (CeedInt i = 0; i < (num_nodes < num_qpts ? num_nodes : num_qpts); i++) identity[i * num_nodes + i] = 1.0; + CeedCallHip(ceed, hipMalloc((void **)&diag->d_identity, interp_bytes)); + CeedCallHip(ceed, hipMemcpy(diag->d_identity, identity, interp_bytes, hipMemcpyHostToDevice)); ++ CeedCallBackend(CeedFree(&identity)); ++ } ++ ++ // CEED_EVAL_INTERP, CEED_EVAL_GRAD, CEED_EVAL_DIV, and CEED_EVAL_CURL ++ for (CeedInt in = 0; in < 2; in++) { ++ CeedFESpace fespace; ++ CeedBasis basis = in ? basis_in : basis_out; ++ ++ CeedCallBackend(CeedBasisGetFESpace(basis, &fespace)); ++ switch (fespace) { ++ case CEED_FE_SPACE_H1: { ++ CeedInt q_comp_interp, q_comp_grad; ++ const CeedScalar *interp, *grad; ++ CeedScalar *d_interp, *d_grad; ++ ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); ++ ++ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); ++ CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); ++ CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice)); ++ CeedCallBackend(CeedBasisGetGrad(basis, &grad)); ++ CeedCallHip(ceed, hipMalloc((void **)&d_grad, interp_bytes * q_comp_grad)); ++ CeedCallHip(ceed, hipMemcpy(d_grad, grad, interp_bytes * q_comp_grad, hipMemcpyHostToDevice)); ++ if (in) { ++ diag->d_interp_in = d_interp; ++ diag->d_grad_in = d_grad; ++ } else { ++ diag->d_interp_out = d_interp; ++ diag->d_grad_out = d_grad; ++ } ++ } break; ++ case CEED_FE_SPACE_HDIV: { ++ CeedInt q_comp_interp, q_comp_div; ++ const CeedScalar *interp, *div; ++ CeedScalar *d_interp, *d_div; ++ ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); ++ ++ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); ++ CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); ++ CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice)); ++ CeedCallBackend(CeedBasisGetDiv(basis, &div)); ++ CeedCallHip(ceed, hipMalloc((void **)&d_div, interp_bytes * q_comp_div)); ++ CeedCallHip(ceed, hipMemcpy(d_div, div, interp_bytes * q_comp_div, hipMemcpyHostToDevice)); ++ if (in) { ++ diag->d_interp_in = d_interp; ++ diag->d_div_in = d_div; ++ } else { ++ diag->d_interp_out = d_interp; ++ diag->d_div_out = d_div; ++ } ++ } break; ++ case CEED_FE_SPACE_HCURL: { ++ CeedInt q_comp_interp, q_comp_curl; ++ const CeedScalar *interp, *curl; ++ CeedScalar *d_interp, *d_curl; ++ ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); ++ ++ CeedCallBackend(CeedBasisGetInterp(basis, &interp)); ++ CeedCallHip(ceed, hipMalloc((void **)&d_interp, interp_bytes * q_comp_interp)); ++ CeedCallHip(ceed, hipMemcpy(d_interp, interp, interp_bytes * q_comp_interp, hipMemcpyHostToDevice)); ++ CeedCallBackend(CeedBasisGetCurl(basis, &curl)); ++ CeedCallHip(ceed, hipMalloc((void **)&d_curl, interp_bytes * q_comp_curl)); ++ CeedCallHip(ceed, hipMemcpy(d_curl, curl, interp_bytes * q_comp_curl, hipMemcpyHostToDevice)); ++ if (in) { ++ diag->d_interp_in = d_interp; ++ diag->d_curl_in = d_curl; ++ } else { ++ diag->d_interp_out = d_interp; ++ diag->d_curl_out = d_curl; ++ } ++ } break; ++ } + } + +- // CEED_EVAL_INTERP +- CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); +- CeedCallHip(ceed, hipMalloc((void **)&diag->d_interp_in, interp_bytes)); +- CeedCallHip(ceed, hipMemcpy(diag->d_interp_in, interp_in, interp_bytes, hipMemcpyHostToDevice)); +- CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); +- CeedCallHip(ceed, hipMalloc((void **)&diag->d_interp_out, interp_bytes)); +- CeedCallHip(ceed, hipMemcpy(diag->d_interp_out, interp_out, interp_bytes, hipMemcpyHostToDevice)); +- +- // CEED_EVAL_GRAD +- CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); +- CeedCallHip(ceed, hipMalloc((void **)&diag->d_grad_in, grad_bytes)); +- CeedCallHip(ceed, hipMemcpy(diag->d_grad_in, grad_in, grad_bytes, hipMemcpyHostToDevice)); +- CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); +- CeedCallHip(ceed, hipMalloc((void **)&diag->d_grad_out, grad_bytes)); +- CeedCallHip(ceed, hipMemcpy(diag->d_grad_out, grad_out, grad_bytes, hipMemcpyHostToDevice)); +- +- // Arrays of e_modes +- CeedCallHip(ceed, hipMalloc((void **)&diag->d_e_mode_in, num_e_mode_in * e_mode_bytes)); +- CeedCallHip(ceed, hipMemcpy(diag->d_e_mode_in, e_mode_in, num_e_mode_in * e_mode_bytes, hipMemcpyHostToDevice)); +- CeedCallHip(ceed, hipMalloc((void **)&diag->d_e_mode_out, num_e_mode_out * e_mode_bytes)); +- CeedCallHip(ceed, hipMemcpy(diag->d_e_mode_out, e_mode_out, num_e_mode_out * e_mode_bytes, hipMemcpyHostToDevice)); +- +- // Restriction +- diag->diag_rstr = rstr_out; ++ // Arrays of eval_modes ++ CeedCallHip(ceed, hipMalloc((void **)&diag->d_eval_modes_in, num_eval_modes_in * eval_modes_bytes)); ++ CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_in, eval_modes_in, num_eval_modes_in * eval_modes_bytes, hipMemcpyHostToDevice)); ++ CeedCallHip(ceed, hipMalloc((void **)&diag->d_eval_modes_out, num_eval_modes_out * eval_modes_bytes)); ++ CeedCallHip(ceed, hipMemcpy(diag->d_eval_modes_out, eval_modes_out, num_eval_modes_out * eval_modes_bytes, hipMemcpyHostToDevice)); ++ CeedCallBackend(CeedFree(&eval_modes_in)); ++ CeedCallBackend(CeedFree(&eval_modes_out)); + return CEED_ERROR_SUCCESS; + } + + //------------------------------------------------------------------------------ +-// Assemble diagonal common code ++// Assemble Diagonal Core + //------------------------------------------------------------------------------ + static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVector assembled, CeedRequest *request, const bool is_point_block) { + Ceed ceed; +- CeedSize assembled_length = 0, assembled_qf_length = 0; +- CeedInt use_ceedsize_idx = 0, num_elem; ++ CeedSize assembled_length, assembled_qf_length; ++ CeedInt use_ceedsize_idx = 0, num_elem, num_nodes; + CeedScalar *elem_diag_array; + const CeedScalar *assembled_qf_array; +- CeedVector assembled_qf = NULL; +- CeedElemRestriction rstr = NULL; ++ CeedVector assembled_qf = NULL, elem_diag; ++ CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out, diag_rstr; + CeedOperator_Hip *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + + // Assemble QFunction +- CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr, request)); +- CeedCallBackend(CeedElemRestrictionDestroy(&rstr)); ++ CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, request)); ++ CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); ++ CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); + + CeedCallBackend(CeedVectorGetLength(assembled, &assembled_length)); + CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); +@@ -796,37 +804,37 @@ static inline int CeedOperatorAssembleDiagonalCore_Hip(CeedOperator op, CeedVect + + assert(diag != NULL); + +- // Restriction +- if (is_point_block && !diag->point_block_diag_rstr) { +- CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(diag->diag_rstr, &diag->point_block_diag_rstr)); +- } +- CeedElemRestriction diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; +- +- // Create diagonal vector +- CeedVector elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; +- +- if (!elem_diag) { +- CeedCallBackend(CeedElemRestrictionCreateVector(diag_rstr, NULL, &elem_diag)); +- if (is_point_block) diag->point_block_elem_diag = elem_diag; +- else diag->elem_diag = elem_diag; ++ // Restriction and diagonal vector ++ CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); ++ CeedCheck(rstr_in == rstr_out, ceed, CEED_ERROR_BACKEND, ++ "Cannot assemble operator diagonal with different input and output active element restrictions"); ++ if (!is_point_block && !diag->diag_rstr) { ++ CeedCallBackend(CeedElemRestrictionCreateUnsignedCopy(rstr_out, &diag->diag_rstr)); ++ CeedCallBackend(CeedElemRestrictionCreateVector(diag->diag_rstr, NULL, &diag->elem_diag)); ++ } else if (is_point_block && !diag->point_block_diag_rstr) { ++ CeedCallBackend(CeedOperatorCreateActivePointBlockRestriction(rstr_out, &diag->point_block_diag_rstr)); ++ CeedCallBackend(CeedElemRestrictionCreateVector(diag->point_block_diag_rstr, NULL, &diag->point_block_elem_diag)); + } ++ diag_rstr = is_point_block ? diag->point_block_diag_rstr : diag->diag_rstr; ++ elem_diag = is_point_block ? diag->point_block_elem_diag : diag->elem_diag; + CeedCallBackend(CeedVectorSetValue(elem_diag, 0.0)); + + // Assemble element operator diagonals + CeedCallBackend(CeedVectorGetArray(elem_diag, CEED_MEM_DEVICE, &elem_diag_array)); +- CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); + CeedCallBackend(CeedElemRestrictionGetNumElements(diag_rstr, &num_elem)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(diag_rstr, &num_nodes)); + + // Compute the diagonal of B^T D B +- int elem_per_block = 1; +- int grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); +- void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_interp_out, +- &diag->d_grad_out, &diag->d_e_mode_in, &diag->d_e_mode_out, &assembled_qf_array, &elem_diag_array}; ++ CeedInt elems_per_block = 1; ++ CeedInt grid = CeedDivUpInt(num_elem, elems_per_block); ++ void *args[] = {(void *)&num_elem, &diag->d_identity, &diag->d_interp_in, &diag->d_grad_in, &diag->d_div_in, ++ &diag->d_curl_in, &diag->d_interp_out, &diag->d_grad_out, &diag->d_div_out, &diag->d_curl_out, ++ &diag->d_eval_modes_in, &diag->d_eval_modes_out, &assembled_qf_array, &elem_diag_array}; + + if (is_point_block) { +- CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->linearPointBlock, grid, diag->num_modes, 1, elem_per_block, args)); ++ CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args)); + } else { +- CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->linearDiagonal, grid, diag->num_modes, 1, elem_per_block, args)); ++ CeedCallBackend(CeedRunKernelDim_Hip(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args)); + } + + // Restore arrays +@@ -858,13 +866,14 @@ static int CeedOperatorLinearAssembleAddPointBlockDiagonal_Hip(CeedOperator op, + } + + //------------------------------------------------------------------------------ +-// Single operator assembly setup ++// Single Operator Assembly Setup + //------------------------------------------------------------------------------ + static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceedsize_idx) { +- Ceed ceed; +- CeedInt num_input_fields, num_output_fields, num_e_mode_in = 0, dim = 1, num_B_in_mats_to_load = 0, size_B_in = 0, num_qpts = 0, elem_size = 0, +- num_e_mode_out = 0, num_B_out_mats_to_load = 0, size_B_out = 0, num_elem, num_comp; +- CeedEvalMode *eval_mode_in = NULL, *eval_mode_out = NULL; ++ Ceed ceed; ++ char *assembly_kernel_path, *assembly_kernel_source; ++ CeedInt num_input_fields, num_output_fields, num_eval_modes_in = 0, num_eval_modes_out = 0; ++ CeedInt elem_size_in, num_qpts_in, num_comp_in, elem_size_out, num_qpts_out, num_comp_out, q_comp; ++ CeedEvalMode *eval_modes_in = NULL, *eval_modes_out = NULL; + CeedElemRestriction rstr_in = NULL, rstr_out = NULL; + CeedBasis basis_in = NULL, basis_out = NULL; + CeedQFunctionField *qf_fields; +@@ -881,34 +890,30 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed + // Determine active input basis eval mode + CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); + CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_fields, NULL, NULL)); +- // Note that the kernel will treat each dimension of a gradient action separately; +- // i.e., when an active input has a CEED_EVAL_GRAD mode, num_e_mode_in will increment by dim. +- // However, for the purposes of loading the B matrices, it will be treated as one mode, and we will load/copy the entire gradient matrix at once, so +- // num_B_in_mats_to_load will be incremented by 1. + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec; + + CeedCallBackend(CeedOperatorFieldGetVector(input_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { ++ CeedBasis basis; + CeedEvalMode eval_mode; + +- CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis_in)); +- CeedCallBackend(CeedBasisGetDimension(basis_in, &dim)); +- CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts)); ++ CeedCallBackend(CeedOperatorFieldGetBasis(input_fields[i], &basis)); ++ CeedCheck(!basis_in || basis_in == basis, ceed, CEED_ERROR_BACKEND, "Backend does not implement operator assembly with multiple active bases"); ++ basis_in = basis; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(input_fields[i], &rstr_in)); +- CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); ++ if (basis_in == CEED_BASIS_NONE) num_qpts_in = elem_size_in; ++ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_in, &num_qpts_in)); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); +- if (eval_mode != CEED_EVAL_NONE) { +- CeedCallBackend(CeedRealloc(num_B_in_mats_to_load + 1, &eval_mode_in)); +- eval_mode_in[num_B_in_mats_to_load] = eval_mode; +- num_B_in_mats_to_load += 1; +- if (eval_mode == CEED_EVAL_GRAD) { +- num_e_mode_in += dim; +- size_B_in += dim * elem_size * num_qpts; +- } else { +- num_e_mode_in += 1; +- size_B_in += elem_size * num_qpts; ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_mode, &q_comp)); ++ if (eval_mode != CEED_EVAL_WEIGHT) { ++ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly ++ CeedCallBackend(CeedRealloc(num_eval_modes_in + q_comp, &eval_modes_in)); ++ for (CeedInt d = 0; d < q_comp; d++) { ++ eval_modes_in[num_eval_modes_in + d] = eval_mode; + } ++ num_eval_modes_in += q_comp; + } + } + } +@@ -920,106 +925,133 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed + + CeedCallBackend(CeedOperatorFieldGetVector(output_fields[i], &vec)); + if (vec == CEED_VECTOR_ACTIVE) { ++ CeedBasis basis; + CeedEvalMode eval_mode; + +- CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis_out)); ++ CeedCallBackend(CeedOperatorFieldGetBasis(output_fields[i], &basis)); ++ CeedCheck(!basis_out || basis_out == basis, ceed, CEED_ERROR_BACKEND, ++ "Backend does not implement operator assembly with multiple active bases"); ++ basis_out = basis; + CeedCallBackend(CeedOperatorFieldGetElemRestriction(output_fields[i], &rstr_out)); +- CeedCheck(!rstr_out || rstr_out == rstr_in, ceed, CEED_ERROR_BACKEND, "Backend does not implement multi-field non-composite operator assembly"); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); ++ if (basis_out == CEED_BASIS_NONE) num_qpts_out = elem_size_out; ++ else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis_out, &num_qpts_out)); ++ CeedCheck(num_qpts_in == num_qpts_out, ceed, CEED_ERROR_UNSUPPORTED, ++ "Active input and output bases must have the same number of quadrature points"); + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_fields[i], &eval_mode)); +- if (eval_mode != CEED_EVAL_NONE) { +- CeedCallBackend(CeedRealloc(num_B_out_mats_to_load + 1, &eval_mode_out)); +- eval_mode_out[num_B_out_mats_to_load] = eval_mode; +- num_B_out_mats_to_load += 1; +- if (eval_mode == CEED_EVAL_GRAD) { +- num_e_mode_out += dim; +- size_B_out += dim * elem_size * num_qpts; +- } else { +- num_e_mode_out += 1; +- size_B_out += elem_size * num_qpts; ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_mode, &q_comp)); ++ if (eval_mode != CEED_EVAL_WEIGHT) { ++ // q_comp = 1 if CEED_EVAL_NONE, CEED_EVAL_WEIGHT caught by QF Assembly ++ CeedCallBackend(CeedRealloc(num_eval_modes_out + q_comp, &eval_modes_out)); ++ for (CeedInt d = 0; d < q_comp; d++) { ++ eval_modes_out[num_eval_modes_out + d] = eval_mode; + } ++ num_eval_modes_out += q_comp; + } + } + } +- +- CeedCheck(num_e_mode_in > 0 && num_e_mode_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); +- +- CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem)); +- CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp)); ++ CeedCheck(num_eval_modes_in > 0 && num_eval_modes_out > 0, ceed, CEED_ERROR_UNSUPPORTED, "Cannot assemble operator without inputs/outputs"); + + CeedCallBackend(CeedCalloc(1, &impl->asmb)); + CeedOperatorAssemble_Hip *asmb = impl->asmb; +- asmb->num_elem = num_elem; ++ asmb->elems_per_block = 1; ++ asmb->block_size_x = elem_size_in; ++ asmb->block_size_y = elem_size_out; ++ ++ bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > 1024; ++ ++ if (fallback) { ++ // Use fallback kernel with 1D threadblock ++ asmb->block_size_y = 1; ++ } + + // Compile kernels +- int elem_per_block = 1; +- asmb->elem_per_block = elem_per_block; +- CeedInt block_size = elem_size * elem_size * elem_per_block; +- char *assembly_kernel_path, *assembly_kernel_source; ++ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_in, &num_comp_in)); ++ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_out, &num_comp_out)); + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-operator-assemble.h", &assembly_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, assembly_kernel_path, &assembly_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Assembly Source Complete! -----\n"); +- bool fallback = block_size > 1024; +- if (fallback) { // Use fallback kernel with 1D threadblock +- block_size = elem_size * elem_per_block; +- asmb->block_size_x = elem_size; +- asmb->block_size_y = 1; +- } else { // Use kernel with 2D threadblock +- asmb->block_size_x = elem_size; +- asmb->block_size_y = elem_size; +- } +- CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 8, "NELEM", num_elem, "NUMEMODEIN", num_e_mode_in, "NUMEMODEOUT", +- num_e_mode_out, "NQPTS", num_qpts, "NNODES", elem_size, "BLOCK_SIZE", block_size, "NCOMP", num_comp, "CEEDSIZE", ++ CeedCallBackend(CeedCompile_Hip(ceed, assembly_kernel_source, &asmb->module, 10, "NUM_EVAL_MODES_IN", num_eval_modes_in, "NUM_EVAL_MODES_OUT", ++ num_eval_modes_out, "NUM_COMP_IN", num_comp_in, "NUM_COMP_OUT", num_comp_out, "NUM_NODES_IN", elem_size_in, ++ "NUM_NODES_OUT", elem_size_out, "NUM_QPTS", num_qpts_in, "BLOCK_SIZE", ++ asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_y, "CEED_SIZE", + use_ceedsize_idx)); +- CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, fallback ? "linearAssembleFallback" : "linearAssemble", &asmb->linearAssemble)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, asmb->module, "LinearAssemble", &asmb->LinearAssemble)); + CeedCallBackend(CeedFree(&assembly_kernel_path)); + CeedCallBackend(CeedFree(&assembly_kernel_source)); + +- // Build 'full' B matrices (not 1D arrays used for tensor-product matrices) +- const CeedScalar *interp_in, *grad_in; +- CeedCallBackend(CeedBasisGetInterp(basis_in, &interp_in)); +- CeedCallBackend(CeedBasisGetGrad(basis_in, &grad_in)); +- +- // Load into B_in, in order that they will be used in eval_mode +- const CeedInt in_bytes = size_B_in * sizeof(CeedScalar); +- CeedInt mat_start = 0; +- +- CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_in, in_bytes)); +- for (int i = 0; i < num_B_in_mats_to_load; i++) { +- CeedEvalMode eval_mode = eval_mode_in[i]; +- if (eval_mode == CEED_EVAL_INTERP) { +- CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[mat_start], interp_in, elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); +- mat_start += elem_size * num_qpts; +- } else if (eval_mode == CEED_EVAL_GRAD) { +- CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[mat_start], grad_in, dim * elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); +- mat_start += dim * elem_size * num_qpts; ++ // Load into B_in, in order that they will be used in eval_modes_in ++ { ++ const CeedInt in_bytes = elem_size_in * num_qpts_in * num_eval_modes_in * sizeof(CeedScalar); ++ CeedInt d_in = 0; ++ CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE; ++ bool has_eval_none = false; ++ CeedScalar *identity = NULL; ++ ++ for (CeedInt i = 0; i < num_eval_modes_in; i++) { ++ has_eval_none = has_eval_none || (eval_modes_in[i] == CEED_EVAL_NONE); ++ } ++ if (has_eval_none) { ++ CeedCallBackend(CeedCalloc(elem_size_in * num_qpts_in, &identity)); ++ for (CeedInt i = 0; i < (elem_size_in < num_qpts_in ? elem_size_in : num_qpts_in); i++) identity[i * elem_size_in + i] = 1.0; + } +- } + +- const CeedScalar *interp_out, *grad_out; ++ CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_in, in_bytes)); ++ for (CeedInt i = 0; i < num_eval_modes_in; i++) { ++ const CeedScalar *h_B_in; + +- // Note that this function currently assumes 1 basis, so this should always be true for now +- if (basis_out == basis_in) { +- interp_out = interp_in; +- grad_out = grad_in; +- } else { +- CeedCallBackend(CeedBasisGetInterp(basis_out, &interp_out)); +- CeedCallBackend(CeedBasisGetGrad(basis_out, &grad_out)); ++ CeedCallBackend(CeedOperatorGetBasisPointer(basis_in, eval_modes_in[i], identity, &h_B_in)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_in, eval_modes_in[i], &q_comp)); ++ if (q_comp > 1) { ++ if (i == 0 || eval_modes_in[i] != eval_modes_in_prev) d_in = 0; ++ else h_B_in = &h_B_in[(++d_in) * elem_size_in * num_qpts_in]; ++ } ++ eval_modes_in_prev = eval_modes_in[i]; ++ ++ CeedCallHip(ceed, hipMemcpy(&asmb->d_B_in[i * elem_size_in * num_qpts_in], h_B_in, elem_size_in * num_qpts_in * sizeof(CeedScalar), ++ hipMemcpyHostToDevice)); ++ } ++ ++ if (identity) { ++ CeedCallBackend(CeedFree(&identity)); ++ } + } + +- // Load into B_out, in order that they will be used in eval_mode +- const CeedInt out_bytes = size_B_out * sizeof(CeedScalar); +- +- mat_start = 0; +- CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_out, out_bytes)); +- for (int i = 0; i < num_B_out_mats_to_load; i++) { +- CeedEvalMode eval_mode = eval_mode_out[i]; +- if (eval_mode == CEED_EVAL_INTERP) { +- CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[mat_start], interp_out, elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); +- mat_start += elem_size * num_qpts; +- } else if (eval_mode == CEED_EVAL_GRAD) { +- CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[mat_start], grad_out, dim * elem_size * num_qpts * sizeof(CeedScalar), hipMemcpyHostToDevice)); +- mat_start += dim * elem_size * num_qpts; ++ // Load into B_out, in order that they will be used in eval_modes_out ++ { ++ const CeedInt out_bytes = elem_size_out * num_qpts_out * num_eval_modes_out * sizeof(CeedScalar); ++ CeedInt d_out = 0; ++ CeedEvalMode eval_modes_out_prev = CEED_EVAL_NONE; ++ bool has_eval_none = false; ++ CeedScalar *identity = NULL; ++ ++ for (CeedInt i = 0; i < num_eval_modes_out; i++) { ++ has_eval_none = has_eval_none || (eval_modes_out[i] == CEED_EVAL_NONE); ++ } ++ if (has_eval_none) { ++ CeedCallBackend(CeedCalloc(elem_size_out * num_qpts_out, &identity)); ++ for (CeedInt i = 0; i < (elem_size_out < num_qpts_out ? elem_size_out : num_qpts_out); i++) identity[i * elem_size_out + i] = 1.0; ++ } ++ ++ CeedCallHip(ceed, hipMalloc((void **)&asmb->d_B_out, out_bytes)); ++ for (CeedInt i = 0; i < num_eval_modes_out; i++) { ++ const CeedScalar *h_B_out; ++ ++ CeedCallBackend(CeedOperatorGetBasisPointer(basis_out, eval_modes_out[i], identity, &h_B_out)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis_out, eval_modes_out[i], &q_comp)); ++ if (q_comp > 1) { ++ if (i == 0 || eval_modes_out[i] != eval_modes_out_prev) d_out = 0; ++ else h_B_out = &h_B_out[(++d_out) * elem_size_out * num_qpts_out]; ++ } ++ eval_modes_out_prev = eval_modes_out[i]; ++ ++ CeedCallHip(ceed, hipMemcpy(&asmb->d_B_out[i * elem_size_out * num_qpts_out], h_B_out, elem_size_out * num_qpts_out * sizeof(CeedScalar), ++ hipMemcpyHostToDevice)); ++ } ++ ++ if (identity) { ++ CeedCallBackend(CeedFree(&identity)); + } + } + return CEED_ERROR_SUCCESS; +@@ -1036,47 +1068,96 @@ static int CeedSingleOperatorAssembleSetup_Hip(CeedOperator op, CeedInt use_ceed + static int CeedSingleOperatorAssemble_Hip(CeedOperator op, CeedInt offset, CeedVector values) { + Ceed ceed; + CeedSize values_length = 0, assembled_qf_length = 0; +- CeedInt use_ceedsize_idx = 0; ++ CeedInt use_ceedsize_idx = 0, num_elem_in, num_elem_out, elem_size_in, elem_size_out; + CeedScalar *values_array; +- const CeedScalar *qf_array; +- CeedVector assembled_qf = NULL; +- CeedElemRestriction rstr_q = NULL; ++ const CeedScalar *assembled_qf_array; ++ CeedVector assembled_qf = NULL; ++ CeedElemRestriction assembled_rstr = NULL, rstr_in, rstr_out; ++ CeedRestrictionType rstr_type_in, rstr_type_out; ++ const bool *orients_in = NULL, *orients_out = NULL; ++ const CeedInt8 *curl_orients_in = NULL, *curl_orients_out = NULL; + CeedOperator_Hip *impl; + + CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); + CeedCallBackend(CeedOperatorGetData(op, &impl)); + + // Assemble QFunction +- CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &rstr_q, CEED_REQUEST_IMMEDIATE)); +- CeedCallBackend(CeedElemRestrictionDestroy(&rstr_q)); +- CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); +- values_array += offset; +- CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &qf_array)); ++ CeedCallBackend(CeedOperatorLinearAssembleQFunctionBuildOrUpdate(op, &assembled_qf, &assembled_rstr, CEED_REQUEST_IMMEDIATE)); ++ CeedCallBackend(CeedElemRestrictionDestroy(&assembled_rstr)); ++ CeedCallBackend(CeedVectorGetArrayRead(assembled_qf, CEED_MEM_DEVICE, &assembled_qf_array)); + + CeedCallBackend(CeedVectorGetLength(values, &values_length)); + CeedCallBackend(CeedVectorGetLength(assembled_qf, &assembled_qf_length)); + if ((values_length > INT_MAX) || (assembled_qf_length > INT_MAX)) use_ceedsize_idx = 1; ++ + // Setup +- if (!impl->asmb) { +- CeedCallBackend(CeedSingleOperatorAssembleSetup_Hip(op, use_ceedsize_idx)); +- assert(impl->asmb != NULL); ++ if (!impl->asmb) CeedCallBackend(CeedSingleOperatorAssembleSetup_Hip(op, use_ceedsize_idx)); ++ CeedOperatorAssemble_Hip *asmb = impl->asmb; ++ ++ assert(asmb != NULL); ++ ++ // Assemble element operator ++ CeedCallBackend(CeedVectorGetArray(values, CEED_MEM_DEVICE, &values_array)); ++ values_array += offset; ++ ++ CeedCallBackend(CeedOperatorGetActiveElemRestrictions(op, &rstr_in, &rstr_out)); ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_in, &num_elem_in)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_in, &elem_size_in)); ++ ++ CeedCallBackend(CeedElemRestrictionGetType(rstr_in, &rstr_type_in)); ++ if (rstr_type_in == CEED_RESTRICTION_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_in, CEED_MEM_DEVICE, &orients_in)); ++ } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_in, CEED_MEM_DEVICE, &curl_orients_in)); ++ } ++ ++ if (rstr_in != rstr_out) { ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr_out, &num_elem_out)); ++ CeedCheck(num_elem_in == num_elem_out, ceed, CEED_ERROR_UNSUPPORTED, ++ "Active input and output operator restrictions must have the same number of elements"); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr_out, &elem_size_out)); ++ ++ CeedCallBackend(CeedElemRestrictionGetType(rstr_out, &rstr_type_out)); ++ if (rstr_type_out == CEED_RESTRICTION_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionGetOrientations(rstr_out, CEED_MEM_DEVICE, &orients_out)); ++ } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionGetCurlOrientations(rstr_out, CEED_MEM_DEVICE, &curl_orients_out)); ++ } ++ } else { ++ elem_size_out = elem_size_in; ++ orients_out = orients_in; ++ curl_orients_out = curl_orients_in; + } + + // Compute B^T D B +- const CeedInt num_elem = impl->asmb->num_elem; +- const CeedInt elem_per_block = impl->asmb->elem_per_block; +- const CeedInt grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); +- void *args[] = {&impl->asmb->d_B_in, &impl->asmb->d_B_out, &qf_array, &values_array}; ++ CeedInt shared_mem = ++ ((curl_orients_in || curl_orients_out ? elem_size_in * elem_size_out : 0) + (curl_orients_in ? elem_size_in * asmb->block_size_y : 0)) * ++ sizeof(CeedScalar); ++ CeedInt grid = CeedDivUpInt(num_elem_in, asmb->elems_per_block); ++ void *args[] = {(void *)&num_elem_in, &asmb->d_B_in, &asmb->d_B_out, &orients_in, &curl_orients_in, ++ &orients_out, &curl_orients_out, &assembled_qf_array, &values_array}; + + CeedCallBackend( +- CeedRunKernelDim_Hip(ceed, impl->asmb->linearAssemble, grid, impl->asmb->block_size_x, impl->asmb->block_size_y, elem_per_block, args)); ++ CeedRunKernelDimShared_Hip(ceed, asmb->LinearAssemble, grid, asmb->block_size_x, asmb->block_size_y, asmb->elems_per_block, shared_mem, args)); + + // Restore arrays + CeedCallBackend(CeedVectorRestoreArray(values, &values_array)); +- CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &qf_array)); ++ CeedCallBackend(CeedVectorRestoreArrayRead(assembled_qf, &assembled_qf_array)); + + // Cleanup + CeedCallBackend(CeedVectorDestroy(&assembled_qf)); ++ if (rstr_type_in == CEED_RESTRICTION_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_in, &orients_in)); ++ } else if (rstr_type_in == CEED_RESTRICTION_CURL_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_in, &curl_orients_in)); ++ } ++ if (rstr_in != rstr_out) { ++ if (rstr_type_out == CEED_RESTRICTION_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionRestoreOrientations(rstr_out, &orients_out)); ++ } else if (rstr_type_out == CEED_RESTRICTION_CURL_ORIENTED) { ++ CeedCallBackend(CeedElemRestrictionRestoreCurlOrientations(rstr_out, &curl_orients_out)); ++ } ++ } + return CEED_ERROR_SUCCESS; + } + +diff --git a/backends/hip-ref/ceed-hip-ref-restriction.c b/backends/hip-ref/ceed-hip-ref-restriction.c +index 0dd11b16..5824fa48 100644 +--- a/backends/hip-ref/ceed-hip-ref-restriction.c ++++ b/backends/hip-ref/ceed-hip-ref-restriction.c +@@ -18,22 +18,23 @@ + #include "ceed-hip-ref.h" + + //------------------------------------------------------------------------------ +-// Apply restriction ++// Core apply restriction code + //------------------------------------------------------------------------------ +-static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { ++static inline int CeedElemRestrictionApply_Hip_Core(CeedElemRestriction rstr, CeedTransposeMode t_mode, bool use_signs, bool use_orients, ++ CeedVector u, CeedVector v, CeedRequest *request) { + Ceed ceed; +- Ceed_Hip *data; + CeedInt num_elem, elem_size; ++ CeedRestrictionType rstr_type; + const CeedScalar *d_u; + CeedScalar *d_v; + CeedElemRestriction_Hip *impl; + hipFunction_t kernel; + +- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); +- CeedCallBackend(CeedGetData(ceed, &data)); +- CeedElemRestrictionGetNumElements(r, &num_elem); +- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); ++ CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); + const CeedInt num_nodes = impl->num_nodes; + + // Get vectors +@@ -49,45 +50,155 @@ static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode + // Restrict + if (t_mode == CEED_NOTRANSPOSE) { + // L-vector -> E-vector +- if (impl->d_ind) { +- // -- Offsets provided +- kernel = impl->OffsetNoTranspose; +- void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; +- CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; +- +- CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); +- } else { +- // -- Strided restriction +- kernel = impl->StridedNoTranspose; +- void *args[] = {&num_elem, &d_u, &d_v}; +- CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; +- +- CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); ++ const CeedInt block_size = elem_size < 256 ? (elem_size > 64 ? elem_size : 64) : 256; ++ const CeedInt grid = CeedDivUpInt(num_nodes, block_size); ++ ++ switch (rstr_type) { ++ case CEED_RESTRICTION_STRIDED: { ++ kernel = impl->StridedNoTranspose; ++ void *args[] = {&num_elem, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } break; ++ case CEED_RESTRICTION_STANDARD: { ++ kernel = impl->OffsetNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } break; ++ case CEED_RESTRICTION_ORIENTED: { ++ if (use_signs) { ++ kernel = impl->OrientedNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OffsetNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } ++ } break; ++ case CEED_RESTRICTION_CURL_ORIENTED: { ++ if (use_signs && use_orients) { ++ kernel = impl->CurlOrientedNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } else if (use_orients) { ++ kernel = impl->CurlOrientedUnsignedNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OffsetNoTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } ++ } break; ++ case CEED_RESTRICTION_POINTS: { ++ // LCOV_EXCL_START ++ return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); ++ // LCOV_EXCL_STOP ++ } break; + } + } else { + // E-vector -> L-vector +- if (impl->d_ind) { +- // -- Offsets provided +- CeedInt block_size = 64; ++ const CeedInt block_size = 64; ++ const CeedInt grid = CeedDivUpInt(num_nodes, block_size); ++ ++ switch (rstr_type) { ++ case CEED_RESTRICTION_STRIDED: { ++ kernel = impl->StridedTranspose; ++ void *args[] = {&num_elem, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } break; ++ case CEED_RESTRICTION_STANDARD: { ++ if (impl->OffsetTranspose) { ++ kernel = impl->OffsetTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OffsetTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } ++ } break; ++ case CEED_RESTRICTION_ORIENTED: { ++ if (use_signs) { ++ if (impl->OrientedTranspose) { ++ kernel = impl->OrientedTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OrientedTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } ++ } else { ++ if (impl->OffsetTranspose) { ++ kernel = impl->OffsetTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; + +- if (impl->OffsetTranspose) { +- kernel = impl->OffsetTranspose; +- void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OffsetTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; + +- CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); +- } else { +- kernel = impl->OffsetTransposeDet; +- void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } ++ } ++ } break; ++ case CEED_RESTRICTION_CURL_ORIENTED: { ++ if (use_signs && use_orients) { ++ if (impl->CurlOrientedTranspose) { ++ kernel = impl->CurlOrientedTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->CurlOrientedTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } ++ } else if (use_orients) { ++ if (impl->CurlOrientedUnsignedTranspose) { ++ kernel = impl->CurlOrientedUnsignedTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &impl->d_curl_orients, &d_u, &d_v}; + +- CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); +- } +- } else { +- // -- Strided restriction +- kernel = impl->StridedTranspose; +- void *args[] = {&num_elem, &d_u, &d_v}; +- CeedInt block_size = 64; ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->CurlOrientedUnsignedTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &impl->d_curl_orients, &d_u, &d_v}; + +- CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, CeedDivUpInt(num_nodes, block_size), block_size, args)); ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } ++ } else { ++ if (impl->OffsetTranspose) { ++ kernel = impl->OffsetTranspose; ++ void *args[] = {&num_elem, &impl->d_ind, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } else { ++ kernel = impl->OffsetTransposeDet; ++ void *args[] = {&impl->d_l_vec_indices, &impl->d_t_indices, &impl->d_t_offsets, &d_u, &d_v}; ++ ++ CeedCallBackend(CeedRunKernel_Hip(ceed, kernel, grid, block_size, args)); ++ } ++ } ++ } break; ++ case CEED_RESTRICTION_POINTS: { ++ // LCOV_EXCL_START ++ return CeedError(ceed, CEED_ERROR_UNSUPPORTED, "Backend does not implement restriction CeedElemRestrictionAtPoints"); ++ // LCOV_EXCL_STOP ++ } break; + } + } + +@@ -99,6 +210,29 @@ static int CeedElemRestrictionApply_Hip(CeedElemRestriction r, CeedTransposeMode + return CEED_ERROR_SUCCESS; + } + ++//------------------------------------------------------------------------------ ++// Apply restriction ++//------------------------------------------------------------------------------ ++static int CeedElemRestrictionApply_Hip(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { ++ return CeedElemRestrictionApply_Hip_Core(rstr, t_mode, true, true, u, v, request); ++} ++ ++//------------------------------------------------------------------------------ ++// Apply unsigned restriction ++//------------------------------------------------------------------------------ ++static int CeedElemRestrictionApplyUnsigned_Hip(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, ++ CeedRequest *request) { ++ return CeedElemRestrictionApply_Hip_Core(rstr, t_mode, false, true, u, v, request); ++} ++ ++//------------------------------------------------------------------------------ ++// Apply unoriented restriction ++//------------------------------------------------------------------------------ ++static int CeedElemRestrictionApplyUnoriented_Hip(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, ++ CeedRequest *request) { ++ return CeedElemRestrictionApply_Hip_Core(rstr, t_mode, false, false, u, v, request); ++} ++ + //------------------------------------------------------------------------------ + // Get offsets + //------------------------------------------------------------------------------ +@@ -117,21 +251,61 @@ static int CeedElemRestrictionGetOffsets_Hip(CeedElemRestriction rstr, CeedMemTy + return CEED_ERROR_SUCCESS; + } + ++//------------------------------------------------------------------------------ ++// Get orientations ++//------------------------------------------------------------------------------ ++static int CeedElemRestrictionGetOrientations_Hip(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) { ++ CeedElemRestriction_Hip *impl; ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ ++ switch (mem_type) { ++ case CEED_MEM_HOST: ++ *orients = impl->h_orients; ++ break; ++ case CEED_MEM_DEVICE: ++ *orients = impl->d_orients; ++ break; ++ } ++ return CEED_ERROR_SUCCESS; ++} ++ ++//------------------------------------------------------------------------------ ++// Get curl-conforming orientations ++//------------------------------------------------------------------------------ ++static int CeedElemRestrictionGetCurlOrientations_Hip(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) { ++ CeedElemRestriction_Hip *impl; ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ ++ switch (mem_type) { ++ case CEED_MEM_HOST: ++ *curl_orients = impl->h_curl_orients; ++ break; ++ case CEED_MEM_DEVICE: ++ *curl_orients = impl->d_curl_orients; ++ break; ++ } ++ return CEED_ERROR_SUCCESS; ++} ++ + //------------------------------------------------------------------------------ + // Destroy restriction + //------------------------------------------------------------------------------ +-static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { ++static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction rstr) { + Ceed ceed; + CeedElemRestriction_Hip *impl; + +- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); + CeedCallHip(ceed, hipModuleUnload(impl->module)); + CeedCallBackend(CeedFree(&impl->h_ind_allocated)); + CeedCallHip(ceed, hipFree(impl->d_ind_allocated)); + CeedCallHip(ceed, hipFree(impl->d_t_offsets)); + CeedCallHip(ceed, hipFree(impl->d_t_indices)); + CeedCallHip(ceed, hipFree(impl->d_l_vec_indices)); ++ CeedCallBackend(CeedFree(&impl->h_orients_allocated)); ++ CeedCallHip(ceed, hipFree(impl->d_orients_allocated)); ++ CeedCallBackend(CeedFree(&impl->h_curl_orients_allocated)); ++ CeedCallHip(ceed, hipFree(impl->d_curl_orients_allocated)); + CeedCallBackend(CeedFree(&impl)); + return CEED_ERROR_SUCCESS; + } +@@ -139,23 +313,25 @@ static int CeedElemRestrictionDestroy_Hip(CeedElemRestriction r) { + //------------------------------------------------------------------------------ + // Create transpose offsets and indices + //------------------------------------------------------------------------------ +-static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const CeedInt *indices) { ++static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction rstr, const CeedInt *indices) { + Ceed ceed; + bool *is_node; + CeedSize l_size; +- CeedInt num_elem, elem_size, num_comp, num_nodes = 0, *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; ++ CeedInt num_elem, elem_size, num_comp, num_nodes = 0; ++ CeedInt *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; + CeedElemRestriction_Hip *impl; + +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); +- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); +- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); +- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); +- CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); +- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); ++ CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); ++ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + const CeedInt size_indices = num_elem * elem_size; + + // Count num_nodes + CeedCallBackend(CeedCalloc(l_size, &is_node)); ++ + for (CeedInt i = 0; i < size_indices; i++) is_node[indices[i]] = 1; + for (CeedInt i = 0; i < l_size; i++) num_nodes += is_node[i]; + impl->num_nodes = num_nodes; +@@ -218,136 +394,223 @@ static int CeedElemRestrictionOffset_Hip(const CeedElemRestriction r, const Ceed + // Create restriction + //------------------------------------------------------------------------------ + int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, +- const CeedInt8 *curl_orients, CeedElemRestriction r) { ++ const CeedInt8 *curl_orients, CeedElemRestriction rstr) { + Ceed ceed, ceed_parent; +- bool is_deterministic, is_strided; +- char *restriction_kernel_path, *restriction_kernel_source; ++ bool is_deterministic; + CeedInt num_elem, num_comp, elem_size, comp_stride = 1; + CeedRestrictionType rstr_type; ++ char *restriction_kernel_path, *restriction_kernel_source; + CeedElemRestriction_Hip *impl; + +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); +- CeedCallBackend(CeedCalloc(1, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); + CeedCallBackend(CeedGetParent(ceed, &ceed_parent)); + CeedCallBackend(CeedIsDeterministic(ceed_parent, &is_deterministic)); +- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); +- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); +- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); +- CeedInt size = num_elem * elem_size; +- CeedInt strides[3] = {1, size, elem_size}; +- CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; +- +- CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); +- CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, +- "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); ++ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); ++ const CeedInt size = num_elem * elem_size; ++ CeedInt strides[3] = {1, size, elem_size}; ++ CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; + + // Stride data +- CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); +- if (is_strided) { ++ CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); ++ if (rstr_type == CEED_RESTRICTION_STRIDED) { + bool has_backend_strides; + +- CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); ++ CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); + if (!has_backend_strides) { +- CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); ++ CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); + } + } else { +- CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); ++ CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); + } + +- impl->h_ind = NULL; +- impl->h_ind_allocated = NULL; +- impl->d_ind = NULL; +- impl->d_ind_allocated = NULL; +- impl->d_t_indices = NULL; +- impl->d_t_offsets = NULL; +- impl->num_nodes = size; +- CeedCallBackend(CeedElemRestrictionSetData(r, impl)); +- CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); +- +- // Set up device indices/offset arrays +- switch (mem_type) { +- case CEED_MEM_HOST: { +- switch (copy_mode) { +- case CEED_OWN_POINTER: +- impl->h_ind_allocated = (CeedInt *)indices; +- impl->h_ind = (CeedInt *)indices; +- break; +- case CEED_USE_POINTER: +- impl->h_ind = (CeedInt *)indices; +- break; +- case CEED_COPY_VALUES: +- if (indices != NULL) { +- CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); +- memcpy(impl->h_ind_allocated, indices, elem_size * num_elem * sizeof(CeedInt)); ++ CeedCallBackend(CeedCalloc(1, &impl)); ++ impl->num_nodes = size; ++ impl->h_ind = NULL; ++ impl->h_ind_allocated = NULL; ++ impl->d_ind = NULL; ++ impl->d_ind_allocated = NULL; ++ impl->d_t_indices = NULL; ++ impl->d_t_offsets = NULL; ++ impl->h_orients = NULL; ++ impl->h_orients_allocated = NULL; ++ impl->d_orients = NULL; ++ impl->d_orients_allocated = NULL; ++ impl->h_curl_orients = NULL; ++ impl->h_curl_orients_allocated = NULL; ++ impl->d_curl_orients = NULL; ++ impl->d_curl_orients_allocated = NULL; ++ CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); ++ CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); ++ ++ // Set up device offset/orientation arrays ++ if (rstr_type != CEED_RESTRICTION_STRIDED) { ++ switch (mem_type) { ++ case CEED_MEM_HOST: { ++ switch (copy_mode) { ++ case CEED_OWN_POINTER: ++ impl->h_ind_allocated = (CeedInt *)indices; ++ impl->h_ind = (CeedInt *)indices; ++ break; ++ case CEED_USE_POINTER: ++ impl->h_ind = (CeedInt *)indices; ++ break; ++ case CEED_COPY_VALUES: ++ CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); ++ memcpy(impl->h_ind_allocated, indices, size * sizeof(CeedInt)); + impl->h_ind = impl->h_ind_allocated; +- } +- break; +- } +- if (indices != NULL) { ++ break; ++ } + CeedCallHip(ceed, hipMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); + impl->d_ind_allocated = impl->d_ind; // We own the device memory + CeedCallHip(ceed, hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), hipMemcpyHostToDevice)); +- if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(r, indices)); +- } +- break; +- } +- case CEED_MEM_DEVICE: { +- switch (copy_mode) { +- case CEED_COPY_VALUES: +- if (indices != NULL) { ++ if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, indices)); ++ } break; ++ case CEED_MEM_DEVICE: { ++ switch (copy_mode) { ++ case CEED_COPY_VALUES: + CeedCallHip(ceed, hipMalloc((void **)&impl->d_ind, size * sizeof(CeedInt))); + impl->d_ind_allocated = impl->d_ind; // We own the device memory + CeedCallHip(ceed, hipMemcpy(impl->d_ind, indices, size * sizeof(CeedInt), hipMemcpyDeviceToDevice)); ++ break; ++ case CEED_OWN_POINTER: ++ impl->d_ind = (CeedInt *)indices; ++ impl->d_ind_allocated = impl->d_ind; ++ break; ++ case CEED_USE_POINTER: ++ impl->d_ind = (CeedInt *)indices; ++ break; ++ } ++ CeedCallBackend(CeedMalloc(size, &impl->h_ind_allocated)); ++ CeedCallHip(ceed, hipMemcpy(impl->h_ind_allocated, impl->d_ind, size * sizeof(CeedInt), hipMemcpyDeviceToHost)); ++ impl->h_ind = impl->h_ind_allocated; ++ if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(rstr, indices)); ++ } break; ++ } ++ ++ // Orientation data ++ if (rstr_type == CEED_RESTRICTION_ORIENTED) { ++ switch (mem_type) { ++ case CEED_MEM_HOST: { ++ switch (copy_mode) { ++ case CEED_OWN_POINTER: ++ impl->h_orients_allocated = (bool *)orients; ++ impl->h_orients = (bool *)orients; ++ break; ++ case CEED_USE_POINTER: ++ impl->h_orients = (bool *)orients; ++ break; ++ case CEED_COPY_VALUES: ++ CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); ++ memcpy(impl->h_orients_allocated, orients, size * sizeof(bool)); ++ impl->h_orients = impl->h_orients_allocated; ++ break; + } +- break; +- case CEED_OWN_POINTER: +- impl->d_ind = (CeedInt *)indices; +- impl->d_ind_allocated = impl->d_ind; +- break; +- case CEED_USE_POINTER: +- impl->d_ind = (CeedInt *)indices; ++ CeedCallHip(ceed, hipMalloc((void **)&impl->d_orients, size * sizeof(bool))); ++ impl->d_orients_allocated = impl->d_orients; // We own the device memory ++ CeedCallHip(ceed, hipMemcpy(impl->d_orients, orients, size * sizeof(bool), hipMemcpyHostToDevice)); ++ } break; ++ case CEED_MEM_DEVICE: { ++ switch (copy_mode) { ++ case CEED_COPY_VALUES: ++ CeedCallHip(ceed, hipMalloc((void **)&impl->d_orients, size * sizeof(bool))); ++ impl->d_orients_allocated = impl->d_orients; // We own the device memory ++ CeedCallHip(ceed, hipMemcpy(impl->d_orients, orients, size * sizeof(bool), hipMemcpyDeviceToDevice)); ++ break; ++ case CEED_OWN_POINTER: ++ impl->d_orients = (bool *)orients; ++ impl->d_orients_allocated = impl->d_orients; ++ break; ++ case CEED_USE_POINTER: ++ impl->d_orients = (bool *)orients; ++ break; ++ } ++ CeedCallBackend(CeedMalloc(size, &impl->h_orients_allocated)); ++ CeedCallHip(ceed, hipMemcpy(impl->h_orients_allocated, impl->d_orients, size * sizeof(bool), hipMemcpyDeviceToHost)); ++ impl->h_orients = impl->h_orients_allocated; ++ } break; + } +- if (indices != NULL) { +- CeedCallBackend(CeedMalloc(elem_size * num_elem, &impl->h_ind_allocated)); +- CeedCallHip(ceed, hipMemcpy(impl->h_ind_allocated, impl->d_ind, elem_size * num_elem * sizeof(CeedInt), hipMemcpyDeviceToHost)); +- impl->h_ind = impl->h_ind_allocated; +- if (is_deterministic) CeedCallBackend(CeedElemRestrictionOffset_Hip(r, indices)); ++ } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) { ++ switch (mem_type) { ++ case CEED_MEM_HOST: { ++ switch (copy_mode) { ++ case CEED_OWN_POINTER: ++ impl->h_curl_orients_allocated = (CeedInt8 *)curl_orients; ++ impl->h_curl_orients = (CeedInt8 *)curl_orients; ++ break; ++ case CEED_USE_POINTER: ++ impl->h_curl_orients = (CeedInt8 *)curl_orients; ++ break; ++ case CEED_COPY_VALUES: ++ CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); ++ memcpy(impl->h_curl_orients_allocated, curl_orients, 3 * size * sizeof(CeedInt8)); ++ impl->h_curl_orients = impl->h_curl_orients_allocated; ++ break; ++ } ++ CeedCallHip(ceed, hipMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); ++ impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory ++ CeedCallHip(ceed, hipMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), hipMemcpyHostToDevice)); ++ } break; ++ case CEED_MEM_DEVICE: { ++ switch (copy_mode) { ++ case CEED_COPY_VALUES: ++ CeedCallHip(ceed, hipMalloc((void **)&impl->d_curl_orients, 3 * size * sizeof(CeedInt8))); ++ impl->d_curl_orients_allocated = impl->d_curl_orients; // We own the device memory ++ CeedCallHip(ceed, hipMemcpy(impl->d_curl_orients, curl_orients, 3 * size * sizeof(CeedInt8), hipMemcpyDeviceToDevice)); ++ break; ++ case CEED_OWN_POINTER: ++ impl->d_curl_orients = (CeedInt8 *)curl_orients; ++ impl->d_curl_orients_allocated = impl->d_curl_orients; ++ break; ++ case CEED_USE_POINTER: ++ impl->d_curl_orients = (CeedInt8 *)curl_orients; ++ break; ++ } ++ CeedCallBackend(CeedMalloc(3 * size, &impl->h_curl_orients_allocated)); ++ CeedCallHip(ceed, hipMemcpy(impl->h_curl_orients_allocated, impl->d_curl_orients, 3 * size * sizeof(CeedInt8), hipMemcpyDeviceToHost)); ++ impl->h_curl_orients = impl->h_curl_orients_allocated; ++ } break; + } +- break; + } +- // LCOV_EXCL_START +- default: +- return CeedError(ceed, CEED_ERROR_BACKEND, "Only MemType = HOST or DEVICE supported"); +- // LCOV_EXCL_STOP + } + + // Compile HIP kernels +- CeedInt num_nodes = impl->num_nodes; +- + CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-restriction.h", &restriction_kernel_path)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, restriction_kernel_path, &restriction_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Restriction Kernel Source Complete! -----\n"); +- CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 8, "RESTR_ELEM_SIZE", elem_size, "RESTR_NUM_ELEM", num_elem, +- "RESTR_NUM_COMP", num_comp, "RESTR_NUM_NODES", num_nodes, "RESTR_COMP_STRIDE", comp_stride, "RESTR_STRIDE_NODES", +- strides[0], "RESTR_STRIDE_COMP", strides[1], "RESTR_STRIDE_ELEM", strides[2])); ++ CeedCallBackend(CeedCompile_Hip(ceed, restriction_kernel_source, &impl->module, 8, "RSTR_ELEM_SIZE", elem_size, "RSTR_NUM_ELEM", num_elem, ++ "RSTR_NUM_COMP", num_comp, "RSTR_NUM_NODES", impl->num_nodes, "RSTR_COMP_STRIDE", comp_stride, "RSTR_STRIDE_NODES", ++ strides[0], "RSTR_STRIDE_COMP", strides[1], "RSTR_STRIDE_ELEM", strides[2])); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedNoTranspose", &impl->StridedNoTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "StridedTranspose", &impl->StridedTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetNoTranspose", &impl->OffsetNoTranspose)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedNoTranspose", &impl->OrientedNoTranspose)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedNoTranspose", &impl->CurlOrientedNoTranspose)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedNoTranspose", &impl->CurlOrientedUnsignedNoTranspose)); + if (!is_deterministic) { + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTranspose", &impl->OffsetTranspose)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTranspose", &impl->OrientedTranspose)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTranspose", &impl->CurlOrientedTranspose)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTranspose", &impl->CurlOrientedUnsignedTranspose)); + } else { + CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OffsetTransposeDet", &impl->OffsetTransposeDet)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "OrientedTransposeDet", &impl->OrientedTransposeDet)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedTransposeDet", &impl->CurlOrientedTransposeDet)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, impl->module, "CurlOrientedUnsignedTransposeDet", &impl->CurlOrientedUnsignedTransposeDet)); + } + CeedCallBackend(CeedFree(&restriction_kernel_path)); + CeedCallBackend(CeedFree(&restriction_kernel_source)); + + // Register backend functions +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Hip)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Hip)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Hip)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Hip)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Hip)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Hip)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Hip)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Hip)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Hip)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Hip)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Hip)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Hip)); + return CEED_ERROR_SUCCESS; + } + +diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h +index 634bb68d..ecfa8874 100644 +--- a/backends/hip-ref/ceed-hip-ref.h ++++ b/backends/hip-ref/ceed-hip-ref.h +@@ -34,6 +34,15 @@ typedef struct { + hipFunction_t OffsetNoTranspose; + hipFunction_t OffsetTranspose; + hipFunction_t OffsetTransposeDet; ++ hipFunction_t OrientedNoTranspose; ++ hipFunction_t OrientedTranspose; ++ hipFunction_t OrientedTransposeDet; ++ hipFunction_t CurlOrientedNoTranspose; ++ hipFunction_t CurlOrientedTranspose; ++ hipFunction_t CurlOrientedTransposeDet; ++ hipFunction_t CurlOrientedUnsignedNoTranspose; ++ hipFunction_t CurlOrientedUnsignedTranspose; ++ hipFunction_t CurlOrientedUnsignedTransposeDet; + CeedInt num_nodes; + CeedInt *h_ind; + CeedInt *h_ind_allocated; +@@ -42,6 +51,14 @@ typedef struct { + CeedInt *d_t_offsets; + CeedInt *d_t_indices; + CeedInt *d_l_vec_indices; ++ bool *h_orients; ++ bool *h_orients_allocated; ++ bool *d_orients; ++ bool *d_orients_allocated; ++ CeedInt8 *h_curl_orients; ++ CeedInt8 *h_curl_orients_allocated; ++ CeedInt8 *d_curl_orients; ++ CeedInt8 *d_curl_orients_allocated; + } CeedElemRestriction_Hip; + + typedef struct { +@@ -84,21 +101,19 @@ typedef struct { + + typedef struct { + hipModule_t module; +- hipFunction_t linearDiagonal; +- hipFunction_t linearPointBlock; +- CeedBasis basis_in, basis_out; ++ hipFunction_t LinearDiagonal; ++ hipFunction_t LinearPointBlock; + CeedElemRestriction diag_rstr, point_block_diag_rstr; + CeedVector elem_diag, point_block_elem_diag; +- CeedInt num_e_mode_in, num_e_mode_out, num_modes; +- CeedEvalMode *h_e_mode_in, *h_e_mode_out; +- CeedEvalMode *d_e_mode_in, *d_e_mode_out; +- CeedScalar *d_identity, *d_interp_in, *d_interp_out, *d_grad_in, *d_grad_out; ++ CeedEvalMode *d_eval_modes_in, *d_eval_modes_out; ++ CeedScalar *d_identity, *d_interp_in, *d_grad_in, *d_div_in, *d_curl_in; ++ CeedScalar *d_interp_out, *d_grad_out, *d_div_out, *d_curl_out; + } CeedOperatorDiag_Hip; + + typedef struct { + hipModule_t module; +- hipFunction_t linearAssemble; +- CeedInt num_elem, block_size_x, block_size_y, elem_per_block; ++ hipFunction_t LinearAssemble; ++ CeedInt block_size_x, block_size_y, elems_per_block; + CeedScalar *d_B_in, *d_B_out; + } CeedOperatorAssemble_Hip; + +diff --git a/backends/ref/ceed-ref-restriction.c b/backends/ref/ceed-ref-restriction.c +index 1bf604c4..8deebe3f 100644 +--- a/backends/ref/ceed-ref-restriction.c ++++ b/backends/ref/ceed-ref-restriction.c +@@ -55,9 +55,9 @@ static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRe + return CEED_ERROR_SUCCESS; + } + +-static inline int CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, +- const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, +- CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { ++static inline int CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, ++ const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, ++ CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { + // Default restriction with offsets + CeedElemRestriction_Ref *impl; + +@@ -216,9 +216,9 @@ static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRest + return CEED_ERROR_SUCCESS; + } + +-static inline int CeedElemRestrictionApplyStandardTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, +- const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, +- CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { ++static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, ++ const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, ++ CeedInt elem_size, CeedInt v_offset, const CeedScalar *uu, CeedScalar *vv) { + // Default restriction with offsets + CeedElemRestriction_Ref *impl; + +@@ -367,7 +367,6 @@ static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRes + CeedElemRestriction_Ref *impl; + + CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); +- + for (CeedInt e = start; e < stop; e++) { + l_vec_offset = impl->offsets[e]; + CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr, e, &num_points)); +@@ -418,16 +417,16 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co + CeedElemRestrictionApplyStridedTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv)); + break; + case CEED_RESTRICTION_STANDARD: +- CeedCallBackend(CeedElemRestrictionApplyStandardTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, +- v_offset, uu, vv)); ++ CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, ++ v_offset, uu, vv)); + break; + case CEED_RESTRICTION_ORIENTED: + if (use_signs) { + CeedCallBackend(CeedElemRestrictionApplyOrientedTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, + elem_size, v_offset, uu, vv)); + } else { +- CeedCallBackend(CeedElemRestrictionApplyStandardTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, +- elem_size, v_offset, uu, vv)); ++ CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, ++ v_offset, uu, vv)); + } + break; + case CEED_RESTRICTION_CURL_ORIENTED: +@@ -438,8 +437,8 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co + CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, + num_elem, elem_size, v_offset, uu, vv)); + } else { +- CeedCallBackend(CeedElemRestrictionApplyStandardTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, +- elem_size, v_offset, uu, vv)); ++ CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, ++ v_offset, uu, vv)); + } + break; + case CEED_RESTRICTION_POINTS: +@@ -458,16 +457,16 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co + CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv)); + break; + case CEED_RESTRICTION_STANDARD: +- CeedCallBackend(CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, +- elem_size, v_offset, uu, vv)); ++ CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, elem_size, ++ v_offset, uu, vv)); + break; + case CEED_RESTRICTION_ORIENTED: + if (use_signs) { + CeedCallBackend(CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, + elem_size, v_offset, uu, vv)); + } else { +- CeedCallBackend(CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, +- elem_size, v_offset, uu, vv)); ++ CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, ++ elem_size, v_offset, uu, vv)); + } + break; + case CEED_RESTRICTION_CURL_ORIENTED: +@@ -478,8 +477,8 @@ static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, co + CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, + num_elem, elem_size, v_offset, uu, vv)); + } else { +- CeedCallBackend(CeedElemRestrictionApplyStandardNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, +- elem_size, v_offset, uu, vv)); ++ CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, ++ elem_size, v_offset, uu, vv)); + } + break; + case CEED_RESTRICTION_POINTS: +@@ -625,14 +624,14 @@ static int CeedElemRestrictionApplyUnoriented_Ref(CeedElemRestriction rstr, Ceed + //------------------------------------------------------------------------------ + // ElemRestriction Apply Points + //------------------------------------------------------------------------------ +-static int CeedElemRestrictionApplyAtPointsInElement_Ref(CeedElemRestriction r, CeedInt elem, CeedTransposeMode t_mode, CeedVector u, CeedVector v, ++static int CeedElemRestrictionApplyAtPointsInElement_Ref(CeedElemRestriction rstr, CeedInt elem, CeedTransposeMode t_mode, CeedVector u, CeedVector v, + CeedRequest *request) { + CeedInt num_comp; + CeedElemRestriction_Ref *impl; + +- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); +- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); +- return impl->Apply(r, num_comp, 0, 1, elem, elem + 1, t_mode, false, false, u, v, request); ++ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ return impl->Apply(rstr, num_comp, 0, 1, elem, elem + 1, t_mode, false, false, u, v, request); + } + + //------------------------------------------------------------------------------ +@@ -733,7 +732,10 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, + CeedInt layout[3] = {1, elem_size, elem_size * num_comp}; + + CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Only MemType = HOST supported"); ++ + CeedCallBackend(CeedCalloc(1, &impl)); ++ CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); ++ CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); + + // Offsets data + CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); +@@ -813,20 +815,6 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, + } + } + +- CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); +- CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Ref)); +- if (rstr_type == CEED_RESTRICTION_POINTS) { +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", CeedElemRestrictionApplyAtPointsInElement_Ref)); +- } +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Ref)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Ref)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyBlock", CeedElemRestrictionApplyBlock_Ref)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Ref)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Ref)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Ref)); +- CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Ref)); +- + // Set apply function based upon num_comp, block_size, and comp_stride + CeedInt index = -1; + +@@ -876,6 +864,19 @@ int CeedElemRestrictionCreate_Ref(CeedMemType mem_type, CeedCopyMode copy_mode, + impl->Apply = CeedElemRestrictionApply_Ref_Core; + break; + } ++ ++ // Register backend functions ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Ref)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Ref)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Ref)); ++ if (rstr_type == CEED_RESTRICTION_POINTS) { ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", CeedElemRestrictionApplyAtPointsInElement_Ref)); ++ } ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyBlock", CeedElemRestrictionApplyBlock_Ref)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Ref)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Ref)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Ref)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Ref)); + return CEED_ERROR_SUCCESS; + } + +diff --git a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp +index 78c71681..867960bb 100644 +--- a/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp ++++ b/backends/sycl-ref/ceed-sycl-ref-operator.sycl.cpp +@@ -1071,16 +1071,16 @@ static int CeedSingleOperatorAssembleSetup_Sycl(CeedOperator op) { + CeedCallBackend(CeedGetData(ceed, &sycl_data)); + + // Kernel setup +- int elem_per_block = 1; +- asmb->elem_per_block = elem_per_block; +- CeedInt block_size = elem_size * elem_size * elem_per_block; ++ int elems_per_block = 1; ++ asmb->elems_per_block = elems_per_block; ++ CeedInt block_size = elem_size * elem_size * elems_per_block; + + /* CeedInt maxThreadsPerBlock = sycl_data->sycl_device.get_info(); + bool fallback = block_size > maxThreadsPerBlock; + asmb->fallback = fallback; + if (fallback) { + // Use fallback kernel with 1D threadblock +- block_size = elem_size * elem_per_block; ++ block_size = elem_size * elems_per_block; + asmb->block_size_x = elem_size; + asmb->block_size_y = 1; + } else { // Use kernel with 2D threadblock +@@ -1250,13 +1250,13 @@ static int CeedOperatorLinearAssembleFallback_Sycl(sycl::queue &sycl_queue, cons + CeedScalar *B_in, *B_out; + B_in = asmb->d_B_in; + B_out = asmb->d_B_out; +- const CeedInt elem_per_block = asmb->elem_per_block; ++ const CeedInt elems_per_block = asmb->elems_per_block; + const CeedInt block_size_x = asmb->block_size_x; + const CeedInt block_size_y = asmb->block_size_y; // This will be 1 for the fallback kernel + +- const CeedInt grid = num_elem / elem_per_block + ((num_elem / elem_per_block * elem_per_block < num_elem) ? 1 : 0); +- sycl::range<3> local_range(block_size_x, block_size_y, elem_per_block); +- sycl::range<3> global_range(grid * block_size_x, block_size_y, elem_per_block); ++ const CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); ++ sycl::range<3> local_range(block_size_x, block_size_y, elems_per_block); ++ sycl::range<3> global_range(grid * block_size_x, block_size_y, elems_per_block); + sycl::nd_range<3> kernel_range(global_range, local_range); + + sycl_queue.parallel_for(kernel_range, [=](sycl::nd_item<3> work_item) { +diff --git a/backends/sycl-ref/ceed-sycl-ref.hpp b/backends/sycl-ref/ceed-sycl-ref.hpp +index 56544c38..fc7bc775 100644 +--- a/backends/sycl-ref/ceed-sycl-ref.hpp ++++ b/backends/sycl-ref/ceed-sycl-ref.hpp +@@ -94,7 +94,7 @@ typedef struct { + } CeedOperatorDiag_Sycl; + + typedef struct { +- CeedInt num_elem, block_size_x, block_size_y, elem_per_block; ++ CeedInt num_elem, block_size_x, block_size_y, elems_per_block; + CeedInt num_e_mode_in, num_e_mode_out, num_qpts, num_nodes, block_size, num_comp; // Kernel parameters + bool fallback; + CeedScalar *d_B_in, *d_B_out; +diff --git a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp +index 4ace9976..8aac0678 100644 +--- a/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp ++++ b/backends/sycl-ref/ceed-sycl-restriction.sycl.cpp +@@ -142,15 +142,15 @@ static int CeedElemRestrictionOffsetTranspose_Sycl(sycl::queue &sycl_queue, cons + //------------------------------------------------------------------------------ + // Apply restriction + //------------------------------------------------------------------------------ +-static int CeedElemRestrictionApply_Sycl(CeedElemRestriction r, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { ++static int CeedElemRestrictionApply_Sycl(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { + Ceed ceed; + Ceed_Sycl *data; + const CeedScalar *d_u; + CeedScalar *d_v; + CeedElemRestriction_Sycl *impl; + +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); +- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); + CeedCallBackend(CeedGetData(ceed, &data)); + + // Get vectors +@@ -197,12 +197,12 @@ static int CeedElemRestrictionApply_Sycl(CeedElemRestriction r, CeedTransposeMod + //------------------------------------------------------------------------------ + // Get offsets + //------------------------------------------------------------------------------ +-static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction r, CeedMemType m_type, const CeedInt **offsets) { ++static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction rstr, CeedMemType m_type, const CeedInt **offsets) { + Ceed ceed; + CeedElemRestriction_Sycl *impl; + +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); +- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); + + switch (m_type) { + case CEED_MEM_HOST: +@@ -218,13 +218,13 @@ static int CeedElemRestrictionGetOffsets_Sycl(CeedElemRestriction r, CeedMemType + //------------------------------------------------------------------------------ + // Destroy restriction + //------------------------------------------------------------------------------ +-static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction r) { ++static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction rstr) { + Ceed ceed; + Ceed_Sycl *data; + CeedElemRestriction_Sycl *impl; + +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); +- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); + CeedCallBackend(CeedGetData(ceed, &data)); + + // Wait for all work to finish before freeing memory +@@ -242,7 +242,7 @@ static int CeedElemRestrictionDestroy_Sycl(CeedElemRestriction r) { + //------------------------------------------------------------------------------ + // Create transpose offsets and indices + //------------------------------------------------------------------------------ +-static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const CeedInt *indices) { ++static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction rstr, const CeedInt *indices) { + Ceed ceed; + Ceed_Sycl *data; + bool *is_node; +@@ -250,12 +250,12 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const Cee + CeedInt num_elem, elem_size, num_comp, num_nodes = 0, *ind_to_offset, *l_vec_indices, *t_offsets, *t_indices; + CeedElemRestriction_Sycl *impl; + +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); +- CeedCallBackend(CeedElemRestrictionGetData(r, &impl)); +- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); +- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); +- CeedCallBackend(CeedElemRestrictionGetLVectorSize(r, &l_size)); +- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); ++ CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); ++ CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); ++ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); + + // Count num_nodes + CeedCallBackend(CeedCalloc(l_size, &is_node)); +@@ -330,7 +330,7 @@ static int CeedElemRestrictionOffset_Sycl(const CeedElemRestriction r, const Cee + // Create restriction + //------------------------------------------------------------------------------ + int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *indices, const bool *orients, +- const CeedInt8 *curl_orients, CeedElemRestriction r) { ++ const CeedInt8 *curl_orients, CeedElemRestriction rstr) { + Ceed ceed; + Ceed_Sycl *data; + bool is_strided; +@@ -338,32 +338,33 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, + CeedRestrictionType rstr_type; + CeedElemRestriction_Sycl *impl; + +- CeedCallBackend(CeedElemRestrictionGetCeed(r, &ceed)); ++ CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); + CeedCallBackend(CeedGetData(ceed, &data)); +- CeedCallBackend(CeedCalloc(1, &impl)); +- CeedCallBackend(CeedElemRestrictionGetNumElements(r, &num_elem)); +- CeedCallBackend(CeedElemRestrictionGetNumComponents(r, &num_comp)); +- CeedCallBackend(CeedElemRestrictionGetElementSize(r, &elem_size)); +- CeedInt size = num_elem * elem_size; +- CeedInt strides[3] = {1, size, elem_size}; +- +- CeedCallBackend(CeedElemRestrictionGetType(r, &rstr_type)); ++ CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); ++ CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); ++ CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); ++ const CeedInt size = num_elem * elem_size; ++ CeedInt strides[3] = {1, size, elem_size}; ++ CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; ++ ++ CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); + CeedCheck(rstr_type != CEED_RESTRICTION_ORIENTED && rstr_type != CEED_RESTRICTION_CURL_ORIENTED, ceed, CEED_ERROR_BACKEND, + "Backend does not implement CeedElemRestrictionCreateOriented or CeedElemRestrictionCreateCurlOriented"); + + // Stride data +- CeedCallBackend(CeedElemRestrictionIsStrided(r, &is_strided)); ++ CeedCallBackend(CeedElemRestrictionIsStrided(rstr, &is_strided)); + if (is_strided) { + bool has_backend_strides; + +- CeedCallBackend(CeedElemRestrictionHasBackendStrides(r, &has_backend_strides)); ++ CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); + if (!has_backend_strides) { +- CeedCallBackend(CeedElemRestrictionGetStrides(r, &strides)); ++ CeedCallBackend(CeedElemRestrictionGetStrides(rstr, &strides)); + } + } else { +- CeedCallBackend(CeedElemRestrictionGetCompStride(r, &comp_stride)); ++ CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); + } + ++ CeedCallBackend(CeedCalloc(1, &impl)); + impl->h_ind = NULL; + impl->h_ind_allocated = NULL; + impl->d_ind = NULL; +@@ -378,9 +379,8 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, + impl->strides[0] = strides[0]; + impl->strides[1] = strides[1]; + impl->strides[2] = strides[2]; +- CeedCallBackend(CeedElemRestrictionSetData(r, impl)); +- CeedInt layout[3] = {1, elem_size * num_elem, elem_size}; +- CeedCallBackend(CeedElemRestrictionSetELayout(r, layout)); ++ CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); ++ CeedCallBackend(CeedElemRestrictionSetELayout(rstr, layout)); + + // Set up device indices/offset arrays + if (mem_type == CEED_MEM_HOST) { +@@ -409,7 +409,7 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, + sycl::event copy_event = data->sycl_queue.copy(indices, impl->d_ind, size, {e}); + // Wait for copy to finish and handle exceptions + CeedCallSycl(ceed, copy_event.wait_and_throw()); +- CeedCallBackend(CeedElemRestrictionOffset_Sycl(r, indices)); ++ CeedCallBackend(CeedElemRestrictionOffset_Sycl(rstr, indices)); + } + } else if (mem_type == CEED_MEM_DEVICE) { + switch (copy_mode) { +@@ -440,7 +440,7 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, + sycl::event copy_event = data->sycl_queue.copy(impl->d_ind, impl->h_ind_allocated, elem_size * num_elem, {e}); + CeedCallSycl(ceed, copy_event.wait_and_throw()); + impl->h_ind = impl->h_ind_allocated; +- CeedCallBackend(CeedElemRestrictionOffset_Sycl(r, indices)); ++ CeedCallBackend(CeedElemRestrictionOffset_Sycl(rstr, indices)); + } + } else { + // LCOV_EXCL_START +@@ -449,10 +449,10 @@ int CeedElemRestrictionCreate_Sycl(CeedMemType mem_type, CeedCopyMode copy_mode, + } + + // Register backend functions +- CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "Apply", CeedElemRestrictionApply_Sycl)); +- CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyUnsigned", CeedElemRestrictionApply_Sycl)); +- CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "ApplyUnoriented", CeedElemRestrictionApply_Sycl)); +- CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "GetOffsets", CeedElemRestrictionGetOffsets_Sycl)); +- CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", r, "Destroy", CeedElemRestrictionDestroy_Sycl)); ++ CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Sycl)); ++ CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApply_Sycl)); ++ CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApply_Sycl)); ++ CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Sycl)); ++ CeedCallBackend(CeedSetBackendFunctionCpp(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Sycl)); + return CEED_ERROR_SUCCESS; + } +diff --git a/include/ceed/backend.h b/include/ceed/backend.h +index b3d2f97e..ff1f82c1 100644 +--- a/include/ceed/backend.h ++++ b/include/ceed/backend.h +@@ -368,6 +368,7 @@ CEED_EXTERN int CeedQFunctionContextRestoreInt32Read(CeedQFunctionContext ctx, C + CEED_EXTERN int CeedQFunctionContextGetDataDestroy(CeedQFunctionContext ctx, CeedMemType *f_mem_type, CeedQFunctionContextDataDestroyUser *f); + CEED_EXTERN int CeedQFunctionContextReference(CeedQFunctionContext ctx); + ++CEED_EXTERN int CeedOperatorGetBasisPointer(CeedBasis basis, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar **basis_ptr); + CEED_EXTERN int CeedOperatorCreateActivePointBlockRestriction(CeedElemRestriction rstr, CeedElemRestriction *pointblock_rstr); + CEED_EXTERN int CeedQFunctionAssemblyDataCreate(Ceed ceed, CeedQFunctionAssemblyData *data); + CEED_EXTERN int CeedQFunctionAssemblyDataReference(CeedQFunctionAssemblyData data); +diff --git a/include/ceed/ceed.h b/include/ceed/ceed.h +index f75c31de..6535c733 100644 +--- a/include/ceed/ceed.h ++++ b/include/ceed/ceed.h +@@ -392,7 +392,7 @@ CEED_EXTERN int CeedQFunctionContextDestroy(CeedQFunctionContext *ctx); + CEED_EXTERN int CeedOperatorCreate(Ceed ceed, CeedQFunction qf, CeedQFunction dqf, CeedQFunction dqfT, CeedOperator *op); + CEED_EXTERN int CeedCompositeOperatorCreate(Ceed ceed, CeedOperator *op); + CEED_EXTERN int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy); +-CEED_EXTERN int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction r, CeedBasis b, CeedVector v); ++CEED_EXTERN int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction rstr, CeedBasis basis, CeedVector v); + CEED_EXTERN int CeedOperatorGetFields(CeedOperator op, CeedInt *num_input_fields, CeedOperatorField **input_fields, CeedInt *num_output_fields, + CeedOperatorField **output_fields); + CEED_EXTERN int CeedCompositeOperatorAddSub(CeedOperator composite_op, CeedOperator sub_op); +diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h +index 7c6f8789..ab366e79 100644 +--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h ++++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble-diagonal.h +@@ -19,11 +19,11 @@ typedef CeedInt IndexType; + #endif + + //------------------------------------------------------------------------------ +-// Get Basis Emode Pointer ++// Get basis pointer + //------------------------------------------------------------------------------ +-extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **basis_ptr, CeedEvalMode e_mode, const CeedScalar *identity, +- const CeedScalar *interp, const CeedScalar *grad) { +- switch (e_mode) { ++static __device__ __inline__ void GetBasisPointer(const CeedScalar **basis_ptr, CeedEvalMode eval_modes, const CeedScalar *identity, ++ const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *div, const CeedScalar *curl) { ++ switch (eval_modes) { + case CEED_EVAL_NONE: + *basis_ptr = identity; + break; +@@ -33,52 +33,67 @@ extern "C" __device__ void CeedOperatorGetBasisPointer_Cuda(const CeedScalar **b + case CEED_EVAL_GRAD: + *basis_ptr = grad; + break; +- case CEED_EVAL_WEIGHT: + case CEED_EVAL_DIV: ++ *basis_ptr = div; ++ break; + case CEED_EVAL_CURL: +- break; // Caught by QF Assembly ++ *basis_ptr = curl; ++ break; ++ case CEED_EVAL_WEIGHT: ++ break; // Caught by QF assembly + } + } + + //------------------------------------------------------------------------------ + // Core code for diagonal assembly + //------------------------------------------------------------------------------ +-__device__ void diagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity, const CeedScalar *interp_in, +- const CeedScalar *grad_in, const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedEvalMode *e_mode_in, +- const CeedEvalMode *e_mode_out, const CeedScalar *__restrict__ assembled_qf_array, +- CeedScalar *__restrict__ elem_diag_array) { +- const int tid = threadIdx.x; // running with P threads, tid is evec node ++static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity, ++ const CeedScalar *interp_in, const CeedScalar *grad_in, const CeedScalar *div_in, ++ const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out, ++ const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, ++ const CeedEvalMode *eval_modes_out, const CeedScalar *__restrict__ assembled_qf_array, ++ CeedScalar *__restrict__ elem_diag_array) { ++ const int tid = threadIdx.x; // Running with P threads ++ + if (tid >= NUM_NODES) return; + + // Compute the diagonal of B^T D B + // Each element + for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < num_elem; e += gridDim.x * blockDim.z) { +- IndexType d_out = -1; +- + // Each basis eval mode pair +- for (IndexType e_out = 0; e_out < NUM_E_MODE_OUT; e_out++) { +- const CeedScalar *b_t = NULL; ++ IndexType d_out = 0; ++ CeedEvalMode eval_modes_out_prev = CEED_EVAL_NONE; + +- if (e_mode_out[e_out] == CEED_EVAL_GRAD) d_out += 1; +- CeedOperatorGetBasisPointer_Cuda(&b_t, e_mode_out[e_out], identity, interp_out, &grad_out[d_out * NUM_QPTS * NUM_NODES]); +- IndexType d_in = -1; ++ for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) { ++ IndexType d_in = 0; ++ CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE; ++ const CeedScalar *b_t = NULL; + +- for (IndexType e_in = 0; e_in < NUM_E_MODE_IN; e_in++) { ++ GetBasisPointer(&b_t, eval_modes_out[e_out], identity, interp_out, grad_out, div_out, curl_out); ++ if (e_out == 0 || eval_modes_out[e_out] != eval_modes_out_prev) d_out = 0; ++ else b_t = &b_t[(++d_out) * NUM_QPTS * NUM_NODES]; ++ eval_modes_out_prev = eval_modes_out[e_out]; ++ ++ for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) { + const CeedScalar *b = NULL; + +- if (e_mode_in[e_in] == CEED_EVAL_GRAD) d_in += 1; +- CeedOperatorGetBasisPointer_Cuda(&b, e_mode_in[e_in], identity, interp_in, &grad_in[d_in * NUM_QPTS * NUM_NODES]); ++ GetBasisPointer(&b, eval_modes_in[e_in], identity, interp_in, grad_in, div_in, curl_in); ++ if (e_in == 0 || eval_modes_in[e_in] != eval_modes_in_prev) d_in = 0; ++ else b = &b[(++d_in) * NUM_QPTS * NUM_NODES]; ++ eval_modes_in_prev = eval_modes_in[e_in]; ++ + // Each component + for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { + // Each qpoint/node pair + if (is_point_block) { +- // Point Block Diagonal ++ // Point block diagonal + for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { + CeedScalar e_value = 0.; + + for (IndexType q = 0; q < NUM_QPTS; q++) { + const CeedScalar qf_value = +- assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_E_MODE_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + ++ assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * ++ NUM_QPTS + + q]; + + e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; +@@ -86,12 +101,13 @@ __device__ void diagonalCore(const CeedInt num_elem, const bool is_point_block, + elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value; + } + } else { +- // Diagonal Only ++ // Diagonal only + CeedScalar e_value = 0.; + + for (IndexType q = 0; q < NUM_QPTS; q++) { + const CeedScalar qf_value = +- assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_E_MODE_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + q]; ++ assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + ++ q]; + + e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; + } +@@ -106,21 +122,25 @@ __device__ void diagonalCore(const CeedInt num_elem, const bool is_point_block, + //------------------------------------------------------------------------------ + // Linear diagonal + //------------------------------------------------------------------------------ +-extern "C" __global__ void linearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, +- const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedEvalMode *e_mode_in, +- const CeedEvalMode *e_mode_out, const CeedScalar *__restrict__ assembled_qf_array, +- CeedScalar *__restrict__ elem_diag_array) { +- diagonalCore(num_elem, false, identity, interp_in, grad_in, interp_out, grad_out, e_mode_in, e_mode_out, assembled_qf_array, elem_diag_array); ++extern "C" __global__ void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, ++ const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out, ++ const CeedScalar *grad_out, const CeedScalar *div_out, const CeedScalar *curl_out, ++ const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, ++ const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { ++ DiagonalCore(num_elem, false, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, ++ assembled_qf_array, elem_diag_array); + } + + //------------------------------------------------------------------------------ + // Linear point block diagonal + //------------------------------------------------------------------------------ +-extern "C" __global__ void linearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, +- const CeedScalar *grad_in, const CeedScalar *interp_out, const CeedScalar *grad_out, +- const CeedEvalMode *e_mode_in, const CeedEvalMode *e_mode_out, ++extern "C" __global__ void LinearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, ++ const CeedScalar *grad_in, const CeedScalar *div_in, const CeedScalar *curl_in, ++ const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedScalar *div_out, ++ const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, + const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { +- diagonalCore(num_elem, true, identity, interp_in, grad_in, interp_out, grad_out, e_mode_in, e_mode_out, assembled_qf_array, elem_diag_array); ++ DiagonalCore(num_elem, true, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, ++ assembled_qf_array, elem_diag_array); + } + + //------------------------------------------------------------------------------ +diff --git a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h +index eeb256fe..60d641ed 100644 +--- a/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h ++++ b/include/ceed/jit-source/cuda/cuda-ref-operator-assemble.h +@@ -19,108 +19,92 @@ typedef CeedInt IndexType; + #endif + + //------------------------------------------------------------------------------ +-// Matrix assembly kernel for low-order elements (2D thread block) ++// Matrix assembly kernel + //------------------------------------------------------------------------------ + extern "C" __launch_bounds__(BLOCK_SIZE) __global__ +- void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, +- CeedScalar *__restrict__ values_array) { +- // This kernel assumes B_in and B_out have the same number of quadrature points and basis points. +- // TODO: expand to more general cases +- const int i = threadIdx.x; // The output row index of each B^TDB operation +- const int l = threadIdx.y; // The output column index of each B^TDB operation +- // such that we have (Bout^T)_ij D_jk Bin_kl = C_il +- +- // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: element, +- // comp_in, comp_out, node_row, node_col +- const IndexType comp_out_stride = NUM_NODES * NUM_NODES; +- const IndexType comp_in_stride = comp_out_stride * NUM_COMP; +- const IndexType e_stride = comp_in_stride * NUM_COMP; +- // Strides for QF array, slowest --> fastest: e_mode_in, comp_in, e_mode_out, comp_out, elem, qpt +- const IndexType q_e_stride = NUM_QPTS; +- const IndexType q_comp_out_stride = NUM_ELEM * q_e_stride; +- const IndexType q_e_mode_out_stride = q_comp_out_stride * NUM_COMP; +- const IndexType q_comp_in_stride = q_e_mode_out_stride * NUM_E_MODE_OUT; +- const IndexType q_e_mode_in_stride = q_comp_in_stride * NUM_COMP; +- +- // Loop over each element (if necessary) +- for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NUM_ELEM; e += gridDim.x * blockDim.z) { +- for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { +- for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { +- CeedScalar result = 0.0; +- IndexType qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; +- +- for (IndexType e_mode_in = 0; e_mode_in < NUM_E_MODE_IN; e_mode_in++) { +- IndexType b_in_index = e_mode_in * NUM_QPTS * NUM_NODES; ++ void LinearAssemble(const CeedInt num_elem, const CeedScalar *B_in, const CeedScalar *B_out, const bool *orients_in, ++ const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out, ++ const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) { ++ extern __shared__ CeedScalar s_CT[]; ++ CeedScalar *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN; + +- for (IndexType e_mode_out = 0; e_mode_out < NUM_E_MODE_OUT; e_mode_out++) { +- IndexType b_out_index = e_mode_out * NUM_QPTS * NUM_NODES; +- IndexType qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in; +- +- // Perform the B^T D B operation for this 'chunk' of D (the qf_array) +- for (IndexType j = 0; j < NUM_QPTS; j++) { +- result += B_out[b_out_index + j * NUM_NODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES + l]; +- } +- } // end of e_mode_out +- } // end of e_mode_in +- IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NUM_NODES * i + l; +- +- values_array[val_index] = result; +- } // end of out component +- } // end of in component +- } // end of element loop +-} +- +-//------------------------------------------------------------------------------ +-// Fallback kernel for larger orders (1D thread block) +-//------------------------------------------------------------------------------ +-extern "C" __launch_bounds__(BLOCK_SIZE) __global__ +- void linearAssembleFallback(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, +- CeedScalar *__restrict__ values_array) { +- // This kernel assumes B_in and B_out have the same number of quadrature points and basis points. +- // TODO: expand to more general cases +- const int l = threadIdx.x; // The output column index of each B^TDB operation ++ const int l = threadIdx.x; // The output column index of each B^T D B operation + // such that we have (Bout^T)_ij D_jk Bin_kl = C_il + +- // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: element, ++ // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: e, + // comp_in, comp_out, node_row, node_col +- const IndexType comp_out_stride = NUM_NODES * NUM_NODES; +- const IndexType comp_in_stride = comp_out_stride * NUM_COMP; +- const IndexType e_stride = comp_in_stride * NUM_COMP; +- // Strides for QF array, slowest --> fastest: e_mode_in, comp_in, e_mode_out, comp_out, elem, qpt +- const IndexType q_e_stride = NUM_QPTS; +- const IndexType q_comp_out_stride = NUM_ELEM * q_e_stride; +- const IndexType q_e_mode_out_stride = q_comp_out_stride * NUM_COMP; +- const IndexType q_comp_in_stride = q_e_mode_out_stride * NUM_E_MODE_OUT; +- const IndexType q_e_mode_in_stride = q_comp_in_stride * NUM_COMP; ++ const IndexType comp_out_stride = NUM_NODES_OUT * NUM_NODES_IN; ++ const IndexType comp_in_stride = comp_out_stride * NUM_COMP_OUT; ++ const IndexType e_stride = comp_in_stride * NUM_COMP_IN; ++ ++ // Strides for QF array, slowest --> fastest: e_in, comp_in, e_out, comp_out, e, q ++ const IndexType q_e_stride = NUM_QPTS; ++ const IndexType q_comp_out_stride = num_elem * q_e_stride; ++ const IndexType q_eval_mode_out_stride = q_comp_out_stride * NUM_COMP_OUT; ++ const IndexType q_comp_in_stride = q_eval_mode_out_stride * NUM_EVAL_MODES_OUT; ++ const IndexType q_eval_mode_in_stride = q_comp_in_stride * NUM_COMP_IN; + + // Loop over each element (if necessary) +- for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NUM_ELEM; e += gridDim.x * blockDim.z) { +- for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { +- for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { +- for (IndexType i = 0; i < NUM_NODES; i++) { ++ for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < num_elem; e += gridDim.x * blockDim.z) { ++ for (IndexType comp_in = 0; comp_in < NUM_COMP_IN; comp_in++) { ++ for (IndexType comp_out = 0; comp_out < NUM_COMP_OUT; comp_out++) { ++ for (IndexType i = threadIdx.y; i < NUM_NODES_OUT; i += BLOCK_SIZE_Y) { + CeedScalar result = 0.0; + IndexType qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; + +- for (IndexType e_mode_in = 0; e_mode_in < NUM_E_MODE_IN; e_mode_in++) { +- IndexType b_in_index = e_mode_in * NUM_QPTS * NUM_NODES; ++ for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) { ++ IndexType b_in_index = e_in * NUM_QPTS * NUM_NODES_IN; + +- for (IndexType e_mode_out = 0; e_mode_out < NUM_E_MODE_OUT; e_mode_out++) { +- IndexType b_out_index = e_mode_out * NUM_QPTS * NUM_NODES; +- IndexType qf_index = qf_index_comp + q_e_mode_out_stride * e_mode_out + q_e_mode_in_stride * e_mode_in; ++ for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) { ++ IndexType b_out_index = e_out * NUM_QPTS * NUM_NODES_OUT; ++ IndexType qf_index = qf_index_comp + q_eval_mode_out_stride * e_out + q_eval_mode_in_stride * e_in; + + // Perform the B^T D B operation for this 'chunk' of D (the qf_array) + for (IndexType j = 0; j < NUM_QPTS; j++) { +- result += B_out[b_out_index + j * NUM_NODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES + l]; ++ result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l]; + } +- } // end of e_mode_out +- } // end of e_mode_in +- IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NUM_NODES * i + l; +- +- values_array[val_index] = result; ++ } // end of out eval mode ++ } // end of in eval mode ++ if (orients_in) { ++ result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0; ++ } ++ if (orients_out) { ++ result *= orients_out[NUM_NODES_OUT * e + i] ? -1.0 : 1.0; ++ } ++ if (!curl_orients_in && !curl_orients_out) { ++ IndexType val_index = e_stride * e + comp_in_stride * comp_in + comp_out_stride * comp_out + NUM_NODES_IN * i + l; ++ ++ values_array[val_index] = result; ++ } else if (curl_orients_in) { ++ s_C[NUM_NODES_IN * threadIdx.y + l] = result; ++ __syncthreads(); ++ s_CT[NUM_NODES_IN * i + l] = ++ (l > 0 ? s_C[NUM_NODES_IN * threadIdx.y + l - 1] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l - 1] : 0.0) + ++ s_C[NUM_NODES_IN * threadIdx.y + l] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l + 1] + ++ (l < (NUM_NODES_IN - 1) ? s_C[NUM_NODES_IN * threadIdx.y + l + 1] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l + 3] : 0.0); ++ } else { ++ s_CT[NUM_NODES_IN * i + l] = result; ++ } + } // end of loop over element node index, i +- } // end of out component +- } // end of in component +- } // end of element loop ++ if (curl_orients_in || curl_orients_out) { ++ // Compute and store the final T^T (B^T D B T) using the fully computed C T product in shared memory ++ if (curl_orients_out) __syncthreads(); ++ for (IndexType i = threadIdx.y; i < NUM_NODES_OUT; i += BLOCK_SIZE_Y) { ++ IndexType val_index = e_stride * e + comp_in_stride * comp_in + comp_out_stride * comp_out + NUM_NODES_IN * i + l; ++ ++ if (curl_orients_out) { ++ values_array[val_index] = ++ (i > 0 ? s_CT[NUM_NODES_IN * (i - 1) + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i - 1] : 0.0) + ++ s_CT[NUM_NODES_IN * i + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i + 1] + ++ (i < (NUM_NODES_OUT - 1) ? s_CT[NUM_NODES_IN * (i + 1) + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i + 3] : 0.0); ++ } else { ++ values_array[val_index] = s_CT[NUM_NODES_IN * i + l]; ++ } ++ } ++ } ++ } // end of out component ++ } // end of in component ++ } // end of element loop + } + + //------------------------------------------------------------------------------ +diff --git a/include/ceed/jit-source/cuda/cuda-ref-restriction.h b/include/ceed/jit-source/cuda/cuda-ref-restriction.h +index 1df6f049..80011148 100644 +--- a/include/ceed/jit-source/cuda/cuda-ref-restriction.h ++++ b/include/ceed/jit-source/cuda/cuda-ref-restriction.h +@@ -28,38 +28,107 @@ extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const Ceed + } + + //------------------------------------------------------------------------------ +-// E-vector -> L-vector, strided ++// L-vector -> E-vector, standard (with offsets) + //------------------------------------------------------------------------------ +-extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, ++ CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt ind = indices[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { +- v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] += +- u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; ++ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE]; + } + } + } + + //------------------------------------------------------------------------------ +-// L-vector -> E-vector, offsets provided ++// L-vector -> E-vector, oriented + //------------------------------------------------------------------------------ +-extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, +- CeedScalar *__restrict__ v) { ++extern "C" __global__ void OrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, ++ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { + for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; ++ const bool orient = orients[node]; + const CeedInt loc_node = node % RSTR_ELEM_SIZE; + const CeedInt elem = node / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { +- v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE]; ++ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0); ++ } ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// L-vector -> E-vector, curl-oriented ++//------------------------------------------------------------------------------ ++extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, ++ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, ++ CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; ++ const CeedInt ind_d = indices[node]; ++ const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; ++ const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0]; ++ const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; ++ const CeedInt8 curl_orient_du = curl_orients[3 * node + 2]; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ CeedScalar value = 0.0; ++ value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; ++ value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; ++ value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; ++ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; ++ } ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// L-vector -> E-vector, unsigned curl-oriented ++//------------------------------------------------------------------------------ ++extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, ++ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, ++ CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; ++ const CeedInt ind_d = indices[node]; ++ const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; ++ const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]); ++ const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); ++ const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]); ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ CeedScalar value = 0.0; ++ value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; ++ value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; ++ value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; ++ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; ++ } ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// E-vector -> L-vector, strided ++//------------------------------------------------------------------------------ ++extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] += ++ u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; + } + } + } + + //------------------------------------------------------------------------------ +-// E-vector -> L-vector, offsets provided ++// E-vector -> L-vector, standard (with offsets) + //------------------------------------------------------------------------------ + extern "C" __global__ void OffsetTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, + CeedScalar *__restrict__ v) { +@@ -87,8 +156,8 @@ extern "C" __global__ void OffsetTransposeDet(const CeedInt *__restrict__ l_vec_ + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; +- CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; +- CeedInt elem = t_ind / RSTR_ELEM_SIZE; ++ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; ++ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; + + for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { + value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; +@@ -99,6 +168,165 @@ extern "C" __global__ void OffsetTransposeDet(const CeedInt *__restrict__ l_vec_ + } + } + ++//------------------------------------------------------------------------------ ++// E-vector -> L-vector, oriented ++//------------------------------------------------------------------------------ ++extern "C" __global__ void OrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, ++ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt ind = indices[node]; ++ const bool orient = orients[node]; ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, ++ u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); ++ } ++ } ++} ++ ++extern "C" __global__ void OrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, ++ const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients, ++ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++ CeedScalar value[RSTR_NUM_COMP]; ++ ++ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { ++ const CeedInt ind = l_vec_indices[i]; ++ const CeedInt range_1 = t_offsets[i]; ++ const CeedInt range_N = t_offsets[i + 1]; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; ++ ++ for (CeedInt j = range_1; j < range_N; j++) { ++ const CeedInt t_ind = t_indices[j]; ++ const bool orient = orients[t_ind]; ++ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; ++ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0); ++ } ++ } ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// E-vector -> L-vector, curl-oriented ++//------------------------------------------------------------------------------ ++extern "C" __global__ void CurlOrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, ++ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, ++ CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt ind = indices[node]; ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0; ++ const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; ++ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ CeedScalar value = 0.0; ++ value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; ++ value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; ++ value += ++ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; ++ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); ++ } ++ } ++} ++ ++extern "C" __global__ void CurlOrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, ++ const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, ++ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++ CeedScalar value[RSTR_NUM_COMP]; ++ ++ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { ++ const CeedInt ind = l_vec_indices[i]; ++ const CeedInt range_1 = t_offsets[i]; ++ const CeedInt range_N = t_offsets[i + 1]; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; ++ ++ for (CeedInt j = range_1; j < range_N; j++) { ++ const CeedInt t_ind = t_indices[j]; ++ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; ++ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; ++ const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0; ++ const CeedInt8 curl_orient_d = curl_orients[3 * t_ind + 1]; ++ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; ++ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; ++ value[comp] += ++ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; ++ } ++ } ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// E-vector -> L-vector, unsigned curl-oriented ++//------------------------------------------------------------------------------ ++extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, ++ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, ++ CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ const CeedInt ind = indices[node]; ++ const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0; ++ const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); ++ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ CeedScalar value = 0.0; ++ value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; ++ value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; ++ value += ++ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; ++ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); ++ } ++ } ++} ++ ++extern "C" __global__ void CurlOrientedUnsignedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, ++ const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, ++ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++ CeedScalar value[RSTR_NUM_COMP]; ++ ++ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { ++ const CeedInt ind = l_vec_indices[i]; ++ const CeedInt range_1 = t_offsets[i]; ++ const CeedInt range_N = t_offsets[i + 1]; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; ++ ++ for (CeedInt j = range_1; j < range_N; j++) { ++ const CeedInt t_ind = t_indices[j]; ++ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; ++ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; ++ const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0; ++ const CeedInt8 curl_orient_d = abs(curl_orients[3 * t_ind + 1]); ++ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; ++ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; ++ value[comp] += ++ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; ++ } ++ } ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; ++ } ++} ++ + //------------------------------------------------------------------------------ + + #endif // CEED_CUDA_REF_RESTRICTION_H +diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h +index 8270c73d..fcd8df29 100644 +--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h ++++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble-diagonal.h +@@ -12,80 +12,106 @@ + + #include + +-#if CEEDSIZE ++#if USE_CEEDSIZE + typedef CeedSize IndexType; + #else + typedef CeedInt IndexType; + #endif + + //------------------------------------------------------------------------------ +-// Get Basis Emode Pointer ++// Get basis pointer + //------------------------------------------------------------------------------ +-extern "C" __device__ void CeedOperatorGetBasisPointer_Hip(const CeedScalar **basisptr, CeedEvalMode emode, const CeedScalar *identity, +- const CeedScalar *interp, const CeedScalar *grad) { +- switch (emode) { ++static __device__ __inline__ void GetBasisPointer(const CeedScalar **basis_ptr, CeedEvalMode eval_modes, const CeedScalar *identity, ++ const CeedScalar *interp, const CeedScalar *grad, const CeedScalar *div, const CeedScalar *curl) { ++ switch (eval_modes) { + case CEED_EVAL_NONE: +- *basisptr = identity; ++ *basis_ptr = identity; + break; + case CEED_EVAL_INTERP: +- *basisptr = interp; ++ *basis_ptr = interp; + break; + case CEED_EVAL_GRAD: +- *basisptr = grad; ++ *basis_ptr = grad; + break; +- case CEED_EVAL_WEIGHT: + case CEED_EVAL_DIV: ++ *basis_ptr = div; ++ break; + case CEED_EVAL_CURL: +- break; // Caught by QF Assembly ++ *basis_ptr = curl; ++ break; ++ case CEED_EVAL_WEIGHT: ++ break; // Caught by QF assembly + } + } + + //------------------------------------------------------------------------------ + // Core code for diagonal assembly + //------------------------------------------------------------------------------ +-__device__ void diagonalCore(const CeedInt nelem, const bool pointBlock, const CeedScalar *identity, const CeedScalar *interpin, +- const CeedScalar *gradin, const CeedScalar *interpout, const CeedScalar *gradout, const CeedEvalMode *emodein, +- const CeedEvalMode *emodeout, const CeedScalar *__restrict__ assembledqfarray, CeedScalar *__restrict__ elemdiagarray) { +- const int tid = threadIdx.x; // running with P threads, tid is evec node +- if (tid >= NNODES) return; ++static __device__ __inline__ void DiagonalCore(const CeedInt num_elem, const bool is_point_block, const CeedScalar *identity, ++ const CeedScalar *interp_in, const CeedScalar *grad_in, const CeedScalar *div_in, ++ const CeedScalar *curl_in, const CeedScalar *interp_out, const CeedScalar *grad_out, ++ const CeedScalar *div_out, const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, ++ const CeedEvalMode *eval_modes_out, const CeedScalar *__restrict__ assembled_qf_array, ++ CeedScalar *__restrict__ elem_diag_array) { ++ const int tid = threadIdx.x; // Running with P threads ++ ++ if (tid >= NUM_NODES) return; + + // Compute the diagonal of B^T D B + // Each element +- for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < nelem; e += gridDim.x * blockDim.z) { +- IndexType dout = -1; ++ for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < num_elem; e += gridDim.x * blockDim.z) { + // Each basis eval mode pair +- for (IndexType eout = 0; eout < NUMEMODEOUT; eout++) { +- const CeedScalar *bt = NULL; +- if (emodeout[eout] == CEED_EVAL_GRAD) dout += 1; +- CeedOperatorGetBasisPointer_Hip(&bt, emodeout[eout], identity, interpout, &gradout[dout * NQPTS * NNODES]); +- IndexType din = -1; +- for (IndexType ein = 0; ein < NUMEMODEIN; ein++) { ++ IndexType d_out = 0; ++ CeedEvalMode eval_modes_out_prev = CEED_EVAL_NONE; ++ ++ for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) { ++ IndexType d_in = 0; ++ CeedEvalMode eval_modes_in_prev = CEED_EVAL_NONE; ++ const CeedScalar *b_t = NULL; ++ ++ GetBasisPointer(&b_t, eval_modes_out[e_out], identity, interp_out, grad_out, div_out, curl_out); ++ if (e_out == 0 || eval_modes_out[e_out] != eval_modes_out_prev) d_out = 0; ++ else b_t = &b_t[(++d_out) * NUM_QPTS * NUM_NODES]; ++ eval_modes_out_prev = eval_modes_out[e_out]; ++ ++ for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) { + const CeedScalar *b = NULL; +- if (emodein[ein] == CEED_EVAL_GRAD) din += 1; +- CeedOperatorGetBasisPointer_Hip(&b, emodein[ein], identity, interpin, &gradin[din * NQPTS * NNODES]); ++ ++ GetBasisPointer(&b, eval_modes_in[e_in], identity, interp_in, grad_in, div_in, curl_in); ++ if (e_in == 0 || eval_modes_in[e_in] != eval_modes_in_prev) d_in = 0; ++ else b = &b[(++d_in) * NUM_QPTS * NUM_NODES]; ++ eval_modes_in_prev = eval_modes_in[e_in]; ++ + // Each component +- for (IndexType compOut = 0; compOut < NCOMP; compOut++) { ++ for (IndexType comp_out = 0; comp_out < NUM_COMP; comp_out++) { + // Each qpoint/node pair +- if (pointBlock) { +- // Point Block Diagonal +- for (IndexType compIn = 0; compIn < NCOMP; compIn++) { +- CeedScalar evalue = 0.; +- for (IndexType q = 0; q < NQPTS; q++) { +- const CeedScalar qfvalue = +- assembledqfarray[((((ein * NCOMP + compIn) * NUMEMODEOUT + eout) * NCOMP + compOut) * nelem + e) * NQPTS + q]; +- evalue += bt[q * NNODES + tid] * qfvalue * b[q * NNODES + tid]; ++ if (is_point_block) { ++ // Point block diagonal ++ for (IndexType comp_in = 0; comp_in < NUM_COMP; comp_in++) { ++ CeedScalar e_value = 0.; ++ ++ for (IndexType q = 0; q < NUM_QPTS; q++) { ++ const CeedScalar qf_value = ++ assembled_qf_array[((((e_in * NUM_COMP + comp_in) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * ++ NUM_QPTS + ++ q]; ++ ++ e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; + } +- elemdiagarray[((compOut * NCOMP + compIn) * nelem + e) * NNODES + tid] += evalue; ++ elem_diag_array[((comp_out * NUM_COMP + comp_in) * num_elem + e) * NUM_NODES + tid] += e_value; + } + } else { +- // Diagonal Only +- CeedScalar evalue = 0.; +- for (IndexType q = 0; q < NQPTS; q++) { +- const CeedScalar qfvalue = +- assembledqfarray[((((ein * NCOMP + compOut) * NUMEMODEOUT + eout) * NCOMP + compOut) * nelem + e) * NQPTS + q]; +- evalue += bt[q * NNODES + tid] * qfvalue * b[q * NNODES + tid]; ++ // Diagonal only ++ CeedScalar e_value = 0.; ++ ++ for (IndexType q = 0; q < NUM_QPTS; q++) { ++ const CeedScalar qf_value = ++ assembled_qf_array[((((e_in * NUM_COMP + comp_out) * NUM_EVAL_MODES_OUT + e_out) * NUM_COMP + comp_out) * num_elem + e) * NUM_QPTS + ++ q]; ++ ++ e_value += b_t[q * NUM_NODES + tid] * qf_value * b[q * NUM_NODES + tid]; + } +- elemdiagarray[(compOut * nelem + e) * NNODES + tid] += evalue; ++ elem_diag_array[(comp_out * num_elem + e) * NUM_NODES + tid] += e_value; + } + } + } +@@ -96,21 +122,25 @@ __device__ void diagonalCore(const CeedInt nelem, const bool pointBlock, const C + //------------------------------------------------------------------------------ + // Linear diagonal + //------------------------------------------------------------------------------ +-extern "C" __global__ void linearDiagonal(const CeedInt nelem, const CeedScalar *identity, const CeedScalar *interpin, const CeedScalar *gradin, +- const CeedScalar *interpout, const CeedScalar *gradout, const CeedEvalMode *emodein, +- const CeedEvalMode *emodeout, const CeedScalar *__restrict__ assembledqfarray, +- CeedScalar *__restrict__ elemdiagarray) { +- diagonalCore(nelem, false, identity, interpin, gradin, interpout, gradout, emodein, emodeout, assembledqfarray, elemdiagarray); ++extern "C" __global__ void LinearDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, const CeedScalar *grad_in, ++ const CeedScalar *div_in, const CeedScalar *curl_in, const CeedScalar *interp_out, ++ const CeedScalar *grad_out, const CeedScalar *div_out, const CeedScalar *curl_out, ++ const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, ++ const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { ++ DiagonalCore(num_elem, false, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, ++ assembled_qf_array, elem_diag_array); + } + + //------------------------------------------------------------------------------ + // Linear point block diagonal + //------------------------------------------------------------------------------ +-extern "C" __global__ void linearPointBlockDiagonal(const CeedInt nelem, const CeedScalar *identity, const CeedScalar *interpin, +- const CeedScalar *gradin, const CeedScalar *interpout, const CeedScalar *gradout, +- const CeedEvalMode *emodein, const CeedEvalMode *emodeout, +- const CeedScalar *__restrict__ assembledqfarray, CeedScalar *__restrict__ elemdiagarray) { +- diagonalCore(nelem, true, identity, interpin, gradin, interpout, gradout, emodein, emodeout, assembledqfarray, elemdiagarray); ++extern "C" __global__ void LinearPointBlockDiagonal(const CeedInt num_elem, const CeedScalar *identity, const CeedScalar *interp_in, ++ const CeedScalar *grad_in, const CeedScalar *div_in, const CeedScalar *curl_in, ++ const CeedScalar *interp_out, const CeedScalar *grad_out, const CeedScalar *div_out, ++ const CeedScalar *curl_out, const CeedEvalMode *eval_modes_in, const CeedEvalMode *eval_modes_out, ++ const CeedScalar *__restrict__ assembled_qf_array, CeedScalar *__restrict__ elem_diag_array) { ++ DiagonalCore(num_elem, true, identity, interp_in, grad_in, div_in, curl_in, interp_out, grad_out, div_out, curl_out, eval_modes_in, eval_modes_out, ++ assembled_qf_array, elem_diag_array); + } + + //------------------------------------------------------------------------------ +diff --git a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h +index 005fa6f7..a0c21f9d 100644 +--- a/include/ceed/jit-source/hip/hip-ref-operator-assemble.h ++++ b/include/ceed/jit-source/hip/hip-ref-operator-assemble.h +@@ -12,107 +12,99 @@ + + #include + +-#if CEEDSIZE ++#if USE_CEEDSIZE + typedef CeedSize IndexType; + #else + typedef CeedInt IndexType; + #endif + + //------------------------------------------------------------------------------ +-// Matrix assembly kernel for low-order elements (2D thread block) ++// Matrix assembly kernel + //------------------------------------------------------------------------------ + extern "C" __launch_bounds__(BLOCK_SIZE) __global__ +- void linearAssemble(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, +- CeedScalar *__restrict__ values_array) { +- // This kernel assumes B_in and B_out have the same number of quadrature points and basis points. +- // TODO: expand to more general cases +- const int i = threadIdx.x; // The output row index of each B^TDB operation +- const int l = threadIdx.y; // The output column index of each B^TDB operation ++ void LinearAssemble(const CeedInt num_elem, const CeedScalar *B_in, const CeedScalar *B_out, const bool *orients_in, ++ const CeedInt8 *curl_orients_in, const bool *orients_out, const CeedInt8 *curl_orients_out, ++ const CeedScalar *__restrict__ qf_array, CeedScalar *__restrict__ values_array) { ++ extern __shared__ CeedScalar s_CT[]; ++ CeedScalar *s_C = s_CT + NUM_NODES_OUT * NUM_NODES_IN; ++ ++ const int l = threadIdx.x; // The output column index of each B^T D B operation + // such that we have (Bout^T)_ij D_jk Bin_kl = C_il + +- // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: element, ++ // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: e, + // comp_in, comp_out, node_row, node_col +- const IndexType comp_out_stride = NNODES * NNODES; +- const IndexType comp_in_stride = comp_out_stride * NCOMP; +- const IndexType e_stride = comp_in_stride * NCOMP; +- // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt +- const IndexType qe_stride = NQPTS; +- const IndexType qcomp_out_stride = NELEM * qe_stride; +- const IndexType qemode_out_stride = qcomp_out_stride * NCOMP; +- const IndexType qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; +- const IndexType qemode_in_stride = qcomp_in_stride * NCOMP; ++ const IndexType comp_out_stride = NUM_NODES_OUT * NUM_NODES_IN; ++ const IndexType comp_in_stride = comp_out_stride * NUM_COMP_OUT; ++ const IndexType e_stride = comp_in_stride * NUM_COMP_IN; ++ ++ // Strides for QF array, slowest --> fastest: e_in, comp_in, e_out, comp_out, e, q ++ const IndexType q_e_stride = NUM_QPTS; ++ const IndexType q_comp_out_stride = num_elem * q_e_stride; ++ const IndexType q_eval_mode_out_stride = q_comp_out_stride * NUM_COMP_OUT; ++ const IndexType q_comp_in_stride = q_eval_mode_out_stride * NUM_EVAL_MODES_OUT; ++ const IndexType q_eval_mode_in_stride = q_comp_in_stride * NUM_COMP_IN; + + // Loop over each element (if necessary) +- for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) { +- for (IndexType comp_in = 0; comp_in < NCOMP; comp_in++) { +- for (IndexType comp_out = 0; comp_out < NCOMP; comp_out++) { +- CeedScalar result = 0.0; +- IndexType qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; +- for (IndexType emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { +- IndexType b_in_index = emode_in * NQPTS * NNODES; +- for (IndexType emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { +- IndexType b_out_index = emode_out * NQPTS * NNODES; +- IndexType qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; +- // Perform the B^T D B operation for this 'chunk' of D (the qf_array) +- for (IndexType j = 0; j < NQPTS; j++) { +- result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; +- } +- } // end of emode_out +- } // end of emode_in +- IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; +- values_array[val_index] = result; +- } // end of out component +- } // end of in component +- } // end of element loop +-} ++ for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < num_elem; e += gridDim.x * blockDim.z) { ++ for (IndexType comp_in = 0; comp_in < NUM_COMP_IN; comp_in++) { ++ for (IndexType comp_out = 0; comp_out < NUM_COMP_OUT; comp_out++) { ++ for (IndexType i = threadIdx.y; i < NUM_NODES_OUT; i += BLOCK_SIZE_Y) { ++ CeedScalar result = 0.0; ++ IndexType qf_index_comp = q_comp_in_stride * comp_in + q_comp_out_stride * comp_out + q_e_stride * e; + +-//------------------------------------------------------------------------------ +-// Fallback kernel for larger orders (1D thread block) +-//------------------------------------------------------------------------------ +-extern "C" __launch_bounds__(BLOCK_SIZE) __global__ +- void linearAssembleFallback(const CeedScalar *B_in, const CeedScalar *B_out, const CeedScalar *__restrict__ qf_array, +- CeedScalar *__restrict__ values_array) { +- // This kernel assumes B_in and B_out have the same number of quadrature points and basis points. +- // TODO: expand to more general cases +- const int l = threadIdx.x; // The output column index of each B^TDB operation +- // such that we have (Bout^T)_ij D_jk Bin_kl = C_il ++ for (IndexType e_in = 0; e_in < NUM_EVAL_MODES_IN; e_in++) { ++ IndexType b_in_index = e_in * NUM_QPTS * NUM_NODES_IN; + +- // Strides for final output ordering, determined by the reference (interface) implementation of the symbolic assembly, slowest --> fastest: element, +- // comp_in, comp_out, node_row, node_col +- const IndexType comp_out_stride = NNODES * NNODES; +- const IndexType comp_in_stride = comp_out_stride * NCOMP; +- const IndexType e_stride = comp_in_stride * NCOMP; +- // Strides for QF array, slowest --> fastest: emode_in, comp_in, emode_out, comp_out, elem, qpt +- const IndexType qe_stride = NQPTS; +- const IndexType qcomp_out_stride = NELEM * qe_stride; +- const IndexType qemode_out_stride = qcomp_out_stride * NCOMP; +- const IndexType qcomp_in_stride = qemode_out_stride * NUMEMODEOUT; +- const IndexType qemode_in_stride = qcomp_in_stride * NCOMP; ++ for (IndexType e_out = 0; e_out < NUM_EVAL_MODES_OUT; e_out++) { ++ IndexType b_out_index = e_out * NUM_QPTS * NUM_NODES_OUT; ++ IndexType qf_index = qf_index_comp + q_eval_mode_out_stride * e_out + q_eval_mode_in_stride * e_in; + +- // Loop over each element (if necessary) +- for (IndexType e = blockIdx.x * blockDim.z + threadIdx.z; e < NELEM; e += gridDim.x * blockDim.z) { +- for (IndexType comp_in = 0; comp_in < NCOMP; comp_in++) { +- for (IndexType comp_out = 0; comp_out < NCOMP; comp_out++) { +- for (IndexType i = 0; i < NNODES; i++) { +- CeedScalar result = 0.0; +- IndexType qf_index_comp = qcomp_in_stride * comp_in + qcomp_out_stride * comp_out + qe_stride * e; +- for (IndexType emode_in = 0; emode_in < NUMEMODEIN; emode_in++) { +- IndexType b_in_index = emode_in * NQPTS * NNODES; +- for (IndexType emode_out = 0; emode_out < NUMEMODEOUT; emode_out++) { +- IndexType b_out_index = emode_out * NQPTS * NNODES; +- IndexType qf_index = qf_index_comp + qemode_out_stride * emode_out + qemode_in_stride * emode_in; + // Perform the B^T D B operation for this 'chunk' of D (the qf_array) +- for (IndexType j = 0; j < NQPTS; j++) { +- result += B_out[b_out_index + j * NNODES + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NNODES + l]; ++ for (IndexType j = 0; j < NUM_QPTS; j++) { ++ result += B_out[b_out_index + j * NUM_NODES_OUT + i] * qf_array[qf_index + j] * B_in[b_in_index + j * NUM_NODES_IN + l]; + } +- } // end of emode_out +- } // end of emode_in +- IndexType val_index = comp_in_stride * comp_in + comp_out_stride * comp_out + e_stride * e + NNODES * i + l; +- values_array[val_index] = result; ++ } // end of out eval mode ++ } // end of in eval mode ++ if (orients_in) { ++ result *= orients_in[NUM_NODES_IN * e + l] ? -1.0 : 1.0; ++ } ++ if (orients_out) { ++ result *= orients_out[NUM_NODES_OUT * e + i] ? -1.0 : 1.0; ++ } ++ if (!curl_orients_in && !curl_orients_out) { ++ IndexType val_index = e_stride * e + comp_in_stride * comp_in + comp_out_stride * comp_out + NUM_NODES_IN * i + l; ++ ++ values_array[val_index] = result; ++ } else if (curl_orients_in) { ++ s_C[NUM_NODES_IN * threadIdx.y + l] = result; ++ __syncthreads(); ++ s_CT[NUM_NODES_IN * i + l] = ++ (l > 0 ? s_C[NUM_NODES_IN * threadIdx.y + l - 1] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l - 1] : 0.0) + ++ s_C[NUM_NODES_IN * threadIdx.y + l] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l + 1] + ++ (l < (NUM_NODES_IN - 1) ? s_C[NUM_NODES_IN * threadIdx.y + l + 1] * curl_orients_in[3 * NUM_NODES_IN * e + 3 * l + 3] : 0.0); ++ } else { ++ s_CT[NUM_NODES_IN * i + l] = result; ++ } + } // end of loop over element node index, i +- } // end of out component +- } // end of in component +- } // end of element loop ++ if (curl_orients_in || curl_orients_out) { ++ // Compute and store the final T^T (B^T D B T) using the fully computed C T product in shared memory ++ if (curl_orients_out) __syncthreads(); ++ for (IndexType i = threadIdx.y; i < NUM_NODES_OUT; i += BLOCK_SIZE_Y) { ++ IndexType val_index = e_stride * e + comp_in_stride * comp_in + comp_out_stride * comp_out + NUM_NODES_IN * i + l; ++ ++ if (curl_orients_out) { ++ values_array[val_index] = ++ (i > 0 ? s_CT[NUM_NODES_IN * (i - 1) + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i - 1] : 0.0) + ++ s_CT[NUM_NODES_IN * i + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i + 1] + ++ (i < (NUM_NODES_OUT - 1) ? s_CT[NUM_NODES_IN * (i + 1) + l] * curl_orients_out[3 * NUM_NODES_OUT * e + 3 * i + 3] : 0.0); ++ } else { ++ values_array[val_index] = s_CT[NUM_NODES_IN * i + l]; ++ } ++ } ++ } ++ } // end of out component ++ } // end of in component ++ } // end of element loop + } + + //------------------------------------------------------------------------------ +diff --git a/include/ceed/jit-source/hip/hip-ref-restriction.h b/include/ceed/jit-source/hip/hip-ref-restriction.h +index c34aa980..e1de6d87 100644 +--- a/include/ceed/jit-source/hip/hip-ref-restriction.h ++++ b/include/ceed/jit-source/hip/hip-ref-restriction.h +@@ -16,86 +16,314 @@ + // L-vector -> E-vector, strided + //------------------------------------------------------------------------------ + extern "C" __global__ void StridedNoTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { +- for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { +- const CeedInt loc_node = node % RESTR_ELEM_SIZE; +- const CeedInt elem = node / RESTR_ELEM_SIZE; ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; + +- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { +- v[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE] = +- u[loc_node * RESTR_STRIDE_NODES + comp * RESTR_STRIDE_COMP + elem * RESTR_STRIDE_ELEM]; ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = ++ u[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM]; + } + } + } + + //------------------------------------------------------------------------------ +-// E-vector -> L-vector, strided ++// L-vector -> E-vector, standard (with offsets) + //------------------------------------------------------------------------------ +-extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { +- for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { +- const CeedInt loc_node = node % RESTR_ELEM_SIZE; +- const CeedInt elem = node / RESTR_ELEM_SIZE; ++extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, ++ CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt ind = indices[node]; ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; + +- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { +- v[loc_node * RESTR_STRIDE_NODES + comp * RESTR_STRIDE_COMP + elem * RESTR_STRIDE_ELEM] += +- u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]; ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE]; + } + } + } + + //------------------------------------------------------------------------------ +-// L-vector -> E-vector, offsets provided ++// L-vector -> E-vector, oriented + //------------------------------------------------------------------------------ +-extern "C" __global__ void OffsetNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, +- CeedScalar *__restrict__ v) { +- for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++extern "C" __global__ void OrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, ++ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; +- const CeedInt loc_node = node % RESTR_ELEM_SIZE; +- const CeedInt elem = node / RESTR_ELEM_SIZE; ++ const bool orient = orients[node]; ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = u[ind + comp * RSTR_COMP_STRIDE] * (orient ? -1.0 : 1.0); ++ } ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// L-vector -> E-vector, curl-oriented ++//------------------------------------------------------------------------------ ++extern "C" __global__ void CurlOrientedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, ++ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, ++ CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; ++ const CeedInt ind_d = indices[node]; ++ const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; ++ const CeedInt8 curl_orient_dl = curl_orients[3 * node + 0]; ++ const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; ++ const CeedInt8 curl_orient_du = curl_orients[3 * node + 2]; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ CeedScalar value = 0.0; ++ value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; ++ value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; ++ value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; ++ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; ++ } ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// L-vector -> E-vector, unsigned curl-oriented ++//------------------------------------------------------------------------------ ++extern "C" __global__ void CurlOrientedUnsignedNoTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, ++ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, ++ CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ const CeedInt ind_dl = loc_node > 0 ? indices[node - 1] : 0; ++ const CeedInt ind_d = indices[node]; ++ const CeedInt ind_du = loc_node < (RSTR_ELEM_SIZE - 1) ? indices[node + 1] : 0; ++ const CeedInt8 curl_orient_dl = abs(curl_orients[3 * node + 0]); ++ const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); ++ const CeedInt8 curl_orient_du = abs(curl_orients[3 * node + 2]); ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ CeedScalar value = 0.0; ++ value += loc_node > 0 ? u[ind_dl + comp * RSTR_COMP_STRIDE] * curl_orient_dl : 0.0; ++ value += u[ind_d + comp * RSTR_COMP_STRIDE] * curl_orient_d; ++ value += loc_node < (RSTR_ELEM_SIZE - 1) ? u[ind_du + comp * RSTR_COMP_STRIDE] * curl_orient_du : 0.0; ++ v[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] = value; ++ } ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// E-vector -> L-vector, strided ++//------------------------------------------------------------------------------ ++extern "C" __global__ void StridedTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; + +- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { +- v[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE] = u[ind + comp * RESTR_COMP_STRIDE]; ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ v[loc_node * RSTR_STRIDE_NODES + comp * RSTR_STRIDE_COMP + elem * RSTR_STRIDE_ELEM] += ++ u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; + } + } + } + + //------------------------------------------------------------------------------ +-// E-vector -> L-vector, offsets provided ++// E-vector -> L-vector, standard (with offsets) + //------------------------------------------------------------------------------ + extern "C" __global__ void OffsetTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ u, + CeedScalar *__restrict__ v) { +- for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RESTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { + const CeedInt ind = indices[node]; +- const CeedInt loc_node = node % RESTR_ELEM_SIZE; +- const CeedInt elem = node / RESTR_ELEM_SIZE; ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; + +- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { +- atomicAdd(v + ind + comp * RESTR_COMP_STRIDE, u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]); ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]); + } + } + } + + extern "C" __global__ void OffsetTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, + const CeedInt *__restrict__ t_offsets, const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { +- CeedScalar value[RESTR_NUM_COMP]; ++ CeedScalar value[RSTR_NUM_COMP]; + +- for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RESTR_NUM_NODES; i += blockDim.x * gridDim.x) { ++ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { + const CeedInt ind = l_vec_indices[i]; + const CeedInt range_1 = t_offsets[i]; + const CeedInt range_N = t_offsets[i + 1]; + +- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) value[comp] = 0.0; ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; + + for (CeedInt j = range_1; j < range_N; j++) { + const CeedInt t_ind = t_indices[j]; +- CeedInt loc_node = t_ind % RESTR_ELEM_SIZE; +- CeedInt elem = t_ind / RESTR_ELEM_SIZE; ++ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; ++ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE]; ++ } ++ } ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// E-vector -> L-vector, oriented ++//------------------------------------------------------------------------------ ++extern "C" __global__ void OrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, const bool *__restrict__ orients, ++ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt ind = indices[node]; ++ const bool orient = orients[node]; ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, ++ u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0)); ++ } ++ } ++} ++ ++extern "C" __global__ void OrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, ++ const CeedInt *__restrict__ t_offsets, const bool *__restrict__ orients, ++ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++ CeedScalar value[RSTR_NUM_COMP]; ++ ++ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { ++ const CeedInt ind = l_vec_indices[i]; ++ const CeedInt range_1 = t_offsets[i]; ++ const CeedInt range_N = t_offsets[i + 1]; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; ++ ++ for (CeedInt j = range_1; j < range_N; j++) { ++ const CeedInt t_ind = t_indices[j]; ++ const bool orient = orients[t_ind]; ++ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; ++ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * (orient ? -1.0 : 1.0); ++ } ++ } ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// E-vector -> L-vector, curl-oriented ++//------------------------------------------------------------------------------ ++extern "C" __global__ void CurlOrientedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, ++ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, ++ CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt ind = indices[node]; ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * node - 1] : 0.0; ++ const CeedInt8 curl_orient_d = curl_orients[3 * node + 1]; ++ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * node + 3] : 0.0; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ CeedScalar value = 0.0; ++ value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; ++ value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; ++ value += ++ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; ++ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); ++ } ++ } ++} ++ ++extern "C" __global__ void CurlOrientedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, ++ const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, ++ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++ CeedScalar value[RSTR_NUM_COMP]; ++ ++ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { ++ const CeedInt ind = l_vec_indices[i]; ++ const CeedInt range_1 = t_offsets[i]; ++ const CeedInt range_N = t_offsets[i + 1]; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; ++ ++ for (CeedInt j = range_1; j < range_N; j++) { ++ const CeedInt t_ind = t_indices[j]; ++ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; ++ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; ++ const CeedInt8 curl_orient_du = loc_node > 0 ? curl_orients[3 * t_ind - 1] : 0.0; ++ const CeedInt8 curl_orient_d = curl_orients[3 * t_ind + 1]; ++ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? curl_orients[3 * t_ind + 3] : 0.0; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; ++ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; ++ value[comp] += ++ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; ++ } ++ } ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// E-vector -> L-vector, unsigned curl-oriented ++//------------------------------------------------------------------------------ ++extern "C" __global__ void CurlOrientedUnsignedTranspose(const CeedInt num_elem, const CeedInt *__restrict__ indices, ++ const CeedInt8 *__restrict__ curl_orients, const CeedScalar *__restrict__ u, ++ CeedScalar *__restrict__ v) { ++ for (CeedInt node = blockIdx.x * blockDim.x + threadIdx.x; node < num_elem * RSTR_ELEM_SIZE; node += blockDim.x * gridDim.x) { ++ const CeedInt loc_node = node % RSTR_ELEM_SIZE; ++ const CeedInt elem = node / RSTR_ELEM_SIZE; ++ const CeedInt ind = indices[node]; ++ const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * node - 1]) : 0.0; ++ const CeedInt8 curl_orient_d = abs(curl_orients[3 * node + 1]); ++ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * node + 3]) : 0.0; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ CeedScalar value = 0.0; ++ value += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; ++ value += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; ++ value += ++ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; ++ atomicAdd(v + ind + comp * RSTR_COMP_STRIDE, value); ++ } ++ } ++} ++ ++extern "C" __global__ void CurlOrientedUnsignedTransposeDet(const CeedInt *__restrict__ l_vec_indices, const CeedInt *__restrict__ t_indices, ++ const CeedInt *__restrict__ t_offsets, const CeedInt8 *__restrict__ curl_orients, ++ const CeedScalar *__restrict__ u, CeedScalar *__restrict__ v) { ++ CeedScalar value[RSTR_NUM_COMP]; ++ ++ for (CeedInt i = blockIdx.x * blockDim.x + threadIdx.x; i < RSTR_NUM_NODES; i += blockDim.x * gridDim.x) { ++ const CeedInt ind = l_vec_indices[i]; ++ const CeedInt range_1 = t_offsets[i]; ++ const CeedInt range_N = t_offsets[i + 1]; ++ ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) value[comp] = 0.0; ++ ++ for (CeedInt j = range_1; j < range_N; j++) { ++ const CeedInt t_ind = t_indices[j]; ++ const CeedInt loc_node = t_ind % RSTR_ELEM_SIZE; ++ const CeedInt elem = t_ind / RSTR_ELEM_SIZE; ++ const CeedInt8 curl_orient_du = loc_node > 0 ? abs(curl_orients[3 * t_ind - 1]) : 0.0; ++ const CeedInt8 curl_orient_d = abs(curl_orients[3 * t_ind + 1]); ++ const CeedInt8 curl_orient_dl = loc_node < (RSTR_ELEM_SIZE - 1) ? abs(curl_orients[3 * t_ind + 3]) : 0.0; + +- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) { +- value[comp] += u[loc_node + comp * RESTR_ELEM_SIZE * RESTR_NUM_ELEM + elem * RESTR_ELEM_SIZE]; ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) { ++ value[comp] += loc_node > 0 ? u[loc_node - 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_du : 0.0; ++ value[comp] += u[loc_node + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_d; ++ value[comp] += ++ loc_node < (RSTR_ELEM_SIZE - 1) ? u[loc_node + 1 + comp * RSTR_ELEM_SIZE * RSTR_NUM_ELEM + elem * RSTR_ELEM_SIZE] * curl_orient_dl : 0.0; + } + } + +- for (CeedInt comp = 0; comp < RESTR_NUM_COMP; comp++) v[ind + comp * RESTR_COMP_STRIDE] += value[comp]; ++ for (CeedInt comp = 0; comp < RSTR_NUM_COMP; comp++) v[ind + comp * RSTR_COMP_STRIDE] += value[comp]; + } + } + +diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c +index 909d9d51..129f5017 100644 +--- a/interface/ceed-operator.c ++++ b/interface/ceed-operator.c +@@ -26,41 +26,41 @@ + + @param[in] ceed Ceed object for error handling + @param[in] qf_field QFunction Field matching Operator Field +- @param[in] r Operator Field ElemRestriction +- @param[in] b Operator Field Basis ++ @param[in] rstr Operator Field ElemRestriction ++ @param[in] basis Operator Field Basis + + @return An error code: 0 - success, otherwise - failure + + @ref Developer + **/ +-static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedElemRestriction r, CeedBasis b) { ++static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedElemRestriction rstr, CeedBasis basis) { + CeedInt dim = 1, num_comp = 1, q_comp = 1, rstr_num_comp = 1, size = qf_field->size; + CeedEvalMode eval_mode = qf_field->eval_mode; + + // Restriction +- CeedCheck((r == CEED_ELEMRESTRICTION_NONE) == (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_INCOMPATIBLE, ++ CeedCheck((rstr == CEED_ELEMRESTRICTION_NONE) == (eval_mode == CEED_EVAL_WEIGHT), ceed, CEED_ERROR_INCOMPATIBLE, + "CEED_ELEMRESTRICTION_NONE and CEED_EVAL_WEIGHT must be used together."); +- if (r != CEED_ELEMRESTRICTION_NONE) { +- CeedCall(CeedElemRestrictionGetNumComponents(r, &rstr_num_comp)); ++ if (rstr != CEED_ELEMRESTRICTION_NONE) { ++ CeedCall(CeedElemRestrictionGetNumComponents(rstr, &rstr_num_comp)); + } + // Basis +- CeedCheck((b == CEED_BASIS_NONE) == (eval_mode == CEED_EVAL_NONE), ceed, CEED_ERROR_INCOMPATIBLE, ++ CeedCheck((basis == CEED_BASIS_NONE) == (eval_mode == CEED_EVAL_NONE), ceed, CEED_ERROR_INCOMPATIBLE, + "CEED_BASIS_NONE and CEED_EVAL_NONE must be used together."); +- if (b != CEED_BASIS_NONE) { +- CeedCall(CeedBasisGetDimension(b, &dim)); +- CeedCall(CeedBasisGetNumComponents(b, &num_comp)); +- CeedCall(CeedBasisGetNumQuadratureComponents(b, eval_mode, &q_comp)); +- CeedCheck(r == CEED_ELEMRESTRICTION_NONE || rstr_num_comp == num_comp, ceed, CEED_ERROR_DIMENSION, ++ if (basis != CEED_BASIS_NONE) { ++ CeedCall(CeedBasisGetDimension(basis, &dim)); ++ CeedCall(CeedBasisGetNumComponents(basis, &num_comp)); ++ CeedCall(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); ++ CeedCheck(rstr == CEED_ELEMRESTRICTION_NONE || rstr_num_comp == num_comp, ceed, CEED_ERROR_DIMENSION, + "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction has %" CeedInt_FMT " components, but Basis has %" CeedInt_FMT + " components", +- qf_field->field_name, qf_field->size, CeedEvalModes[qf_field->eval_mode], rstr_num_comp, num_comp); ++ qf_field->field_name, size, CeedEvalModes[eval_mode], rstr_num_comp, num_comp); + } + // Field size + switch (eval_mode) { + case CEED_EVAL_NONE: + CeedCheck(size == rstr_num_comp, ceed, CEED_ERROR_DIMENSION, +- "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction has %" CeedInt_FMT " components", qf_field->field_name, +- qf_field->size, CeedEvalModes[qf_field->eval_mode], rstr_num_comp); ++ "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction has %" CeedInt_FMT " components", qf_field->field_name, size, ++ CeedEvalModes[eval_mode], rstr_num_comp); + break; + case CEED_EVAL_INTERP: + case CEED_EVAL_GRAD: +@@ -68,7 +68,7 @@ static int CeedOperatorCheckField(Ceed ceed, CeedQFunctionField qf_field, CeedEl + case CEED_EVAL_CURL: + CeedCheck(size == num_comp * q_comp, ceed, CEED_ERROR_DIMENSION, + "Field '%s' of size %" CeedInt_FMT " and EvalMode %s: ElemRestriction/Basis has %" CeedInt_FMT " components", qf_field->field_name, +- qf_field->size, CeedEvalModes[qf_field->eval_mode], num_comp * q_comp); ++ size, CeedEvalModes[eval_mode], num_comp * q_comp); + break; + case CEED_EVAL_WEIGHT: + // No additional checks required +@@ -672,12 +672,12 @@ int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy) { + There can be at most one active input CeedVector and at most one active output CeedVector passed to CeedOperatorApply(). + + The number of quadrature points must agree across all points. +- When using @ref CEED_BASIS_NONE, the number of quadrature points is determined by the element size of r. ++ When using @ref CEED_BASIS_NONE, the number of quadrature points is determined by the element size of rstr. + + @param[in,out] op CeedOperator on which to provide the field + @param[in] field_name Name of the field (to be matched with the name used by CeedQFunction) +- @param[in] r CeedElemRestriction +- @param[in] b CeedBasis in which the field resides or @ref CEED_BASIS_NONE if collocated with quadrature points ++ @param[in] rstr CeedElemRestriction ++ @param[in] basis CeedBasis in which the field resides or @ref CEED_BASIS_NONE if collocated with quadrature points + @param[in] v CeedVector to be used by CeedOperator or @ref CEED_VECTOR_ACTIVE if field is active or @ref CEED_VECTOR_NONE + if using @ref CEED_EVAL_WEIGHT in the QFunction + +@@ -685,35 +685,31 @@ int CeedOperatorReferenceCopy(CeedOperator op, CeedOperator *op_copy) { + + @ref User + **/ +-int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction r, CeedBasis b, CeedVector v) { +- bool is_input = true; +- CeedInt num_elem = 0, num_qpts = 0; +- CeedQFunctionField qf_field; +- CeedOperatorField *op_field; ++int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestriction rstr, CeedBasis basis, CeedVector v) { ++ bool is_input = true; ++ CeedInt num_elem = 0, num_qpts = 0; ++ CeedRestrictionType rstr_type; ++ CeedQFunctionField qf_field; ++ CeedOperatorField *op_field; + + CeedCheck(!op->is_composite, op->ceed, CEED_ERROR_INCOMPATIBLE, "Cannot add field to composite operator."); + CeedCheck(!op->is_immutable, op->ceed, CEED_ERROR_MAJOR, "Operator cannot be changed after set as immutable"); +- CeedCheck(r, op->ceed, CEED_ERROR_INCOMPATIBLE, "ElemRestriction r for field \"%s\" must be non-NULL.", field_name); +- CeedCheck(b, op->ceed, CEED_ERROR_INCOMPATIBLE, "Basis b for field \"%s\" must be non-NULL.", field_name); +- CeedCheck(v, op->ceed, CEED_ERROR_INCOMPATIBLE, "Vector v for field \"%s\" must be non-NULL.", field_name); ++ CeedCheck(rstr, op->ceed, CEED_ERROR_INCOMPATIBLE, "ElemRestriction for field \"%s\" must be non-NULL.", field_name); ++ CeedCheck(basis, op->ceed, CEED_ERROR_INCOMPATIBLE, "Basis for field \"%s\" must be non-NULL.", field_name); ++ CeedCheck(v, op->ceed, CEED_ERROR_INCOMPATIBLE, "Vector for field \"%s\" must be non-NULL.", field_name); + +- CeedCall(CeedElemRestrictionGetNumElements(r, &num_elem)); +- CeedCheck(r == CEED_ELEMRESTRICTION_NONE || !op->has_restriction || op->num_elem == num_elem, op->ceed, CEED_ERROR_DIMENSION, ++ CeedCall(CeedElemRestrictionGetNumElements(rstr, &num_elem)); ++ CeedCheck(rstr == CEED_ELEMRESTRICTION_NONE || !op->has_restriction || op->num_elem == num_elem, op->ceed, CEED_ERROR_DIMENSION, + "ElemRestriction with %" CeedInt_FMT " elements incompatible with prior %" CeedInt_FMT " elements", num_elem, op->num_elem); +- { +- CeedRestrictionType rstr_type; +- +- CeedCall(CeedElemRestrictionGetType(r, &rstr_type)); +- CeedCheck(rstr_type != CEED_RESTRICTION_POINTS, op->ceed, CEED_ERROR_UNSUPPORTED, +- "CeedElemRestrictionAtPoints not supported for standard operator fields"); +- } +- +- if (b == CEED_BASIS_NONE) CeedCall(CeedElemRestrictionGetElementSize(r, &num_qpts)); +- else CeedCall(CeedBasisGetNumQuadraturePoints(b, &num_qpts)); ++ CeedCall(CeedElemRestrictionGetType(rstr, &rstr_type)); ++ CeedCheck(rstr_type != CEED_RESTRICTION_POINTS, op->ceed, CEED_ERROR_UNSUPPORTED, ++ "CeedElemRestrictionAtPoints not supported for standard operator fields"); ++ if (basis == CEED_BASIS_NONE) CeedCall(CeedElemRestrictionGetElementSize(rstr, &num_qpts)); ++ else CeedCall(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); + CeedCheck(op->num_qpts == 0 || op->num_qpts == num_qpts, op->ceed, CEED_ERROR_DIMENSION, + "%s must correspond to the same number of quadrature points as previously added Bases. Found %" CeedInt_FMT + " quadrature points but expected %" CeedInt_FMT " quadrature points.", +- b == CEED_BASIS_NONE ? "ElemRestriction" : "Basis", num_qpts, op->num_qpts); ++ basis == CEED_BASIS_NONE ? "ElemRestriction" : "Basis", num_qpts, op->num_qpts); + for (CeedInt i = 0; i < op->qf->num_input_fields; i++) { + if (!strcmp(field_name, (*op->qf->input_fields[i]).field_name)) { + qf_field = op->qf->input_fields[i]; +@@ -733,13 +729,13 @@ int CeedOperatorSetField(CeedOperator op, const char *field_name, CeedElemRestri + return CeedError(op->ceed, CEED_ERROR_INCOMPLETE, "QFunction has no knowledge of field '%s'", field_name); + // LCOV_EXCL_STOP + found: +- CeedCall(CeedOperatorCheckField(op->ceed, qf_field, r, b)); ++ CeedCall(CeedOperatorCheckField(op->ceed, qf_field, rstr, basis)); + CeedCall(CeedCalloc(1, op_field)); + + if (v == CEED_VECTOR_ACTIVE) { + CeedSize l_size; + +- CeedCall(CeedElemRestrictionGetLVectorSize(r, &l_size)); ++ CeedCall(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); + if (is_input) { + if (op->input_size == -1) op->input_size = l_size; + CeedCheck(l_size == op->input_size, op->ceed, CEED_ERROR_INCOMPATIBLE, "LVector size %td does not match previous size %td", l_size, +@@ -752,12 +748,12 @@ found: + } + + CeedCall(CeedVectorReferenceCopy(v, &(*op_field)->vec)); +- CeedCall(CeedElemRestrictionReferenceCopy(r, &(*op_field)->elem_rstr)); +- if (r != CEED_ELEMRESTRICTION_NONE && !op->has_restriction) { ++ CeedCall(CeedElemRestrictionReferenceCopy(rstr, &(*op_field)->elem_rstr)); ++ if (rstr != CEED_ELEMRESTRICTION_NONE && !op->has_restriction) { + op->num_elem = num_elem; + op->has_restriction = true; // Restriction set, but num_elem may be 0 + } +- CeedCall(CeedBasisReferenceCopy(b, &(*op_field)->basis)); ++ CeedCall(CeedBasisReferenceCopy(basis, &(*op_field)->basis)); + if (op->num_qpts == 0) op->num_qpts = num_qpts; + op->num_fields += 1; + CeedCall(CeedStringAllocCopy(field_name, (char **)&(*op_field)->field_name)); +diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c +index 5a09a3b8..549c2432 100644 +--- a/interface/ceed-preconditioning.c ++++ b/interface/ceed-preconditioning.c +@@ -139,40 +139,6 @@ static int CeedOperatorCreateFallback(CeedOperator op) { + return CEED_ERROR_SUCCESS; + } + +-/** +- @brief Select correct basis matrix pointer based on CeedEvalMode +- +- @param[in] basis CeedBasis from which to get the basis matrix +- @param[in] eval_mode Current basis evaluation mode +- @param[in] identity Pointer to identity matrix +- @param[out] basis_ptr Basis pointer to set +- +- @ref Developer +-**/ +-static inline int CeedOperatorGetBasisPointer(CeedBasis basis, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar **basis_ptr) { +- switch (eval_mode) { +- case CEED_EVAL_NONE: +- *basis_ptr = identity; +- break; +- case CEED_EVAL_INTERP: +- CeedCall(CeedBasisGetInterp(basis, basis_ptr)); +- break; +- case CEED_EVAL_GRAD: +- CeedCall(CeedBasisGetGrad(basis, basis_ptr)); +- break; +- case CEED_EVAL_DIV: +- CeedCall(CeedBasisGetDiv(basis, basis_ptr)); +- break; +- case CEED_EVAL_CURL: +- CeedCall(CeedBasisGetCurl(basis, basis_ptr)); +- break; +- case CEED_EVAL_WEIGHT: +- break; // Caught by QF Assembly +- } +- assert(*basis_ptr != NULL); +- return CEED_ERROR_SUCCESS; +-} +- + /** + @brief Core logic for assembling operator diagonal or point block diagonal + +@@ -1000,6 +966,40 @@ CeedPragmaOptimizeOn + /// @addtogroup CeedOperatorBackend + /// @{ + ++/** ++ @brief Select correct basis matrix pointer based on CeedEvalMode ++ ++ @param[in] basis CeedBasis from which to get the basis matrix ++ @param[in] eval_mode Current basis evaluation mode ++ @param[in] identity Pointer to identity matrix ++ @param[out] basis_ptr Basis pointer to set ++ ++ @ref Backend ++**/ ++int CeedOperatorGetBasisPointer(CeedBasis basis, CeedEvalMode eval_mode, const CeedScalar *identity, const CeedScalar **basis_ptr) { ++ switch (eval_mode) { ++ case CEED_EVAL_NONE: ++ *basis_ptr = identity; ++ break; ++ case CEED_EVAL_INTERP: ++ CeedCall(CeedBasisGetInterp(basis, basis_ptr)); ++ break; ++ case CEED_EVAL_GRAD: ++ CeedCall(CeedBasisGetGrad(basis, basis_ptr)); ++ break; ++ case CEED_EVAL_DIV: ++ CeedCall(CeedBasisGetDiv(basis, basis_ptr)); ++ break; ++ case CEED_EVAL_CURL: ++ CeedCall(CeedBasisGetCurl(basis, basis_ptr)); ++ break; ++ case CEED_EVAL_WEIGHT: ++ break; // Caught by QF Assembly ++ } ++ assert(*basis_ptr != NULL); ++ return CEED_ERROR_SUCCESS; ++} ++ + /** + @brief Create point block restriction for active operator field + +@@ -1277,10 +1277,10 @@ int CeedOperatorAssemblyDataCreate(Ceed ceed, CeedOperator op, CeedOperatorAssem + + // Build OperatorAssembly data + CeedCall(CeedOperatorGetQFunction(op, &qf)); +- CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_fields, NULL, NULL)); +- CeedCall(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + + // Determine active input basis ++ CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_fields, NULL, NULL)); ++ CeedCall(CeedOperatorGetFields(op, NULL, &op_fields, NULL, NULL)); + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedVector vec; + diff --git a/extern/patch/libCEED/patch_hcurl_hdiv_basis_cuda_hip.diff b/extern/patch/libCEED/patch_hcurl_hdiv_basis_cuda_hip.diff index 2cc731abed..5c14a943ba 100644 --- a/extern/patch/libCEED/patch_hcurl_hdiv_basis_cuda_hip.diff +++ b/extern/patch/libCEED/patch_hcurl_hdiv_basis_cuda_hip.diff @@ -1,1113 +1,1113 @@ -diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c -index 3ec0d47b..f7f07784 100644 ---- a/backends/cuda-ref/ceed-cuda-ref-basis.c -+++ b/backends/cuda-ref/ceed-cuda-ref-basis.c -@@ -20,7 +20,6 @@ - //------------------------------------------------------------------------------ - int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - Ceed ceed; -- Ceed_Cuda *ceed_Cuda; - CeedInt Q_1d, dim; - const CeedInt transpose = t_mode == CEED_TRANSPOSE; - const int max_block_size = 32; -@@ -29,7 +28,6 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo - CeedBasis_Cuda *data; - - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); -- CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); - CeedCallBackend(CeedBasisGetData(basis, &data)); - - // Read vectors -@@ -38,7 +36,7 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); - - // Clear v for transpose operation -- if (t_mode == CEED_TRANSPOSE) { -+ if (transpose) { - CeedSize length; - - CeedCallBackend(CeedVectorGetLength(v, &length)); -@@ -95,17 +93,15 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo - int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { - Ceed ceed; -- Ceed_Cuda *ceed_Cuda; - CeedInt num_nodes, num_qpts; - const CeedInt transpose = t_mode == CEED_TRANSPOSE; -- int elems_per_block = 1; -- int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); -+ const int elems_per_block = 1; -+ const int grid = CeedDivUpInt(num_elem, elems_per_block); - const CeedScalar *d_u; - CeedScalar *d_v; - CeedBasisNonTensor_Cuda *data; - - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); -- CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); - CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); - CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); -@@ -117,7 +113,7 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); - - // Clear v for transpose operation -- if (t_mode == CEED_TRANSPOSE) { -+ if (transpose) { - CeedSize length; - - CeedCallBackend(CeedVectorGetLength(v, &length)); -@@ -127,16 +123,44 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr - // Apply basis operation - switch (eval_mode) { - case CEED_EVAL_INTERP: { -- void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp, &d_u, &d_v}; -+ void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; - const int block_size_x = transpose ? num_nodes : num_qpts; - -- CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); -+ if (transpose) { -+ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); -+ } else { -+ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); -+ } - } break; - case CEED_EVAL_GRAD: { -- void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_grad, &d_u, &d_v}; -+ void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; - const int block_size_x = transpose ? num_nodes : num_qpts; - -- CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Grad, grid, block_size_x, 1, elems_per_block, grad_args)); -+ if (transpose) { -+ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); -+ } else { -+ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); -+ } -+ } break; -+ case CEED_EVAL_DIV: { -+ void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; -+ const int block_size_x = transpose ? num_nodes : num_qpts; -+ -+ if (transpose) { -+ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); -+ } else { -+ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); -+ } -+ } break; -+ case CEED_EVAL_CURL: { -+ void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; -+ const int block_size_x = transpose ? num_nodes : num_qpts; -+ -+ if (transpose) { -+ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); -+ } else { -+ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); -+ } - } break; - case CEED_EVAL_WEIGHT: { - void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; -@@ -144,12 +168,6 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr - CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); - } break; - // LCOV_EXCL_START -- // Evaluate the divergence to/from the quadrature points -- case CEED_EVAL_DIV: -- return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); -- // Evaluate the curl to/from the quadrature points -- case CEED_EVAL_CURL: -- return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - // Take no action, BasisApply should not have been called - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); -@@ -194,6 +212,8 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { - CeedCallCuda(ceed, cudaFree(data->d_q_weight)); - CeedCallCuda(ceed, cudaFree(data->d_interp)); - CeedCallCuda(ceed, cudaFree(data->d_grad)); -+ CeedCallCuda(ceed, cudaFree(data->d_div)); -+ CeedCallCuda(ceed, cudaFree(data->d_curl)); - CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; - } -@@ -238,34 +258,43 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const - - CeedCallBackend(CeedBasisSetData(basis, data)); - -+ // Register backend functions - CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); - return CEED_ERROR_SUCCESS; - } - - //------------------------------------------------------------------------------ --// Create non-tensor -+// Create non-tensor H^1 - //------------------------------------------------------------------------------ - int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, - const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { - Ceed ceed; - char *basis_kernel_path, *basis_kernel_source; -- CeedInt num_comp; -- const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); -- const CeedInt interp_bytes = q_bytes * num_nodes; -- const CeedInt grad_bytes = q_bytes * num_nodes * dim; -+ CeedInt num_comp, q_comp_interp, q_comp_grad; -+ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); - CeedBasisNonTensor_Cuda *data; - - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - CeedCallBackend(CeedCalloc(1, &data)); - - // Copy basis data to GPU -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); - CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); - CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); -- CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); -- CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); -- CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); -- CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); -+ if (interp) { -+ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; -+ -+ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); -+ CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); -+ } -+ if (grad) { -+ const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; -+ -+ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); -+ CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); -+ } - - // Compile basis kernels - CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); -@@ -273,11 +302,125 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); -- CeedCallCuda(ceed, CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim, -- "BASIS_NUM_COMP", num_comp)); -- CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); -- CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); -- CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); -+ CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", -+ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); -+ CeedCallBackend(CeedFree(&basis_kernel_path)); -+ CeedCallBackend(CeedFree(&basis_kernel_source)); -+ -+ CeedCallBackend(CeedBasisSetData(basis, data)); -+ -+ // Register backend functions -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); -+ return CEED_ERROR_SUCCESS; -+} -+ -+//------------------------------------------------------------------------------ -+// Create non-tensor H(div) -+//------------------------------------------------------------------------------ -+int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, -+ const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { -+ Ceed ceed; -+ char *basis_kernel_path, *basis_kernel_source; -+ CeedInt num_comp, q_comp_interp, q_comp_div; -+ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); -+ CeedBasisNonTensor_Cuda *data; -+ -+ CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); -+ CeedCallBackend(CeedCalloc(1, &data)); -+ -+ // Copy basis data to GPU -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); -+ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); -+ CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); -+ if (interp) { -+ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; -+ -+ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); -+ CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); -+ } -+ if (div) { -+ const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; -+ -+ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); -+ CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); -+ } -+ -+ // Compile basis kernels -+ CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); -+ CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); -+ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); -+ CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); -+ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); -+ CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", -+ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); -+ CeedCallBackend(CeedFree(&basis_kernel_path)); -+ CeedCallBackend(CeedFree(&basis_kernel_source)); -+ -+ CeedCallBackend(CeedBasisSetData(basis, data)); -+ -+ // Register backend functions -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); -+ return CEED_ERROR_SUCCESS; -+} -+ -+//------------------------------------------------------------------------------ -+// Create non-tensor H(curl) -+//------------------------------------------------------------------------------ -+int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, -+ const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { -+ Ceed ceed; -+ char *basis_kernel_path, *basis_kernel_source; -+ CeedInt num_comp, q_comp_interp, q_comp_curl; -+ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); -+ CeedBasisNonTensor_Cuda *data; -+ -+ CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); -+ CeedCallBackend(CeedCalloc(1, &data)); -+ -+ // Copy basis data to GPU -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); -+ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); -+ CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); -+ if (interp) { -+ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; -+ -+ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); -+ CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); -+ } -+ if (curl) { -+ const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; -+ -+ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); -+ CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); -+ } -+ -+ // Compile basis kernels -+ CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); -+ CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); -+ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); -+ CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); -+ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); -+ CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", -+ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); -+ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); - -diff --git a/backends/cuda-ref/ceed-cuda-ref.c b/backends/cuda-ref/ceed-cuda-ref.c -index a008cbc4..42922fe6 100644 ---- a/backends/cuda-ref/ceed-cuda-ref.c -+++ b/backends/cuda-ref/ceed-cuda-ref.c -@@ -54,6 +54,8 @@ static int CeedInit_Cuda_ref(const char *resource, Ceed ceed) { - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Cuda)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Cuda)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Cuda)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Cuda)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Cuda)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Cuda)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Cuda)); -diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h -index 309c1056..c904adb6 100644 ---- a/backends/cuda-ref/ceed-cuda-ref.h -+++ b/backends/cuda-ref/ceed-cuda-ref.h -@@ -53,10 +53,14 @@ typedef struct { - typedef struct { - CUmodule module; - CUfunction Interp; -- CUfunction Grad; -+ CUfunction InterpTranspose; -+ CUfunction Deriv; -+ CUfunction DerivTranspose; - CUfunction Weight; - CeedScalar *d_interp; - CeedScalar *d_grad; -+ CeedScalar *d_div; -+ CeedScalar *d_curl; - CeedScalar *d_q_weight; - } CeedBasisNonTensor_Cuda; - -@@ -118,9 +122,12 @@ CEED_INTERN int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMod - - CEED_INTERN int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, - const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); -- - CEED_INTERN int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); -+CEED_INTERN int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, -+ const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); -+CEED_INTERN int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, -+ const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); - - CEED_INTERN int CeedQFunctionCreate_Cuda(CeedQFunction qf); - -diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c -index 4927f1e4..e854e8dc 100644 ---- a/backends/hip-ref/ceed-hip-ref-basis.c -+++ b/backends/hip-ref/ceed-hip-ref-basis.c -@@ -19,7 +19,6 @@ - //------------------------------------------------------------------------------ - int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { - Ceed ceed; -- Ceed_Hip *ceed_Hip; - CeedInt Q_1d, dim; - const CeedInt transpose = t_mode == CEED_TRANSPOSE; - const int max_block_size = 64; -@@ -28,7 +27,6 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod - CeedBasis_Hip *data; - - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); -- CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); - CeedCallBackend(CeedBasisGetData(basis, &data)); - - // Read vectors -@@ -37,7 +35,7 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); - - // Clear v for transpose operation -- if (t_mode == CEED_TRANSPOSE) { -+ if (transpose) { - CeedSize length; - - CeedCallBackend(CeedVectorGetLength(v, &length)); -@@ -94,17 +92,15 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod - int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, - CeedVector v) { - Ceed ceed; -- Ceed_Hip *ceed_Hip; - CeedInt num_nodes, num_qpts; - const CeedInt transpose = t_mode == CEED_TRANSPOSE; -- int elems_per_block = 1; -- int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); -+ const int elems_per_block = 1; -+ const int grid = CeedDivUpInt(num_elem, elems_per_block); - const CeedScalar *d_u; - CeedScalar *d_v; - CeedBasisNonTensor_Hip *data; - - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); -- CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); - CeedCallBackend(CeedBasisGetData(basis, &data)); - CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); - CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); -@@ -116,7 +112,7 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra - CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); - - // Clear v for transpose operation -- if (t_mode == CEED_TRANSPOSE) { -+ if (transpose) { - CeedSize length; - - CeedCallBackend(CeedVectorGetLength(v, &length)); -@@ -126,16 +122,44 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra - // Apply basis operation - switch (eval_mode) { - case CEED_EVAL_INTERP: { -- void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp, &d_u, &d_v}; -+ void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; - const int block_size_x = transpose ? num_nodes : num_qpts; - -- CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); -+ if (transpose) { -+ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); -+ } else { -+ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); -+ } - } break; - case CEED_EVAL_GRAD: { -- void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_grad, &d_u, &d_v}; -+ void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; - const int block_size_x = transpose ? num_nodes : num_qpts; - -- CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Grad, grid, block_size_x, 1, elems_per_block, grad_args)); -+ if (transpose) { -+ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); -+ } else { -+ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); -+ } -+ } break; -+ case CEED_EVAL_DIV: { -+ void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; -+ const int block_size_x = transpose ? num_nodes : num_qpts; -+ -+ if (transpose) { -+ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); -+ } else { -+ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); -+ } -+ } break; -+ case CEED_EVAL_CURL: { -+ void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; -+ const int block_size_x = transpose ? num_nodes : num_qpts; -+ -+ if (transpose) { -+ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); -+ } else { -+ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); -+ } - } break; - case CEED_EVAL_WEIGHT: { - void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; -@@ -143,12 +167,6 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra - CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); - } break; - // LCOV_EXCL_START -- // Evaluate the divergence to/from the quadrature points -- case CEED_EVAL_DIV: -- return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); -- // Evaluate the curl to/from the quadrature points -- case CEED_EVAL_CURL: -- return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); - // Take no action, BasisApply should not have been called - case CEED_EVAL_NONE: - return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); -@@ -193,6 +211,8 @@ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { - CeedCallHip(ceed, hipFree(data->d_q_weight)); - CeedCallHip(ceed, hipFree(data->d_interp)); - CeedCallHip(ceed, hipFree(data->d_grad)); -+ CeedCallHip(ceed, hipFree(data->d_div)); -+ CeedCallHip(ceed, hipFree(data->d_curl)); - CeedCallBackend(CeedFree(&data)); - return CEED_ERROR_SUCCESS; - } -@@ -237,34 +257,43 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C - - CeedCallBackend(CeedBasisSetData(basis, data)); - -+ // Register backend functions - CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip)); - return CEED_ERROR_SUCCESS; - } - - //------------------------------------------------------------------------------ --// Create non-tensor -+// Create non-tensor H^1 - //------------------------------------------------------------------------------ - int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, - const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { - Ceed ceed; - char *basis_kernel_path, *basis_kernel_source; -- CeedInt num_comp; -- const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); -- const CeedInt interp_bytes = q_bytes * num_nodes; -- const CeedInt grad_bytes = q_bytes * num_nodes * dim; -+ CeedInt num_comp, q_comp_interp, q_comp_grad; -+ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); - CeedBasisNonTensor_Hip *data; - - CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); - CeedCallBackend(CeedCalloc(1, &data)); - - // Copy basis data to GPU -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); - CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); - CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); -- CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); -- CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); -- CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes)); -- CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice)); -+ if (interp) { -+ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; -+ -+ CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); -+ CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); -+ } -+ if (grad) { -+ const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; -+ -+ CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes)); -+ CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice)); -+ } - - // Compile basis kernels - CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); -@@ -272,13 +301,128 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); - CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); - CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); -- CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim, -- "BASIS_NUM_COMP", num_comp)); -+ CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", -+ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); - CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); -- CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); - CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); - CeedCallBackend(CeedFree(&basis_kernel_path)); - CeedCallBackend(CeedFree(&basis_kernel_source)); -+ -+ CeedCallBackend(CeedBasisSetData(basis, data)); -+ -+ // Register backend functions -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); -+ return CEED_ERROR_SUCCESS; -+} -+ -+//------------------------------------------------------------------------------ -+// Create non-tensor H(div) -+//------------------------------------------------------------------------------ -+int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, -+ const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { -+ Ceed ceed; -+ char *basis_kernel_path, *basis_kernel_source; -+ CeedInt num_comp, q_comp_interp, q_comp_div; -+ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); -+ CeedBasisNonTensor_Hip *data; -+ -+ CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); -+ CeedCallBackend(CeedCalloc(1, &data)); -+ -+ // Copy basis data to GPU -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); -+ CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); -+ CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); -+ if (interp) { -+ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; -+ -+ CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); -+ CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); -+ } -+ if (div) { -+ const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; -+ -+ CeedCallHip(ceed, hipMalloc((void **)&data->d_div, div_bytes)); -+ CeedCallHip(ceed, hipMemcpy(data->d_div, div, div_bytes, hipMemcpyHostToDevice)); -+ } -+ -+ // Compile basis kernels -+ CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); -+ CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); -+ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); -+ CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); -+ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); -+ CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", -+ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); -+ CeedCallBackend(CeedFree(&basis_kernel_path)); -+ CeedCallBackend(CeedFree(&basis_kernel_source)); -+ -+ CeedCallBackend(CeedBasisSetData(basis, data)); -+ -+ // Register backend functions -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); -+ return CEED_ERROR_SUCCESS; -+} -+ -+//------------------------------------------------------------------------------ -+// Create non-tensor H(curl) -+//------------------------------------------------------------------------------ -+int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, -+ const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { -+ Ceed ceed; -+ char *basis_kernel_path, *basis_kernel_source; -+ CeedInt num_comp, q_comp_interp, q_comp_curl; -+ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); -+ CeedBasisNonTensor_Hip *data; -+ -+ CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); -+ CeedCallBackend(CeedCalloc(1, &data)); -+ -+ // Copy basis data to GPU -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); -+ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); -+ CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); -+ CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); -+ if (interp) { -+ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; -+ -+ CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); -+ CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); -+ } -+ if (curl) { -+ const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; -+ -+ CeedCallHip(ceed, hipMalloc((void **)&data->d_curl, curl_bytes)); -+ CeedCallHip(ceed, hipMemcpy(data->d_curl, curl, curl_bytes, hipMemcpyHostToDevice)); -+ } -+ -+ // Compile basis kernels -+ CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); -+ CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); -+ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); -+ CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); -+ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); -+ CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", -+ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); -+ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); -+ CeedCallBackend(CeedFree(&basis_kernel_path)); -+ CeedCallBackend(CeedFree(&basis_kernel_source)); -+ - CeedCallBackend(CeedBasisSetData(basis, data)); - - // Register backend functions -diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c -index 754c0b52..eca2f4dd 100644 ---- a/backends/hip-ref/ceed-hip-ref.c -+++ b/backends/hip-ref/ceed-hip-ref.c -@@ -17,8 +17,8 @@ - //------------------------------------------------------------------------------ - // HIP preferred MemType - //------------------------------------------------------------------------------ --static int CeedGetPreferredMemType_Hip(CeedMemType *type) { -- *type = CEED_MEM_DEVICE; -+static int CeedGetPreferredMemType_Hip(CeedMemType *mem_type) { -+ *mem_type = CEED_MEM_DEVICE; - return CEED_ERROR_SUCCESS; - } - -@@ -54,6 +54,8 @@ static int CeedInit_Hip_ref(const char *resource, Ceed ceed) { - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Hip)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Hip)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Hip)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Hip)); -+ CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Hip)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Hip)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip)); - CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Hip)); -diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h -index 634bb68d..89bbc7a0 100644 ---- a/backends/hip-ref/ceed-hip-ref.h -+++ b/backends/hip-ref/ceed-hip-ref.h -@@ -57,10 +57,14 @@ typedef struct { - typedef struct { - hipModule_t module; - hipFunction_t Interp; -- hipFunction_t Grad; -+ hipFunction_t InterpTranspose; -+ hipFunction_t Deriv; -+ hipFunction_t DerivTranspose; - hipFunction_t Weight; - CeedScalar *d_interp; - CeedScalar *d_grad; -+ CeedScalar *d_div; -+ CeedScalar *d_curl; - CeedScalar *d_q_weight; - } CeedBasisNonTensor_Hip; - -@@ -122,9 +126,12 @@ CEED_INTERN int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode - - CEED_INTERN int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, - const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); -- - CEED_INTERN int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, - const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); -+CEED_INTERN int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, -+ const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); -+CEED_INTERN int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, -+ const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); - - CEED_INTERN int CeedQFunctionCreate_Hip(CeedQFunction qf); - -diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h -new file mode 100644 -index 00000000..7e6450fb ---- /dev/null -+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h -@@ -0,0 +1,67 @@ -+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -+// -+// SPDX-License-Identifier: BSD-2-Clause -+// -+// This file is part of CEED: http://github.com/ceed -+ -+/// @file -+/// Internal header for CUDA non-tensor product basis templates -+#ifndef CEED_CUDA_REF_BASIS_NONTENSOR_TEMPLATES_H -+#define CEED_CUDA_REF_BASIS_NONTENSOR_TEMPLATES_H -+ -+#include -+ -+//------------------------------------------------------------------------------ -+// Tensor contraction -+//------------------------------------------------------------------------------ -+template -+inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U, -+ const CeedInt strides_comp_V, const CeedInt strides_q_comp_V, const CeedScalar *__restrict__ d_B, -+ const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { -+ const CeedInt t_id = threadIdx.x; -+ const CeedScalar *U; -+ CeedScalar r_V[Q_COMP]; -+ // TODO load B in shared memory if blockDim.z > 1? -+ -+ for (CeedInt comp = 0; comp < NUM_COMP; comp++) { -+ // Run with Q threads -+ U = d_U + elem * strides_elem_U + comp * strides_comp_U; -+ for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0; -+ for (CeedInt i = 0; i < P; i++) { -+ const CeedScalar val = U[i]; -+ -+ for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] += d_B[i + t_id * P + d * P * Q] * val; -+ } -+ for (CeedInt d = 0; d < Q_COMP; d++) { -+ d_V[elem * strides_elem_V + comp * strides_comp_V + d * strides_q_comp_V + t_id] = r_V[d]; -+ } -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// Tensor contraction transpose -+//------------------------------------------------------------------------------ -+template -+inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U, -+ const CeedInt strides_comp_V, const CeedInt strides_q_comp_U, const CeedScalar *__restrict__ d_B, -+ const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { -+ const CeedInt t_id = threadIdx.x; -+ const CeedScalar *U; -+ CeedScalar r_V; -+ // TODO load B in shared memory if blockDim.z > 1? -+ -+ for (CeedInt comp = 0; comp < NUM_COMP; comp++) { -+ // Run with P threads -+ r_V = 0.0; -+ for (CeedInt d = 0; d < Q_COMP; d++) { -+ U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U; -+ for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i]; -+ } -+ d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V; -+ } -+} -+ -+//------------------------------------------------------------------------------ -+ -+#endif // CEED_CUDA_REF_BASIS_NONTENSOR_TEMPLATES_H -diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h -index 484d755f..4b5c7f94 100644 ---- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h -+++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h -@@ -12,6 +12,8 @@ - - #include - -+#include "cuda-ref-basis-nontensor-templates.h" -+ - //------------------------------------------------------------------------------ - // Non-Tensor Basis Kernels - //------------------------------------------------------------------------------ -@@ -19,65 +21,38 @@ - //------------------------------------------------------------------------------ - // Interp - //------------------------------------------------------------------------------ --extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U, -+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { -- const CeedInt t_id = threadIdx.x; -- const CeedScalar *U; -- CeedScalar V; -- // TODO load B in shared memory if blockDim.z > 1? -- - for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { -- for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { -- if (transpose) { // run with P threads -- U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q; -- V = 0.0; -- for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i]; -- -- d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; -- } else { // run with Q threads -- U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; -- V = 0.0; -- for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i]; -+ Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, -+ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); -+ } -+} - -- d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V; -- } -- } -+extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, -+ CeedScalar *__restrict__ d_V) { -+ for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { -+ ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, -+ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); - } - } - - //------------------------------------------------------------------------------ --// Grad -+// Deriv - //------------------------------------------------------------------------------ --extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U, -- CeedScalar *__restrict__ d_V) { -- const CeedInt t_id = threadIdx.x; -- const CeedScalar *U; -- // TODO load G in shared memory if blockDim.z > 1? -- -+extern "C" __global__ void Deriv(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, -+ CeedScalar *__restrict__ d_V) { - for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { -- for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { -- if (transpose) { // run with P threads -- CeedScalar V = 0.0; -- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { -- U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q; -- for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i]; -- } -- -- d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; -- } else { // run with Q threads -- CeedScalar V[BASIS_DIM]; -- U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; -- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0; -- for (CeedInt i = 0; i < BASIS_P; i++) { -- const CeedScalar val = U[i]; -- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val; -- } -+ Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, -+ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); -+ } -+} - -- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { -- d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim]; -- } -- } -- } -+extern "C" __global__ void DerivTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, -+ CeedScalar *__restrict__ d_V) { -+ for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { -+ ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, -+ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); - } - } - -@@ -86,8 +61,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, - //------------------------------------------------------------------------------ - extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) { - const CeedInt t_id = threadIdx.x; -- - // TODO load q_weight in shared memory if blockDim.z > 1? -+ - for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { - d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; - } -diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h -new file mode 100644 -index 00000000..4e568369 ---- /dev/null -+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h -@@ -0,0 +1,67 @@ -+// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. -+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. -+// -+// SPDX-License-Identifier: BSD-2-Clause -+// -+// This file is part of CEED: http://github.com/ceed -+ -+/// @file -+/// Internal header for HIP non-tensor product basis templates -+#ifndef CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H -+#define CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H -+ -+#include -+ -+//------------------------------------------------------------------------------ -+// Tensor contraction -+//------------------------------------------------------------------------------ -+template -+inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U, -+ const CeedInt strides_comp_V, const CeedInt strides_q_comp_V, const CeedScalar *__restrict__ d_B, -+ const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { -+ const CeedInt t_id = threadIdx.x; -+ const CeedScalar *U; -+ CeedScalar r_V[Q_COMP]; -+ // TODO load B in shared memory if blockDim.z > 1? -+ -+ for (CeedInt comp = 0; comp < NUM_COMP; comp++) { -+ // Run with Q threads -+ U = d_U + elem * strides_elem_U + comp * strides_comp_U; -+ for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0; -+ for (CeedInt i = 0; i < P; i++) { -+ const CeedScalar val = U[i]; -+ -+ for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] += d_B[i + t_id * P + d * P * Q] * val; -+ } -+ for (CeedInt d = 0; d < Q_COMP; d++) { -+ d_V[elem * strides_elem_V + comp * strides_comp_V + d * strides_q_comp_V + t_id] = r_V[d]; -+ } -+ } -+} -+ -+//------------------------------------------------------------------------------ -+// Tensor contraction transpose -+//------------------------------------------------------------------------------ -+template -+inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U, -+ const CeedInt strides_comp_V, const CeedInt strides_q_comp_U, const CeedScalar *__restrict__ d_B, -+ const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { -+ const CeedInt t_id = threadIdx.x; -+ const CeedScalar *U; -+ CeedScalar r_V; -+ // TODO load B in shared memory if blockDim.z > 1? -+ -+ for (CeedInt comp = 0; comp < NUM_COMP; comp++) { -+ // Run with P threads -+ r_V = 0.0; -+ for (CeedInt d = 0; d < Q_COMP; d++) { -+ U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U; -+ for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i]; -+ } -+ d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V; -+ } -+} -+ -+//------------------------------------------------------------------------------ -+ -+#endif // CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H -diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h -index 101f898e..fd389f8a 100644 ---- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h -+++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h -@@ -12,6 +12,8 @@ - - #include - -+#include "hip-ref-basis-nontensor-templates.h" -+ - //------------------------------------------------------------------------------ - // Non-Tensor Basis Kernels - //------------------------------------------------------------------------------ -@@ -19,67 +21,38 @@ - //------------------------------------------------------------------------------ - // Interp - //------------------------------------------------------------------------------ --extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U, -+extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, - CeedScalar *__restrict__ d_V) { -- const CeedInt t_id = threadIdx.x; -- -- const CeedScalar *U; -- CeedScalar V; -- // TODO load B in shared memory if blockDim.z > 1? -- - for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { -- for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { -- if (transpose) { // run with P threads -- U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q; -- V = 0.0; -- for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i]; -- -- d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; -- } else { // run with Q threads -- U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; -- V = 0.0; -- for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i]; -+ Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, -+ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); -+ } -+} - -- d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V; -- } -- } -+extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, -+ CeedScalar *__restrict__ d_V) { -+ for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { -+ ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, -+ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); - } - } - - //------------------------------------------------------------------------------ --// Grad -+// Deriv - //------------------------------------------------------------------------------ --extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U, -- CeedScalar *__restrict__ d_V) { -- const CeedInt t_id = threadIdx.x; -- -- const CeedScalar *U; -- // TODO load G in shared memory if blockDim.z > 1? -- -+extern "C" __global__ void Deriv(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, -+ CeedScalar *__restrict__ d_V) { - for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { -- for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { -- if (transpose) { // run with P threads -- CeedScalar V = 0.0; -- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { -- U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q; -- for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i]; -- } -- -- d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; -- } else { // run with Q threads -- CeedScalar V[BASIS_DIM]; -- U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; -- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0; -- for (CeedInt i = 0; i < BASIS_P; i++) { -- const CeedScalar val = U[i]; -- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val; -- } -+ Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, -+ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); -+ } -+} - -- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { -- d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim]; -- } -- } -- } -+extern "C" __global__ void DerivTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, -+ CeedScalar *__restrict__ d_V) { -+ for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { -+ ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, -+ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); - } - } - -@@ -89,6 +62,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, - extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) { - const CeedInt t_id = threadIdx.x; - // TODO load q_weight in shared memory if blockDim.z > 1? -+ - for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { - d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; - } -diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c -index 9e569584..1b86fa33 100644 ---- a/interface/ceed-basis.c -+++ b/interface/ceed-basis.c -@@ -1208,7 +1208,7 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee - const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis *basis) { - CeedInt Q = num_qpts, P = num_nodes, dim = 0, curl_comp = 0; - -- if (!ceed->BasisCreateHdiv) { -+ if (!ceed->BasisCreateHcurl) { - Ceed delegate; - - CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis")); +diff --git a/backends/cuda-ref/ceed-cuda-ref-basis.c b/backends/cuda-ref/ceed-cuda-ref-basis.c +index 3ec0d47b..f7f07784 100644 +--- a/backends/cuda-ref/ceed-cuda-ref-basis.c ++++ b/backends/cuda-ref/ceed-cuda-ref-basis.c +@@ -20,7 +20,6 @@ + //------------------------------------------------------------------------------ + int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { + Ceed ceed; +- Ceed_Cuda *ceed_Cuda; + CeedInt Q_1d, dim; + const CeedInt transpose = t_mode == CEED_TRANSPOSE; + const int max_block_size = 32; +@@ -29,7 +28,6 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo + CeedBasis_Cuda *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); +- CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); + CeedCallBackend(CeedBasisGetData(basis, &data)); + + // Read vectors +@@ -38,7 +36,7 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + + // Clear v for transpose operation +- if (t_mode == CEED_TRANSPOSE) { ++ if (transpose) { + CeedSize length; + + CeedCallBackend(CeedVectorGetLength(v, &length)); +@@ -95,17 +93,15 @@ int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMo + int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + Ceed ceed; +- Ceed_Cuda *ceed_Cuda; + CeedInt num_nodes, num_qpts; + const CeedInt transpose = t_mode == CEED_TRANSPOSE; +- int elems_per_block = 1; +- int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); ++ const int elems_per_block = 1; ++ const int grid = CeedDivUpInt(num_elem, elems_per_block); + const CeedScalar *d_u; + CeedScalar *d_v; + CeedBasisNonTensor_Cuda *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); +- CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); + CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); + CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); +@@ -117,7 +113,7 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + + // Clear v for transpose operation +- if (t_mode == CEED_TRANSPOSE) { ++ if (transpose) { + CeedSize length; + + CeedCallBackend(CeedVectorGetLength(v, &length)); +@@ -127,16 +123,44 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr + // Apply basis operation + switch (eval_mode) { + case CEED_EVAL_INTERP: { +- void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp, &d_u, &d_v}; ++ void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; + const int block_size_x = transpose ? num_nodes : num_qpts; + +- CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); ++ if (transpose) { ++ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); ++ } else { ++ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); ++ } + } break; + case CEED_EVAL_GRAD: { +- void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_grad, &d_u, &d_v}; ++ void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; + const int block_size_x = transpose ? num_nodes : num_qpts; + +- CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Grad, grid, block_size_x, 1, elems_per_block, grad_args)); ++ if (transpose) { ++ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); ++ } else { ++ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); ++ } ++ } break; ++ case CEED_EVAL_DIV: { ++ void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; ++ const int block_size_x = transpose ? num_nodes : num_qpts; ++ ++ if (transpose) { ++ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); ++ } else { ++ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); ++ } ++ } break; ++ case CEED_EVAL_CURL: { ++ void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; ++ const int block_size_x = transpose ? num_nodes : num_qpts; ++ ++ if (transpose) { ++ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); ++ } else { ++ CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); ++ } + } break; + case CEED_EVAL_WEIGHT: { + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; +@@ -144,12 +168,6 @@ int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt num_elem, CeedTr + CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); + } break; + // LCOV_EXCL_START +- // Evaluate the divergence to/from the quadrature points +- case CEED_EVAL_DIV: +- return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); +- // Evaluate the curl to/from the quadrature points +- case CEED_EVAL_CURL: +- return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); + // Take no action, BasisApply should not have been called + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); +@@ -194,6 +212,8 @@ static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { + CeedCallCuda(ceed, cudaFree(data->d_q_weight)); + CeedCallCuda(ceed, cudaFree(data->d_interp)); + CeedCallCuda(ceed, cudaFree(data->d_grad)); ++ CeedCallCuda(ceed, cudaFree(data->d_div)); ++ CeedCallCuda(ceed, cudaFree(data->d_curl)); + CeedCallBackend(CeedFree(&data)); + return CEED_ERROR_SUCCESS; + } +@@ -238,34 +258,43 @@ int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const + + CeedCallBackend(CeedBasisSetData(basis, data)); + ++ // Register backend functions + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Cuda)); + return CEED_ERROR_SUCCESS; + } + + //------------------------------------------------------------------------------ +-// Create non-tensor ++// Create non-tensor H^1 + //------------------------------------------------------------------------------ + int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, + const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { + Ceed ceed; + char *basis_kernel_path, *basis_kernel_source; +- CeedInt num_comp; +- const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); +- const CeedInt interp_bytes = q_bytes * num_nodes; +- const CeedInt grad_bytes = q_bytes * num_nodes * dim; ++ CeedInt num_comp, q_comp_interp, q_comp_grad; ++ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); + CeedBasisNonTensor_Cuda *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedCalloc(1, &data)); + + // Copy basis data to GPU ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); + CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); + CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); +- CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); +- CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); +- CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); +- CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); ++ if (interp) { ++ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; ++ ++ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); ++ CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); ++ } ++ if (grad) { ++ const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; ++ ++ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_grad, grad_bytes)); ++ CeedCallCuda(ceed, cudaMemcpy(data->d_grad, grad, grad_bytes, cudaMemcpyHostToDevice)); ++ } + + // Compile basis kernels + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); +@@ -273,11 +302,125 @@ int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); +- CeedCallCuda(ceed, CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim, +- "BASIS_NUM_COMP", num_comp)); +- CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); +- CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Grad", &data->Grad)); +- CeedCallCuda(ceed, CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); ++ CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", ++ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); ++ CeedCallBackend(CeedFree(&basis_kernel_path)); ++ CeedCallBackend(CeedFree(&basis_kernel_source)); ++ ++ CeedCallBackend(CeedBasisSetData(basis, data)); ++ ++ // Register backend functions ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); ++ return CEED_ERROR_SUCCESS; ++} ++ ++//------------------------------------------------------------------------------ ++// Create non-tensor H(div) ++//------------------------------------------------------------------------------ ++int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, ++ const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { ++ Ceed ceed; ++ char *basis_kernel_path, *basis_kernel_source; ++ CeedInt num_comp, q_comp_interp, q_comp_div; ++ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); ++ CeedBasisNonTensor_Cuda *data; ++ ++ CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); ++ CeedCallBackend(CeedCalloc(1, &data)); ++ ++ // Copy basis data to GPU ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); ++ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); ++ CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); ++ if (interp) { ++ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; ++ ++ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); ++ CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); ++ } ++ if (div) { ++ const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; ++ ++ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_div, div_bytes)); ++ CeedCallCuda(ceed, cudaMemcpy(data->d_div, div, div_bytes, cudaMemcpyHostToDevice)); ++ } ++ ++ // Compile basis kernels ++ CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); ++ CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); ++ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); ++ CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); ++ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); ++ CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", ++ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); ++ CeedCallBackend(CeedFree(&basis_kernel_path)); ++ CeedCallBackend(CeedFree(&basis_kernel_source)); ++ ++ CeedCallBackend(CeedBasisSetData(basis, data)); ++ ++ // Register backend functions ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Cuda)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Cuda)); ++ return CEED_ERROR_SUCCESS; ++} ++ ++//------------------------------------------------------------------------------ ++// Create non-tensor H(curl) ++//------------------------------------------------------------------------------ ++int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, ++ const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { ++ Ceed ceed; ++ char *basis_kernel_path, *basis_kernel_source; ++ CeedInt num_comp, q_comp_interp, q_comp_curl; ++ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); ++ CeedBasisNonTensor_Cuda *data; ++ ++ CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); ++ CeedCallBackend(CeedCalloc(1, &data)); ++ ++ // Copy basis data to GPU ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); ++ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_q_weight, q_bytes)); ++ CeedCallCuda(ceed, cudaMemcpy(data->d_q_weight, q_weight, q_bytes, cudaMemcpyHostToDevice)); ++ if (interp) { ++ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; ++ ++ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_interp, interp_bytes)); ++ CeedCallCuda(ceed, cudaMemcpy(data->d_interp, interp, interp_bytes, cudaMemcpyHostToDevice)); ++ } ++ if (curl) { ++ const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; ++ ++ CeedCallCuda(ceed, cudaMalloc((void **)&data->d_curl, curl_bytes)); ++ CeedCallCuda(ceed, cudaMemcpy(data->d_curl, curl, curl_bytes, cudaMemcpyHostToDevice)); ++ } ++ ++ // Compile basis kernels ++ CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); ++ CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-ref-basis-nontensor.h", &basis_kernel_path)); ++ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); ++ CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); ++ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); ++ CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", ++ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Interp", &data->Interp)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Deriv", &data->Deriv)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); ++ CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, "Weight", &data->Weight)); + CeedCallBackend(CeedFree(&basis_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); + +diff --git a/backends/cuda-ref/ceed-cuda-ref.c b/backends/cuda-ref/ceed-cuda-ref.c +index a008cbc4..42922fe6 100644 +--- a/backends/cuda-ref/ceed-cuda-ref.c ++++ b/backends/cuda-ref/ceed-cuda-ref.c +@@ -54,6 +54,8 @@ static int CeedInit_Cuda_ref(const char *resource, Ceed ceed) { + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Cuda)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Cuda)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Cuda)); +diff --git a/backends/cuda-ref/ceed-cuda-ref.h b/backends/cuda-ref/ceed-cuda-ref.h +index 309c1056..c904adb6 100644 +--- a/backends/cuda-ref/ceed-cuda-ref.h ++++ b/backends/cuda-ref/ceed-cuda-ref.h +@@ -53,10 +53,14 @@ typedef struct { + typedef struct { + CUmodule module; + CUfunction Interp; +- CUfunction Grad; ++ CUfunction InterpTranspose; ++ CUfunction Deriv; ++ CUfunction DerivTranspose; + CUfunction Weight; + CeedScalar *d_interp; + CeedScalar *d_grad; ++ CeedScalar *d_div; ++ CeedScalar *d_curl; + CeedScalar *d_q_weight; + } CeedBasisNonTensor_Cuda; + +@@ -118,9 +122,12 @@ CEED_INTERN int CeedElemRestrictionCreate_Cuda(CeedMemType mem_type, CeedCopyMod + + CEED_INTERN int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); +- + CEED_INTERN int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); ++CEED_INTERN int CeedBasisCreateHdiv_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, ++ const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); ++CEED_INTERN int CeedBasisCreateHcurl_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, ++ const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); + + CEED_INTERN int CeedQFunctionCreate_Cuda(CeedQFunction qf); + +diff --git a/backends/hip-ref/ceed-hip-ref-basis.c b/backends/hip-ref/ceed-hip-ref-basis.c +index 4927f1e4..e854e8dc 100644 +--- a/backends/hip-ref/ceed-hip-ref-basis.c ++++ b/backends/hip-ref/ceed-hip-ref-basis.c +@@ -19,7 +19,6 @@ + //------------------------------------------------------------------------------ + int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) { + Ceed ceed; +- Ceed_Hip *ceed_Hip; + CeedInt Q_1d, dim; + const CeedInt transpose = t_mode == CEED_TRANSPOSE; + const int max_block_size = 64; +@@ -28,7 +27,6 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod + CeedBasis_Hip *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); +- CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); + CeedCallBackend(CeedBasisGetData(basis, &data)); + + // Read vectors +@@ -37,7 +35,7 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + + // Clear v for transpose operation +- if (t_mode == CEED_TRANSPOSE) { ++ if (transpose) { + CeedSize length; + + CeedCallBackend(CeedVectorGetLength(v, &length)); +@@ -94,17 +92,15 @@ int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMod + int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, + CeedVector v) { + Ceed ceed; +- Ceed_Hip *ceed_Hip; + CeedInt num_nodes, num_qpts; + const CeedInt transpose = t_mode == CEED_TRANSPOSE; +- int elems_per_block = 1; +- int grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); ++ const int elems_per_block = 1; ++ const int grid = CeedDivUpInt(num_elem, elems_per_block); + const CeedScalar *d_u; + CeedScalar *d_v; + CeedBasisNonTensor_Hip *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); +- CeedCallBackend(CeedGetData(ceed, &ceed_Hip)); + CeedCallBackend(CeedBasisGetData(basis, &data)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); + CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes)); +@@ -116,7 +112,7 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra + CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v)); + + // Clear v for transpose operation +- if (t_mode == CEED_TRANSPOSE) { ++ if (transpose) { + CeedSize length; + + CeedCallBackend(CeedVectorGetLength(v, &length)); +@@ -126,16 +122,44 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra + // Apply basis operation + switch (eval_mode) { + case CEED_EVAL_INTERP: { +- void *interp_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_interp, &d_u, &d_v}; ++ void *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v}; + const int block_size_x = transpose ? num_nodes : num_qpts; + +- CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); ++ if (transpose) { ++ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)); ++ } else { ++ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)); ++ } + } break; + case CEED_EVAL_GRAD: { +- void *grad_args[] = {(void *)&num_elem, (void *)&transpose, &data->d_grad, &d_u, &d_v}; ++ void *grad_args[] = {(void *)&num_elem, &data->d_grad, &d_u, &d_v}; + const int block_size_x = transpose ? num_nodes : num_qpts; + +- CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Grad, grid, block_size_x, 1, elems_per_block, grad_args)); ++ if (transpose) { ++ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); ++ } else { ++ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); ++ } ++ } break; ++ case CEED_EVAL_DIV: { ++ void *div_args[] = {(void *)&num_elem, &data->d_div, &d_u, &d_v}; ++ const int block_size_x = transpose ? num_nodes : num_qpts; ++ ++ if (transpose) { ++ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); ++ } else { ++ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); ++ } ++ } break; ++ case CEED_EVAL_CURL: { ++ void *curl_args[] = {(void *)&num_elem, &data->d_curl, &d_u, &d_v}; ++ const int block_size_x = transpose ? num_nodes : num_qpts; ++ ++ if (transpose) { ++ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); ++ } else { ++ CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); ++ } + } break; + case CEED_EVAL_WEIGHT: { + void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v}; +@@ -143,12 +167,6 @@ int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTra + CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args)); + } break; + // LCOV_EXCL_START +- // Evaluate the divergence to/from the quadrature points +- case CEED_EVAL_DIV: +- return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); +- // Evaluate the curl to/from the quadrature points +- case CEED_EVAL_CURL: +- return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); + // Take no action, BasisApply should not have been called + case CEED_EVAL_NONE: + return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context"); +@@ -193,6 +211,8 @@ static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { + CeedCallHip(ceed, hipFree(data->d_q_weight)); + CeedCallHip(ceed, hipFree(data->d_interp)); + CeedCallHip(ceed, hipFree(data->d_grad)); ++ CeedCallHip(ceed, hipFree(data->d_div)); ++ CeedCallHip(ceed, hipFree(data->d_curl)); + CeedCallBackend(CeedFree(&data)); + return CEED_ERROR_SUCCESS; + } +@@ -237,34 +257,43 @@ int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const C + + CeedCallBackend(CeedBasisSetData(basis, data)); + ++ // Register backend functions + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip)); + return CEED_ERROR_SUCCESS; + } + + //------------------------------------------------------------------------------ +-// Create non-tensor ++// Create non-tensor H^1 + //------------------------------------------------------------------------------ + int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad, + const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { + Ceed ceed; + char *basis_kernel_path, *basis_kernel_source; +- CeedInt num_comp; +- const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); +- const CeedInt interp_bytes = q_bytes * num_nodes; +- const CeedInt grad_bytes = q_bytes * num_nodes * dim; ++ CeedInt num_comp, q_comp_interp, q_comp_grad; ++ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); + CeedBasisNonTensor_Hip *data; + + CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedCalloc(1, &data)); + + // Copy basis data to GPU ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); + CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); + CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); +- CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); +- CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); +- CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes)); +- CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice)); ++ if (interp) { ++ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; ++ ++ CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); ++ CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); ++ } ++ if (grad) { ++ const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad; ++ ++ CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes)); ++ CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice)); ++ } + + // Compile basis kernels + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); +@@ -272,13 +301,128 @@ int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); + CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); + CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); +- CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 4, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_DIM", dim, +- "BASIS_NUM_COMP", num_comp)); ++ CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", ++ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); +- CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); + CeedCallBackend(CeedFree(&basis_kernel_path)); + CeedCallBackend(CeedFree(&basis_kernel_source)); ++ ++ CeedCallBackend(CeedBasisSetData(basis, data)); ++ ++ // Register backend functions ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); ++ return CEED_ERROR_SUCCESS; ++} ++ ++//------------------------------------------------------------------------------ ++// Create non-tensor H(div) ++//------------------------------------------------------------------------------ ++int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div, ++ const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { ++ Ceed ceed; ++ char *basis_kernel_path, *basis_kernel_source; ++ CeedInt num_comp, q_comp_interp, q_comp_div; ++ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); ++ CeedBasisNonTensor_Hip *data; ++ ++ CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); ++ CeedCallBackend(CeedCalloc(1, &data)); ++ ++ // Copy basis data to GPU ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div)); ++ CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); ++ CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); ++ if (interp) { ++ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; ++ ++ CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); ++ CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); ++ } ++ if (div) { ++ const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div; ++ ++ CeedCallHip(ceed, hipMalloc((void **)&data->d_div, div_bytes)); ++ CeedCallHip(ceed, hipMemcpy(data->d_div, div, div_bytes, hipMemcpyHostToDevice)); ++ } ++ ++ // Compile basis kernels ++ CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); ++ CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); ++ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); ++ CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); ++ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); ++ CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", ++ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); ++ CeedCallBackend(CeedFree(&basis_kernel_path)); ++ CeedCallBackend(CeedFree(&basis_kernel_source)); ++ ++ CeedCallBackend(CeedBasisSetData(basis, data)); ++ ++ // Register backend functions ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip)); ++ return CEED_ERROR_SUCCESS; ++} ++ ++//------------------------------------------------------------------------------ ++// Create non-tensor H(curl) ++//------------------------------------------------------------------------------ ++int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, ++ const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) { ++ Ceed ceed; ++ char *basis_kernel_path, *basis_kernel_source; ++ CeedInt num_comp, q_comp_interp, q_comp_curl; ++ const CeedInt q_bytes = num_qpts * sizeof(CeedScalar); ++ CeedBasisNonTensor_Hip *data; ++ ++ CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); ++ CeedCallBackend(CeedCalloc(1, &data)); ++ ++ // Copy basis data to GPU ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); ++ CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl)); ++ CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes)); ++ CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice)); ++ if (interp) { ++ const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp; ++ ++ CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes)); ++ CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice)); ++ } ++ if (curl) { ++ const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl; ++ ++ CeedCallHip(ceed, hipMalloc((void **)&data->d_curl, curl_bytes)); ++ CeedCallHip(ceed, hipMemcpy(data->d_curl, curl, curl_bytes, hipMemcpyHostToDevice)); ++ } ++ ++ // Compile basis kernels ++ CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); ++ CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path)); ++ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n"); ++ CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source)); ++ CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n"); ++ CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP", ++ q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose)); ++ CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight)); ++ CeedCallBackend(CeedFree(&basis_kernel_path)); ++ CeedCallBackend(CeedFree(&basis_kernel_source)); ++ + CeedCallBackend(CeedBasisSetData(basis, data)); + + // Register backend functions +diff --git a/backends/hip-ref/ceed-hip-ref.c b/backends/hip-ref/ceed-hip-ref.c +index 754c0b52..eca2f4dd 100644 +--- a/backends/hip-ref/ceed-hip-ref.c ++++ b/backends/hip-ref/ceed-hip-ref.c +@@ -17,8 +17,8 @@ + //------------------------------------------------------------------------------ + // HIP preferred MemType + //------------------------------------------------------------------------------ +-static int CeedGetPreferredMemType_Hip(CeedMemType *type) { +- *type = CEED_MEM_DEVICE; ++static int CeedGetPreferredMemType_Hip(CeedMemType *mem_type) { ++ *mem_type = CEED_MEM_DEVICE; + return CEED_ERROR_SUCCESS; + } + +@@ -54,6 +54,8 @@ static int CeedInit_Hip_ref(const char *resource, Ceed ceed) { + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "VectorCreate", CeedVectorCreate_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateTensorH1", CeedBasisCreateTensorH1_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateH1", CeedBasisCreateH1_Hip)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHdiv", CeedBasisCreateHdiv_Hip)); ++ CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "BasisCreateHcurl", CeedBasisCreateHcurl_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "ElemRestrictionCreate", CeedElemRestrictionCreate_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Hip)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionContextCreate", CeedQFunctionContextCreate_Hip)); +diff --git a/backends/hip-ref/ceed-hip-ref.h b/backends/hip-ref/ceed-hip-ref.h +index 634bb68d..89bbc7a0 100644 +--- a/backends/hip-ref/ceed-hip-ref.h ++++ b/backends/hip-ref/ceed-hip-ref.h +@@ -57,10 +57,14 @@ typedef struct { + typedef struct { + hipModule_t module; + hipFunction_t Interp; +- hipFunction_t Grad; ++ hipFunction_t InterpTranspose; ++ hipFunction_t Deriv; ++ hipFunction_t DerivTranspose; + hipFunction_t Weight; + CeedScalar *d_interp; + CeedScalar *d_grad; ++ CeedScalar *d_div; ++ CeedScalar *d_curl; + CeedScalar *d_q_weight; + } CeedBasisNonTensor_Hip; + +@@ -122,9 +126,12 @@ CEED_INTERN int CeedElemRestrictionCreate_Hip(CeedMemType mem_type, CeedCopyMode + + CEED_INTERN int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d, + const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis); +- + CEED_INTERN int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, + const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); ++CEED_INTERN int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, ++ const CeedScalar *div, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); ++CEED_INTERN int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, ++ const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis); + + CEED_INTERN int CeedQFunctionCreate_Hip(CeedQFunction qf); + +diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h +new file mode 100644 +index 00000000..7e6450fb +--- /dev/null ++++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor-templates.h +@@ -0,0 +1,67 @@ ++// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. ++// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. ++// ++// SPDX-License-Identifier: BSD-2-Clause ++// ++// This file is part of CEED: http://github.com/ceed ++ ++/// @file ++/// Internal header for CUDA non-tensor product basis templates ++#ifndef CEED_CUDA_REF_BASIS_NONTENSOR_TEMPLATES_H ++#define CEED_CUDA_REF_BASIS_NONTENSOR_TEMPLATES_H ++ ++#include ++ ++//------------------------------------------------------------------------------ ++// Tensor contraction ++//------------------------------------------------------------------------------ ++template ++inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U, ++ const CeedInt strides_comp_V, const CeedInt strides_q_comp_V, const CeedScalar *__restrict__ d_B, ++ const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { ++ const CeedInt t_id = threadIdx.x; ++ const CeedScalar *U; ++ CeedScalar r_V[Q_COMP]; ++ // TODO load B in shared memory if blockDim.z > 1? ++ ++ for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ++ // Run with Q threads ++ U = d_U + elem * strides_elem_U + comp * strides_comp_U; ++ for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0; ++ for (CeedInt i = 0; i < P; i++) { ++ const CeedScalar val = U[i]; ++ ++ for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] += d_B[i + t_id * P + d * P * Q] * val; ++ } ++ for (CeedInt d = 0; d < Q_COMP; d++) { ++ d_V[elem * strides_elem_V + comp * strides_comp_V + d * strides_q_comp_V + t_id] = r_V[d]; ++ } ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// Tensor contraction transpose ++//------------------------------------------------------------------------------ ++template ++inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U, ++ const CeedInt strides_comp_V, const CeedInt strides_q_comp_U, const CeedScalar *__restrict__ d_B, ++ const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { ++ const CeedInt t_id = threadIdx.x; ++ const CeedScalar *U; ++ CeedScalar r_V; ++ // TODO load B in shared memory if blockDim.z > 1? ++ ++ for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ++ // Run with P threads ++ r_V = 0.0; ++ for (CeedInt d = 0; d < Q_COMP; d++) { ++ U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U; ++ for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i]; ++ } ++ d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V; ++ } ++} ++ ++//------------------------------------------------------------------------------ ++ ++#endif // CEED_CUDA_REF_BASIS_NONTENSOR_TEMPLATES_H +diff --git a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h +index 484d755f..4b5c7f94 100644 +--- a/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h ++++ b/include/ceed/jit-source/cuda/cuda-ref-basis-nontensor.h +@@ -12,6 +12,8 @@ + + #include + ++#include "cuda-ref-basis-nontensor-templates.h" ++ + //------------------------------------------------------------------------------ + // Non-Tensor Basis Kernels + //------------------------------------------------------------------------------ +@@ -19,65 +21,38 @@ + //------------------------------------------------------------------------------ + // Interp + //------------------------------------------------------------------------------ +-extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U, ++extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { +- const CeedInt t_id = threadIdx.x; +- const CeedScalar *U; +- CeedScalar V; +- // TODO load B in shared memory if blockDim.z > 1? +- + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { +- for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { +- if (transpose) { // run with P threads +- U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q; +- V = 0.0; +- for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i]; +- +- d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; +- } else { // run with Q threads +- U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; +- V = 0.0; +- for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i]; ++ Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, ++ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); ++ } ++} + +- d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V; +- } +- } ++extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, ++ CeedScalar *__restrict__ d_V) { ++ for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ++ ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, ++ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); + } + } + + //------------------------------------------------------------------------------ +-// Grad ++// Deriv + //------------------------------------------------------------------------------ +-extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U, +- CeedScalar *__restrict__ d_V) { +- const CeedInt t_id = threadIdx.x; +- const CeedScalar *U; +- // TODO load G in shared memory if blockDim.z > 1? +- ++extern "C" __global__ void Deriv(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, ++ CeedScalar *__restrict__ d_V) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { +- for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { +- if (transpose) { // run with P threads +- CeedScalar V = 0.0; +- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { +- U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q; +- for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i]; +- } +- +- d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; +- } else { // run with Q threads +- CeedScalar V[BASIS_DIM]; +- U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; +- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0; +- for (CeedInt i = 0; i < BASIS_P; i++) { +- const CeedScalar val = U[i]; +- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val; +- } ++ Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, ++ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); ++ } ++} + +- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { +- d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim]; +- } +- } +- } ++extern "C" __global__ void DerivTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, ++ CeedScalar *__restrict__ d_V) { ++ for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ++ ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, ++ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); + } + } + +@@ -86,8 +61,8 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, + //------------------------------------------------------------------------------ + extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) { + const CeedInt t_id = threadIdx.x; +- + // TODO load q_weight in shared memory if blockDim.z > 1? ++ + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; + } +diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h +new file mode 100644 +index 00000000..4e568369 +--- /dev/null ++++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor-templates.h +@@ -0,0 +1,67 @@ ++// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. ++// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. ++// ++// SPDX-License-Identifier: BSD-2-Clause ++// ++// This file is part of CEED: http://github.com/ceed ++ ++/// @file ++/// Internal header for HIP non-tensor product basis templates ++#ifndef CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H ++#define CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H ++ ++#include ++ ++//------------------------------------------------------------------------------ ++// Tensor contraction ++//------------------------------------------------------------------------------ ++template ++inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U, ++ const CeedInt strides_comp_V, const CeedInt strides_q_comp_V, const CeedScalar *__restrict__ d_B, ++ const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { ++ const CeedInt t_id = threadIdx.x; ++ const CeedScalar *U; ++ CeedScalar r_V[Q_COMP]; ++ // TODO load B in shared memory if blockDim.z > 1? ++ ++ for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ++ // Run with Q threads ++ U = d_U + elem * strides_elem_U + comp * strides_comp_U; ++ for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0; ++ for (CeedInt i = 0; i < P; i++) { ++ const CeedScalar val = U[i]; ++ ++ for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] += d_B[i + t_id * P + d * P * Q] * val; ++ } ++ for (CeedInt d = 0; d < Q_COMP; d++) { ++ d_V[elem * strides_elem_V + comp * strides_comp_V + d * strides_q_comp_V + t_id] = r_V[d]; ++ } ++ } ++} ++ ++//------------------------------------------------------------------------------ ++// Tensor contraction transpose ++//------------------------------------------------------------------------------ ++template ++inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U, ++ const CeedInt strides_comp_V, const CeedInt strides_q_comp_U, const CeedScalar *__restrict__ d_B, ++ const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { ++ const CeedInt t_id = threadIdx.x; ++ const CeedScalar *U; ++ CeedScalar r_V; ++ // TODO load B in shared memory if blockDim.z > 1? ++ ++ for (CeedInt comp = 0; comp < NUM_COMP; comp++) { ++ // Run with P threads ++ r_V = 0.0; ++ for (CeedInt d = 0; d < Q_COMP; d++) { ++ U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U; ++ for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i]; ++ } ++ d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V; ++ } ++} ++ ++//------------------------------------------------------------------------------ ++ ++#endif // CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H +diff --git a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h +index 101f898e..fd389f8a 100644 +--- a/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h ++++ b/include/ceed/jit-source/hip/hip-ref-basis-nontensor.h +@@ -12,6 +12,8 @@ + + #include + ++#include "hip-ref-basis-nontensor-templates.h" ++ + //------------------------------------------------------------------------------ + // Non-Tensor Basis Kernels + //------------------------------------------------------------------------------ +@@ -19,67 +21,38 @@ + //------------------------------------------------------------------------------ + // Interp + //------------------------------------------------------------------------------ +-extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U, ++extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, + CeedScalar *__restrict__ d_V) { +- const CeedInt t_id = threadIdx.x; +- +- const CeedScalar *U; +- CeedScalar V; +- // TODO load B in shared memory if blockDim.z > 1? +- + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { +- for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { +- if (transpose) { // run with P threads +- U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q; +- V = 0.0; +- for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i]; +- +- d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; +- } else { // run with Q threads +- U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; +- V = 0.0; +- for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i]; ++ Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, ++ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); ++ } ++} + +- d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V; +- } +- } ++extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, ++ CeedScalar *__restrict__ d_V) { ++ for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ++ ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, ++ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); + } + } + + //------------------------------------------------------------------------------ +-// Grad ++// Deriv + //------------------------------------------------------------------------------ +-extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U, +- CeedScalar *__restrict__ d_V) { +- const CeedInt t_id = threadIdx.x; +- +- const CeedScalar *U; +- // TODO load G in shared memory if blockDim.z > 1? +- ++extern "C" __global__ void Deriv(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, ++ CeedScalar *__restrict__ d_V) { + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { +- for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { +- if (transpose) { // run with P threads +- CeedScalar V = 0.0; +- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { +- U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q; +- for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i]; +- } +- +- d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; +- } else { // run with Q threads +- CeedScalar V[BASIS_DIM]; +- U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; +- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0; +- for (CeedInt i = 0; i < BASIS_P; i++) { +- const CeedScalar val = U[i]; +- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val; +- } ++ Contract(elem, BASIS_P, BASIS_Q, BASIS_P * num_elem, BASIS_Q * num_elem, ++ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); ++ } ++} + +- for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { +- d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim]; +- } +- } +- } ++extern "C" __global__ void DerivTranspose(const CeedInt num_elem, const CeedScalar *__restrict__ d_B, const CeedScalar *__restrict__ d_U, ++ CeedScalar *__restrict__ d_V) { ++ for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { ++ ContractTranspose(elem, BASIS_Q, BASIS_P, BASIS_Q * num_elem, BASIS_P * num_elem, ++ BASIS_NUM_COMP * BASIS_Q * num_elem, d_B, d_U, d_V); + } + } + +@@ -89,6 +62,7 @@ extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, + extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) { + const CeedInt t_id = threadIdx.x; + // TODO load q_weight in shared memory if blockDim.z > 1? ++ + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; + } +diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c +index 9e569584..1b86fa33 100644 +--- a/interface/ceed-basis.c ++++ b/interface/ceed-basis.c +@@ -1208,7 +1208,7 @@ int CeedBasisCreateHcurl(Ceed ceed, CeedElemTopology topo, CeedInt num_comp, Cee + const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis *basis) { + CeedInt Q = num_qpts, P = num_nodes, dim = 0, curl_comp = 0; + +- if (!ceed->BasisCreateHdiv) { ++ if (!ceed->BasisCreateHcurl) { + Ceed delegate; + + CeedCall(CeedGetObjectDelegate(ceed, &delegate, "Basis")); diff --git a/extern/patch/mfem/patch_cmake_cuda_fix.diff b/extern/patch/mfem/patch_cmake_cuda_fix.diff index b3fd1d3df5..59b073f5b5 100644 --- a/extern/patch/mfem/patch_cmake_cuda_fix.diff +++ b/extern/patch/mfem/patch_cmake_cuda_fix.diff @@ -1,105 +1,105 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 6e459ed1c..e4440ac02 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -139,10 +139,9 @@ if (MFEM_USE_CUDA) - set(CMAKE_CUDA_HOST_LINK_LAUNCHER ${CMAKE_CXX_COMPILER}) - endif() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_FLAGS}") -+ find_package(CUDAToolkit REQUIRED) - set(CUSPARSE_FOUND TRUE) -- set(CUSPARSE_LIBRARIES "cusparse") -- set(CUBLAS_FOUND TRUE) -- set(CUBLAS_LIBRARIES "cublas") -+ get_target_property(CUSPARSE_LIBRARIES CUDA::cusparse LOCATION) - endif() - - if (XSDK_ENABLE_C) -@@ -531,7 +530,7 @@ find_package(Threads REQUIRED) - set(MFEM_TPLS OPENMP HYPRE LAPACK BLAS SuperLUDist STRUMPACK METIS SuiteSparse - SUNDIALS PETSC SLEPC MUMPS AXOM FMS CONDUIT Ginkgo GNUTLS GSLIB - NETCDF MPFR PUMI HIOP POSIXCLOCKS MFEMBacktrace ZLIB OCCA CEED RAJA UMPIRE -- ADIOS2 CUBLAS CUSPARSE MKL_CPARDISO MKL_PARDISO AMGX CALIPER CODIPACK -+ ADIOS2 CUSPARSE MKL_CPARDISO MKL_PARDISO AMGX CALIPER CODIPACK - BENCHMARK PARELAG MPI_CXX HIP HIPSPARSE MOONOLITH BLITZ ALGOIM ENZYME) - - # Add all *_FOUND libraries in the variable TPL_LIBRARIES. -diff --git a/config/cmake/modules/FindHYPRE.cmake b/config/cmake/modules/FindHYPRE.cmake -index b21599330..f152d8722 100644 ---- a/config/cmake/modules/FindHYPRE.cmake -+++ b/config/cmake/modules/FindHYPRE.cmake -@@ -14,10 +14,12 @@ - # - HYPRE_LIBRARIES - # - HYPRE_INCLUDE_DIRS - # - HYPRE_VERSION --# - HYPRE_USING_HIP (internal) - - if (HYPRE_FOUND) -- if (HYPRE_USING_HIP) -+ if (MFEM_USE_CUDA) -+ find_package(CUDAToolkit REQUIRED) -+ endif() -+ if (MFEM_USE_HIP) - find_package(rocsparse REQUIRED) - find_package(rocrand REQUIRED) - endif() -@@ -26,21 +28,7 @@ endif() - - include(MfemCmakeUtilities) - mfem_find_package(HYPRE HYPRE HYPRE_DIR "include" "HYPRE.h" "lib" "HYPRE" -- "Paths to headers required by HYPRE." "Libraries required by HYPRE." -- CHECK_BUILD HYPRE_USING_HIP FALSE -- " --#undef HYPRE_USING_HIP --#include -- --#ifndef HYPRE_USING_HIP --#error HYPRE is built without HIP. --#endif -- --int main() --{ -- return 0; --} --") -+ "Paths to headers required by HYPRE." "Libraries required by HYPRE.") - - if (HYPRE_FOUND AND (NOT HYPRE_VERSION)) - try_run(HYPRE_VERSION_RUN_RESULT HYPRE_VERSION_COMPILE_RESULT -@@ -57,7 +45,17 @@ if (HYPRE_FOUND AND (NOT HYPRE_VERSION)) - endif() - endif() - --if (HYPRE_FOUND AND HYPRE_USING_HIP) -+if (HYPRE_FOUND AND MFEM_USE_CUDA) -+ find_package(CUDAToolkit REQUIRED) -+ get_target_property(CUSPARSE_LIBRARIES CUDA::cusparse LOCATION) -+ get_target_property(CURAND_LIBRARIES CUDA::curand LOCATION) -+ list(APPEND HYPRE_LIBRARIES ${CUSPARSE_LIBRARIES} ${CURAND_LIBRARIES}) -+ set(HYPRE_LIBRARIES ${HYPRE_LIBRARIES} CACHE STRING -+ "HYPRE libraries + dependencies." FORCE) -+ message(STATUS "Updated HYPRE_LIBRARIES: ${HYPRE_LIBRARIES}") -+endif() -+ -+if (HYPRE_FOUND AND MFEM_USE_HIP) - find_package(rocsparse REQUIRED) - find_package(rocrand REQUIRED) - list(APPEND HYPRE_LIBRARIES ${rocsparse_LIBRARIES} ${rocrand_LIBRARIES}) -diff --git a/config/defaults.cmake b/config/defaults.cmake -index 06531934d..4bd4cdf8d 100644 ---- a/config/defaults.cmake -+++ b/config/defaults.cmake -@@ -106,12 +106,7 @@ set(HYPRE_DIR "${MFEM_DIR}/../hypre/src/hypre" CACHE PATH - # If hypre was compiled to depend on BLAS and LAPACK: - # set(HYPRE_REQUIRED_PACKAGES "BLAS" "LAPACK" CACHE STRING - # "Packages that HYPRE depends on.") --if (MFEM_USE_CUDA) -- # This is only necessary when hypre is built with cuda: -- set(HYPRE_REQUIRED_LIBRARIES "-lcusparse" "-lcurand" CACHE STRING -- "Libraries that HYPRE depends on.") --endif() --# HIP dependency for HYPRE is handled in FindHYPRE.cmake. -+# CUDA and HIP dependencies for HYPRE are handled in FindHYPRE.cmake. - - set(METIS_DIR "${MFEM_DIR}/../metis-4.0" CACHE PATH "Path to the METIS library.") - +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 6e459ed1c..e4440ac02 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -139,10 +139,9 @@ if (MFEM_USE_CUDA) + set(CMAKE_CUDA_HOST_LINK_LAUNCHER ${CMAKE_CXX_COMPILER}) + endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CUDA_FLAGS}") ++ find_package(CUDAToolkit REQUIRED) + set(CUSPARSE_FOUND TRUE) +- set(CUSPARSE_LIBRARIES "cusparse") +- set(CUBLAS_FOUND TRUE) +- set(CUBLAS_LIBRARIES "cublas") ++ get_target_property(CUSPARSE_LIBRARIES CUDA::cusparse LOCATION) + endif() + + if (XSDK_ENABLE_C) +@@ -531,7 +530,7 @@ find_package(Threads REQUIRED) + set(MFEM_TPLS OPENMP HYPRE LAPACK BLAS SuperLUDist STRUMPACK METIS SuiteSparse + SUNDIALS PETSC SLEPC MUMPS AXOM FMS CONDUIT Ginkgo GNUTLS GSLIB + NETCDF MPFR PUMI HIOP POSIXCLOCKS MFEMBacktrace ZLIB OCCA CEED RAJA UMPIRE +- ADIOS2 CUBLAS CUSPARSE MKL_CPARDISO MKL_PARDISO AMGX CALIPER CODIPACK ++ ADIOS2 CUSPARSE MKL_CPARDISO MKL_PARDISO AMGX CALIPER CODIPACK + BENCHMARK PARELAG MPI_CXX HIP HIPSPARSE MOONOLITH BLITZ ALGOIM ENZYME) + + # Add all *_FOUND libraries in the variable TPL_LIBRARIES. +diff --git a/config/cmake/modules/FindHYPRE.cmake b/config/cmake/modules/FindHYPRE.cmake +index b21599330..f152d8722 100644 +--- a/config/cmake/modules/FindHYPRE.cmake ++++ b/config/cmake/modules/FindHYPRE.cmake +@@ -14,10 +14,12 @@ + # - HYPRE_LIBRARIES + # - HYPRE_INCLUDE_DIRS + # - HYPRE_VERSION +-# - HYPRE_USING_HIP (internal) + + if (HYPRE_FOUND) +- if (HYPRE_USING_HIP) ++ if (MFEM_USE_CUDA) ++ find_package(CUDAToolkit REQUIRED) ++ endif() ++ if (MFEM_USE_HIP) + find_package(rocsparse REQUIRED) + find_package(rocrand REQUIRED) + endif() +@@ -26,21 +28,7 @@ endif() + + include(MfemCmakeUtilities) + mfem_find_package(HYPRE HYPRE HYPRE_DIR "include" "HYPRE.h" "lib" "HYPRE" +- "Paths to headers required by HYPRE." "Libraries required by HYPRE." +- CHECK_BUILD HYPRE_USING_HIP FALSE +- " +-#undef HYPRE_USING_HIP +-#include +- +-#ifndef HYPRE_USING_HIP +-#error HYPRE is built without HIP. +-#endif +- +-int main() +-{ +- return 0; +-} +-") ++ "Paths to headers required by HYPRE." "Libraries required by HYPRE.") + + if (HYPRE_FOUND AND (NOT HYPRE_VERSION)) + try_run(HYPRE_VERSION_RUN_RESULT HYPRE_VERSION_COMPILE_RESULT +@@ -57,7 +45,17 @@ if (HYPRE_FOUND AND (NOT HYPRE_VERSION)) + endif() + endif() + +-if (HYPRE_FOUND AND HYPRE_USING_HIP) ++if (HYPRE_FOUND AND MFEM_USE_CUDA) ++ find_package(CUDAToolkit REQUIRED) ++ get_target_property(CUSPARSE_LIBRARIES CUDA::cusparse LOCATION) ++ get_target_property(CURAND_LIBRARIES CUDA::curand LOCATION) ++ list(APPEND HYPRE_LIBRARIES ${CUSPARSE_LIBRARIES} ${CURAND_LIBRARIES}) ++ set(HYPRE_LIBRARIES ${HYPRE_LIBRARIES} CACHE STRING ++ "HYPRE libraries + dependencies." FORCE) ++ message(STATUS "Updated HYPRE_LIBRARIES: ${HYPRE_LIBRARIES}") ++endif() ++ ++if (HYPRE_FOUND AND MFEM_USE_HIP) + find_package(rocsparse REQUIRED) + find_package(rocrand REQUIRED) + list(APPEND HYPRE_LIBRARIES ${rocsparse_LIBRARIES} ${rocrand_LIBRARIES}) +diff --git a/config/defaults.cmake b/config/defaults.cmake +index 06531934d..4bd4cdf8d 100644 +--- a/config/defaults.cmake ++++ b/config/defaults.cmake +@@ -106,12 +106,7 @@ set(HYPRE_DIR "${MFEM_DIR}/../hypre/src/hypre" CACHE PATH + # If hypre was compiled to depend on BLAS and LAPACK: + # set(HYPRE_REQUIRED_PACKAGES "BLAS" "LAPACK" CACHE STRING + # "Packages that HYPRE depends on.") +-if (MFEM_USE_CUDA) +- # This is only necessary when hypre is built with cuda: +- set(HYPRE_REQUIRED_LIBRARIES "-lcusparse" "-lcurand" CACHE STRING +- "Libraries that HYPRE depends on.") +-endif() +-# HIP dependency for HYPRE is handled in FindHYPRE.cmake. ++# CUDA and HIP dependencies for HYPRE are handled in FindHYPRE.cmake. + + set(METIS_DIR "${MFEM_DIR}/../metis-4.0" CACHE PATH "Path to the METIS library.") + diff --git a/extern/patch/mfem/patch_global_variables_threadsafe.diff b/extern/patch/mfem/patch_global_variables_threadsafe.diff index f77f023bfc..3550593a8d 100644 --- a/extern/patch/mfem/patch_global_variables_threadsafe.diff +++ b/extern/patch/mfem/patch_global_variables_threadsafe.diff @@ -1,2256 +1,2256 @@ -diff --git a/fem/eltrans.cpp b/fem/eltrans.cpp -index b812e22eb..96aa854a5 100644 ---- a/fem/eltrans.cpp -+++ b/fem/eltrans.cpp -@@ -355,15 +355,11 @@ int InverseElementTransformation::Transform(const Vector &pt, - } - else - { -- const int old_type = GlobGeometryRefiner.GetType(); -- GlobGeometryRefiner.SetType(qpts_type); -- RefinedGeometry &RefG = -- *GlobGeometryRefiner.Refine(T->GetGeometryType(), order); -+ RefinedGeometry &RefG = *refiner.Refine(T->GetGeometryType(), order); - int closest_idx = (init_guess_type == ClosestPhysNode) ? - FindClosestPhysPoint(pt, RefG.RefPts) : - FindClosestRefPoint(pt, RefG.RefPts); - ip0 = &RefG.RefPts.IntPoint(closest_idx); -- GlobGeometryRefiner.SetType(old_type); - } - break; - } -diff --git a/fem/eltrans.hpp b/fem/eltrans.hpp -index 198e20df3..43eccd499 100644 ---- a/fem/eltrans.hpp -+++ b/fem/eltrans.hpp -@@ -234,6 +234,7 @@ protected: - const IntegrationPoint *ip0; - int init_guess_type; // algorithm to use - int qpts_type; // Quadrature1D type for the initial guess type -+ GeometryRefiner refiner; // geometry refiner for initial guess - int rel_qpts_order; // num_1D_qpts = max(trans_order+rel_qpts_order,0)+1 - int solver_type; // solution strategy to use - int max_iter; // max. number of Newton iterations -@@ -277,6 +278,7 @@ public: - ip0(NULL), - init_guess_type(Center), - qpts_type(Quadrature1D::OpenHalfUniform), -+ refiner(qpts_type), - rel_qpts_order(-1), - solver_type(NewtonElementProject), - max_iter(16), -@@ -301,7 +303,8 @@ public: - { ip0 = &init_ip; SetInitialGuessType(GivenPoint); } - - /// Set the Quadrature1D type used for the `Closest*` initial guess types. -- void SetInitGuessPointsType(int q_type) { qpts_type = q_type; } -+ void SetInitGuessPointsType(int q_type) -+ { qpts_type = q_type; refiner.SetType(q_type); } - - /// Set the relative order used for the `Closest*` initial guess types. - /** The number of points in each spatial direction is given by the formula -@@ -361,7 +364,7 @@ public: - class IsoparametricTransformation : public ElementTransformation - { - private: -- DenseMatrix dshape,d2shape; -+ DenseMatrix dshape, d2shape; - Vector shape; - - const FiniteElement *FElem; -diff --git a/fem/fe/fe_base.cpp b/fem/fe/fe_base.cpp -index b2f49a4bc..4387ae775 100644 ---- a/fem/fe/fe_base.cpp -+++ b/fem/fe/fe_base.cpp -@@ -359,135 +359,148 @@ void FiniteElement::CalcPhysHessian(ElementTransformation &Trans, - - // Hessian in physical coords - lhm.Invert(); -- Mult( hess, lhm, Hessian); -+ Mult(hess, lhm, Hessian); - } - - const DofToQuad &FiniteElement::GetDofToQuad(const IntegrationRule &ir, - DofToQuad::Mode mode) const - { -+ DofToQuad *d2q = nullptr; - MFEM_VERIFY(mode == DofToQuad::FULL, "invalid mode requested"); - -- for (int i = 0; i < dof2quad_array.Size(); i++) -- { -- const DofToQuad &d2q = *dof2quad_array[i]; -- if (d2q.IntRule == &ir && d2q.mode == mode) { return d2q; } -- } -- --#ifdef MFEM_THREAD_SAFE -- DenseMatrix vshape(dof, dim); -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ #pragma omp critical (DofToQuad) - #endif -- -- DofToQuad *d2q = new DofToQuad; -- const int nqpt = ir.GetNPoints(); -- d2q->FE = this; -- d2q->IntRule = &ir; -- d2q->mode = mode; -- d2q->ndof = dof; -- d2q->nqpt = nqpt; -- if (range_type == SCALAR) -- { -- d2q->B.SetSize(nqpt*dof); -- d2q->Bt.SetSize(dof*nqpt); -- -- Vector shape; -- vshape.GetColumnReference(0, shape); -- for (int i = 0; i < nqpt; i++) -- { -- const IntegrationPoint &ip = ir.IntPoint(i); -- CalcShape(ip, shape); -- for (int j = 0; j < dof; j++) -- { -- d2q->B[i+nqpt*j] = d2q->Bt[j+dof*i] = shape(j); -- } -- } -- } -- else if (range_type == VECTOR) - { -- d2q->B.SetSize(nqpt*dim*dof); -- d2q->Bt.SetSize(dof*nqpt*dim); -- -- for (int i = 0; i < nqpt; i++) -+ for (int i = 0; i < dof2quad_array.Size(); i++) - { -- const IntegrationPoint &ip = ir.IntPoint(i); -- CalcVShape(ip, vshape); -- for (int d = 0; d < dim; d++) -- { -- for (int j = 0; j < dof; j++) -- { -- d2q->B[i+nqpt*(d+dim*j)] = d2q->Bt[j+dof*(i+nqpt*d)] = vshape(j, d); -- } -- } -+ d2q = dof2quad_array[i]; -+ if (d2q->IntRule != &ir || d2q->mode != mode) { d2q = nullptr; } - } -- } -- else -- { -- // Skip B and Bt for unknown range type -- } -- switch (deriv_type) -- { -- case GRAD: -+ if (!d2q) - { -- d2q->G.SetSize(nqpt*dim*dof); -- d2q->Gt.SetSize(dof*nqpt*dim); -- -- for (int i = 0; i < nqpt; i++) -+#ifdef MFEM_THREAD_SAFE -+ DenseMatrix vshape(dof, dim); -+#endif -+ d2q = new DofToQuad; -+ const int nqpt = ir.GetNPoints(); -+ d2q->FE = this; -+ d2q->IntRule = &ir; -+ d2q->mode = mode; -+ d2q->ndof = dof; -+ d2q->nqpt = nqpt; -+ switch (range_type) - { -- const IntegrationPoint &ip = ir.IntPoint(i); -- CalcDShape(ip, vshape); -- for (int d = 0; d < dim; d++) -+ case SCALAR: - { -- for (int j = 0; j < dof; j++) -+ d2q->B.SetSize(nqpt*dof); -+ d2q->Bt.SetSize(dof*nqpt); -+ -+ Vector shape; -+ vshape.GetColumnReference(0, shape); -+ for (int i = 0; i < nqpt; i++) - { -- d2q->G[i+nqpt*(d+dim*j)] = d2q->Gt[j+dof*(i+nqpt*d)] = vshape(j, d); -+ const IntegrationPoint &ip = ir.IntPoint(i); -+ CalcShape(ip, shape); -+ for (int j = 0; j < dof; j++) -+ { -+ d2q->B[i+nqpt*j] = d2q->Bt[j+dof*i] = shape(j); -+ } - } -+ break; - } -- } -- break; -- } -- case DIV: -- { -- d2q->G.SetSize(nqpt*dof); -- d2q->Gt.SetSize(dof*nqpt); -+ case VECTOR: -+ { -+ d2q->B.SetSize(nqpt*dim*dof); -+ d2q->Bt.SetSize(dof*nqpt*dim); - -- Vector divshape; -- vshape.GetColumnReference(0, divshape); -- for (int i = 0; i < nqpt; i++) -+ for (int i = 0; i < nqpt; i++) -+ { -+ const IntegrationPoint &ip = ir.IntPoint(i); -+ CalcVShape(ip, vshape); -+ for (int d = 0; d < dim; d++) -+ { -+ for (int j = 0; j < dof; j++) -+ { -+ d2q->B[i+nqpt*(d+dim*j)] = -+ d2q->Bt[j+dof*(i+nqpt*d)] = vshape(j, d); -+ } -+ } -+ } -+ break; -+ } -+ case UNKNOWN_RANGE_TYPE: -+ // Skip B and Bt for unknown range type -+ break; -+ } -+ switch (deriv_type) - { -- const IntegrationPoint &ip = ir.IntPoint(i); -- CalcDivShape(ip, divshape); -- for (int j = 0; j < dof; j++) -+ case GRAD: - { -- d2q->G[i+nqpt*j] = d2q->Gt[j+dof*i] = divshape(j); -+ d2q->G.SetSize(nqpt*dim*dof); -+ d2q->Gt.SetSize(dof*nqpt*dim); -+ -+ for (int i = 0; i < nqpt; i++) -+ { -+ const IntegrationPoint &ip = ir.IntPoint(i); -+ CalcDShape(ip, vshape); -+ for (int d = 0; d < dim; d++) -+ { -+ for (int j = 0; j < dof; j++) -+ { -+ d2q->G[i+nqpt*(d+dim*j)] = -+ d2q->Gt[j+dof*(i+nqpt*d)] = vshape(j, d); -+ } -+ } -+ } -+ break; - } -- } -- break; -- } -- case CURL: -- { -- d2q->G.SetSize(nqpt*cdim*dof); -- d2q->Gt.SetSize(dof*nqpt*cdim); -+ case DIV: -+ { -+ d2q->G.SetSize(nqpt*dof); -+ d2q->Gt.SetSize(dof*nqpt); - -- DenseMatrix curlshape(vshape.GetData(), dof, cdim); // cdim <= dim -- for (int i = 0; i < nqpt; i++) -- { -- const IntegrationPoint &ip = ir.IntPoint(i); -- CalcCurlShape(ip, curlshape); -- for (int d = 0; d < cdim; d++) -+ Vector divshape; -+ vshape.GetColumnReference(0, divshape); -+ for (int i = 0; i < nqpt; i++) -+ { -+ const IntegrationPoint &ip = ir.IntPoint(i); -+ CalcDivShape(ip, divshape); -+ for (int j = 0; j < dof; j++) -+ { -+ d2q->G[i+nqpt*j] = d2q->Gt[j+dof*i] = divshape(j); -+ } -+ } -+ break; -+ } -+ case CURL: - { -- for (int j = 0; j < dof; j++) -+ d2q->G.SetSize(nqpt*cdim*dof); -+ d2q->Gt.SetSize(dof*nqpt*cdim); -+ -+ DenseMatrix curlshape(vshape.GetData(), dof, cdim); // cdim <= dim -+ for (int i = 0; i < nqpt; i++) - { -- d2q->G[i+nqpt*(d+cdim*j)] = d2q->Gt[j+dof*(i+nqpt*d)] = curlshape(j, d); -+ const IntegrationPoint &ip = ir.IntPoint(i); -+ CalcCurlShape(ip, curlshape); -+ for (int d = 0; d < cdim; d++) -+ { -+ for (int j = 0; j < dof; j++) -+ { -+ d2q->G[i+nqpt*(d+cdim*j)] = -+ d2q->Gt[j+dof*(i+nqpt*d)] = curlshape(j, d); -+ } -+ } - } -+ break; - } -+ case NONE: -+ // Skip G and Gt for unknown derivative type -+ break; - } -- break; -+ dof2quad_array.Append(d2q); - } -- case NONE: -- default: -- // Skip G and Gt for unknown derivative type -- break; - } -- dof2quad_array.Append(d2q); - return *d2q; - } - -@@ -904,14 +917,14 @@ VectorFiniteElement::VectorFiniteElement(int D, Geometry::Type G, - } - - void VectorFiniteElement::CalcShape( -- const IntegrationPoint &ip, Vector &shape ) const -+ const IntegrationPoint &ip, Vector &shape) const - { - mfem_error("Error: Cannot use scalar CalcShape(...) function with\n" - " VectorFiniteElements!"); - } - - void VectorFiniteElement::CalcDShape( -- const IntegrationPoint &ip, DenseMatrix &dshape ) const -+ const IntegrationPoint &ip, DenseMatrix &dshape) const - { - mfem_error("Error: Cannot use scalar CalcDShape(...) function with\n" - " VectorFiniteElements!"); -@@ -2183,51 +2196,72 @@ void Poly_1D::CalcChebyshev(const int p, const double x, double *u, double *d, - - const double *Poly_1D::GetPoints(const int p, const int btype) - { -+ Array *pts; - BasisType::Check(btype); - const int qtype = BasisType::GetQuadrature1D(btype); -- - if (qtype == Quadrature1D::Invalid) { return NULL; } - -- if (points_container.find(btype) == points_container.end()) -- { -- points_container[btype] = new Array(h_mt); -- } -- Array &pts = *points_container[btype]; -- if (pts.Size() <= p) -- { -- pts.SetSize(p + 1, NULL); -- } -- if (pts[p] == NULL) -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ #pragma omp critical (Poly1DGetPoints) -+#endif - { -- pts[p] = new double[p + 1]; -- quad_func.GivePolyPoints(p+1, pts[p], qtype); -+ auto it = points_container.find(btype); -+ if (it != points_container.end()) -+ { -+ pts = it->second; -+ } -+ else -+ { -+ pts = new Array(h_mt); -+ points_container[btype] = pts; -+ } -+ if (pts->Size() <= p) -+ { -+ pts->SetSize(p + 1, NULL); -+ } -+ if ((*pts)[p] == NULL) -+ { -+ (*pts)[p] = new double[p + 1]; -+ quad_func.GivePolyPoints(p + 1, (*pts)[p], qtype); -+ } - } -- return pts[p]; -+ return (*pts)[p]; - } - - Poly_1D::Basis &Poly_1D::GetBasis(const int p, const int btype) - { -+ Array *bases; - BasisType::Check(btype); - -- if ( bases_container.find(btype) == bases_container.end() ) -- { -- // we haven't been asked for basis or points of this type yet -- bases_container[btype] = new Array(h_mt); -- } -- Array &bases = *bases_container[btype]; -- if (bases.Size() <= p) -- { -- bases.SetSize(p + 1, NULL); -- } -- if (bases[p] == NULL) -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ #pragma omp critical (Poly1DGetBasis) -+#endif - { -- EvalType etype; -- if (btype == BasisType::Positive) { etype = Positive; } -- else if (btype == BasisType::IntegratedGLL) { etype = Integrated; } -- else { etype = Barycentric; } -- bases[p] = new Basis(p, GetPoints(p, btype), etype); -+ auto it = bases_container.find(btype); -+ if (it != bases_container.end()) -+ { -+ bases = it->second; -+ } -+ else -+ { -+ // we haven't been asked for basis or points of this type yet -+ bases = new Array(h_mt); -+ bases_container[btype] = bases; -+ } -+ if (bases->Size() <= p) -+ { -+ bases->SetSize(p + 1, NULL); -+ } -+ if ((*bases)[p] == NULL) -+ { -+ EvalType etype; -+ if (btype == BasisType::Positive) { etype = Positive; } -+ else if (btype == BasisType::IntegratedGLL) { etype = Integrated; } -+ else { etype = Barycentric; } -+ (*bases)[p] = new Basis(p, GetPoints(p, btype), etype); -+ } - } -- return *bases[p]; -+ return *(*bases)[p]; - } - - Poly_1D::~Poly_1D() -@@ -2236,7 +2270,7 @@ Poly_1D::~Poly_1D() - it != points_container.end() ; ++it) - { - Array& pts = *it->second; -- for ( int i = 0 ; i < pts.Size() ; ++i ) -+ for (int i = 0; i < pts.Size(); ++i) - { - delete [] pts[i]; - } -@@ -2247,7 +2281,7 @@ Poly_1D::~Poly_1D() - it != bases_container.end() ; ++it) - { - Array& bases = *it->second; -- for ( int i = 0 ; i < bases.Size() ; ++i ) -+ for (int i = 0; i < bases.Size(); ++i) - { - delete bases[i]; - } -@@ -2461,39 +2495,47 @@ const DofToQuad &TensorBasisElement::GetTensorDofToQuad( - DofToQuad::Mode mode, const Poly_1D::Basis &basis, bool closed, - Array &dof2quad_array) - { -+ DofToQuad *d2q = nullptr; - MFEM_VERIFY(mode == DofToQuad::TENSOR, "invalid mode requested"); - -- for (int i = 0; i < dof2quad_array.Size(); i++) -- { -- const DofToQuad &d2q = *dof2quad_array[i]; -- if (d2q.IntRule == &ir && d2q.mode == mode) { return d2q; } -- } -- -- DofToQuad *d2q = new DofToQuad; -- const int ndof = closed ? fe.GetOrder() + 1 : fe.GetOrder(); -- const int nqpt = (int)floor(pow(ir.GetNPoints(), 1.0/fe.GetDim()) + 0.5); -- d2q->FE = &fe; -- d2q->IntRule = &ir; -- d2q->mode = mode; -- d2q->ndof = ndof; -- d2q->nqpt = nqpt; -- d2q->B.SetSize(nqpt*ndof); -- d2q->Bt.SetSize(ndof*nqpt); -- d2q->G.SetSize(nqpt*ndof); -- d2q->Gt.SetSize(ndof*nqpt); -- Vector val(ndof), grad(ndof); -- for (int i = 0; i < nqpt; i++) -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ #pragma omp critical (DofToQuad) -+#endif - { -- // The first 'nqpt' points in 'ir' have the same x-coordinates as those -- // of the 1D rule. -- basis.Eval(ir.IntPoint(i).x, val, grad); -- for (int j = 0; j < ndof; j++) -- { -- d2q->B[i+nqpt*j] = d2q->Bt[j+ndof*i] = val(j); -- d2q->G[i+nqpt*j] = d2q->Gt[j+ndof*i] = grad(j); -+ for (int i = 0; i < dof2quad_array.Size(); i++) -+ { -+ d2q = dof2quad_array[i]; -+ if (d2q->IntRule != &ir || d2q->mode != mode) { d2q = nullptr; } -+ } -+ if (!d2q) -+ { -+ d2q = new DofToQuad; -+ const int ndof = closed ? fe.GetOrder() + 1 : fe.GetOrder(); -+ const int nqpt = (int)floor(pow(ir.GetNPoints(), 1.0/fe.GetDim()) + 0.5); -+ d2q->FE = &fe; -+ d2q->IntRule = &ir; -+ d2q->mode = mode; -+ d2q->ndof = ndof; -+ d2q->nqpt = nqpt; -+ d2q->B.SetSize(nqpt*ndof); -+ d2q->Bt.SetSize(ndof*nqpt); -+ d2q->G.SetSize(nqpt*ndof); -+ d2q->Gt.SetSize(ndof*nqpt); -+ Vector val(ndof), grad(ndof); -+ for (int i = 0; i < nqpt; i++) -+ { -+ // The first 'nqpt' points in 'ir' have the same x-coordinates as those -+ // of the 1D rule. -+ basis.Eval(ir.IntPoint(i).x, val, grad); -+ for (int j = 0; j < ndof; j++) -+ { -+ d2q->B[i+nqpt*j] = d2q->Bt[j+ndof*i] = val(j); -+ d2q->G[i+nqpt*j] = d2q->Gt[j+ndof*i] = grad(j); -+ } -+ } -+ dof2quad_array.Append(d2q); - } - } -- dof2quad_array.Append(d2q); - return *d2q; - } - -diff --git a/fem/fe/fe_base.hpp b/fem/fe/fe_base.hpp -index b948b4f8d..f9e31b457 100644 ---- a/fem/fe/fe_base.hpp -+++ b/fem/fe/fe_base.hpp -@@ -250,7 +250,7 @@ protected: - /// Container for all DofToQuad objects created by the FiniteElement. - /** Multiple DofToQuad objects may be needed when different quadrature rules - or different DofToQuad::Mode are used. */ -- mutable Array dof2quad_array; -+ mutable Array dof2quad_array; - - public: - /// Enumeration for range_type and deriv_range_type -@@ -1026,8 +1026,8 @@ public: - }; - - private: -- typedef std::map< int, Array* > PointsMap; -- typedef std::map< int, Array* > BasisMap; -+ typedef std::map*> PointsMap; -+ typedef std::map*> BasisMap; - - MemoryType h_mt; - PointsMap points_container; -diff --git a/fem/geom.cpp b/fem/geom.cpp -index 5438d741f..2d9f4e907 100644 ---- a/fem/geom.cpp -+++ b/fem/geom.cpp -@@ -262,7 +262,7 @@ Geometry::~Geometry() - } - } - --const IntegrationRule * Geometry::GetVertices(int GeomType) -+const IntegrationRule *Geometry::GetVertices(int GeomType) const - { - switch (GeomType) - { -@@ -274,8 +274,9 @@ const IntegrationRule * Geometry::GetVertices(int GeomType) - case Geometry::CUBE: return GeomVert[5]; - case Geometry::PRISM: return GeomVert[6]; - case Geometry::PYRAMID: return GeomVert[7]; -- default: -- mfem_error ("Geometry::GetVertices(...)"); -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: -+ mfem_error("Geometry::GetVertices(...)"); - } - // make some compilers happy. - return GeomVert[0]; -@@ -370,7 +371,8 @@ void Geometry::GetRandomPoint(int GeomType, IntegrationPoint &ip) - ip.x = 1.0 - z; - } - break; -- default: -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: - MFEM_ABORT("Unknown type of reference element!"); - } - } -@@ -435,12 +437,14 @@ bool Geometry::CheckPoint(int GeomType, const IntegrationPoint &ip) - if (ip.x < 0.0 || ip.y < 0.0 || ip.x+ip.z > 1.0 || ip.y+ip.z > 1.0 || - ip.z < 0.0 || ip.z > 1.0) { return false; } - break; -- default: -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: - MFEM_ABORT("Unknown type of reference element!"); - } - return true; - } - -+// static method - bool Geometry::CheckPoint(int GeomType, const IntegrationPoint &ip, double eps) - { - switch (GeomType) -@@ -516,7 +520,8 @@ bool Geometry::CheckPoint(int GeomType, const IntegrationPoint &ip, double eps) - return false; - } - break; -- default: -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: - MFEM_ABORT("Unknown type of reference element!"); - } - return true; -@@ -640,7 +645,8 @@ bool Geometry::ProjectPoint(int GeomType, const IntegrationPoint &beg, - }; - return internal::IntersectSegment<6,3>(lbeg, lend, end); - } -- default: -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: - MFEM_ABORT("Unknown type of reference element!"); - } - return true; -@@ -774,13 +780,16 @@ bool Geometry::ProjectPoint(int GeomType, IntegrationPoint &ip) - } - } - -- default: -+ case Geometry::POINT: - MFEM_ABORT("Reference element type is not supported!"); -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: -+ MFEM_ABORT("Unknown type of reference element!"); - } - return true; - } - --void Geometry::GetPerfPointMat(int GeomType, DenseMatrix &pm) -+void Geometry::GetPerfPointMat(int GeomType, DenseMatrix &pm) const - { - switch (GeomType) - { -@@ -859,8 +868,11 @@ void Geometry::GetPerfPointMat(int GeomType, DenseMatrix &pm) - } - break; - -- default: -- mfem_error ("Geometry::GetPerfPointMat (...)"); -+ case Geometry::POINT: -+ MFEM_ABORT("Reference element type is not supported!"); -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: -+ MFEM_ABORT("Unknown type of reference element!"); - } - } - -@@ -1055,11 +1067,6 @@ Constants::VertToVert::J[8][2] = - }; - - --GeometryRefiner::GeometryRefiner() --{ -- type = Quadrature1D::ClosedUniform; --} -- - GeometryRefiner::~GeometryRefiner() - { - for (int i = 0; i < Geometry::NumGeom; i++) -@@ -1070,10 +1077,9 @@ GeometryRefiner::~GeometryRefiner() - } - - RefinedGeometry *GeometryRefiner::FindInRGeom(Geometry::Type Geom, -- int Times, int ETimes, -- int Type) -+ int Times, int ETimes) const - { -- Array &RGA = RGeom[Geom]; -+ const Array &RGA = RGeom[Geom]; - for (int i = 0; i < RGA.Size(); i++) - { - RefinedGeometry &RG = *RGA[i]; -@@ -1085,9 +1091,10 @@ RefinedGeometry *GeometryRefiner::FindInRGeom(Geometry::Type Geom, - return NULL; - } - --IntegrationRule *GeometryRefiner::FindInIntPts(Geometry::Type Geom, int NPts) -+IntegrationRule *GeometryRefiner::FindInIntPts(Geometry::Type Geom, -+ int NPts) const - { -- Array &IPA = IntPts[Geom]; -+ const Array &IPA = IntPts[Geom]; - for (int i = 0; i < IPA.Size(); i++) - { - IntegrationRule &ir = *IPA[i]; -@@ -1096,491 +1103,538 @@ IntegrationRule *GeometryRefiner::FindInIntPts(Geometry::Type Geom, int NPts) - return NULL; - } - --RefinedGeometry * GeometryRefiner::Refine(Geometry::Type Geom, -- int Times, int ETimes) -+RefinedGeometry *GeometryRefiner::Refine(Geometry::Type Geom, int Times, -+ int ETimes) - { -+ RefinedGeometry *RG = NULL; - int i, j, k, l, m; -- - Times = std::max(Times, 1); - ETimes = Geometry::Dimension[Geom] <= 1 ? 0 : std::max(ETimes, 1); -- const double *cp = poly1d.GetPoints(Times, BasisType::GetNodalBasis(type)); -+ const double *cp = poly1d.GetPoints(Times, BasisType::GetNodalBasis(Type)); - -- RefinedGeometry *RG = FindInRGeom(Geom, Times, ETimes, type); -- if (RG) { return RG; } -- -- switch (Geom) -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ #pragma omp critical (Refine) -+#endif - { -- case Geometry::POINT: -+ RG = FindInRGeom(Geom, Times, ETimes); -+ if (!RG) - { -- RG = new RefinedGeometry(1, 1, 0); -- RG->Times = 1; -- RG->ETimes = 0; -- RG->Type = type; -- RG->RefPts.IntPoint(0).x = cp[0]; -- RG->RefGeoms[0] = 0; -- -- RGeom[Geometry::POINT].Append(RG); -- return RG; -- } -- -- case Geometry::SEGMENT: -- { -- RG = new RefinedGeometry(Times+1, 2*Times, 0); -- RG->Times = Times; -- RG->ETimes = 0; -- RG->Type = type; -- for (i = 0; i <= Times; i++) -- { -- IntegrationPoint &ip = RG->RefPts.IntPoint(i); -- ip.x = cp[i]; -- } -- Array &G = RG->RefGeoms; -- for (i = 0; i < Times; i++) -+ switch (Geom) - { -- G[2*i+0] = i; -- G[2*i+1] = i+1; -- } -- -- RGeom[Geometry::SEGMENT].Append(RG); -- return RG; -- } -- -- case Geometry::TRIANGLE: -- { -- RG = new RefinedGeometry((Times+1)*(Times+2)/2, 3*Times*Times, -- 3*Times*(ETimes+1), 3*Times); -- RG->Times = Times; -- RG->ETimes = ETimes; -- RG->Type = type; -- for (k = j = 0; j <= Times; j++) -- for (i = 0; i <= Times-j; i++, k++) -+ case Geometry::POINT: - { -- IntegrationPoint &ip = RG->RefPts.IntPoint(k); -- ip.x = cp[i]/(cp[i] + cp[j] + cp[Times-i-j]); -- ip.y = cp[j]/(cp[i] + cp[j] + cp[Times-i-j]); -+ RG = new RefinedGeometry(1, 1, 0); -+ RG->Times = 1; -+ RG->ETimes = 0; -+ RG->Type = Type; -+ RG->RefPts.IntPoint(0).x = cp[0]; -+ RG->RefGeoms[0] = 0; -+ -+ RGeom[Geometry::POINT].Append(RG); - } -- Array &G = RG->RefGeoms; -- for (l = k = j = 0; j < Times; j++, k++) -- for (i = 0; i < Times-j; i++, k++) -+ break; -+ -+ case Geometry::SEGMENT: - { -- G[l++] = k; -- G[l++] = k+1; -- G[l++] = k+Times-j+1; -- if (i+j+1 < Times) -+ RG = new RefinedGeometry(Times+1, 2*Times, 0); -+ RG->Times = Times; -+ RG->ETimes = 0; -+ RG->Type = Type; -+ for (i = 0; i <= Times; i++) - { -- G[l++] = k+1; -- G[l++] = k+Times-j+2; -- G[l++] = k+Times-j+1; -+ IntegrationPoint &ip = RG->RefPts.IntPoint(i); -+ ip.x = cp[i]; -+ } -+ Array &G = RG->RefGeoms; -+ for (i = 0; i < Times; i++) -+ { -+ G[2*i+0] = i; -+ G[2*i+1] = i+1; - } -- } -- Array &E = RG->RefEdges; -- int lb = 0, li = 2*RG->NumBdrEdges; -- // horizontal edges -- for (k = 0; k < Times; k += Times/ETimes) -- { -- int < = (k == 0) ? lb : li; -- j = k*(Times+1)-((k-1)*k)/2; -- for (i = 0; i < Times-k; i++) -- { -- E[lt++] = j; j++; -- E[lt++] = j; -- } -- } -- // diagonal edges -- for (k = Times; k > 0; k -= Times/ETimes) -- { -- int < = (k == Times) ? lb : li; -- j = k; -- for (i = 0; i < k; i++) -- { -- E[lt++] = j; j += Times-i; -- E[lt++] = j; -- } -- } -- // vertical edges -- for (k = 0; k < Times; k += Times/ETimes) -- { -- int < = (k == 0) ? lb : li; -- j = k; -- for (i = 0; i < Times-k; i++) -- { -- E[lt++] = j; j += Times-i+1; -- E[lt++] = j; -- } -- } -- -- RGeom[Geometry::TRIANGLE].Append(RG); -- return RG; -- } - -- case Geometry::SQUARE: -- { -- RG = new RefinedGeometry((Times+1)*(Times+1), 4*Times*Times, -- 4*(ETimes+1)*Times, 4*Times); -- RG->Times = Times; -- RG->ETimes = ETimes; -- RG->Type = type; -- for (k = j = 0; j <= Times; j++) -- for (i = 0; i <= Times; i++, k++) -- { -- IntegrationPoint &ip = RG->RefPts.IntPoint(k); -- ip.x = cp[i]; -- ip.y = cp[j]; -- } -- Array &G = RG->RefGeoms; -- for (l = k = j = 0; j < Times; j++, k++) -- for (i = 0; i < Times; i++, k++) -- { -- G[l++] = k; -- G[l++] = k+1; -- G[l++] = k+Times+2; -- G[l++] = k+Times+1; -+ RGeom[Geometry::SEGMENT].Append(RG); - } -- Array &E = RG->RefEdges; -- int lb = 0, li = 2*RG->NumBdrEdges; -- // horizontal edges -- for (k = 0; k <= Times; k += Times/ETimes) -- { -- int < = (k == 0 || k == Times) ? lb : li; -- for (i = 0, j = k*(Times+1); i < Times; i++) -- { -- E[lt++] = j; j++; -- E[lt++] = j; -- } -- } -- // vertical edges (in right-to-left order) -- for (k = Times; k >= 0; k -= Times/ETimes) -- { -- int < = (k == Times || k == 0) ? lb : li; -- for (i = 0, j = k; i < Times; i++) -+ break; -+ -+ case Geometry::TRIANGLE: - { -- E[lt++] = j; j += Times+1; -- E[lt++] = j; -- } -- } -+ RG = new RefinedGeometry((Times+1)*(Times+2)/2, 3*Times*Times, -+ 3*Times*(ETimes+1), 3*Times); -+ RG->Times = Times; -+ RG->ETimes = ETimes; -+ RG->Type = Type; -+ for (k = j = 0; j <= Times; j++) -+ { -+ for (i = 0; i <= Times-j; i++, k++) -+ { -+ IntegrationPoint &ip = RG->RefPts.IntPoint(k); -+ ip.x = cp[i]/(cp[i] + cp[j] + cp[Times-i-j]); -+ ip.y = cp[j]/(cp[i] + cp[j] + cp[Times-i-j]); -+ } -+ } -+ Array &G = RG->RefGeoms; -+ for (l = k = j = 0; j < Times; j++, k++) -+ { -+ for (i = 0; i < Times-j; i++, k++) -+ { -+ G[l++] = k; -+ G[l++] = k+1; -+ G[l++] = k+Times-j+1; -+ if (i+j+1 < Times) -+ { -+ G[l++] = k+1; -+ G[l++] = k+Times-j+2; -+ G[l++] = k+Times-j+1; -+ } -+ } -+ } -+ Array &E = RG->RefEdges; -+ int lb = 0, li = 2*RG->NumBdrEdges; -+ // horizontal edges -+ for (k = 0; k < Times; k += Times/ETimes) -+ { -+ int < = (k == 0) ? lb : li; -+ j = k*(Times+1)-((k-1)*k)/2; -+ for (i = 0; i < Times-k; i++) -+ { -+ E[lt++] = j; j++; -+ E[lt++] = j; -+ } -+ } -+ // diagonal edges -+ for (k = Times; k > 0; k -= Times/ETimes) -+ { -+ int < = (k == Times) ? lb : li; -+ j = k; -+ for (i = 0; i < k; i++) -+ { -+ E[lt++] = j; j += Times-i; -+ E[lt++] = j; -+ } -+ } -+ // vertical edges -+ for (k = 0; k < Times; k += Times/ETimes) -+ { -+ int < = (k == 0) ? lb : li; -+ j = k; -+ for (i = 0; i < Times-k; i++) -+ { -+ E[lt++] = j; j += Times-i+1; -+ E[lt++] = j; -+ } -+ } - -- RGeom[Geometry::SQUARE].Append(RG); -- return RG; -- } -+ RGeom[Geometry::TRIANGLE].Append(RG); -+ } -+ break; - -- case Geometry::CUBE: -- { -- RG = new RefinedGeometry ((Times+1)*(Times+1)*(Times+1), -- 8*Times*Times*Times, 0); -- RG->Times = Times; -- RG->ETimes = ETimes; -- RG->Type = type; -- for (l = k = 0; k <= Times; k++) -- for (j = 0; j <= Times; j++) -- for (i = 0; i <= Times; i++, l++) -+ case Geometry::SQUARE: -+ { -+ RG = new RefinedGeometry((Times+1)*(Times+1), 4*Times*Times, -+ 4*(ETimes+1)*Times, 4*Times); -+ RG->Times = Times; -+ RG->ETimes = ETimes; -+ RG->Type = Type; -+ for (k = j = 0; j <= Times; j++) - { -- IntegrationPoint &ip = RG->RefPts.IntPoint(l); -- ip.x = cp[i]; -- ip.y = cp[j]; -- ip.z = cp[k]; -+ for (i = 0; i <= Times; i++, k++) -+ { -+ IntegrationPoint &ip = RG->RefPts.IntPoint(k); -+ ip.x = cp[i]; -+ ip.y = cp[j]; -+ } - } -- Array &G = RG->RefGeoms; -- for (l = k = 0; k < Times; k++) -- for (j = 0; j < Times; j++) -- for (i = 0; i < Times; i++) -+ Array &G = RG->RefGeoms; -+ for (l = k = j = 0; j < Times; j++, k++) - { -- G[l++] = i+0 + (j+0 + (k+0) * (Times+1)) * (Times+1); -- G[l++] = i+1 + (j+0 + (k+0) * (Times+1)) * (Times+1); -- G[l++] = i+1 + (j+1 + (k+0) * (Times+1)) * (Times+1); -- G[l++] = i+0 + (j+1 + (k+0) * (Times+1)) * (Times+1); -- G[l++] = i+0 + (j+0 + (k+1) * (Times+1)) * (Times+1); -- G[l++] = i+1 + (j+0 + (k+1) * (Times+1)) * (Times+1); -- G[l++] = i+1 + (j+1 + (k+1) * (Times+1)) * (Times+1); -- G[l++] = i+0 + (j+1 + (k+1) * (Times+1)) * (Times+1); -+ for (i = 0; i < Times; i++, k++) -+ { -+ G[l++] = k; -+ G[l++] = k+1; -+ G[l++] = k+Times+2; -+ G[l++] = k+Times+1; -+ } -+ } -+ Array &E = RG->RefEdges; -+ int lb = 0, li = 2*RG->NumBdrEdges; -+ // horizontal edges -+ for (k = 0; k <= Times; k += Times/ETimes) -+ { -+ int < = (k == 0 || k == Times) ? lb : li; -+ for (i = 0, j = k*(Times+1); i < Times; i++) -+ { -+ E[lt++] = j; j++; -+ E[lt++] = j; -+ } -+ } -+ // vertical edges (in right-to-left order) -+ for (k = Times; k >= 0; k -= Times/ETimes) -+ { -+ int < = (k == Times || k == 0) ? lb : li; -+ for (i = 0, j = k; i < Times; i++) -+ { -+ E[lt++] = j; j += Times+1; -+ E[lt++] = j; -+ } - } - -- RGeom[Geometry::CUBE].Append(RG); -- return RG; -- } -+ RGeom[Geometry::SQUARE].Append(RG); -+ } -+ break; - -- case Geometry::TETRAHEDRON: -- { -- // subdivide the tetrahedron with vertices -- // (0,0,0), (0,0,1), (1,1,1), (0,1,1) -- -- // vertices: 0 <= i <= j <= k <= Times -- // (3-combination with repetitions) -- // number of vertices: (n+3)*(n+2)*(n+1)/6, n = Times -- -- // elements: the vertices are: v1=(i,j,k), v2=v1+u1, v3=v2+u2, v4=v3+u3 -- // where 0 <= i <= j <= k <= n-1 and -- // u1,u2,u3 is a permutation of (1,0,0),(0,1,0),(0,0,1) -- // such that all v2,v3,v4 have non-decreasing components -- // number of elements: n^3 -- -- const int n = Times; -- RG = new RefinedGeometry((n+3)*(n+2)*(n+1)/6, 4*n*n*n, 0); -- RG->Times = Times; -- RG->ETimes = ETimes; -- RG->Type = type; -- // enumerate and define the vertices -- Array vi((n+1)*(n+1)*(n+1)); -- vi = -1; -- m = 0; -- -- // vertices are given in lexicographic ordering on the reference -- // element -- for (int kk = 0; kk <= n; kk++) -- for (int jj = 0; jj <= n-kk; jj++) -- for (int ii = 0; ii <= n-jj-kk; ii++) -+ case Geometry::CUBE: -+ { -+ RG = new RefinedGeometry ((Times+1)*(Times+1)*(Times+1), -+ 8*Times*Times*Times, 0); -+ RG->Times = Times; -+ RG->ETimes = ETimes; -+ RG->Type = Type; -+ for (l = k = 0; k <= Times; k++) -+ { -+ for (j = 0; j <= Times; j++) -+ { -+ for (i = 0; i <= Times; i++, l++) -+ { -+ IntegrationPoint &ip = RG->RefPts.IntPoint(l); -+ ip.x = cp[i]; -+ ip.y = cp[j]; -+ ip.z = cp[k]; -+ } -+ } -+ } -+ Array &G = RG->RefGeoms; -+ for (l = k = 0; k < Times; k++) - { -- IntegrationPoint &ip = RG->RefPts.IntPoint(m); -- double w = cp[ii] + cp[jj] + cp[kk] + cp[Times-ii-jj-kk]; -- ip.x = cp[ii]/w; -- ip.y = cp[jj]/w; -- ip.z = cp[kk]/w; -- // (ii,jj,kk) are coordinates in the reference tetrahedron, -- // transform to coordinates (i,j,k) in the auxiliary -- // tetrahedron defined by (0,0,0), (0,0,1), (1,1,1), (0,1,1) -- i = jj; -- j = jj+kk; -- k = ii+jj+kk; -- l = i + (j + k * (n+1)) * (n+1); -- // map from linear Cartesian hex index in the auxiliary tet -- // to lexicographic in the reference tet -- vi[l] = m; -- m++; -+ for (j = 0; j < Times; j++) -+ { -+ for (i = 0; i < Times; i++) -+ { -+ G[l++] = i+0 + (j+0 + (k+0) * (Times+1)) * (Times+1); -+ G[l++] = i+1 + (j+0 + (k+0) * (Times+1)) * (Times+1); -+ G[l++] = i+1 + (j+1 + (k+0) * (Times+1)) * (Times+1); -+ G[l++] = i+0 + (j+1 + (k+0) * (Times+1)) * (Times+1); -+ G[l++] = i+0 + (j+0 + (k+1) * (Times+1)) * (Times+1); -+ G[l++] = i+1 + (j+0 + (k+1) * (Times+1)) * (Times+1); -+ G[l++] = i+1 + (j+1 + (k+1) * (Times+1)) * (Times+1); -+ G[l++] = i+0 + (j+1 + (k+1) * (Times+1)) * (Times+1); -+ } -+ } - } - -- if (m != (n+3)*(n+2)*(n+1)/6) -- { -- mfem_error("GeometryRefiner::Refine() for TETRAHEDRON #1"); -- } -- // elements -- Array &G = RG->RefGeoms; -- m = 0; -- for (k = 0; k < n; k++) -- for (j = 0; j <= k; j++) -- for (i = 0; i <= j; i++) -+ RGeom[Geometry::CUBE].Append(RG); -+ } -+ break; -+ -+ case Geometry::TETRAHEDRON: -+ { -+ // subdivide the tetrahedron with vertices -+ // (0,0,0), (0,0,1), (1,1,1), (0,1,1) -+ -+ // vertices: 0 <= i <= j <= k <= Times -+ // (3-combination with repetitions) -+ // number of vertices: (n+3)*(n+2)*(n+1)/6, n = Times -+ -+ // elements: the vertices are: v1=(i,j,k), v2=v1+u1, v3=v2+u2, v4=v3+u3 -+ // where 0 <= i <= j <= k <= n-1 and -+ // u1,u2,u3 is a permutation of (1,0,0),(0,1,0),(0,0,1) -+ // such that all v2,v3,v4 have non-decreasing components -+ // number of elements: n^3 -+ -+ const int n = Times; -+ RG = new RefinedGeometry((n+3)*(n+2)*(n+1)/6, 4*n*n*n, 0); -+ RG->Times = Times; -+ RG->ETimes = ETimes; -+ RG->Type = Type; -+ // enumerate and define the vertices -+ Array vi((n+1)*(n+1)*(n+1)); -+ vi = -1; -+ m = 0; -+ -+ // vertices are given in lexicographic ordering on the reference -+ // element -+ for (int kk = 0; kk <= n; kk++) - { -- // the ordering of the vertices is chosen to ensure: -- // 1) correct orientation -- // 2) the x,y,z edges are in the set of edges -- // {(0,1),(2,3), (0,2),(1,3)} -- // (goal is to ensure that subsequent refinement using -- // this procedure preserves the six tetrahedral shapes) -- -- // zyx: (i,j,k)-(i,j,k+1)-(i+1,j+1,k+1)-(i,j+1,k+1) -- G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; -- G[m++] = vi[i+0 + (j+0 + (k+1) * (n+1)) * (n+1)]; -- G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; -- G[m++] = vi[i+0 + (j+1 + (k+1) * (n+1)) * (n+1)]; -- if (j < k) -+ for (int jj = 0; jj <= n-kk; jj++) - { -- // yzx: (i,j,k)-(i+1,j+1,k+1)-(i,j+1,k)-(i,j+1,k+1) -- G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; -- G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; -- G[m++] = vi[i+0 + (j+1 + (k+0) * (n+1)) * (n+1)]; -- G[m++] = vi[i+0 + (j+1 + (k+1) * (n+1)) * (n+1)]; -- // yxz: (i,j,k)-(i,j+1,k)-(i+1,j+1,k+1)-(i+1,j+1,k) -- G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; -- G[m++] = vi[i+0 + (j+1 + (k+0) * (n+1)) * (n+1)]; -- G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; -- G[m++] = vi[i+1 + (j+1 + (k+0) * (n+1)) * (n+1)]; -+ for (int ii = 0; ii <= n-jj-kk; ii++) -+ { -+ IntegrationPoint &ip = RG->RefPts.IntPoint(m); -+ double w = cp[ii] + cp[jj] + cp[kk] + cp[Times-ii-jj-kk]; -+ ip.x = cp[ii]/w; -+ ip.y = cp[jj]/w; -+ ip.z = cp[kk]/w; -+ // (ii,jj,kk) are coordinates in the reference tetrahedron, -+ // transform to coordinates (i,j,k) in the auxiliary -+ // tetrahedron defined by (0,0,0), (0,0,1), (1,1,1), (0,1,1) -+ i = jj; -+ j = jj+kk; -+ k = ii+jj+kk; -+ l = i + (j + k * (n+1)) * (n+1); -+ // map from linear Cartesian hex index in the auxiliary tet -+ // to lexicographic in the reference tet -+ vi[l] = m; -+ m++; -+ } - } -- if (i < j) -+ } -+ -+ if (m != (n+3)*(n+2)*(n+1)/6) -+ { -+ MFEM_ABORT("GeometryRefiner::Refine() for TETRAHEDRON #1"); -+ } -+ // elements -+ Array &G = RG->RefGeoms; -+ m = 0; -+ for (k = 0; k < n; k++) -+ { -+ for (j = 0; j <= k; j++) - { -- // xzy: (i,j,k)-(i+1,j,k)-(i+1,j+1,k+1)-(i+1,j,k+1) -- G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; -- G[m++] = vi[i+1 + (j+0 + (k+0) * (n+1)) * (n+1)]; -- G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; -- G[m++] = vi[i+1 + (j+0 + (k+1) * (n+1)) * (n+1)]; -- if (j < k) -+ for (i = 0; i <= j; i++) - { -- // xyz: (i,j,k)-(i+1,j+1,k+1)-(i+1,j,k)-(i+1,j+1,k) -+ // the ordering of the vertices is chosen to ensure: -+ // 1) correct orientation -+ // 2) the x,y,z edges are in the set of edges -+ // {(0,1),(2,3), (0,2),(1,3)} -+ // (goal is to ensure that subsequent refinement using -+ // this procedure preserves the six tetrahedral shapes) -+ -+ // zyx: (i,j,k)-(i,j,k+1)-(i+1,j+1,k+1)-(i,j+1,k+1) - G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+0 + (j+0 + (k+1) * (n+1)) * (n+1)]; - G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; -- G[m++] = vi[i+1 + (j+0 + (k+0) * (n+1)) * (n+1)]; -- G[m++] = vi[i+1 + (j+1 + (k+0) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+0 + (j+1 + (k+1) * (n+1)) * (n+1)]; -+ if (j < k) -+ { -+ // yzx: (i,j,k)-(i+1,j+1,k+1)-(i,j+1,k)-(i,j+1,k+1) -+ G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+0 + (j+1 + (k+0) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+0 + (j+1 + (k+1) * (n+1)) * (n+1)]; -+ // yxz: (i,j,k)-(i,j+1,k)-(i+1,j+1,k+1)-(i+1,j+1,k) -+ G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+0 + (j+1 + (k+0) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+1 + (j+1 + (k+0) * (n+1)) * (n+1)]; -+ } -+ if (i < j) -+ { -+ // xzy: (i,j,k)-(i+1,j,k)-(i+1,j+1,k+1)-(i+1,j,k+1) -+ G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+1 + (j+0 + (k+0) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+1 + (j+0 + (k+1) * (n+1)) * (n+1)]; -+ if (j < k) -+ { -+ // xyz: (i,j,k)-(i+1,j+1,k+1)-(i+1,j,k)-(i+1,j+1,k) -+ G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+1 + (j+0 + (k+0) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+1 + (j+1 + (k+0) * (n+1)) * (n+1)]; -+ } -+ // zxy: (i,j,k)-(i+1,j+1,k+1)-(i,j,k+1)-(i+1,j,k+1) -+ G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+0 + (j+0 + (k+1) * (n+1)) * (n+1)]; -+ G[m++] = vi[i+1 + (j+0 + (k+1) * (n+1)) * (n+1)]; -+ } - } -- // zxy: (i,j,k)-(i+1,j+1,k+1)-(i,j,k+1)-(i+1,j,k+1) -- G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; -- G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; -- G[m++] = vi[i+0 + (j+0 + (k+1) * (n+1)) * (n+1)]; -- G[m++] = vi[i+1 + (j+0 + (k+1) * (n+1)) * (n+1)]; - } - } -- if (m != 4*n*n*n) -- { -- mfem_error("GeometryRefiner::Refine() for TETRAHEDRON #2"); -- } -- for (i = 0; i < m; i++) -- if (G[i] < 0) -- { -- mfem_error("GeometryRefiner::Refine() for TETRAHEDRON #3"); -- } -+ if (m != 4*n*n*n) -+ { -+ MFEM_ABORT("GeometryRefiner::Refine() for TETRAHEDRON #2"); -+ } -+ for (i = 0; i < m; i++) -+ { -+ if (G[i] < 0) -+ { -+ MFEM_ABORT("GeometryRefiner::Refine() for TETRAHEDRON #3"); -+ } -+ } - -- RGeom[Geometry::TETRAHEDRON].Append(RG); -- return RG; -- } -+ RGeom[Geometry::TETRAHEDRON].Append(RG); -+ } -+ break; - -- case Geometry::PYRAMID: -- { -- const int n = Times; -- RG = new RefinedGeometry ((n+1)*(n+2)*(2*n+3)/6, -- 5*n*(2*n-1)*(2*n+1)/3, 0); -- RG->Times = Times; -- RG->ETimes = ETimes; -- RG->Type = type; -- // enumerate and define the vertices -- m = 0; -- for (k = 0; k <= n; k++) -- { -- const double *cpij = -- poly1d.GetPoints(Times - k, BasisType::GetNodalBasis(type)); -- for (j = 0; j <= n - k; j++) -- for (i = 0; i <= n - k; i++) -+ case Geometry::PYRAMID: -+ { -+ const int n = Times; -+ RG = new RefinedGeometry ((n+1)*(n+2)*(2*n+3)/6, -+ 5*n*(2*n-1)*(2*n+1)/3, 0); -+ RG->Times = Times; -+ RG->ETimes = ETimes; -+ RG->Type = Type; -+ // enumerate and define the vertices -+ m = 0; -+ for (k = 0; k <= n; k++) - { -- IntegrationPoint &ip = RG->RefPts.IntPoint(m); -- if (type == 0) -+ const double *cpij = -+ poly1d.GetPoints(Times - k, BasisType::GetNodalBasis(Type)); -+ for (j = 0; j <= n - k; j++) - { -- ip.x = (n > k) ? (double(i) / (n - k)) : 0.0; -- ip.y = (n > k) ? (double(j) / (n - k)) : 0.0; -- ip.z = double(k) / n; -+ for (i = 0; i <= n - k; i++) -+ { -+ IntegrationPoint &ip = RG->RefPts.IntPoint(m); -+ if (Type == 0) -+ { -+ ip.x = (n > k) ? (double(i) / (n - k)) : 0.0; -+ ip.y = (n > k) ? (double(j) / (n - k)) : 0.0; -+ ip.z = double(k) / n; -+ } -+ else -+ { -+ ip.x = cpij[i] * (1.0 - cp[k]); -+ ip.y = cpij[j] * (1.0 - cp[k]); -+ ip.z = cp[k]; -+ } -+ m++; -+ } - } -- else -+ } -+ if (m != (n+1)*(n+2)*(2*n+3)/6) -+ { -+ MFEM_ABORT("GeometryRefiner::Refine() for PYRAMID #1"); -+ } -+ // elements -+ Array &G = RG->RefGeoms; -+ m = 0; -+ for (k = 0; k < n; k++) -+ { -+ int lk = k * (k * (2 * k - 6 * n - 9) + 6 * n * (n + 3) + 13) / 6; -+ int lkp1 = (k + 1) * -+ (k * (2 * k - 6 * n -5) + 6 * n * (n + 2) + 6) / 6; -+ for (j = 0; j < n - k; j++) - { -- ip.x = cpij[i] * (1.0 - cp[k]); -- ip.y = cpij[j] * (1.0 - cp[k]); -- ip.z = cp[k]; -+ for (i = 0; i < n - k; i++) -+ { -+ G[m++] = lk + j * (n - k + 1) + i; -+ G[m++] = lk + j * (n - k + 1) + i + 1; -+ G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; -+ G[m++] = lk + (j + 1) * (n - k + 1) + i; -+ G[m++] = lkp1 + j * (n - k) + i; -+ } -+ } -+ for (j = 0; j < n - k - 1; j++) -+ { -+ for (i = 0; i < n - k - 1; i++) -+ { -+ G[m++] = lkp1 + j * (n - k) + i; -+ G[m++] = lkp1 + (j + 1) * (n - k) + i; -+ G[m++] = lkp1 + (j + 1) * (n - k) + i + 1; -+ G[m++] = lkp1 + j * (n - k) + i + 1; -+ G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; -+ } -+ } -+ for (j = 0; j < n - k; j++) -+ { -+ for (i = 0; i < n - k - 1; i++) -+ { -+ G[m++] = lk + j * (n - k + 1) + i + 1; -+ G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; -+ G[m++] = lkp1 + j * (n - k) + i; -+ G[m++] = lkp1 + j * (n - k) + i + 1; -+ G[m++] = -1; -+ } -+ } -+ for (j = 0; j < n - k - 1; j++) -+ { -+ for (i = 0; i < n - k; i++) -+ { -+ G[m++] = lk + (j + 1) * (n - k + 1) + i; -+ G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; -+ G[m++] = lkp1 + (j + 1) * (n - k) + i; -+ G[m++] = lkp1 + j * (n - k) + i; -+ G[m++] = -1; -+ } - } -- m++; - } -- } -- if (m != (n+1)*(n+2)*(2*n+3)/6) -- { -- mfem_error("GeometryRefiner::Refine() for PYRAMID #1"); -- } -- // elements -- Array &G = RG->RefGeoms; -- m = 0; -- for (k = 0; k < n; k++) -- { -- int lk = k * (k * (2 * k - 6 * n - 9) + 6 * n * (n + 3) + 13) / 6; -- int lkp1 = (k + 1) * -- (k * (2 * k - 6 * n -5) + 6 * n * (n + 2) + 6) / 6; -- for (j = 0; j < n - k; j++) -- { -- for (i = 0; i < n - k; i++) -+ if (m != 5*n*(2*n-1)*(2*n+1)/3) - { -- G[m++] = lk + j * (n - k + 1) + i; -- G[m++] = lk + j * (n - k + 1) + i + 1; -- G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; -- G[m++] = lk + (j + 1) * (n - k + 1) + i; -- G[m++] = lkp1 + j * (n - k) + i; -+ MFEM_ABORT("GeometryRefiner::Refine() for PYRAMID #2"); - } -+ -+ RGeom[Geometry::PYRAMID].Append(RG); - } -- for (j = 0; j < n - k - 1; j++) -+ break; -+ -+ case Geometry::PRISM: - { -- for (i = 0; i < n - k - 1; i++) -+ const int n = Times; -+ RG = new RefinedGeometry ((n+1)*(n+1)*(n+2)/2, 6*n*n*n, 0); -+ RG->Times = Times; -+ RG->ETimes = ETimes; -+ RG->Type = Type; -+ // enumerate and define the vertices -+ m = 0; -+ for (l = k = 0; k <= n; k++) - { -- G[m++] = lkp1 + j * (n - k) + i; -- G[m++] = lkp1 + (j + 1) * (n - k) + i; -- G[m++] = lkp1 + (j + 1) * (n - k) + i + 1; -- G[m++] = lkp1 + j * (n - k) + i + 1; -- G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; -+ for (j = 0; j <= n; j++) -+ { -+ for (i = 0; i <= n-j; i++, l++) -+ { -+ IntegrationPoint &ip = RG->RefPts.IntPoint(l); -+ ip.x = cp[i]/(cp[i] + cp[j] + cp[n-i-j]); -+ ip.y = cp[j]/(cp[i] + cp[j] + cp[n-i-j]); -+ ip.z = cp[k]; -+ m++; -+ } -+ } - } -- } -- for (j = 0; j < n - k; j++) -- { -- for (i = 0; i < n - k - 1; i++) -+ if (m != (n+1)*(n+1)*(n+2)/2) - { -- G[m++] = lk + j * (n - k + 1) + i + 1; -- G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; -- G[m++] = lkp1 + j * (n - k) + i; -- G[m++] = lkp1 + j * (n - k) + i + 1; -- G[m++] = -1; -+ MFEM_ABORT("GeometryRefiner::Refine() for PRISM #1"); - } -- } -- for (j = 0; j < n - k - 1; j++) -- { -- for (i = 0; i < n - k; i++) -+ // elements -+ Array &G = RG->RefGeoms; -+ m = 0; -+ for (m = k = 0; k < n; k++) - { -- G[m++] = lk + (j + 1) * (n - k + 1) + i; -- G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; -- G[m++] = lkp1 + (j + 1) * (n - k) + i; -- G[m++] = lkp1 + j * (n - k) + i; -- G[m++] = -1; -+ for (l = j = 0; j < n; j++, l++) -+ { -+ for (i = 0; i < n-j; i++, l++) -+ { -+ G[m++] = l + (k+0) * (n+1) * (n+2) / 2; -+ G[m++] = l + 1 + (k+0) * (n+1) * (n+2) / 2; -+ G[m++] = l - j + (2 + (k+0) * (n+2)) * (n+1) / 2; -+ G[m++] = l + (k+1) * (n+1) * (n+2) / 2; -+ G[m++] = l + 1 + (k+1) * (n+1) * (Times+2) / 2; -+ G[m++] = l - j + (2 + (k+1) * (n+2)) * (n+1) / 2; -+ if (i+j+1 < n) -+ { -+ G[m++] = l + 1 + (k+0) * (n+1) * (n+2)/2; -+ G[m++] = l - j + (2 + (k+0) * (n+1)) * (n+2) / 2; -+ G[m++] = l - j + (2 + (k+0) * (n+2)) * (n+1) / 2; -+ G[m++] = l + 1 + (k+1) * (n+1) * (n+2) / 2; -+ G[m++] = l - j + (2 + (k+1) * (n+1)) * (n+2) / 2; -+ G[m++] = l - j + (2 + (k+1) * (n+2)) * (n+1) / 2; -+ } -+ } -+ } - } -- } -- } -- if (m != 5*n*(2*n-1)*(2*n+1)/3) -- { -- mfem_error("GeometryRefiner::Refine() for PYRAMID #2"); -- } -- RGeom[Geometry::PYRAMID].Append(RG); -- return RG; -- } -- -- case Geometry::PRISM: -- { -- const int n = Times; -- RG = new RefinedGeometry ((n+1)*(n+1)*(n+2)/2, 6*n*n*n, 0); -- RG->Times = Times; -- RG->ETimes = ETimes; -- RG->Type = type; -- // enumerate and define the vertices -- m = 0; -- for (l = k = 0; k <= n; k++) -- for (j = 0; j <= n; j++) -- for (i = 0; i <= n-j; i++, l++) -+ if (m != 6*n*n*n) - { -- IntegrationPoint &ip = RG->RefPts.IntPoint(l); -- ip.x = cp[i]/(cp[i] + cp[j] + cp[n-i-j]); -- ip.y = cp[j]/(cp[i] + cp[j] + cp[n-i-j]); -- ip.z = cp[k]; -- m++; -+ MFEM_ABORT("GeometryRefiner::Refine() for PRISM #2"); - } -- if (m != (n+1)*(n+1)*(n+2)/2) -- { -- mfem_error("GeometryRefiner::Refine() for PRISM #1"); -- } -- // elements -- Array &G = RG->RefGeoms; -- m = 0; -- for (m = k = 0; k < n; k++) -- for (l = j = 0; j < n; j++, l++) -- for (i = 0; i < n-j; i++, l++) -+ for (i = 0; i < m; i++) - { -- G[m++] = l + (k+0) * (n+1) * (n+2) / 2; -- G[m++] = l + 1 + (k+0) * (n+1) * (n+2) / 2; -- G[m++] = l - j + (2 + (k+0) * (n+2)) * (n+1) / 2; -- G[m++] = l + (k+1) * (n+1) * (n+2) / 2; -- G[m++] = l + 1 + (k+1) * (n+1) * (Times+2) / 2; -- G[m++] = l - j + (2 + (k+1) * (n+2)) * (n+1) / 2; -- if (i+j+1 < n) -+ if (G[i] < 0) - { -- G[m++] = l + 1 + (k+0) * (n+1) * (n+2)/2; -- G[m++] = l - j + (2 + (k+0) * (n+1)) * (n+2) / 2; -- G[m++] = l - j + (2 + (k+0) * (n+2)) * (n+1) / 2; -- G[m++] = l + 1 + (k+1) * (n+1) * (n+2) / 2; -- G[m++] = l - j + (2 + (k+1) * (n+1)) * (n+2) / 2; -- G[m++] = l - j + (2 + (k+1) * (n+2)) * (n+1) / 2; -+ MFEM_ABORT("GeometryRefiner::Refine() for PRISM #3"); - } - } -- if (m != 6*n*n*n) -- { -- mfem_error("GeometryRefiner::Refine() for PRISM #2"); -- } -- for (i = 0; i < m; i++) -- if (G[i] < 0) -- { -- mfem_error("GeometryRefiner::Refine() for PRISM #3"); -+ -+ RGeom[Geometry::PRISM].Append(RG); - } -+ break; - -- RGeom[Geometry::PRISM].Append(RG); -- return RG; -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: -+ MFEM_ABORT("Unknown type of reference element!"); -+ } - } -- -- default: -- -- return NULL; - } -+ -+ return RG; - } - - const IntegrationRule *GeometryRefiner::RefineInterior(Geometry::Type Geom, -@@ -1596,15 +1650,23 @@ const IntegrationRule *GeometryRefiner::RefineInterior(Geometry::Type Geom, - { - return NULL; - } -- ir = FindInIntPts(Geom, Times-1); -- if (ir) { return ir; } -- -- ir = new IntegrationRule(Times-1); -- for (int i = 1; i < Times; i++) -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ #pragma omp critical (RefineInterior) -+#endif - { -- IntegrationPoint &ip = ir->IntPoint(i-1); -- ip.x = double(i) / Times; -- ip.y = ip.z = 0.0; -+ ir = FindInIntPts(Geometry::SEGMENT, Times-1); -+ if (!ir) -+ { -+ ir = new IntegrationRule(Times-1); -+ for (int i = 1; i < Times; i++) -+ { -+ IntegrationPoint &ip = ir->IntPoint(i-1); -+ ip.x = double(i) / Times; -+ ip.y = ip.z = 0.0; -+ } -+ -+ IntPts[Geometry::SEGMENT].Append(ir); -+ } - } - } - break; -@@ -1615,18 +1677,28 @@ const IntegrationRule *GeometryRefiner::RefineInterior(Geometry::Type Geom, - { - return NULL; - } -- ir = FindInIntPts(Geom, ((Times-1)*(Times-2))/2); -- if (ir) { return ir; } -- -- ir = new IntegrationRule(((Times-1)*(Times-2))/2); -- for (int k = 0, j = 1; j < Times-1; j++) -- for (int i = 1; i < Times-j; i++, k++) -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ #pragma omp critical (RefineInterior) -+#endif -+ { -+ ir = FindInIntPts(Geometry::TRIANGLE, ((Times-1)*(Times-2))/2); -+ if (!ir) - { -- IntegrationPoint &ip = ir->IntPoint(k); -- ip.x = double(i) / Times; -- ip.y = double(j) / Times; -- ip.z = 0.0; -+ ir = new IntegrationRule(((Times-1)*(Times-2))/2); -+ for (int k = 0, j = 1; j < Times-1; j++) -+ { -+ for (int i = 1; i < Times-j; i++, k++) -+ { -+ IntegrationPoint &ip = ir->IntPoint(k); -+ ip.x = double(i) / Times; -+ ip.y = double(j) / Times; -+ ip.z = 0.0; -+ } -+ } -+ -+ IntPts[Geometry::TRIANGLE].Append(ir); - } -+ } - } - break; - -@@ -1636,32 +1708,46 @@ const IntegrationRule *GeometryRefiner::RefineInterior(Geometry::Type Geom, - { - return NULL; - } -- ir = FindInIntPts(Geom, (Times-1)*(Times-1)); -- if (ir) { return ir; } -- -- ir = new IntegrationRule((Times-1)*(Times-1)); -- for (int k = 0, j = 1; j < Times; j++) -- for (int i = 1; i < Times; i++, k++) -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ #pragma omp critical (RefineInterior) -+#endif -+ { -+ ir = FindInIntPts(Geometry::SQUARE, (Times-1)*(Times-1)); -+ if (!ir) - { -- IntegrationPoint &ip = ir->IntPoint(k); -- ip.x = double(i) / Times; -- ip.y = double(j) / Times; -- ip.z = 0.0; -+ ir = new IntegrationRule((Times-1)*(Times-1)); -+ for (int k = 0, j = 1; j < Times; j++) -+ { -+ for (int i = 1; i < Times; i++, k++) -+ { -+ IntegrationPoint &ip = ir->IntPoint(k); -+ ip.x = double(i) / Times; -+ ip.y = double(j) / Times; -+ ip.z = 0.0; -+ } -+ } -+ -+ IntPts[Geometry::SQUARE].Append(ir); - } -+ } - } - break; - -- default: -- mfem_error("GeometryRefiner::RefineInterior(...)"); -+ case Geometry::POINT: -+ case Geometry::TETRAHEDRON: -+ case Geometry::CUBE: -+ case Geometry::PYRAMID: -+ case Geometry::PRISM: -+ MFEM_ABORT("Reference element type is not supported!"); -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: -+ MFEM_ABORT("Unknown type of reference element!"); - } - -- MFEM_ASSERT(ir != NULL, "Failed to construct the refined IntegrationRule."); -- IntPts[Geom].Append(ir); -- - return ir; - } - -- -+// static method - int GeometryRefiner::GetRefinementLevelFromPoints(Geometry::Type geom, int Npts) - { - switch (geom) -@@ -1719,16 +1805,17 @@ int GeometryRefiner::GetRefinementLevelFromPoints(Geometry::Type geom, int Npts) - } - return -1; - } -- default: -- { -- mfem_error("Non existing Geometry."); -- } -+ case Geometry::PYRAMID: -+ MFEM_ABORT("Reference element type is not supported!"); -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: -+ MFEM_ABORT("Unknown type of reference element!"); - } - - return -1; - } - -- -+// static method - int GeometryRefiner::GetRefinementLevelFromElems(Geometry::Type geom, int Nels) - { - switch (geom) -@@ -1760,10 +1847,11 @@ int GeometryRefiner::GetRefinementLevelFromElems(Geometry::Type geom, int Nels) - } - return -1; - } -- default: -- { -- mfem_error("Non existing Geometry."); -- } -+ case Geometry::PYRAMID: -+ MFEM_ABORT("Reference element type is not supported!"); -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: -+ MFEM_ABORT("Unknown type of reference element!"); - } - - return -1; -diff --git a/fem/geom.hpp b/fem/geom.hpp -index 698912d97..1290f4e2a 100644 ---- a/fem/geom.hpp -+++ b/fem/geom.hpp -@@ -65,10 +65,10 @@ public: - - /** @brief Return an IntegrationRule consisting of all vertices of the given - Geometry::Type, @a GeomType. */ -- const IntegrationRule *GetVertices(int GeomType); -+ const IntegrationRule *GetVertices(int GeomType) const; - - /// Return the center of the given Geometry::Type, @a GeomType. -- const IntegrationPoint &GetCenter(int GeomType) -+ const IntegrationPoint &GetCenter(int GeomType) const - { return GeomCenter[GeomType]; } - - /// Get a random point in the reference element specified by @a GeomType. -@@ -97,9 +97,9 @@ public: - - const DenseMatrix &GetGeomToPerfGeomJac(int GeomType) const - { return *GeomToPerfGeomJac[GeomType]; } -- DenseMatrix *GetPerfGeomToGeomJac(int GeomType) -- { return PerfGeomToGeomJac[GeomType]; } -- void GetPerfPointMat(int GeomType, DenseMatrix &pm); -+ const DenseMatrix &GetPerfGeomToGeomJac(int GeomType) const -+ { return *PerfGeomToGeomJac[GeomType]; } -+ void GetPerfPointMat(int GeomType, DenseMatrix &pm) const; - void JacToPerfJac(int GeomType, const DenseMatrix &J, - DenseMatrix &PJ) const; - -@@ -123,7 +123,7 @@ public: - } - - /// Return the number of boundary "faces" of a given Geometry::Type. -- int NumBdr(int GeomType) { return NumBdrArray[GeomType]; } -+ int NumBdr(int GeomType) const { return NumBdrArray[GeomType]; } - }; - - template <> struct -@@ -317,27 +317,27 @@ public: - int Type; - - RefinedGeometry(int NPts, int NRefG, int NRefE, int NBdrE = 0) : -- RefPts(NPts), RefGeoms(NRefG), RefEdges(NRefE), NumBdrEdges(NBdrE) { } -+ RefPts(NPts), RefGeoms(NRefG), RefEdges(NRefE), NumBdrEdges(NBdrE) {} - }; - - class GeometryRefiner - { - private: -- int type; // Quadrature1D type (ClosedUniform is default) -+ int Type; // Quadrature1D type (ClosedUniform is default) - Array RGeom[Geometry::NumGeom]; - Array IntPts[Geometry::NumGeom]; - -- RefinedGeometry *FindInRGeom(Geometry::Type Geom, int Times, int ETimes, -- int Type); -- IntegrationRule *FindInIntPts(Geometry::Type Geom, int NPts); -+ RefinedGeometry *FindInRGeom(Geometry::Type Geom, int Times, -+ int ETimes) const; -+ IntegrationRule *FindInIntPts(Geometry::Type Geom, int NPts) const; - - public: -- GeometryRefiner(); -+ GeometryRefiner(int t = Quadrature1D::ClosedUniform) : Type(t) {} - - /// Set the Quadrature1D type of points to use for subdivision. -- void SetType(const int t) { type = t; } -+ void SetType(int t) { Type = t; } - /// Get the Quadrature1D type of points used for subdivision. -- int GetType() const { return type; } -+ int GetType() const { return Type; } - - RefinedGeometry *Refine(Geometry::Type Geom, int Times, int ETimes = 1); - -@@ -345,10 +345,10 @@ public: - const IntegrationRule *RefineInterior(Geometry::Type Geom, int Times); - - /// Get the Refinement level based on number of points -- virtual int GetRefinementLevelFromPoints(Geometry::Type Geom, int Npts); -+ static int GetRefinementLevelFromPoints(Geometry::Type Geom, int Npts); - - /// Get the Refinement level based on number of elements -- virtual int GetRefinementLevelFromElems(Geometry::Type geom, int Npts); -+ static int GetRefinementLevelFromElems(Geometry::Type geom, int Npts); - - ~GeometryRefiner(); - }; -diff --git a/fem/intrules.cpp b/fem/intrules.cpp -index 8c5c354e3..cb9544852 100644 ---- a/fem/intrules.cpp -+++ b/fem/intrules.cpp -@@ -737,7 +737,7 @@ void QuadratureFunctions1D::GivePolyPoints(const int np, double *pts, - ClosedGL(np, &ir); - break; - } -- default: -+ case Quadrature1D::Invalid: - { - MFEM_ABORT("Asking for an unknown type of 1D Quadrature points, " - "type = " << type); -@@ -831,7 +831,10 @@ void QuadratureFunctions1D::CalculateUniformWeights(IntegrationRule *ir, - hinv = p+1; - ihoffset = 1; - break; -- default: -+ case Quadrature1D::GaussLegendre: -+ case Quadrature1D::GaussLobatto: -+ case Quadrature1D::ClosedGL: -+ case Quadrature1D::Invalid: - MFEM_ABORT("invalid Quadrature1D type: " << type); - } - // set w0 = (-1)^p*(p!)/(hinv^p) -@@ -940,10 +943,10 @@ IntegrationRules IntRules(0, Quadrature1D::GaussLegendre); - - IntegrationRules RefinedIntRules(1, Quadrature1D::GaussLegendre); - --IntegrationRules::IntegrationRules(int Ref, int type_): -- quad_type(type_) -+IntegrationRules::IntegrationRules(int ref, int type) -+ : quad_type(type) - { -- refined = Ref; -+ refined = ref; - - if (refined < 0) { own_rules = 0; return; } - -@@ -975,11 +978,19 @@ IntegrationRules::IntegrationRules(int Ref, int type_): - - CubeIntRules.SetSize(32, h_mt); - CubeIntRules = NULL; -+ -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ IntRuleLocks.SetSize(Geometry::NUM_GEOMETRIES, h_mt); -+ for (int i = 0; i < Geometry::NUM_GEOMETRIES; i++) -+ { -+ omp_init_lock(&IntRuleLocks[i]); -+ } -+#endif - } - - const IntegrationRule &IntegrationRules::Get(int GeomType, int Order) - { -- Array *ir_array; -+ Array *ir_array = NULL; - - switch (GeomType) - { -@@ -991,9 +1002,9 @@ const IntegrationRule &IntegrationRules::Get(int GeomType, int Order) - case Geometry::CUBE: ir_array = &CubeIntRules; break; - case Geometry::PRISM: ir_array = &PrismIntRules; break; - case Geometry::PYRAMID: ir_array = &PyramidIntRules; break; -- default: -- mfem_error("IntegrationRules::Get(...) : Unknown geometry type!"); -- ir_array = NULL; -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: -+ MFEM_ABORT("Unknown type of reference element!"); - } - - if (Order < 0) -@@ -1001,36 +1012,35 @@ const IntegrationRule &IntegrationRules::Get(int GeomType, int Order) - Order = 0; - } - -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ omp_set_lock(&IntRuleLocks[GeomType]); -+#endif -+ - if (!HaveIntRule(*ir_array, Order)) - { --#ifdef MFEM_USE_LEGACY_OPENMP -- #pragma omp critical --#endif -- { -- if (!HaveIntRule(*ir_array, Order)) -- { -- IntegrationRule *ir = GenerateIntegrationRule(GeomType, Order); -+ IntegrationRule *ir = GenerateIntegrationRule(GeomType, Order); - #ifdef MFEM_DEBUG -- int RealOrder = Order; -- while (RealOrder+1 < ir_array->Size() && -- (*ir_array)[RealOrder+1] == ir) -- { -- RealOrder++; -- } -- MFEM_VERIFY(RealOrder == ir->GetOrder(), "internal error"); -+ int RealOrder = Order; -+ while (RealOrder+1 < ir_array->Size() && (*ir_array)[RealOrder+1] == ir) -+ { -+ RealOrder++; -+ } -+ MFEM_VERIFY(RealOrder == ir->GetOrder(), "internal error"); - #else -- MFEM_CONTRACT_VAR(ir); -+ MFEM_CONTRACT_VAR(ir); - #endif -- } -- } - } - -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ omp_unset_lock(&IntRuleLocks[GeomType]); -+#endif -+ - return *(*ir_array)[Order]; - } - - void IntegrationRules::Set(int GeomType, int Order, IntegrationRule &IntRule) - { -- Array *ir_array; -+ Array *ir_array = NULL; - - switch (GeomType) - { -@@ -1042,11 +1052,15 @@ void IntegrationRules::Set(int GeomType, int Order, IntegrationRule &IntRule) - case Geometry::CUBE: ir_array = &CubeIntRules; break; - case Geometry::PRISM: ir_array = &PrismIntRules; break; - case Geometry::PYRAMID: ir_array = &PyramidIntRules; break; -- default: -- mfem_error("IntegrationRules::Set(...) : Unknown geometry type!"); -- ir_array = NULL; -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: -+ MFEM_ABORT("Unknown type of reference element!"); - } - -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ omp_set_lock(&IntRuleLocks[GeomType]); -+#endif -+ - if (HaveIntRule(*ir_array, Order)) - { - MFEM_ABORT("Overwriting set rules is not supported!"); -@@ -1055,16 +1069,19 @@ void IntegrationRules::Set(int GeomType, int Order, IntegrationRule &IntRule) - AllocIntRule(*ir_array, Order); - - (*ir_array)[Order] = &IntRule; -+ -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ omp_unset_lock(&IntRuleLocks[GeomType]); -+#endif - } - --void IntegrationRules::DeleteIntRuleArray(Array &ir_array) -+void IntegrationRules::DeleteIntRuleArray( -+ Array &ir_array) const - { -- int i; -- IntegrationRule *ir = NULL; -- - // Many of the intrules have multiple contiguous copies in the ir_array - // so we have to be careful to not delete them twice. -- for (i = 0; i < ir_array.Size(); i++) -+ IntegrationRule *ir = NULL; -+ for (int i = 0; i < ir_array.Size(); i++) - { - if (ir_array[i] != NULL && ir_array[i] != ir) - { -@@ -1076,6 +1093,13 @@ void IntegrationRules::DeleteIntRuleArray(Array &ir_array) - - IntegrationRules::~IntegrationRules() - { -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ for (int i = 0; i < Geometry::NUM_GEOMETRIES; i++) -+ { -+ omp_destroy_lock(&IntRuleLocks[i]); -+ } -+#endif -+ - if (!own_rules) { return; } - - DeleteIntRuleArray(PointIntRules); -@@ -1110,10 +1134,11 @@ IntegrationRule *IntegrationRules::GenerateIntegrationRule(int GeomType, - return PrismIntegrationRule(Order); - case Geometry::PYRAMID: - return PyramidIntegrationRule(Order); -- default: -- mfem_error("IntegrationRules::Set(...) : Unknown geometry type!"); -- return NULL; -+ case Geometry::INVALID: -+ case Geometry::NUM_GEOMETRIES: -+ MFEM_ABORT("Unknown type of reference element!"); - } -+ return NULL; - } - - -@@ -1122,7 +1147,7 @@ IntegrationRule *IntegrationRules::PointIntegrationRule(int Order) - { - if (Order > 1) - { -- mfem_error("Point Integration Rule of Order > 1 not defined"); -+ MFEM_ABORT("Point Integration Rule of Order > 1 not defined"); - return NULL; - } - -@@ -1185,7 +1210,7 @@ IntegrationRule *IntegrationRules::SegmentIntegrationRule(int Order) - QuadratureFunctions1D::OpenHalfUniform(n, ir); - break; - } -- default: -+ case Quadrature1D::Invalid: - { - MFEM_ABORT("unknown Quadrature1D type: " << quad_type); - } -@@ -1762,8 +1787,8 @@ IntegrationRule *IntegrationRules::PyramidIntegrationRule(int Order) - - for (int k=0; kIntPoint(k); -+ const IntegrationPoint &ipc = irc.IntPoint(k); -+ IntegrationPoint &ipp = PyramidIntRules[Order]->IntPoint(k); - ipp.x = ipc.x * (1.0 - ipc.z); - ipp.y = ipc.y * (1.0 - ipc.z); - ipp.z = ipc.z; -@@ -1775,8 +1800,8 @@ IntegrationRule *IntegrationRules::PyramidIntegrationRule(int Order) - // Integration rules for reference prism - IntegrationRule *IntegrationRules::PrismIntegrationRule(int Order) - { -- const IntegrationRule & irt = Get(Geometry::TRIANGLE, Order); -- const IntegrationRule & irs = Get(Geometry::SEGMENT, Order); -+ const IntegrationRule &irt = Get(Geometry::TRIANGLE, Order); -+ const IntegrationRule &irs = Get(Geometry::SEGMENT, Order); - int nt = irt.GetNPoints(); - int ns = irs.GetNPoints(); - AllocIntRule(PrismIntRules, Order); -@@ -1790,12 +1815,12 @@ IntegrationRule *IntegrationRules::PrismIntegrationRule(int Order) - - for (int ks=0; ksIntPoint(kp); -+ const IntegrationPoint &ipt = irt.IntPoint(kt); -+ IntegrationPoint &ipp = PrismIntRules[Order]->IntPoint(kp); - ipp.x = ipt.x; - ipp.y = ipt.y; - ipp.z = ips.x; -diff --git a/fem/intrules.hpp b/fem/intrules.hpp -index bf38766d7..1d5757994 100644 ---- a/fem/intrules.hpp -+++ b/fem/intrules.hpp -@@ -14,6 +14,9 @@ - - #include "../config/config.hpp" - #include "../general/array.hpp" -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+#include -+#endif - - #include - #include -@@ -428,14 +431,18 @@ private: - Array PrismIntRules; - Array CubeIntRules; - -- void AllocIntRule(Array &ir_array, int Order) -+#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) -+ Array IntRuleLocks; -+#endif -+ -+ void AllocIntRule(Array &ir_array, int Order) const - { - if (ir_array.Size() <= Order) - { - ir_array.SetSize(Order + 1, NULL); - } - } -- bool HaveIntRule(Array &ir_array, int Order) -+ bool HaveIntRule(Array &ir_array, int Order) const - { - return (ir_array.Size() > Order && ir_array[Order] != NULL); - } -@@ -443,6 +450,7 @@ private: - { - return Order | 1; // valid for all quad_type's - } -+ void DeleteIntRuleArray(Array &ir_array) const; - - /// The following methods allocate new IntegrationRule objects without - /// checking if they already exist. To avoid memory leaks use -@@ -457,12 +465,10 @@ private: - IntegrationRule *PrismIntegrationRule(int Order); - IntegrationRule *CubeIntegrationRule(int Order); - -- void DeleteIntRuleArray(Array &ir_array); -- - public: - /// Sets initial sizes for the integration rule arrays, but rules - /// are defined the first time they are requested with the Get method. -- explicit IntegrationRules(int Ref = 0, -+ explicit IntegrationRules(int ref = 0, - int type = Quadrature1D::GaussLegendre); - - /// Returns an integration rule for given GeomType and Order. -diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp -index 5e96c3f39..600f2fc2a 100644 ---- a/mesh/mesh.cpp -+++ b/mesh/mesh.cpp -@@ -4446,9 +4446,7 @@ void Mesh::MakeRefined_(Mesh &orig_mesh, const Array ref_factors, - - Array rdofs; - DenseMatrix phys_pts; -- -- GeometryRefiner refiner; -- refiner.SetType(q_type); -+ GeometryRefiner refiner(q_type); - - // Add refined elements and set vertex coordinates - for (int el = 0; el < orig_ne; el++) -diff --git a/tests/unit/fem/test_lexicographic_ordering.cpp b/tests/unit/fem/test_lexicographic_ordering.cpp -index f68395bfc..6c73cb4e3 100644 ---- a/tests/unit/fem/test_lexicographic_ordering.cpp -+++ b/tests/unit/fem/test_lexicographic_ordering.cpp -@@ -20,8 +20,7 @@ void VerifyOrdering(NodalFiniteElement &el) - Geometry::Type geom = el.GetGeomType(); - const Array &p = el.GetLexicographicOrdering(); - -- GeometryRefiner refiner; -- refiner.SetType(BasisType::GaussLobatto); -+ GeometryRefiner refiner(BasisType::GaussLobatto); - RefinedGeometry *ref_geom = refiner.Refine(geom, order); - - double error = 0.0; +diff --git a/fem/eltrans.cpp b/fem/eltrans.cpp +index b812e22eb..96aa854a5 100644 +--- a/fem/eltrans.cpp ++++ b/fem/eltrans.cpp +@@ -355,15 +355,11 @@ int InverseElementTransformation::Transform(const Vector &pt, + } + else + { +- const int old_type = GlobGeometryRefiner.GetType(); +- GlobGeometryRefiner.SetType(qpts_type); +- RefinedGeometry &RefG = +- *GlobGeometryRefiner.Refine(T->GetGeometryType(), order); ++ RefinedGeometry &RefG = *refiner.Refine(T->GetGeometryType(), order); + int closest_idx = (init_guess_type == ClosestPhysNode) ? + FindClosestPhysPoint(pt, RefG.RefPts) : + FindClosestRefPoint(pt, RefG.RefPts); + ip0 = &RefG.RefPts.IntPoint(closest_idx); +- GlobGeometryRefiner.SetType(old_type); + } + break; + } +diff --git a/fem/eltrans.hpp b/fem/eltrans.hpp +index 198e20df3..43eccd499 100644 +--- a/fem/eltrans.hpp ++++ b/fem/eltrans.hpp +@@ -234,6 +234,7 @@ protected: + const IntegrationPoint *ip0; + int init_guess_type; // algorithm to use + int qpts_type; // Quadrature1D type for the initial guess type ++ GeometryRefiner refiner; // geometry refiner for initial guess + int rel_qpts_order; // num_1D_qpts = max(trans_order+rel_qpts_order,0)+1 + int solver_type; // solution strategy to use + int max_iter; // max. number of Newton iterations +@@ -277,6 +278,7 @@ public: + ip0(NULL), + init_guess_type(Center), + qpts_type(Quadrature1D::OpenHalfUniform), ++ refiner(qpts_type), + rel_qpts_order(-1), + solver_type(NewtonElementProject), + max_iter(16), +@@ -301,7 +303,8 @@ public: + { ip0 = &init_ip; SetInitialGuessType(GivenPoint); } + + /// Set the Quadrature1D type used for the `Closest*` initial guess types. +- void SetInitGuessPointsType(int q_type) { qpts_type = q_type; } ++ void SetInitGuessPointsType(int q_type) ++ { qpts_type = q_type; refiner.SetType(q_type); } + + /// Set the relative order used for the `Closest*` initial guess types. + /** The number of points in each spatial direction is given by the formula +@@ -361,7 +364,7 @@ public: + class IsoparametricTransformation : public ElementTransformation + { + private: +- DenseMatrix dshape,d2shape; ++ DenseMatrix dshape, d2shape; + Vector shape; + + const FiniteElement *FElem; +diff --git a/fem/fe/fe_base.cpp b/fem/fe/fe_base.cpp +index b2f49a4bc..4387ae775 100644 +--- a/fem/fe/fe_base.cpp ++++ b/fem/fe/fe_base.cpp +@@ -359,135 +359,148 @@ void FiniteElement::CalcPhysHessian(ElementTransformation &Trans, + + // Hessian in physical coords + lhm.Invert(); +- Mult( hess, lhm, Hessian); ++ Mult(hess, lhm, Hessian); + } + + const DofToQuad &FiniteElement::GetDofToQuad(const IntegrationRule &ir, + DofToQuad::Mode mode) const + { ++ DofToQuad *d2q = nullptr; + MFEM_VERIFY(mode == DofToQuad::FULL, "invalid mode requested"); + +- for (int i = 0; i < dof2quad_array.Size(); i++) +- { +- const DofToQuad &d2q = *dof2quad_array[i]; +- if (d2q.IntRule == &ir && d2q.mode == mode) { return d2q; } +- } +- +-#ifdef MFEM_THREAD_SAFE +- DenseMatrix vshape(dof, dim); ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ #pragma omp critical (DofToQuad) + #endif +- +- DofToQuad *d2q = new DofToQuad; +- const int nqpt = ir.GetNPoints(); +- d2q->FE = this; +- d2q->IntRule = &ir; +- d2q->mode = mode; +- d2q->ndof = dof; +- d2q->nqpt = nqpt; +- if (range_type == SCALAR) +- { +- d2q->B.SetSize(nqpt*dof); +- d2q->Bt.SetSize(dof*nqpt); +- +- Vector shape; +- vshape.GetColumnReference(0, shape); +- for (int i = 0; i < nqpt; i++) +- { +- const IntegrationPoint &ip = ir.IntPoint(i); +- CalcShape(ip, shape); +- for (int j = 0; j < dof; j++) +- { +- d2q->B[i+nqpt*j] = d2q->Bt[j+dof*i] = shape(j); +- } +- } +- } +- else if (range_type == VECTOR) + { +- d2q->B.SetSize(nqpt*dim*dof); +- d2q->Bt.SetSize(dof*nqpt*dim); +- +- for (int i = 0; i < nqpt; i++) ++ for (int i = 0; i < dof2quad_array.Size(); i++) + { +- const IntegrationPoint &ip = ir.IntPoint(i); +- CalcVShape(ip, vshape); +- for (int d = 0; d < dim; d++) +- { +- for (int j = 0; j < dof; j++) +- { +- d2q->B[i+nqpt*(d+dim*j)] = d2q->Bt[j+dof*(i+nqpt*d)] = vshape(j, d); +- } +- } ++ d2q = dof2quad_array[i]; ++ if (d2q->IntRule != &ir || d2q->mode != mode) { d2q = nullptr; } + } +- } +- else +- { +- // Skip B and Bt for unknown range type +- } +- switch (deriv_type) +- { +- case GRAD: ++ if (!d2q) + { +- d2q->G.SetSize(nqpt*dim*dof); +- d2q->Gt.SetSize(dof*nqpt*dim); +- +- for (int i = 0; i < nqpt; i++) ++#ifdef MFEM_THREAD_SAFE ++ DenseMatrix vshape(dof, dim); ++#endif ++ d2q = new DofToQuad; ++ const int nqpt = ir.GetNPoints(); ++ d2q->FE = this; ++ d2q->IntRule = &ir; ++ d2q->mode = mode; ++ d2q->ndof = dof; ++ d2q->nqpt = nqpt; ++ switch (range_type) + { +- const IntegrationPoint &ip = ir.IntPoint(i); +- CalcDShape(ip, vshape); +- for (int d = 0; d < dim; d++) ++ case SCALAR: + { +- for (int j = 0; j < dof; j++) ++ d2q->B.SetSize(nqpt*dof); ++ d2q->Bt.SetSize(dof*nqpt); ++ ++ Vector shape; ++ vshape.GetColumnReference(0, shape); ++ for (int i = 0; i < nqpt; i++) + { +- d2q->G[i+nqpt*(d+dim*j)] = d2q->Gt[j+dof*(i+nqpt*d)] = vshape(j, d); ++ const IntegrationPoint &ip = ir.IntPoint(i); ++ CalcShape(ip, shape); ++ for (int j = 0; j < dof; j++) ++ { ++ d2q->B[i+nqpt*j] = d2q->Bt[j+dof*i] = shape(j); ++ } + } ++ break; + } +- } +- break; +- } +- case DIV: +- { +- d2q->G.SetSize(nqpt*dof); +- d2q->Gt.SetSize(dof*nqpt); ++ case VECTOR: ++ { ++ d2q->B.SetSize(nqpt*dim*dof); ++ d2q->Bt.SetSize(dof*nqpt*dim); + +- Vector divshape; +- vshape.GetColumnReference(0, divshape); +- for (int i = 0; i < nqpt; i++) ++ for (int i = 0; i < nqpt; i++) ++ { ++ const IntegrationPoint &ip = ir.IntPoint(i); ++ CalcVShape(ip, vshape); ++ for (int d = 0; d < dim; d++) ++ { ++ for (int j = 0; j < dof; j++) ++ { ++ d2q->B[i+nqpt*(d+dim*j)] = ++ d2q->Bt[j+dof*(i+nqpt*d)] = vshape(j, d); ++ } ++ } ++ } ++ break; ++ } ++ case UNKNOWN_RANGE_TYPE: ++ // Skip B and Bt for unknown range type ++ break; ++ } ++ switch (deriv_type) + { +- const IntegrationPoint &ip = ir.IntPoint(i); +- CalcDivShape(ip, divshape); +- for (int j = 0; j < dof; j++) ++ case GRAD: + { +- d2q->G[i+nqpt*j] = d2q->Gt[j+dof*i] = divshape(j); ++ d2q->G.SetSize(nqpt*dim*dof); ++ d2q->Gt.SetSize(dof*nqpt*dim); ++ ++ for (int i = 0; i < nqpt; i++) ++ { ++ const IntegrationPoint &ip = ir.IntPoint(i); ++ CalcDShape(ip, vshape); ++ for (int d = 0; d < dim; d++) ++ { ++ for (int j = 0; j < dof; j++) ++ { ++ d2q->G[i+nqpt*(d+dim*j)] = ++ d2q->Gt[j+dof*(i+nqpt*d)] = vshape(j, d); ++ } ++ } ++ } ++ break; + } +- } +- break; +- } +- case CURL: +- { +- d2q->G.SetSize(nqpt*cdim*dof); +- d2q->Gt.SetSize(dof*nqpt*cdim); ++ case DIV: ++ { ++ d2q->G.SetSize(nqpt*dof); ++ d2q->Gt.SetSize(dof*nqpt); + +- DenseMatrix curlshape(vshape.GetData(), dof, cdim); // cdim <= dim +- for (int i = 0; i < nqpt; i++) +- { +- const IntegrationPoint &ip = ir.IntPoint(i); +- CalcCurlShape(ip, curlshape); +- for (int d = 0; d < cdim; d++) ++ Vector divshape; ++ vshape.GetColumnReference(0, divshape); ++ for (int i = 0; i < nqpt; i++) ++ { ++ const IntegrationPoint &ip = ir.IntPoint(i); ++ CalcDivShape(ip, divshape); ++ for (int j = 0; j < dof; j++) ++ { ++ d2q->G[i+nqpt*j] = d2q->Gt[j+dof*i] = divshape(j); ++ } ++ } ++ break; ++ } ++ case CURL: + { +- for (int j = 0; j < dof; j++) ++ d2q->G.SetSize(nqpt*cdim*dof); ++ d2q->Gt.SetSize(dof*nqpt*cdim); ++ ++ DenseMatrix curlshape(vshape.GetData(), dof, cdim); // cdim <= dim ++ for (int i = 0; i < nqpt; i++) + { +- d2q->G[i+nqpt*(d+cdim*j)] = d2q->Gt[j+dof*(i+nqpt*d)] = curlshape(j, d); ++ const IntegrationPoint &ip = ir.IntPoint(i); ++ CalcCurlShape(ip, curlshape); ++ for (int d = 0; d < cdim; d++) ++ { ++ for (int j = 0; j < dof; j++) ++ { ++ d2q->G[i+nqpt*(d+cdim*j)] = ++ d2q->Gt[j+dof*(i+nqpt*d)] = curlshape(j, d); ++ } ++ } + } ++ break; + } ++ case NONE: ++ // Skip G and Gt for unknown derivative type ++ break; + } +- break; ++ dof2quad_array.Append(d2q); + } +- case NONE: +- default: +- // Skip G and Gt for unknown derivative type +- break; + } +- dof2quad_array.Append(d2q); + return *d2q; + } + +@@ -904,14 +917,14 @@ VectorFiniteElement::VectorFiniteElement(int D, Geometry::Type G, + } + + void VectorFiniteElement::CalcShape( +- const IntegrationPoint &ip, Vector &shape ) const ++ const IntegrationPoint &ip, Vector &shape) const + { + mfem_error("Error: Cannot use scalar CalcShape(...) function with\n" + " VectorFiniteElements!"); + } + + void VectorFiniteElement::CalcDShape( +- const IntegrationPoint &ip, DenseMatrix &dshape ) const ++ const IntegrationPoint &ip, DenseMatrix &dshape) const + { + mfem_error("Error: Cannot use scalar CalcDShape(...) function with\n" + " VectorFiniteElements!"); +@@ -2183,51 +2196,72 @@ void Poly_1D::CalcChebyshev(const int p, const double x, double *u, double *d, + + const double *Poly_1D::GetPoints(const int p, const int btype) + { ++ Array *pts; + BasisType::Check(btype); + const int qtype = BasisType::GetQuadrature1D(btype); +- + if (qtype == Quadrature1D::Invalid) { return NULL; } + +- if (points_container.find(btype) == points_container.end()) +- { +- points_container[btype] = new Array(h_mt); +- } +- Array &pts = *points_container[btype]; +- if (pts.Size() <= p) +- { +- pts.SetSize(p + 1, NULL); +- } +- if (pts[p] == NULL) ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ #pragma omp critical (Poly1DGetPoints) ++#endif + { +- pts[p] = new double[p + 1]; +- quad_func.GivePolyPoints(p+1, pts[p], qtype); ++ auto it = points_container.find(btype); ++ if (it != points_container.end()) ++ { ++ pts = it->second; ++ } ++ else ++ { ++ pts = new Array(h_mt); ++ points_container[btype] = pts; ++ } ++ if (pts->Size() <= p) ++ { ++ pts->SetSize(p + 1, NULL); ++ } ++ if ((*pts)[p] == NULL) ++ { ++ (*pts)[p] = new double[p + 1]; ++ quad_func.GivePolyPoints(p + 1, (*pts)[p], qtype); ++ } + } +- return pts[p]; ++ return (*pts)[p]; + } + + Poly_1D::Basis &Poly_1D::GetBasis(const int p, const int btype) + { ++ Array *bases; + BasisType::Check(btype); + +- if ( bases_container.find(btype) == bases_container.end() ) +- { +- // we haven't been asked for basis or points of this type yet +- bases_container[btype] = new Array(h_mt); +- } +- Array &bases = *bases_container[btype]; +- if (bases.Size() <= p) +- { +- bases.SetSize(p + 1, NULL); +- } +- if (bases[p] == NULL) ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ #pragma omp critical (Poly1DGetBasis) ++#endif + { +- EvalType etype; +- if (btype == BasisType::Positive) { etype = Positive; } +- else if (btype == BasisType::IntegratedGLL) { etype = Integrated; } +- else { etype = Barycentric; } +- bases[p] = new Basis(p, GetPoints(p, btype), etype); ++ auto it = bases_container.find(btype); ++ if (it != bases_container.end()) ++ { ++ bases = it->second; ++ } ++ else ++ { ++ // we haven't been asked for basis or points of this type yet ++ bases = new Array(h_mt); ++ bases_container[btype] = bases; ++ } ++ if (bases->Size() <= p) ++ { ++ bases->SetSize(p + 1, NULL); ++ } ++ if ((*bases)[p] == NULL) ++ { ++ EvalType etype; ++ if (btype == BasisType::Positive) { etype = Positive; } ++ else if (btype == BasisType::IntegratedGLL) { etype = Integrated; } ++ else { etype = Barycentric; } ++ (*bases)[p] = new Basis(p, GetPoints(p, btype), etype); ++ } + } +- return *bases[p]; ++ return *(*bases)[p]; + } + + Poly_1D::~Poly_1D() +@@ -2236,7 +2270,7 @@ Poly_1D::~Poly_1D() + it != points_container.end() ; ++it) + { + Array& pts = *it->second; +- for ( int i = 0 ; i < pts.Size() ; ++i ) ++ for (int i = 0; i < pts.Size(); ++i) + { + delete [] pts[i]; + } +@@ -2247,7 +2281,7 @@ Poly_1D::~Poly_1D() + it != bases_container.end() ; ++it) + { + Array& bases = *it->second; +- for ( int i = 0 ; i < bases.Size() ; ++i ) ++ for (int i = 0; i < bases.Size(); ++i) + { + delete bases[i]; + } +@@ -2461,39 +2495,47 @@ const DofToQuad &TensorBasisElement::GetTensorDofToQuad( + DofToQuad::Mode mode, const Poly_1D::Basis &basis, bool closed, + Array &dof2quad_array) + { ++ DofToQuad *d2q = nullptr; + MFEM_VERIFY(mode == DofToQuad::TENSOR, "invalid mode requested"); + +- for (int i = 0; i < dof2quad_array.Size(); i++) +- { +- const DofToQuad &d2q = *dof2quad_array[i]; +- if (d2q.IntRule == &ir && d2q.mode == mode) { return d2q; } +- } +- +- DofToQuad *d2q = new DofToQuad; +- const int ndof = closed ? fe.GetOrder() + 1 : fe.GetOrder(); +- const int nqpt = (int)floor(pow(ir.GetNPoints(), 1.0/fe.GetDim()) + 0.5); +- d2q->FE = &fe; +- d2q->IntRule = &ir; +- d2q->mode = mode; +- d2q->ndof = ndof; +- d2q->nqpt = nqpt; +- d2q->B.SetSize(nqpt*ndof); +- d2q->Bt.SetSize(ndof*nqpt); +- d2q->G.SetSize(nqpt*ndof); +- d2q->Gt.SetSize(ndof*nqpt); +- Vector val(ndof), grad(ndof); +- for (int i = 0; i < nqpt; i++) ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ #pragma omp critical (DofToQuad) ++#endif + { +- // The first 'nqpt' points in 'ir' have the same x-coordinates as those +- // of the 1D rule. +- basis.Eval(ir.IntPoint(i).x, val, grad); +- for (int j = 0; j < ndof; j++) +- { +- d2q->B[i+nqpt*j] = d2q->Bt[j+ndof*i] = val(j); +- d2q->G[i+nqpt*j] = d2q->Gt[j+ndof*i] = grad(j); ++ for (int i = 0; i < dof2quad_array.Size(); i++) ++ { ++ d2q = dof2quad_array[i]; ++ if (d2q->IntRule != &ir || d2q->mode != mode) { d2q = nullptr; } ++ } ++ if (!d2q) ++ { ++ d2q = new DofToQuad; ++ const int ndof = closed ? fe.GetOrder() + 1 : fe.GetOrder(); ++ const int nqpt = (int)floor(pow(ir.GetNPoints(), 1.0/fe.GetDim()) + 0.5); ++ d2q->FE = &fe; ++ d2q->IntRule = &ir; ++ d2q->mode = mode; ++ d2q->ndof = ndof; ++ d2q->nqpt = nqpt; ++ d2q->B.SetSize(nqpt*ndof); ++ d2q->Bt.SetSize(ndof*nqpt); ++ d2q->G.SetSize(nqpt*ndof); ++ d2q->Gt.SetSize(ndof*nqpt); ++ Vector val(ndof), grad(ndof); ++ for (int i = 0; i < nqpt; i++) ++ { ++ // The first 'nqpt' points in 'ir' have the same x-coordinates as those ++ // of the 1D rule. ++ basis.Eval(ir.IntPoint(i).x, val, grad); ++ for (int j = 0; j < ndof; j++) ++ { ++ d2q->B[i+nqpt*j] = d2q->Bt[j+ndof*i] = val(j); ++ d2q->G[i+nqpt*j] = d2q->Gt[j+ndof*i] = grad(j); ++ } ++ } ++ dof2quad_array.Append(d2q); + } + } +- dof2quad_array.Append(d2q); + return *d2q; + } + +diff --git a/fem/fe/fe_base.hpp b/fem/fe/fe_base.hpp +index b948b4f8d..f9e31b457 100644 +--- a/fem/fe/fe_base.hpp ++++ b/fem/fe/fe_base.hpp +@@ -250,7 +250,7 @@ protected: + /// Container for all DofToQuad objects created by the FiniteElement. + /** Multiple DofToQuad objects may be needed when different quadrature rules + or different DofToQuad::Mode are used. */ +- mutable Array dof2quad_array; ++ mutable Array dof2quad_array; + + public: + /// Enumeration for range_type and deriv_range_type +@@ -1026,8 +1026,8 @@ public: + }; + + private: +- typedef std::map< int, Array* > PointsMap; +- typedef std::map< int, Array* > BasisMap; ++ typedef std::map*> PointsMap; ++ typedef std::map*> BasisMap; + + MemoryType h_mt; + PointsMap points_container; +diff --git a/fem/geom.cpp b/fem/geom.cpp +index 5438d741f..2d9f4e907 100644 +--- a/fem/geom.cpp ++++ b/fem/geom.cpp +@@ -262,7 +262,7 @@ Geometry::~Geometry() + } + } + +-const IntegrationRule * Geometry::GetVertices(int GeomType) ++const IntegrationRule *Geometry::GetVertices(int GeomType) const + { + switch (GeomType) + { +@@ -274,8 +274,9 @@ const IntegrationRule * Geometry::GetVertices(int GeomType) + case Geometry::CUBE: return GeomVert[5]; + case Geometry::PRISM: return GeomVert[6]; + case Geometry::PYRAMID: return GeomVert[7]; +- default: +- mfem_error ("Geometry::GetVertices(...)"); ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: ++ mfem_error("Geometry::GetVertices(...)"); + } + // make some compilers happy. + return GeomVert[0]; +@@ -370,7 +371,8 @@ void Geometry::GetRandomPoint(int GeomType, IntegrationPoint &ip) + ip.x = 1.0 - z; + } + break; +- default: ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: + MFEM_ABORT("Unknown type of reference element!"); + } + } +@@ -435,12 +437,14 @@ bool Geometry::CheckPoint(int GeomType, const IntegrationPoint &ip) + if (ip.x < 0.0 || ip.y < 0.0 || ip.x+ip.z > 1.0 || ip.y+ip.z > 1.0 || + ip.z < 0.0 || ip.z > 1.0) { return false; } + break; +- default: ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: + MFEM_ABORT("Unknown type of reference element!"); + } + return true; + } + ++// static method + bool Geometry::CheckPoint(int GeomType, const IntegrationPoint &ip, double eps) + { + switch (GeomType) +@@ -516,7 +520,8 @@ bool Geometry::CheckPoint(int GeomType, const IntegrationPoint &ip, double eps) + return false; + } + break; +- default: ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: + MFEM_ABORT("Unknown type of reference element!"); + } + return true; +@@ -640,7 +645,8 @@ bool Geometry::ProjectPoint(int GeomType, const IntegrationPoint &beg, + }; + return internal::IntersectSegment<6,3>(lbeg, lend, end); + } +- default: ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: + MFEM_ABORT("Unknown type of reference element!"); + } + return true; +@@ -774,13 +780,16 @@ bool Geometry::ProjectPoint(int GeomType, IntegrationPoint &ip) + } + } + +- default: ++ case Geometry::POINT: + MFEM_ABORT("Reference element type is not supported!"); ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: ++ MFEM_ABORT("Unknown type of reference element!"); + } + return true; + } + +-void Geometry::GetPerfPointMat(int GeomType, DenseMatrix &pm) ++void Geometry::GetPerfPointMat(int GeomType, DenseMatrix &pm) const + { + switch (GeomType) + { +@@ -859,8 +868,11 @@ void Geometry::GetPerfPointMat(int GeomType, DenseMatrix &pm) + } + break; + +- default: +- mfem_error ("Geometry::GetPerfPointMat (...)"); ++ case Geometry::POINT: ++ MFEM_ABORT("Reference element type is not supported!"); ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: ++ MFEM_ABORT("Unknown type of reference element!"); + } + } + +@@ -1055,11 +1067,6 @@ Constants::VertToVert::J[8][2] = + }; + + +-GeometryRefiner::GeometryRefiner() +-{ +- type = Quadrature1D::ClosedUniform; +-} +- + GeometryRefiner::~GeometryRefiner() + { + for (int i = 0; i < Geometry::NumGeom; i++) +@@ -1070,10 +1077,9 @@ GeometryRefiner::~GeometryRefiner() + } + + RefinedGeometry *GeometryRefiner::FindInRGeom(Geometry::Type Geom, +- int Times, int ETimes, +- int Type) ++ int Times, int ETimes) const + { +- Array &RGA = RGeom[Geom]; ++ const Array &RGA = RGeom[Geom]; + for (int i = 0; i < RGA.Size(); i++) + { + RefinedGeometry &RG = *RGA[i]; +@@ -1085,9 +1091,10 @@ RefinedGeometry *GeometryRefiner::FindInRGeom(Geometry::Type Geom, + return NULL; + } + +-IntegrationRule *GeometryRefiner::FindInIntPts(Geometry::Type Geom, int NPts) ++IntegrationRule *GeometryRefiner::FindInIntPts(Geometry::Type Geom, ++ int NPts) const + { +- Array &IPA = IntPts[Geom]; ++ const Array &IPA = IntPts[Geom]; + for (int i = 0; i < IPA.Size(); i++) + { + IntegrationRule &ir = *IPA[i]; +@@ -1096,491 +1103,538 @@ IntegrationRule *GeometryRefiner::FindInIntPts(Geometry::Type Geom, int NPts) + return NULL; + } + +-RefinedGeometry * GeometryRefiner::Refine(Geometry::Type Geom, +- int Times, int ETimes) ++RefinedGeometry *GeometryRefiner::Refine(Geometry::Type Geom, int Times, ++ int ETimes) + { ++ RefinedGeometry *RG = NULL; + int i, j, k, l, m; +- + Times = std::max(Times, 1); + ETimes = Geometry::Dimension[Geom] <= 1 ? 0 : std::max(ETimes, 1); +- const double *cp = poly1d.GetPoints(Times, BasisType::GetNodalBasis(type)); ++ const double *cp = poly1d.GetPoints(Times, BasisType::GetNodalBasis(Type)); + +- RefinedGeometry *RG = FindInRGeom(Geom, Times, ETimes, type); +- if (RG) { return RG; } +- +- switch (Geom) ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ #pragma omp critical (Refine) ++#endif + { +- case Geometry::POINT: ++ RG = FindInRGeom(Geom, Times, ETimes); ++ if (!RG) + { +- RG = new RefinedGeometry(1, 1, 0); +- RG->Times = 1; +- RG->ETimes = 0; +- RG->Type = type; +- RG->RefPts.IntPoint(0).x = cp[0]; +- RG->RefGeoms[0] = 0; +- +- RGeom[Geometry::POINT].Append(RG); +- return RG; +- } +- +- case Geometry::SEGMENT: +- { +- RG = new RefinedGeometry(Times+1, 2*Times, 0); +- RG->Times = Times; +- RG->ETimes = 0; +- RG->Type = type; +- for (i = 0; i <= Times; i++) +- { +- IntegrationPoint &ip = RG->RefPts.IntPoint(i); +- ip.x = cp[i]; +- } +- Array &G = RG->RefGeoms; +- for (i = 0; i < Times; i++) ++ switch (Geom) + { +- G[2*i+0] = i; +- G[2*i+1] = i+1; +- } +- +- RGeom[Geometry::SEGMENT].Append(RG); +- return RG; +- } +- +- case Geometry::TRIANGLE: +- { +- RG = new RefinedGeometry((Times+1)*(Times+2)/2, 3*Times*Times, +- 3*Times*(ETimes+1), 3*Times); +- RG->Times = Times; +- RG->ETimes = ETimes; +- RG->Type = type; +- for (k = j = 0; j <= Times; j++) +- for (i = 0; i <= Times-j; i++, k++) ++ case Geometry::POINT: + { +- IntegrationPoint &ip = RG->RefPts.IntPoint(k); +- ip.x = cp[i]/(cp[i] + cp[j] + cp[Times-i-j]); +- ip.y = cp[j]/(cp[i] + cp[j] + cp[Times-i-j]); ++ RG = new RefinedGeometry(1, 1, 0); ++ RG->Times = 1; ++ RG->ETimes = 0; ++ RG->Type = Type; ++ RG->RefPts.IntPoint(0).x = cp[0]; ++ RG->RefGeoms[0] = 0; ++ ++ RGeom[Geometry::POINT].Append(RG); + } +- Array &G = RG->RefGeoms; +- for (l = k = j = 0; j < Times; j++, k++) +- for (i = 0; i < Times-j; i++, k++) ++ break; ++ ++ case Geometry::SEGMENT: + { +- G[l++] = k; +- G[l++] = k+1; +- G[l++] = k+Times-j+1; +- if (i+j+1 < Times) ++ RG = new RefinedGeometry(Times+1, 2*Times, 0); ++ RG->Times = Times; ++ RG->ETimes = 0; ++ RG->Type = Type; ++ for (i = 0; i <= Times; i++) + { +- G[l++] = k+1; +- G[l++] = k+Times-j+2; +- G[l++] = k+Times-j+1; ++ IntegrationPoint &ip = RG->RefPts.IntPoint(i); ++ ip.x = cp[i]; ++ } ++ Array &G = RG->RefGeoms; ++ for (i = 0; i < Times; i++) ++ { ++ G[2*i+0] = i; ++ G[2*i+1] = i+1; + } +- } +- Array &E = RG->RefEdges; +- int lb = 0, li = 2*RG->NumBdrEdges; +- // horizontal edges +- for (k = 0; k < Times; k += Times/ETimes) +- { +- int < = (k == 0) ? lb : li; +- j = k*(Times+1)-((k-1)*k)/2; +- for (i = 0; i < Times-k; i++) +- { +- E[lt++] = j; j++; +- E[lt++] = j; +- } +- } +- // diagonal edges +- for (k = Times; k > 0; k -= Times/ETimes) +- { +- int < = (k == Times) ? lb : li; +- j = k; +- for (i = 0; i < k; i++) +- { +- E[lt++] = j; j += Times-i; +- E[lt++] = j; +- } +- } +- // vertical edges +- for (k = 0; k < Times; k += Times/ETimes) +- { +- int < = (k == 0) ? lb : li; +- j = k; +- for (i = 0; i < Times-k; i++) +- { +- E[lt++] = j; j += Times-i+1; +- E[lt++] = j; +- } +- } +- +- RGeom[Geometry::TRIANGLE].Append(RG); +- return RG; +- } + +- case Geometry::SQUARE: +- { +- RG = new RefinedGeometry((Times+1)*(Times+1), 4*Times*Times, +- 4*(ETimes+1)*Times, 4*Times); +- RG->Times = Times; +- RG->ETimes = ETimes; +- RG->Type = type; +- for (k = j = 0; j <= Times; j++) +- for (i = 0; i <= Times; i++, k++) +- { +- IntegrationPoint &ip = RG->RefPts.IntPoint(k); +- ip.x = cp[i]; +- ip.y = cp[j]; +- } +- Array &G = RG->RefGeoms; +- for (l = k = j = 0; j < Times; j++, k++) +- for (i = 0; i < Times; i++, k++) +- { +- G[l++] = k; +- G[l++] = k+1; +- G[l++] = k+Times+2; +- G[l++] = k+Times+1; ++ RGeom[Geometry::SEGMENT].Append(RG); + } +- Array &E = RG->RefEdges; +- int lb = 0, li = 2*RG->NumBdrEdges; +- // horizontal edges +- for (k = 0; k <= Times; k += Times/ETimes) +- { +- int < = (k == 0 || k == Times) ? lb : li; +- for (i = 0, j = k*(Times+1); i < Times; i++) +- { +- E[lt++] = j; j++; +- E[lt++] = j; +- } +- } +- // vertical edges (in right-to-left order) +- for (k = Times; k >= 0; k -= Times/ETimes) +- { +- int < = (k == Times || k == 0) ? lb : li; +- for (i = 0, j = k; i < Times; i++) ++ break; ++ ++ case Geometry::TRIANGLE: + { +- E[lt++] = j; j += Times+1; +- E[lt++] = j; +- } +- } ++ RG = new RefinedGeometry((Times+1)*(Times+2)/2, 3*Times*Times, ++ 3*Times*(ETimes+1), 3*Times); ++ RG->Times = Times; ++ RG->ETimes = ETimes; ++ RG->Type = Type; ++ for (k = j = 0; j <= Times; j++) ++ { ++ for (i = 0; i <= Times-j; i++, k++) ++ { ++ IntegrationPoint &ip = RG->RefPts.IntPoint(k); ++ ip.x = cp[i]/(cp[i] + cp[j] + cp[Times-i-j]); ++ ip.y = cp[j]/(cp[i] + cp[j] + cp[Times-i-j]); ++ } ++ } ++ Array &G = RG->RefGeoms; ++ for (l = k = j = 0; j < Times; j++, k++) ++ { ++ for (i = 0; i < Times-j; i++, k++) ++ { ++ G[l++] = k; ++ G[l++] = k+1; ++ G[l++] = k+Times-j+1; ++ if (i+j+1 < Times) ++ { ++ G[l++] = k+1; ++ G[l++] = k+Times-j+2; ++ G[l++] = k+Times-j+1; ++ } ++ } ++ } ++ Array &E = RG->RefEdges; ++ int lb = 0, li = 2*RG->NumBdrEdges; ++ // horizontal edges ++ for (k = 0; k < Times; k += Times/ETimes) ++ { ++ int < = (k == 0) ? lb : li; ++ j = k*(Times+1)-((k-1)*k)/2; ++ for (i = 0; i < Times-k; i++) ++ { ++ E[lt++] = j; j++; ++ E[lt++] = j; ++ } ++ } ++ // diagonal edges ++ for (k = Times; k > 0; k -= Times/ETimes) ++ { ++ int < = (k == Times) ? lb : li; ++ j = k; ++ for (i = 0; i < k; i++) ++ { ++ E[lt++] = j; j += Times-i; ++ E[lt++] = j; ++ } ++ } ++ // vertical edges ++ for (k = 0; k < Times; k += Times/ETimes) ++ { ++ int < = (k == 0) ? lb : li; ++ j = k; ++ for (i = 0; i < Times-k; i++) ++ { ++ E[lt++] = j; j += Times-i+1; ++ E[lt++] = j; ++ } ++ } + +- RGeom[Geometry::SQUARE].Append(RG); +- return RG; +- } ++ RGeom[Geometry::TRIANGLE].Append(RG); ++ } ++ break; + +- case Geometry::CUBE: +- { +- RG = new RefinedGeometry ((Times+1)*(Times+1)*(Times+1), +- 8*Times*Times*Times, 0); +- RG->Times = Times; +- RG->ETimes = ETimes; +- RG->Type = type; +- for (l = k = 0; k <= Times; k++) +- for (j = 0; j <= Times; j++) +- for (i = 0; i <= Times; i++, l++) ++ case Geometry::SQUARE: ++ { ++ RG = new RefinedGeometry((Times+1)*(Times+1), 4*Times*Times, ++ 4*(ETimes+1)*Times, 4*Times); ++ RG->Times = Times; ++ RG->ETimes = ETimes; ++ RG->Type = Type; ++ for (k = j = 0; j <= Times; j++) + { +- IntegrationPoint &ip = RG->RefPts.IntPoint(l); +- ip.x = cp[i]; +- ip.y = cp[j]; +- ip.z = cp[k]; ++ for (i = 0; i <= Times; i++, k++) ++ { ++ IntegrationPoint &ip = RG->RefPts.IntPoint(k); ++ ip.x = cp[i]; ++ ip.y = cp[j]; ++ } + } +- Array &G = RG->RefGeoms; +- for (l = k = 0; k < Times; k++) +- for (j = 0; j < Times; j++) +- for (i = 0; i < Times; i++) ++ Array &G = RG->RefGeoms; ++ for (l = k = j = 0; j < Times; j++, k++) + { +- G[l++] = i+0 + (j+0 + (k+0) * (Times+1)) * (Times+1); +- G[l++] = i+1 + (j+0 + (k+0) * (Times+1)) * (Times+1); +- G[l++] = i+1 + (j+1 + (k+0) * (Times+1)) * (Times+1); +- G[l++] = i+0 + (j+1 + (k+0) * (Times+1)) * (Times+1); +- G[l++] = i+0 + (j+0 + (k+1) * (Times+1)) * (Times+1); +- G[l++] = i+1 + (j+0 + (k+1) * (Times+1)) * (Times+1); +- G[l++] = i+1 + (j+1 + (k+1) * (Times+1)) * (Times+1); +- G[l++] = i+0 + (j+1 + (k+1) * (Times+1)) * (Times+1); ++ for (i = 0; i < Times; i++, k++) ++ { ++ G[l++] = k; ++ G[l++] = k+1; ++ G[l++] = k+Times+2; ++ G[l++] = k+Times+1; ++ } ++ } ++ Array &E = RG->RefEdges; ++ int lb = 0, li = 2*RG->NumBdrEdges; ++ // horizontal edges ++ for (k = 0; k <= Times; k += Times/ETimes) ++ { ++ int < = (k == 0 || k == Times) ? lb : li; ++ for (i = 0, j = k*(Times+1); i < Times; i++) ++ { ++ E[lt++] = j; j++; ++ E[lt++] = j; ++ } ++ } ++ // vertical edges (in right-to-left order) ++ for (k = Times; k >= 0; k -= Times/ETimes) ++ { ++ int < = (k == Times || k == 0) ? lb : li; ++ for (i = 0, j = k; i < Times; i++) ++ { ++ E[lt++] = j; j += Times+1; ++ E[lt++] = j; ++ } + } + +- RGeom[Geometry::CUBE].Append(RG); +- return RG; +- } ++ RGeom[Geometry::SQUARE].Append(RG); ++ } ++ break; + +- case Geometry::TETRAHEDRON: +- { +- // subdivide the tetrahedron with vertices +- // (0,0,0), (0,0,1), (1,1,1), (0,1,1) +- +- // vertices: 0 <= i <= j <= k <= Times +- // (3-combination with repetitions) +- // number of vertices: (n+3)*(n+2)*(n+1)/6, n = Times +- +- // elements: the vertices are: v1=(i,j,k), v2=v1+u1, v3=v2+u2, v4=v3+u3 +- // where 0 <= i <= j <= k <= n-1 and +- // u1,u2,u3 is a permutation of (1,0,0),(0,1,0),(0,0,1) +- // such that all v2,v3,v4 have non-decreasing components +- // number of elements: n^3 +- +- const int n = Times; +- RG = new RefinedGeometry((n+3)*(n+2)*(n+1)/6, 4*n*n*n, 0); +- RG->Times = Times; +- RG->ETimes = ETimes; +- RG->Type = type; +- // enumerate and define the vertices +- Array vi((n+1)*(n+1)*(n+1)); +- vi = -1; +- m = 0; +- +- // vertices are given in lexicographic ordering on the reference +- // element +- for (int kk = 0; kk <= n; kk++) +- for (int jj = 0; jj <= n-kk; jj++) +- for (int ii = 0; ii <= n-jj-kk; ii++) ++ case Geometry::CUBE: ++ { ++ RG = new RefinedGeometry ((Times+1)*(Times+1)*(Times+1), ++ 8*Times*Times*Times, 0); ++ RG->Times = Times; ++ RG->ETimes = ETimes; ++ RG->Type = Type; ++ for (l = k = 0; k <= Times; k++) ++ { ++ for (j = 0; j <= Times; j++) ++ { ++ for (i = 0; i <= Times; i++, l++) ++ { ++ IntegrationPoint &ip = RG->RefPts.IntPoint(l); ++ ip.x = cp[i]; ++ ip.y = cp[j]; ++ ip.z = cp[k]; ++ } ++ } ++ } ++ Array &G = RG->RefGeoms; ++ for (l = k = 0; k < Times; k++) + { +- IntegrationPoint &ip = RG->RefPts.IntPoint(m); +- double w = cp[ii] + cp[jj] + cp[kk] + cp[Times-ii-jj-kk]; +- ip.x = cp[ii]/w; +- ip.y = cp[jj]/w; +- ip.z = cp[kk]/w; +- // (ii,jj,kk) are coordinates in the reference tetrahedron, +- // transform to coordinates (i,j,k) in the auxiliary +- // tetrahedron defined by (0,0,0), (0,0,1), (1,1,1), (0,1,1) +- i = jj; +- j = jj+kk; +- k = ii+jj+kk; +- l = i + (j + k * (n+1)) * (n+1); +- // map from linear Cartesian hex index in the auxiliary tet +- // to lexicographic in the reference tet +- vi[l] = m; +- m++; ++ for (j = 0; j < Times; j++) ++ { ++ for (i = 0; i < Times; i++) ++ { ++ G[l++] = i+0 + (j+0 + (k+0) * (Times+1)) * (Times+1); ++ G[l++] = i+1 + (j+0 + (k+0) * (Times+1)) * (Times+1); ++ G[l++] = i+1 + (j+1 + (k+0) * (Times+1)) * (Times+1); ++ G[l++] = i+0 + (j+1 + (k+0) * (Times+1)) * (Times+1); ++ G[l++] = i+0 + (j+0 + (k+1) * (Times+1)) * (Times+1); ++ G[l++] = i+1 + (j+0 + (k+1) * (Times+1)) * (Times+1); ++ G[l++] = i+1 + (j+1 + (k+1) * (Times+1)) * (Times+1); ++ G[l++] = i+0 + (j+1 + (k+1) * (Times+1)) * (Times+1); ++ } ++ } + } + +- if (m != (n+3)*(n+2)*(n+1)/6) +- { +- mfem_error("GeometryRefiner::Refine() for TETRAHEDRON #1"); +- } +- // elements +- Array &G = RG->RefGeoms; +- m = 0; +- for (k = 0; k < n; k++) +- for (j = 0; j <= k; j++) +- for (i = 0; i <= j; i++) ++ RGeom[Geometry::CUBE].Append(RG); ++ } ++ break; ++ ++ case Geometry::TETRAHEDRON: ++ { ++ // subdivide the tetrahedron with vertices ++ // (0,0,0), (0,0,1), (1,1,1), (0,1,1) ++ ++ // vertices: 0 <= i <= j <= k <= Times ++ // (3-combination with repetitions) ++ // number of vertices: (n+3)*(n+2)*(n+1)/6, n = Times ++ ++ // elements: the vertices are: v1=(i,j,k), v2=v1+u1, v3=v2+u2, v4=v3+u3 ++ // where 0 <= i <= j <= k <= n-1 and ++ // u1,u2,u3 is a permutation of (1,0,0),(0,1,0),(0,0,1) ++ // such that all v2,v3,v4 have non-decreasing components ++ // number of elements: n^3 ++ ++ const int n = Times; ++ RG = new RefinedGeometry((n+3)*(n+2)*(n+1)/6, 4*n*n*n, 0); ++ RG->Times = Times; ++ RG->ETimes = ETimes; ++ RG->Type = Type; ++ // enumerate and define the vertices ++ Array vi((n+1)*(n+1)*(n+1)); ++ vi = -1; ++ m = 0; ++ ++ // vertices are given in lexicographic ordering on the reference ++ // element ++ for (int kk = 0; kk <= n; kk++) + { +- // the ordering of the vertices is chosen to ensure: +- // 1) correct orientation +- // 2) the x,y,z edges are in the set of edges +- // {(0,1),(2,3), (0,2),(1,3)} +- // (goal is to ensure that subsequent refinement using +- // this procedure preserves the six tetrahedral shapes) +- +- // zyx: (i,j,k)-(i,j,k+1)-(i+1,j+1,k+1)-(i,j+1,k+1) +- G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; +- G[m++] = vi[i+0 + (j+0 + (k+1) * (n+1)) * (n+1)]; +- G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; +- G[m++] = vi[i+0 + (j+1 + (k+1) * (n+1)) * (n+1)]; +- if (j < k) ++ for (int jj = 0; jj <= n-kk; jj++) + { +- // yzx: (i,j,k)-(i+1,j+1,k+1)-(i,j+1,k)-(i,j+1,k+1) +- G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; +- G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; +- G[m++] = vi[i+0 + (j+1 + (k+0) * (n+1)) * (n+1)]; +- G[m++] = vi[i+0 + (j+1 + (k+1) * (n+1)) * (n+1)]; +- // yxz: (i,j,k)-(i,j+1,k)-(i+1,j+1,k+1)-(i+1,j+1,k) +- G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; +- G[m++] = vi[i+0 + (j+1 + (k+0) * (n+1)) * (n+1)]; +- G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; +- G[m++] = vi[i+1 + (j+1 + (k+0) * (n+1)) * (n+1)]; ++ for (int ii = 0; ii <= n-jj-kk; ii++) ++ { ++ IntegrationPoint &ip = RG->RefPts.IntPoint(m); ++ double w = cp[ii] + cp[jj] + cp[kk] + cp[Times-ii-jj-kk]; ++ ip.x = cp[ii]/w; ++ ip.y = cp[jj]/w; ++ ip.z = cp[kk]/w; ++ // (ii,jj,kk) are coordinates in the reference tetrahedron, ++ // transform to coordinates (i,j,k) in the auxiliary ++ // tetrahedron defined by (0,0,0), (0,0,1), (1,1,1), (0,1,1) ++ i = jj; ++ j = jj+kk; ++ k = ii+jj+kk; ++ l = i + (j + k * (n+1)) * (n+1); ++ // map from linear Cartesian hex index in the auxiliary tet ++ // to lexicographic in the reference tet ++ vi[l] = m; ++ m++; ++ } + } +- if (i < j) ++ } ++ ++ if (m != (n+3)*(n+2)*(n+1)/6) ++ { ++ MFEM_ABORT("GeometryRefiner::Refine() for TETRAHEDRON #1"); ++ } ++ // elements ++ Array &G = RG->RefGeoms; ++ m = 0; ++ for (k = 0; k < n; k++) ++ { ++ for (j = 0; j <= k; j++) + { +- // xzy: (i,j,k)-(i+1,j,k)-(i+1,j+1,k+1)-(i+1,j,k+1) +- G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; +- G[m++] = vi[i+1 + (j+0 + (k+0) * (n+1)) * (n+1)]; +- G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; +- G[m++] = vi[i+1 + (j+0 + (k+1) * (n+1)) * (n+1)]; +- if (j < k) ++ for (i = 0; i <= j; i++) + { +- // xyz: (i,j,k)-(i+1,j+1,k+1)-(i+1,j,k)-(i+1,j+1,k) ++ // the ordering of the vertices is chosen to ensure: ++ // 1) correct orientation ++ // 2) the x,y,z edges are in the set of edges ++ // {(0,1),(2,3), (0,2),(1,3)} ++ // (goal is to ensure that subsequent refinement using ++ // this procedure preserves the six tetrahedral shapes) ++ ++ // zyx: (i,j,k)-(i,j,k+1)-(i+1,j+1,k+1)-(i,j+1,k+1) + G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+0 + (j+0 + (k+1) * (n+1)) * (n+1)]; + G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; +- G[m++] = vi[i+1 + (j+0 + (k+0) * (n+1)) * (n+1)]; +- G[m++] = vi[i+1 + (j+1 + (k+0) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+0 + (j+1 + (k+1) * (n+1)) * (n+1)]; ++ if (j < k) ++ { ++ // yzx: (i,j,k)-(i+1,j+1,k+1)-(i,j+1,k)-(i,j+1,k+1) ++ G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+0 + (j+1 + (k+0) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+0 + (j+1 + (k+1) * (n+1)) * (n+1)]; ++ // yxz: (i,j,k)-(i,j+1,k)-(i+1,j+1,k+1)-(i+1,j+1,k) ++ G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+0 + (j+1 + (k+0) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+1 + (j+1 + (k+0) * (n+1)) * (n+1)]; ++ } ++ if (i < j) ++ { ++ // xzy: (i,j,k)-(i+1,j,k)-(i+1,j+1,k+1)-(i+1,j,k+1) ++ G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+1 + (j+0 + (k+0) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+1 + (j+0 + (k+1) * (n+1)) * (n+1)]; ++ if (j < k) ++ { ++ // xyz: (i,j,k)-(i+1,j+1,k+1)-(i+1,j,k)-(i+1,j+1,k) ++ G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+1 + (j+0 + (k+0) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+1 + (j+1 + (k+0) * (n+1)) * (n+1)]; ++ } ++ // zxy: (i,j,k)-(i+1,j+1,k+1)-(i,j,k+1)-(i+1,j,k+1) ++ G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+0 + (j+0 + (k+1) * (n+1)) * (n+1)]; ++ G[m++] = vi[i+1 + (j+0 + (k+1) * (n+1)) * (n+1)]; ++ } + } +- // zxy: (i,j,k)-(i+1,j+1,k+1)-(i,j,k+1)-(i+1,j,k+1) +- G[m++] = vi[i+0 + (j+0 + (k+0) * (n+1)) * (n+1)]; +- G[m++] = vi[i+1 + (j+1 + (k+1) * (n+1)) * (n+1)]; +- G[m++] = vi[i+0 + (j+0 + (k+1) * (n+1)) * (n+1)]; +- G[m++] = vi[i+1 + (j+0 + (k+1) * (n+1)) * (n+1)]; + } + } +- if (m != 4*n*n*n) +- { +- mfem_error("GeometryRefiner::Refine() for TETRAHEDRON #2"); +- } +- for (i = 0; i < m; i++) +- if (G[i] < 0) +- { +- mfem_error("GeometryRefiner::Refine() for TETRAHEDRON #3"); +- } ++ if (m != 4*n*n*n) ++ { ++ MFEM_ABORT("GeometryRefiner::Refine() for TETRAHEDRON #2"); ++ } ++ for (i = 0; i < m; i++) ++ { ++ if (G[i] < 0) ++ { ++ MFEM_ABORT("GeometryRefiner::Refine() for TETRAHEDRON #3"); ++ } ++ } + +- RGeom[Geometry::TETRAHEDRON].Append(RG); +- return RG; +- } ++ RGeom[Geometry::TETRAHEDRON].Append(RG); ++ } ++ break; + +- case Geometry::PYRAMID: +- { +- const int n = Times; +- RG = new RefinedGeometry ((n+1)*(n+2)*(2*n+3)/6, +- 5*n*(2*n-1)*(2*n+1)/3, 0); +- RG->Times = Times; +- RG->ETimes = ETimes; +- RG->Type = type; +- // enumerate and define the vertices +- m = 0; +- for (k = 0; k <= n; k++) +- { +- const double *cpij = +- poly1d.GetPoints(Times - k, BasisType::GetNodalBasis(type)); +- for (j = 0; j <= n - k; j++) +- for (i = 0; i <= n - k; i++) ++ case Geometry::PYRAMID: ++ { ++ const int n = Times; ++ RG = new RefinedGeometry ((n+1)*(n+2)*(2*n+3)/6, ++ 5*n*(2*n-1)*(2*n+1)/3, 0); ++ RG->Times = Times; ++ RG->ETimes = ETimes; ++ RG->Type = Type; ++ // enumerate and define the vertices ++ m = 0; ++ for (k = 0; k <= n; k++) + { +- IntegrationPoint &ip = RG->RefPts.IntPoint(m); +- if (type == 0) ++ const double *cpij = ++ poly1d.GetPoints(Times - k, BasisType::GetNodalBasis(Type)); ++ for (j = 0; j <= n - k; j++) + { +- ip.x = (n > k) ? (double(i) / (n - k)) : 0.0; +- ip.y = (n > k) ? (double(j) / (n - k)) : 0.0; +- ip.z = double(k) / n; ++ for (i = 0; i <= n - k; i++) ++ { ++ IntegrationPoint &ip = RG->RefPts.IntPoint(m); ++ if (Type == 0) ++ { ++ ip.x = (n > k) ? (double(i) / (n - k)) : 0.0; ++ ip.y = (n > k) ? (double(j) / (n - k)) : 0.0; ++ ip.z = double(k) / n; ++ } ++ else ++ { ++ ip.x = cpij[i] * (1.0 - cp[k]); ++ ip.y = cpij[j] * (1.0 - cp[k]); ++ ip.z = cp[k]; ++ } ++ m++; ++ } + } +- else ++ } ++ if (m != (n+1)*(n+2)*(2*n+3)/6) ++ { ++ MFEM_ABORT("GeometryRefiner::Refine() for PYRAMID #1"); ++ } ++ // elements ++ Array &G = RG->RefGeoms; ++ m = 0; ++ for (k = 0; k < n; k++) ++ { ++ int lk = k * (k * (2 * k - 6 * n - 9) + 6 * n * (n + 3) + 13) / 6; ++ int lkp1 = (k + 1) * ++ (k * (2 * k - 6 * n -5) + 6 * n * (n + 2) + 6) / 6; ++ for (j = 0; j < n - k; j++) + { +- ip.x = cpij[i] * (1.0 - cp[k]); +- ip.y = cpij[j] * (1.0 - cp[k]); +- ip.z = cp[k]; ++ for (i = 0; i < n - k; i++) ++ { ++ G[m++] = lk + j * (n - k + 1) + i; ++ G[m++] = lk + j * (n - k + 1) + i + 1; ++ G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; ++ G[m++] = lk + (j + 1) * (n - k + 1) + i; ++ G[m++] = lkp1 + j * (n - k) + i; ++ } ++ } ++ for (j = 0; j < n - k - 1; j++) ++ { ++ for (i = 0; i < n - k - 1; i++) ++ { ++ G[m++] = lkp1 + j * (n - k) + i; ++ G[m++] = lkp1 + (j + 1) * (n - k) + i; ++ G[m++] = lkp1 + (j + 1) * (n - k) + i + 1; ++ G[m++] = lkp1 + j * (n - k) + i + 1; ++ G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; ++ } ++ } ++ for (j = 0; j < n - k; j++) ++ { ++ for (i = 0; i < n - k - 1; i++) ++ { ++ G[m++] = lk + j * (n - k + 1) + i + 1; ++ G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; ++ G[m++] = lkp1 + j * (n - k) + i; ++ G[m++] = lkp1 + j * (n - k) + i + 1; ++ G[m++] = -1; ++ } ++ } ++ for (j = 0; j < n - k - 1; j++) ++ { ++ for (i = 0; i < n - k; i++) ++ { ++ G[m++] = lk + (j + 1) * (n - k + 1) + i; ++ G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; ++ G[m++] = lkp1 + (j + 1) * (n - k) + i; ++ G[m++] = lkp1 + j * (n - k) + i; ++ G[m++] = -1; ++ } + } +- m++; + } +- } +- if (m != (n+1)*(n+2)*(2*n+3)/6) +- { +- mfem_error("GeometryRefiner::Refine() for PYRAMID #1"); +- } +- // elements +- Array &G = RG->RefGeoms; +- m = 0; +- for (k = 0; k < n; k++) +- { +- int lk = k * (k * (2 * k - 6 * n - 9) + 6 * n * (n + 3) + 13) / 6; +- int lkp1 = (k + 1) * +- (k * (2 * k - 6 * n -5) + 6 * n * (n + 2) + 6) / 6; +- for (j = 0; j < n - k; j++) +- { +- for (i = 0; i < n - k; i++) ++ if (m != 5*n*(2*n-1)*(2*n+1)/3) + { +- G[m++] = lk + j * (n - k + 1) + i; +- G[m++] = lk + j * (n - k + 1) + i + 1; +- G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; +- G[m++] = lk + (j + 1) * (n - k + 1) + i; +- G[m++] = lkp1 + j * (n - k) + i; ++ MFEM_ABORT("GeometryRefiner::Refine() for PYRAMID #2"); + } ++ ++ RGeom[Geometry::PYRAMID].Append(RG); + } +- for (j = 0; j < n - k - 1; j++) ++ break; ++ ++ case Geometry::PRISM: + { +- for (i = 0; i < n - k - 1; i++) ++ const int n = Times; ++ RG = new RefinedGeometry ((n+1)*(n+1)*(n+2)/2, 6*n*n*n, 0); ++ RG->Times = Times; ++ RG->ETimes = ETimes; ++ RG->Type = Type; ++ // enumerate and define the vertices ++ m = 0; ++ for (l = k = 0; k <= n; k++) + { +- G[m++] = lkp1 + j * (n - k) + i; +- G[m++] = lkp1 + (j + 1) * (n - k) + i; +- G[m++] = lkp1 + (j + 1) * (n - k) + i + 1; +- G[m++] = lkp1 + j * (n - k) + i + 1; +- G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; ++ for (j = 0; j <= n; j++) ++ { ++ for (i = 0; i <= n-j; i++, l++) ++ { ++ IntegrationPoint &ip = RG->RefPts.IntPoint(l); ++ ip.x = cp[i]/(cp[i] + cp[j] + cp[n-i-j]); ++ ip.y = cp[j]/(cp[i] + cp[j] + cp[n-i-j]); ++ ip.z = cp[k]; ++ m++; ++ } ++ } + } +- } +- for (j = 0; j < n - k; j++) +- { +- for (i = 0; i < n - k - 1; i++) ++ if (m != (n+1)*(n+1)*(n+2)/2) + { +- G[m++] = lk + j * (n - k + 1) + i + 1; +- G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; +- G[m++] = lkp1 + j * (n - k) + i; +- G[m++] = lkp1 + j * (n - k) + i + 1; +- G[m++] = -1; ++ MFEM_ABORT("GeometryRefiner::Refine() for PRISM #1"); + } +- } +- for (j = 0; j < n - k - 1; j++) +- { +- for (i = 0; i < n - k; i++) ++ // elements ++ Array &G = RG->RefGeoms; ++ m = 0; ++ for (m = k = 0; k < n; k++) + { +- G[m++] = lk + (j + 1) * (n - k + 1) + i; +- G[m++] = lk + (j + 1) * (n - k + 1) + i + 1; +- G[m++] = lkp1 + (j + 1) * (n - k) + i; +- G[m++] = lkp1 + j * (n - k) + i; +- G[m++] = -1; ++ for (l = j = 0; j < n; j++, l++) ++ { ++ for (i = 0; i < n-j; i++, l++) ++ { ++ G[m++] = l + (k+0) * (n+1) * (n+2) / 2; ++ G[m++] = l + 1 + (k+0) * (n+1) * (n+2) / 2; ++ G[m++] = l - j + (2 + (k+0) * (n+2)) * (n+1) / 2; ++ G[m++] = l + (k+1) * (n+1) * (n+2) / 2; ++ G[m++] = l + 1 + (k+1) * (n+1) * (Times+2) / 2; ++ G[m++] = l - j + (2 + (k+1) * (n+2)) * (n+1) / 2; ++ if (i+j+1 < n) ++ { ++ G[m++] = l + 1 + (k+0) * (n+1) * (n+2)/2; ++ G[m++] = l - j + (2 + (k+0) * (n+1)) * (n+2) / 2; ++ G[m++] = l - j + (2 + (k+0) * (n+2)) * (n+1) / 2; ++ G[m++] = l + 1 + (k+1) * (n+1) * (n+2) / 2; ++ G[m++] = l - j + (2 + (k+1) * (n+1)) * (n+2) / 2; ++ G[m++] = l - j + (2 + (k+1) * (n+2)) * (n+1) / 2; ++ } ++ } ++ } + } +- } +- } +- if (m != 5*n*(2*n-1)*(2*n+1)/3) +- { +- mfem_error("GeometryRefiner::Refine() for PYRAMID #2"); +- } +- RGeom[Geometry::PYRAMID].Append(RG); +- return RG; +- } +- +- case Geometry::PRISM: +- { +- const int n = Times; +- RG = new RefinedGeometry ((n+1)*(n+1)*(n+2)/2, 6*n*n*n, 0); +- RG->Times = Times; +- RG->ETimes = ETimes; +- RG->Type = type; +- // enumerate and define the vertices +- m = 0; +- for (l = k = 0; k <= n; k++) +- for (j = 0; j <= n; j++) +- for (i = 0; i <= n-j; i++, l++) ++ if (m != 6*n*n*n) + { +- IntegrationPoint &ip = RG->RefPts.IntPoint(l); +- ip.x = cp[i]/(cp[i] + cp[j] + cp[n-i-j]); +- ip.y = cp[j]/(cp[i] + cp[j] + cp[n-i-j]); +- ip.z = cp[k]; +- m++; ++ MFEM_ABORT("GeometryRefiner::Refine() for PRISM #2"); + } +- if (m != (n+1)*(n+1)*(n+2)/2) +- { +- mfem_error("GeometryRefiner::Refine() for PRISM #1"); +- } +- // elements +- Array &G = RG->RefGeoms; +- m = 0; +- for (m = k = 0; k < n; k++) +- for (l = j = 0; j < n; j++, l++) +- for (i = 0; i < n-j; i++, l++) ++ for (i = 0; i < m; i++) + { +- G[m++] = l + (k+0) * (n+1) * (n+2) / 2; +- G[m++] = l + 1 + (k+0) * (n+1) * (n+2) / 2; +- G[m++] = l - j + (2 + (k+0) * (n+2)) * (n+1) / 2; +- G[m++] = l + (k+1) * (n+1) * (n+2) / 2; +- G[m++] = l + 1 + (k+1) * (n+1) * (Times+2) / 2; +- G[m++] = l - j + (2 + (k+1) * (n+2)) * (n+1) / 2; +- if (i+j+1 < n) ++ if (G[i] < 0) + { +- G[m++] = l + 1 + (k+0) * (n+1) * (n+2)/2; +- G[m++] = l - j + (2 + (k+0) * (n+1)) * (n+2) / 2; +- G[m++] = l - j + (2 + (k+0) * (n+2)) * (n+1) / 2; +- G[m++] = l + 1 + (k+1) * (n+1) * (n+2) / 2; +- G[m++] = l - j + (2 + (k+1) * (n+1)) * (n+2) / 2; +- G[m++] = l - j + (2 + (k+1) * (n+2)) * (n+1) / 2; ++ MFEM_ABORT("GeometryRefiner::Refine() for PRISM #3"); + } + } +- if (m != 6*n*n*n) +- { +- mfem_error("GeometryRefiner::Refine() for PRISM #2"); +- } +- for (i = 0; i < m; i++) +- if (G[i] < 0) +- { +- mfem_error("GeometryRefiner::Refine() for PRISM #3"); ++ ++ RGeom[Geometry::PRISM].Append(RG); + } ++ break; + +- RGeom[Geometry::PRISM].Append(RG); +- return RG; ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: ++ MFEM_ABORT("Unknown type of reference element!"); ++ } + } +- +- default: +- +- return NULL; + } ++ ++ return RG; + } + + const IntegrationRule *GeometryRefiner::RefineInterior(Geometry::Type Geom, +@@ -1596,15 +1650,23 @@ const IntegrationRule *GeometryRefiner::RefineInterior(Geometry::Type Geom, + { + return NULL; + } +- ir = FindInIntPts(Geom, Times-1); +- if (ir) { return ir; } +- +- ir = new IntegrationRule(Times-1); +- for (int i = 1; i < Times; i++) ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ #pragma omp critical (RefineInterior) ++#endif + { +- IntegrationPoint &ip = ir->IntPoint(i-1); +- ip.x = double(i) / Times; +- ip.y = ip.z = 0.0; ++ ir = FindInIntPts(Geometry::SEGMENT, Times-1); ++ if (!ir) ++ { ++ ir = new IntegrationRule(Times-1); ++ for (int i = 1; i < Times; i++) ++ { ++ IntegrationPoint &ip = ir->IntPoint(i-1); ++ ip.x = double(i) / Times; ++ ip.y = ip.z = 0.0; ++ } ++ ++ IntPts[Geometry::SEGMENT].Append(ir); ++ } + } + } + break; +@@ -1615,18 +1677,28 @@ const IntegrationRule *GeometryRefiner::RefineInterior(Geometry::Type Geom, + { + return NULL; + } +- ir = FindInIntPts(Geom, ((Times-1)*(Times-2))/2); +- if (ir) { return ir; } +- +- ir = new IntegrationRule(((Times-1)*(Times-2))/2); +- for (int k = 0, j = 1; j < Times-1; j++) +- for (int i = 1; i < Times-j; i++, k++) ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ #pragma omp critical (RefineInterior) ++#endif ++ { ++ ir = FindInIntPts(Geometry::TRIANGLE, ((Times-1)*(Times-2))/2); ++ if (!ir) + { +- IntegrationPoint &ip = ir->IntPoint(k); +- ip.x = double(i) / Times; +- ip.y = double(j) / Times; +- ip.z = 0.0; ++ ir = new IntegrationRule(((Times-1)*(Times-2))/2); ++ for (int k = 0, j = 1; j < Times-1; j++) ++ { ++ for (int i = 1; i < Times-j; i++, k++) ++ { ++ IntegrationPoint &ip = ir->IntPoint(k); ++ ip.x = double(i) / Times; ++ ip.y = double(j) / Times; ++ ip.z = 0.0; ++ } ++ } ++ ++ IntPts[Geometry::TRIANGLE].Append(ir); + } ++ } + } + break; + +@@ -1636,32 +1708,46 @@ const IntegrationRule *GeometryRefiner::RefineInterior(Geometry::Type Geom, + { + return NULL; + } +- ir = FindInIntPts(Geom, (Times-1)*(Times-1)); +- if (ir) { return ir; } +- +- ir = new IntegrationRule((Times-1)*(Times-1)); +- for (int k = 0, j = 1; j < Times; j++) +- for (int i = 1; i < Times; i++, k++) ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ #pragma omp critical (RefineInterior) ++#endif ++ { ++ ir = FindInIntPts(Geometry::SQUARE, (Times-1)*(Times-1)); ++ if (!ir) + { +- IntegrationPoint &ip = ir->IntPoint(k); +- ip.x = double(i) / Times; +- ip.y = double(j) / Times; +- ip.z = 0.0; ++ ir = new IntegrationRule((Times-1)*(Times-1)); ++ for (int k = 0, j = 1; j < Times; j++) ++ { ++ for (int i = 1; i < Times; i++, k++) ++ { ++ IntegrationPoint &ip = ir->IntPoint(k); ++ ip.x = double(i) / Times; ++ ip.y = double(j) / Times; ++ ip.z = 0.0; ++ } ++ } ++ ++ IntPts[Geometry::SQUARE].Append(ir); + } ++ } + } + break; + +- default: +- mfem_error("GeometryRefiner::RefineInterior(...)"); ++ case Geometry::POINT: ++ case Geometry::TETRAHEDRON: ++ case Geometry::CUBE: ++ case Geometry::PYRAMID: ++ case Geometry::PRISM: ++ MFEM_ABORT("Reference element type is not supported!"); ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: ++ MFEM_ABORT("Unknown type of reference element!"); + } + +- MFEM_ASSERT(ir != NULL, "Failed to construct the refined IntegrationRule."); +- IntPts[Geom].Append(ir); +- + return ir; + } + +- ++// static method + int GeometryRefiner::GetRefinementLevelFromPoints(Geometry::Type geom, int Npts) + { + switch (geom) +@@ -1719,16 +1805,17 @@ int GeometryRefiner::GetRefinementLevelFromPoints(Geometry::Type geom, int Npts) + } + return -1; + } +- default: +- { +- mfem_error("Non existing Geometry."); +- } ++ case Geometry::PYRAMID: ++ MFEM_ABORT("Reference element type is not supported!"); ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: ++ MFEM_ABORT("Unknown type of reference element!"); + } + + return -1; + } + +- ++// static method + int GeometryRefiner::GetRefinementLevelFromElems(Geometry::Type geom, int Nels) + { + switch (geom) +@@ -1760,10 +1847,11 @@ int GeometryRefiner::GetRefinementLevelFromElems(Geometry::Type geom, int Nels) + } + return -1; + } +- default: +- { +- mfem_error("Non existing Geometry."); +- } ++ case Geometry::PYRAMID: ++ MFEM_ABORT("Reference element type is not supported!"); ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: ++ MFEM_ABORT("Unknown type of reference element!"); + } + + return -1; +diff --git a/fem/geom.hpp b/fem/geom.hpp +index 698912d97..1290f4e2a 100644 +--- a/fem/geom.hpp ++++ b/fem/geom.hpp +@@ -65,10 +65,10 @@ public: + + /** @brief Return an IntegrationRule consisting of all vertices of the given + Geometry::Type, @a GeomType. */ +- const IntegrationRule *GetVertices(int GeomType); ++ const IntegrationRule *GetVertices(int GeomType) const; + + /// Return the center of the given Geometry::Type, @a GeomType. +- const IntegrationPoint &GetCenter(int GeomType) ++ const IntegrationPoint &GetCenter(int GeomType) const + { return GeomCenter[GeomType]; } + + /// Get a random point in the reference element specified by @a GeomType. +@@ -97,9 +97,9 @@ public: + + const DenseMatrix &GetGeomToPerfGeomJac(int GeomType) const + { return *GeomToPerfGeomJac[GeomType]; } +- DenseMatrix *GetPerfGeomToGeomJac(int GeomType) +- { return PerfGeomToGeomJac[GeomType]; } +- void GetPerfPointMat(int GeomType, DenseMatrix &pm); ++ const DenseMatrix &GetPerfGeomToGeomJac(int GeomType) const ++ { return *PerfGeomToGeomJac[GeomType]; } ++ void GetPerfPointMat(int GeomType, DenseMatrix &pm) const; + void JacToPerfJac(int GeomType, const DenseMatrix &J, + DenseMatrix &PJ) const; + +@@ -123,7 +123,7 @@ public: + } + + /// Return the number of boundary "faces" of a given Geometry::Type. +- int NumBdr(int GeomType) { return NumBdrArray[GeomType]; } ++ int NumBdr(int GeomType) const { return NumBdrArray[GeomType]; } + }; + + template <> struct +@@ -317,27 +317,27 @@ public: + int Type; + + RefinedGeometry(int NPts, int NRefG, int NRefE, int NBdrE = 0) : +- RefPts(NPts), RefGeoms(NRefG), RefEdges(NRefE), NumBdrEdges(NBdrE) { } ++ RefPts(NPts), RefGeoms(NRefG), RefEdges(NRefE), NumBdrEdges(NBdrE) {} + }; + + class GeometryRefiner + { + private: +- int type; // Quadrature1D type (ClosedUniform is default) ++ int Type; // Quadrature1D type (ClosedUniform is default) + Array RGeom[Geometry::NumGeom]; + Array IntPts[Geometry::NumGeom]; + +- RefinedGeometry *FindInRGeom(Geometry::Type Geom, int Times, int ETimes, +- int Type); +- IntegrationRule *FindInIntPts(Geometry::Type Geom, int NPts); ++ RefinedGeometry *FindInRGeom(Geometry::Type Geom, int Times, ++ int ETimes) const; ++ IntegrationRule *FindInIntPts(Geometry::Type Geom, int NPts) const; + + public: +- GeometryRefiner(); ++ GeometryRefiner(int t = Quadrature1D::ClosedUniform) : Type(t) {} + + /// Set the Quadrature1D type of points to use for subdivision. +- void SetType(const int t) { type = t; } ++ void SetType(int t) { Type = t; } + /// Get the Quadrature1D type of points used for subdivision. +- int GetType() const { return type; } ++ int GetType() const { return Type; } + + RefinedGeometry *Refine(Geometry::Type Geom, int Times, int ETimes = 1); + +@@ -345,10 +345,10 @@ public: + const IntegrationRule *RefineInterior(Geometry::Type Geom, int Times); + + /// Get the Refinement level based on number of points +- virtual int GetRefinementLevelFromPoints(Geometry::Type Geom, int Npts); ++ static int GetRefinementLevelFromPoints(Geometry::Type Geom, int Npts); + + /// Get the Refinement level based on number of elements +- virtual int GetRefinementLevelFromElems(Geometry::Type geom, int Npts); ++ static int GetRefinementLevelFromElems(Geometry::Type geom, int Npts); + + ~GeometryRefiner(); + }; +diff --git a/fem/intrules.cpp b/fem/intrules.cpp +index 8c5c354e3..cb9544852 100644 +--- a/fem/intrules.cpp ++++ b/fem/intrules.cpp +@@ -737,7 +737,7 @@ void QuadratureFunctions1D::GivePolyPoints(const int np, double *pts, + ClosedGL(np, &ir); + break; + } +- default: ++ case Quadrature1D::Invalid: + { + MFEM_ABORT("Asking for an unknown type of 1D Quadrature points, " + "type = " << type); +@@ -831,7 +831,10 @@ void QuadratureFunctions1D::CalculateUniformWeights(IntegrationRule *ir, + hinv = p+1; + ihoffset = 1; + break; +- default: ++ case Quadrature1D::GaussLegendre: ++ case Quadrature1D::GaussLobatto: ++ case Quadrature1D::ClosedGL: ++ case Quadrature1D::Invalid: + MFEM_ABORT("invalid Quadrature1D type: " << type); + } + // set w0 = (-1)^p*(p!)/(hinv^p) +@@ -940,10 +943,10 @@ IntegrationRules IntRules(0, Quadrature1D::GaussLegendre); + + IntegrationRules RefinedIntRules(1, Quadrature1D::GaussLegendre); + +-IntegrationRules::IntegrationRules(int Ref, int type_): +- quad_type(type_) ++IntegrationRules::IntegrationRules(int ref, int type) ++ : quad_type(type) + { +- refined = Ref; ++ refined = ref; + + if (refined < 0) { own_rules = 0; return; } + +@@ -975,11 +978,19 @@ IntegrationRules::IntegrationRules(int Ref, int type_): + + CubeIntRules.SetSize(32, h_mt); + CubeIntRules = NULL; ++ ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ IntRuleLocks.SetSize(Geometry::NUM_GEOMETRIES, h_mt); ++ for (int i = 0; i < Geometry::NUM_GEOMETRIES; i++) ++ { ++ omp_init_lock(&IntRuleLocks[i]); ++ } ++#endif + } + + const IntegrationRule &IntegrationRules::Get(int GeomType, int Order) + { +- Array *ir_array; ++ Array *ir_array = NULL; + + switch (GeomType) + { +@@ -991,9 +1002,9 @@ const IntegrationRule &IntegrationRules::Get(int GeomType, int Order) + case Geometry::CUBE: ir_array = &CubeIntRules; break; + case Geometry::PRISM: ir_array = &PrismIntRules; break; + case Geometry::PYRAMID: ir_array = &PyramidIntRules; break; +- default: +- mfem_error("IntegrationRules::Get(...) : Unknown geometry type!"); +- ir_array = NULL; ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: ++ MFEM_ABORT("Unknown type of reference element!"); + } + + if (Order < 0) +@@ -1001,36 +1012,35 @@ const IntegrationRule &IntegrationRules::Get(int GeomType, int Order) + Order = 0; + } + ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ omp_set_lock(&IntRuleLocks[GeomType]); ++#endif ++ + if (!HaveIntRule(*ir_array, Order)) + { +-#ifdef MFEM_USE_LEGACY_OPENMP +- #pragma omp critical +-#endif +- { +- if (!HaveIntRule(*ir_array, Order)) +- { +- IntegrationRule *ir = GenerateIntegrationRule(GeomType, Order); ++ IntegrationRule *ir = GenerateIntegrationRule(GeomType, Order); + #ifdef MFEM_DEBUG +- int RealOrder = Order; +- while (RealOrder+1 < ir_array->Size() && +- (*ir_array)[RealOrder+1] == ir) +- { +- RealOrder++; +- } +- MFEM_VERIFY(RealOrder == ir->GetOrder(), "internal error"); ++ int RealOrder = Order; ++ while (RealOrder+1 < ir_array->Size() && (*ir_array)[RealOrder+1] == ir) ++ { ++ RealOrder++; ++ } ++ MFEM_VERIFY(RealOrder == ir->GetOrder(), "internal error"); + #else +- MFEM_CONTRACT_VAR(ir); ++ MFEM_CONTRACT_VAR(ir); + #endif +- } +- } + } + ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ omp_unset_lock(&IntRuleLocks[GeomType]); ++#endif ++ + return *(*ir_array)[Order]; + } + + void IntegrationRules::Set(int GeomType, int Order, IntegrationRule &IntRule) + { +- Array *ir_array; ++ Array *ir_array = NULL; + + switch (GeomType) + { +@@ -1042,11 +1052,15 @@ void IntegrationRules::Set(int GeomType, int Order, IntegrationRule &IntRule) + case Geometry::CUBE: ir_array = &CubeIntRules; break; + case Geometry::PRISM: ir_array = &PrismIntRules; break; + case Geometry::PYRAMID: ir_array = &PyramidIntRules; break; +- default: +- mfem_error("IntegrationRules::Set(...) : Unknown geometry type!"); +- ir_array = NULL; ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: ++ MFEM_ABORT("Unknown type of reference element!"); + } + ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ omp_set_lock(&IntRuleLocks[GeomType]); ++#endif ++ + if (HaveIntRule(*ir_array, Order)) + { + MFEM_ABORT("Overwriting set rules is not supported!"); +@@ -1055,16 +1069,19 @@ void IntegrationRules::Set(int GeomType, int Order, IntegrationRule &IntRule) + AllocIntRule(*ir_array, Order); + + (*ir_array)[Order] = &IntRule; ++ ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ omp_unset_lock(&IntRuleLocks[GeomType]); ++#endif + } + +-void IntegrationRules::DeleteIntRuleArray(Array &ir_array) ++void IntegrationRules::DeleteIntRuleArray( ++ Array &ir_array) const + { +- int i; +- IntegrationRule *ir = NULL; +- + // Many of the intrules have multiple contiguous copies in the ir_array + // so we have to be careful to not delete them twice. +- for (i = 0; i < ir_array.Size(); i++) ++ IntegrationRule *ir = NULL; ++ for (int i = 0; i < ir_array.Size(); i++) + { + if (ir_array[i] != NULL && ir_array[i] != ir) + { +@@ -1076,6 +1093,13 @@ void IntegrationRules::DeleteIntRuleArray(Array &ir_array) + + IntegrationRules::~IntegrationRules() + { ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ for (int i = 0; i < Geometry::NUM_GEOMETRIES; i++) ++ { ++ omp_destroy_lock(&IntRuleLocks[i]); ++ } ++#endif ++ + if (!own_rules) { return; } + + DeleteIntRuleArray(PointIntRules); +@@ -1110,10 +1134,11 @@ IntegrationRule *IntegrationRules::GenerateIntegrationRule(int GeomType, + return PrismIntegrationRule(Order); + case Geometry::PYRAMID: + return PyramidIntegrationRule(Order); +- default: +- mfem_error("IntegrationRules::Set(...) : Unknown geometry type!"); +- return NULL; ++ case Geometry::INVALID: ++ case Geometry::NUM_GEOMETRIES: ++ MFEM_ABORT("Unknown type of reference element!"); + } ++ return NULL; + } + + +@@ -1122,7 +1147,7 @@ IntegrationRule *IntegrationRules::PointIntegrationRule(int Order) + { + if (Order > 1) + { +- mfem_error("Point Integration Rule of Order > 1 not defined"); ++ MFEM_ABORT("Point Integration Rule of Order > 1 not defined"); + return NULL; + } + +@@ -1185,7 +1210,7 @@ IntegrationRule *IntegrationRules::SegmentIntegrationRule(int Order) + QuadratureFunctions1D::OpenHalfUniform(n, ir); + break; + } +- default: ++ case Quadrature1D::Invalid: + { + MFEM_ABORT("unknown Quadrature1D type: " << quad_type); + } +@@ -1762,8 +1787,8 @@ IntegrationRule *IntegrationRules::PyramidIntegrationRule(int Order) + + for (int k=0; kIntPoint(k); ++ const IntegrationPoint &ipc = irc.IntPoint(k); ++ IntegrationPoint &ipp = PyramidIntRules[Order]->IntPoint(k); + ipp.x = ipc.x * (1.0 - ipc.z); + ipp.y = ipc.y * (1.0 - ipc.z); + ipp.z = ipc.z; +@@ -1775,8 +1800,8 @@ IntegrationRule *IntegrationRules::PyramidIntegrationRule(int Order) + // Integration rules for reference prism + IntegrationRule *IntegrationRules::PrismIntegrationRule(int Order) + { +- const IntegrationRule & irt = Get(Geometry::TRIANGLE, Order); +- const IntegrationRule & irs = Get(Geometry::SEGMENT, Order); ++ const IntegrationRule &irt = Get(Geometry::TRIANGLE, Order); ++ const IntegrationRule &irs = Get(Geometry::SEGMENT, Order); + int nt = irt.GetNPoints(); + int ns = irs.GetNPoints(); + AllocIntRule(PrismIntRules, Order); +@@ -1790,12 +1815,12 @@ IntegrationRule *IntegrationRules::PrismIntegrationRule(int Order) + + for (int ks=0; ksIntPoint(kp); ++ const IntegrationPoint &ipt = irt.IntPoint(kt); ++ IntegrationPoint &ipp = PrismIntRules[Order]->IntPoint(kp); + ipp.x = ipt.x; + ipp.y = ipt.y; + ipp.z = ips.x; +diff --git a/fem/intrules.hpp b/fem/intrules.hpp +index bf38766d7..1d5757994 100644 +--- a/fem/intrules.hpp ++++ b/fem/intrules.hpp +@@ -14,6 +14,9 @@ + + #include "../config/config.hpp" + #include "../general/array.hpp" ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++#include ++#endif + + #include + #include +@@ -428,14 +431,18 @@ private: + Array PrismIntRules; + Array CubeIntRules; + +- void AllocIntRule(Array &ir_array, int Order) ++#if defined(MFEM_THREAD_SAFE) && defined(MFEM_USE_OPENMP) ++ Array IntRuleLocks; ++#endif ++ ++ void AllocIntRule(Array &ir_array, int Order) const + { + if (ir_array.Size() <= Order) + { + ir_array.SetSize(Order + 1, NULL); + } + } +- bool HaveIntRule(Array &ir_array, int Order) ++ bool HaveIntRule(Array &ir_array, int Order) const + { + return (ir_array.Size() > Order && ir_array[Order] != NULL); + } +@@ -443,6 +450,7 @@ private: + { + return Order | 1; // valid for all quad_type's + } ++ void DeleteIntRuleArray(Array &ir_array) const; + + /// The following methods allocate new IntegrationRule objects without + /// checking if they already exist. To avoid memory leaks use +@@ -457,12 +465,10 @@ private: + IntegrationRule *PrismIntegrationRule(int Order); + IntegrationRule *CubeIntegrationRule(int Order); + +- void DeleteIntRuleArray(Array &ir_array); +- + public: + /// Sets initial sizes for the integration rule arrays, but rules + /// are defined the first time they are requested with the Get method. +- explicit IntegrationRules(int Ref = 0, ++ explicit IntegrationRules(int ref = 0, + int type = Quadrature1D::GaussLegendre); + + /// Returns an integration rule for given GeomType and Order. +diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp +index 5e96c3f39..600f2fc2a 100644 +--- a/mesh/mesh.cpp ++++ b/mesh/mesh.cpp +@@ -4446,9 +4446,7 @@ void Mesh::MakeRefined_(Mesh &orig_mesh, const Array ref_factors, + + Array rdofs; + DenseMatrix phys_pts; +- +- GeometryRefiner refiner; +- refiner.SetType(q_type); ++ GeometryRefiner refiner(q_type); + + // Add refined elements and set vertex coordinates + for (int el = 0; el < orig_ne; el++) +diff --git a/tests/unit/fem/test_lexicographic_ordering.cpp b/tests/unit/fem/test_lexicographic_ordering.cpp +index f68395bfc..6c73cb4e3 100644 +--- a/tests/unit/fem/test_lexicographic_ordering.cpp ++++ b/tests/unit/fem/test_lexicographic_ordering.cpp +@@ -20,8 +20,7 @@ void VerifyOrdering(NodalFiniteElement &el) + Geometry::Type geom = el.GetGeomType(); + const Array &p = el.GetLexicographicOrdering(); + +- GeometryRefiner refiner; +- refiner.SetType(BasisType::GaussLobatto); ++ GeometryRefiner refiner(BasisType::GaussLobatto); + RefinedGeometry *ref_geom = refiner.Refine(geom, order); + + double error = 0.0; diff --git a/extern/patch/mfem/patch_mesh_partitioner_dev.diff b/extern/patch/mfem/patch_mesh_partitioner_dev.diff index 1ff2a7f8ed..201a8c350c 100644 --- a/extern/patch/mfem/patch_mesh_partitioner_dev.diff +++ b/extern/patch/mfem/patch_mesh_partitioner_dev.diff @@ -1,1883 +1,1883 @@ -diff --git a/.gitignore b/.gitignore -index 030672a06..8cc9a33f7 100644 ---- a/.gitignore -+++ b/.gitignore -@@ -227,7 +227,7 @@ miniapps/meshing/mobius-strip.mesh - miniapps/meshing/klein-bottle.mesh - miniapps/meshing/toroid-*.mesh - miniapps/meshing/twist-*.mesh --miniapps/meshing/mesh-explorer.mesh -+miniapps/meshing/mesh-explorer.mesh* - miniapps/meshing/partitioning.txt - miniapps/meshing/mesh-explorer-visit* - miniapps/meshing/mesh-explorer-paraview/ -diff --git a/examples/ex1p-test.cpp b/examples/ex1p-test.cpp -new file mode 100644 -index 000000000..9b17ae982 ---- /dev/null -+++ b/examples/ex1p-test.cpp -@@ -0,0 +1,301 @@ -+// MFEM Example 1 - Parallel Version -+// -+// Compile with: make ex1p -+// -+// Sample runs: mpirun -np 4 ex1p -m ../data/square-disc.mesh -+// mpirun -np 4 ex1p -m ../data/star.mesh -+// mpirun -np 4 ex1p -m ../data/star-mixed.mesh -+// mpirun -np 4 ex1p -m ../data/escher.mesh -+// mpirun -np 4 ex1p -m ../data/fichera.mesh -+// mpirun -np 4 ex1p -m ../data/fichera-mixed.mesh -+// mpirun -np 4 ex1p -m ../data/toroid-wedge.mesh -+// mpirun -np 4 ex1p -m ../data/octahedron.mesh -o 1 -+// mpirun -np 4 ex1p -m ../data/periodic-annulus-sector.msh -+// mpirun -np 4 ex1p -m ../data/periodic-torus-sector.msh -+// mpirun -np 4 ex1p -m ../data/square-disc-p2.vtk -o 2 -+// mpirun -np 4 ex1p -m ../data/square-disc-p3.mesh -o 3 -+// mpirun -np 4 ex1p -m ../data/square-disc-nurbs.mesh -o -1 -+// mpirun -np 4 ex1p -m ../data/star-mixed-p2.mesh -o 2 -+// mpirun -np 4 ex1p -m ../data/disc-nurbs.mesh -o -1 -+// mpirun -np 4 ex1p -m ../data/pipe-nurbs.mesh -o -1 -+// mpirun -np 4 ex1p -m ../data/ball-nurbs.mesh -o 2 -+// mpirun -np 4 ex1p -m ../data/fichera-mixed-p2.mesh -o 2 -+// mpirun -np 4 ex1p -m ../data/star-surf.mesh -+// mpirun -np 4 ex1p -m ../data/square-disc-surf.mesh -+// mpirun -np 4 ex1p -m ../data/inline-segment.mesh -+// mpirun -np 4 ex1p -m ../data/amr-quad.mesh -+// mpirun -np 4 ex1p -m ../data/amr-hex.mesh -+// mpirun -np 4 ex1p -m ../data/mobius-strip.mesh -+// mpirun -np 4 ex1p -m ../data/mobius-strip.mesh -o -1 -sc -+// -+// Device sample runs: -+// mpirun -np 4 ex1p -pa -d cuda -+// mpirun -np 4 ex1p -pa -d occa-cuda -+// mpirun -np 4 ex1p -pa -d raja-omp -+// mpirun -np 4 ex1p -pa -d ceed-cpu -+// mpirun -np 4 ex1p -pa -d ceed-cpu -o 4 -a -+// * mpirun -np 4 ex1p -pa -d ceed-cuda -+// * mpirun -np 4 ex1p -pa -d ceed-hip -+// mpirun -np 4 ex1p -pa -d ceed-cuda:/gpu/cuda/shared -+// mpirun -np 4 ex1p -m ../data/beam-tet.mesh -pa -d ceed-cpu -+// -+// Description: This example code demonstrates the use of MFEM to define a -+// simple finite element discretization of the Laplace problem -+// -Delta u = 1 with homogeneous Dirichlet boundary conditions. -+// Specifically, we discretize using a FE space of the specified -+// order, or if order < 1 using an isoparametric/isogeometric -+// space (i.e. quadratic for quadratic curvilinear mesh, NURBS for -+// NURBS mesh, etc.) -+// -+// The example highlights the use of mesh refinement, finite -+// element grid functions, as well as linear and bilinear forms -+// corresponding to the left-hand side and right-hand side of the -+// discrete linear system. We also cover the explicit elimination -+// of essential boundary conditions, static condensation, and the -+// optional connection to the GLVis tool for visualization. -+ -+#include "mfem.hpp" -+#include -+#include -+ -+using namespace std; -+using namespace mfem; -+ -+int main(int argc, char *argv[]) -+{ -+ // 1. Initialize MPI. -+ MPI_Session mpi; -+ int num_procs = mpi.WorldSize(); -+ int myid = mpi.WorldRank(); -+ -+ // 2. Parse command-line options. -+ const char *mesh_file = "../data/star.mesh"; -+ int order = 1; -+ bool static_cond = false; -+ bool pa = false; -+ const char *device_config = "cpu"; -+ bool visualization = true; -+ bool algebraic_ceed = false; -+ -+ OptionsParser args(argc, argv); -+ args.AddOption(&mesh_file, "-m", "--mesh", -+ "Mesh file to use."); -+ args.AddOption(&order, "-o", "--order", -+ "Finite element order (polynomial degree) or -1 for" -+ " isoparametric space."); -+ args.AddOption(&static_cond, "-sc", "--static-condensation", "-no-sc", -+ "--no-static-condensation", "Enable static condensation."); -+ args.AddOption(&pa, "-pa", "--partial-assembly", "-no-pa", -+ "--no-partial-assembly", "Enable Partial Assembly."); -+ args.AddOption(&device_config, "-d", "--device", -+ "Device configuration string, see Device::Configure()."); -+#ifdef MFEM_USE_CEED -+ args.AddOption(&algebraic_ceed, "-a", "--algebraic", -+ "-no-a", "--no-algebraic", -+ "Use algebraic Ceed solver"); -+#endif -+ args.AddOption(&visualization, "-vis", "--visualization", "-no-vis", -+ "--no-visualization", -+ "Enable or disable GLVis visualization."); -+ args.Parse(); -+ if (!args.Good()) -+ { -+ if (myid == 0) -+ { -+ args.PrintUsage(cout); -+ } -+ return 1; -+ } -+ if (myid == 0) -+ { -+ args.PrintOptions(cout); -+ } -+ -+ // 3. Enable hardware devices such as GPUs, and programming models such as -+ // CUDA, OCCA, RAJA and OpenMP based on command line options. -+ Device device(device_config); -+ if (myid == 0) { device.Print(); } -+ -+ // 4. Read the (serial) mesh from the given mesh file on all processors. We -+ // can handle triangular, quadrilateral, tetrahedral, hexahedral, surface -+ // and volume meshes with the same code. -+ Mesh mesh(mesh_file, 1, 1); -+ int dim = mesh.Dimension(); -+ -+ // 5. Refine the serial mesh on all processors to increase the resolution. In -+ // this example we do 'ref_levels' of uniform refinement. We choose -+ // 'ref_levels' to be the largest number that gives a final mesh with no -+ // more than 10,000 elements. -+ { -+ int ref_levels = -+ (int)floor(log(10000./mesh.GetNE())/log(2.)/dim); -+ for (int l = 0; l < ref_levels; l++) -+ { -+ mesh.UniformRefinement(); -+ } -+ } -+ -+ // 6. Define a parallel mesh by a partitioning of the serial mesh. Refine -+ // this mesh further in parallel to increase the resolution. Once the -+ // parallel mesh is defined, the serial mesh can be deleted. -+ // ParMesh pmesh(MPI_COMM_WORLD, mesh); -+ mesh.Clear(); -+ ifstream mesh_ifs( -+ MakeParFilename("../miniapps/meshing/mesh-explorer.mesh.", -+ myid)); -+ ParMesh pmesh(MPI_COMM_WORLD, mesh_ifs, /* refine: */ false); -+ dim = pmesh.Dimension(); -+ pmesh.PrintInfo(cout); -+ { -+ int par_ref_levels = 0; -+ for (int l = 0; l < par_ref_levels; l++) -+ { -+ pmesh.UniformRefinement(); -+ } -+ } -+ -+ // 7. Define a parallel finite element space on the parallel mesh. Here we -+ // use continuous Lagrange finite elements of the specified order. If -+ // order < 1, we instead use an isoparametric/isogeometric space. -+ FiniteElementCollection *fec; -+ bool delete_fec; -+ if (order > 0) -+ { -+ fec = new H1_FECollection(order, dim); -+ delete_fec = true; -+ } -+ else if (pmesh.GetNodes()) -+ { -+ fec = pmesh.GetNodes()->OwnFEC(); -+ delete_fec = false; -+ if (myid == 0) -+ { -+ cout << "Using isoparametric FEs: " << fec->Name() << endl; -+ } -+ } -+ else -+ { -+ fec = new H1_FECollection(order = 1, dim); -+ delete_fec = true; -+ } -+ ParFiniteElementSpace fespace(&pmesh, fec); -+ HYPRE_BigInt size = fespace.GlobalTrueVSize(); -+ if (myid == 0) -+ { -+ cout << "Number of finite element unknowns: " << size << endl; -+ } -+ -+ // 8. Determine the list of true (i.e. parallel conforming) essential -+ // boundary dofs. In this example, the boundary conditions are defined -+ // by marking all the boundary attributes from the mesh as essential -+ // (Dirichlet) and converting them to a list of true dofs. -+ Array ess_tdof_list; -+ if (pmesh.bdr_attributes.Size()) -+ { -+ Array ess_bdr(pmesh.bdr_attributes.Max()); -+ ess_bdr = 1; -+ fespace.GetEssentialTrueDofs(ess_bdr, ess_tdof_list); -+ } -+ -+ // 9. Set up the parallel linear form b(.) which corresponds to the -+ // right-hand side of the FEM linear system, which in this case is -+ // (1,phi_i) where phi_i are the basis functions in fespace. -+ ParLinearForm b(&fespace); -+ ConstantCoefficient one(1.0); -+ b.AddDomainIntegrator(new DomainLFIntegrator(one)); -+ b.Assemble(); -+ -+ // 10. Define the solution vector x as a parallel finite element grid -+ // function corresponding to fespace. Initialize x with initial guess of -+ // zero, which satisfies the boundary conditions. -+ ParGridFunction x(&fespace); -+ x = 0.0; -+ -+ // 11. Set up the parallel bilinear form a(.,.) on the finite element space -+ // corresponding to the Laplacian operator -Delta, by adding the -+ // Diffusion domain integrator. -+ ParBilinearForm a(&fespace); -+ if (pa) { a.SetAssemblyLevel(AssemblyLevel::PARTIAL); } -+ a.AddDomainIntegrator(new DiffusionIntegrator(one)); -+ -+ // 12. Assemble the parallel bilinear form and the corresponding linear -+ // system, applying any necessary transformations such as: parallel -+ // assembly, eliminating boundary conditions, applying conforming -+ // constraints for non-conforming AMR, static condensation, etc. -+ if (static_cond) { a.EnableStaticCondensation(); } -+ a.Assemble(); -+ -+ OperatorPtr A; -+ Vector B, X; -+ a.FormLinearSystem(ess_tdof_list, x, b, A, X, B); -+ -+ // 13. Solve the linear system A X = B. -+ // * With full assembly, use the BoomerAMG preconditioner from hypre. -+ // * With partial assembly, use Jacobi smoothing, for now. -+ Solver *prec = NULL; -+ if (pa) -+ { -+ if (UsesTensorBasis(fespace)) -+ { -+ if (algebraic_ceed) -+ { -+ prec = new ceed::AlgebraicSolver(a, ess_tdof_list); -+ } -+ else -+ { -+ prec = new OperatorJacobiSmoother(a, ess_tdof_list); -+ } -+ } -+ } -+ else -+ { -+ prec = new HypreBoomerAMG; -+ } -+ CGSolver cg(MPI_COMM_WORLD); -+ cg.SetRelTol(1e-12); -+ cg.SetMaxIter(2000); -+ cg.SetPrintLevel(1); -+ if (prec) { cg.SetPreconditioner(*prec); } -+ cg.SetOperator(*A); -+ cg.Mult(B, X); -+ delete prec; -+ -+ // 14. Recover the parallel grid function corresponding to X. This is the -+ // local finite element solution on each processor. -+ a.RecoverFEMSolution(X, b, x); -+ -+ // 15. Save the refined mesh and the solution in parallel. This output can -+ // be viewed later using GLVis: "glvis -np -m mesh -g sol". -+ { -+ ostringstream mesh_name, sol_name; -+ mesh_name << "mesh." << setfill('0') << setw(6) << myid; -+ sol_name << "sol." << setfill('0') << setw(6) << myid; -+ -+ ofstream mesh_ofs(mesh_name.str().c_str()); -+ mesh_ofs.precision(8); -+ pmesh.Print(mesh_ofs); -+ -+ ofstream sol_ofs(sol_name.str().c_str()); -+ sol_ofs.precision(8); -+ x.Save(sol_ofs); -+ } -+ -+ // 16. Send the solution by socket to a GLVis server. -+ if (visualization) -+ { -+ char vishost[] = "localhost"; -+ int visport = 19916; -+ socketstream sol_sock(vishost, visport); -+ sol_sock << "parallel " << num_procs << " " << myid << "\n"; -+ sol_sock.precision(8); -+ sol_sock << "solution\n" << pmesh << x << flush; -+ } -+ -+ // 17. Free the used memory. -+ if (delete_fec) -+ { -+ delete fec; -+ } -+ -+ return 0; -+} -diff --git a/fem/pfespace.cpp b/fem/pfespace.cpp -index 76ac230a1..8761e1489 100644 ---- a/fem/pfespace.cpp -+++ b/fem/pfespace.cpp -@@ -716,13 +716,10 @@ void ParFiniteElementSpace::CheckNDSTriaDofs() - } - - // Check for shared triangle faces -- bool strias = false; -+ bool strias = false; -+ for (int g = 1; g < pmesh->GetNGroups(); g++) - { -- int ngrps = pmesh->GetNGroups(); -- for (int g = 1; g < ngrps; g++) -- { -- strias |= pmesh->GroupNTriangles(g); -- } -+ strias |= pmesh->GroupNTriangles(g); - } - - // Combine results -diff --git a/general/array.hpp b/general/array.hpp -index 0f16b3023..a19b099a1 100644 ---- a/general/array.hpp -+++ b/general/array.hpp -@@ -77,8 +77,8 @@ public: - /** @brief Creates array using an existing c-array of asize elements; - allocsize is set to -asize to indicate that the data will not - be deleted. */ -- inline Array(T *data_, int asize) -- { data.Wrap(data_, asize, false); size = asize; } -+ inline Array(T *data_, int asize, bool own_data = false) -+ { data.Wrap(data_, asize, own_data); size = asize; } - - /// Copy constructor: deep copy from @a src - /** This method supports source arrays using any MemoryType. */ -@@ -206,7 +206,7 @@ public: - inline void Copy(Array ©) const; - - /// Make this Array a reference to a pointer. -- inline void MakeRef(T *, int); -+ inline void MakeRef(T *data_, int size_, bool own_data = false); - - /// Make this Array a reference to 'master'. - inline void MakeRef(const Array &master); -@@ -869,11 +869,11 @@ inline void Array::Copy(Array ©) const - } - - template --inline void Array::MakeRef(T *p, int s) -+inline void Array::MakeRef(T *data_, int size_, bool own_data) - { - data.Delete(); -- data.Wrap(p, s, false); -- size = s; -+ data.Wrap(data_, size_, own_data); -+ size = size_; - } - - template -diff --git a/general/communication.cpp b/general/communication.cpp -index 0c2fffc1f..e8002a273 100644 ---- a/general/communication.cpp -+++ b/general/communication.cpp -@@ -275,7 +275,7 @@ void GroupTopology::Save(ostream &os) const - os << "\ncommunication_groups\n"; - os << "number_of_groups " << NGroups() << "\n\n"; - -- os << "# number of entities in each group, followed by group ids in group\n"; -+ os << "# number of entities in each group, followed by ranks in group\n"; - for (int group_id = 0; group_id < NGroups(); ++group_id) - { - int group_size = GetGroupSize(group_id); -diff --git a/general/table.hpp b/general/table.hpp -index 2ed9f4a1b..96373b2d1 100644 ---- a/general/table.hpp -+++ b/general/table.hpp -@@ -208,6 +208,7 @@ void Transpose (const Table &A, Table &At, int ncols_A_ = -1); - Table * Transpose (const Table &A); - - /// Transpose an Array -+/** @note The column (TYPE II) indices in each row of @a At will be sorted. */ - void Transpose(const Array &A, Table &At, int ncols_A_ = -1); - - /// C = A * B (as boolean matrices) -diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp -index a8ec98649..5f82de812 100644 ---- a/mesh/mesh.cpp -+++ b/mesh/mesh.cpp -@@ -19,6 +19,7 @@ - #include "../general/device.hpp" - #include "../general/tic_toc.hpp" - #include "../general/gecko.hpp" -+#include "../general/sets.hpp" - #include "../fem/quadinterpolator.hpp" - - #include -@@ -31,6 +32,7 @@ - #include - #include - #include -+#include - - // Include the METIS header, if using version 5. If using METIS 4, the needed - // declarations are inlined below, i.e. no header is needed. -@@ -1290,7 +1292,7 @@ Mesh::FaceInformation::operator Mesh::FaceInfo() const - return res; - } - --std::ostream& operator<<(std::ostream& os, const Mesh::FaceInformation& info) -+std::ostream &operator<<(std::ostream &os, const Mesh::FaceInformation& info) - { - os << "face topology="; - switch (info.topology) -@@ -2989,7 +2991,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) - { - GetElementToFaceTable(); - GenerateFaces(); -- if (NumOfBdrElements == 0 && generate_bdr) -+ if (ReduceInt(NumOfBdrElements) == 0 && generate_bdr) - { - GenerateBoundaryElements(); - GetElementToFaceTable(); // update be_to_face -@@ -3009,7 +3011,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) - if (Dim == 2) - { - GenerateFaces(); // 'Faces' in 2D refers to the edges -- if (NumOfBdrElements == 0 && generate_bdr) -+ if (ReduceInt(NumOfBdrElements) == 0 && generate_bdr) - { - GenerateBoundaryElements(); - } -@@ -3023,7 +3025,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) - if (Dim == 1) - { - GenerateFaces(); -- if (NumOfBdrElements == 0 && generate_bdr) -+ if (ReduceInt(NumOfBdrElements) == 0 && generate_bdr) - { - // be_to_face will be set inside GenerateBoundaryElements - GenerateBoundaryElements(); -@@ -5643,6 +5645,12 @@ const FiniteElementSpace *Mesh::GetNodalFESpace() const - - void Mesh::SetCurvature(int order, bool discont, int space_dim, int ordering) - { -+ if (order <= 0) -+ { -+ delete Nodes; -+ Nodes = nullptr; -+ return; -+ } - space_dim = (space_dim == -1) ? spaceDim : space_dim; - FiniteElementCollection* nfec; - if (discont) -@@ -10685,7 +10693,7 @@ void Mesh::Printer(std::ostream &os, std::string section_delimiter) const - } - } - --void Mesh::PrintTopo(std::ostream &os,const Array &e_to_k) const -+void Mesh::PrintTopo(std::ostream &os, const Array &e_to_k) const - { - int i; - Array vert; -@@ -12556,6 +12564,878 @@ void Mesh::GetGeometricParametersFromJacobian(const DenseMatrix &J, - } - - -+MeshPart::EntityHelper::EntityHelper( -+ int dim_, const Array (&entity_to_vertex_)[Geometry::NumGeom]) -+ : dim(dim_), -+ entity_to_vertex(entity_to_vertex_) -+{ -+ int geom_offset = 0; -+ for (int g = Geometry::DimStart[dim]; g < Geometry::DimStart[dim+1]; g++) -+ { -+ geom_offsets[g] = geom_offset; -+ geom_offset += entity_to_vertex[g].Size()/Geometry::NumVerts[g]; -+ } -+ geom_offsets[Geometry::DimStart[dim+1]] = geom_offset; -+ num_entities = geom_offset; -+} -+ -+MeshPart::Entity MeshPart::EntityHelper::FindEntity(int bytype_entity_id) -+{ -+ // Find the 'geom' that corresponds to 'bytype_entity_id' -+ int geom = Geometry::DimStart[dim]; -+ while (geom_offsets[geom+1] <= bytype_entity_id) { geom++; } -+ MFEM_ASSERT(geom < Geometry::NumGeom, "internal error"); -+ MFEM_ASSERT(Geometry::Dimension[geom] == dim, "internal error"); -+ const int nv = Geometry::NumVerts[geom]; -+ const int geom_elem_id = bytype_entity_id - geom_offsets[geom]; -+ const int *v = &entity_to_vertex[geom][nv*geom_elem_id]; -+ return { geom, nv, v }; -+} -+ -+void MeshPart::Print(std::ostream &os) const -+{ -+ os << "MFEM mesh v1.2\n"; -+ -+ // optional -+ os << -+ "\n#\n# MFEM Geometry Types (see mesh/geom.hpp):\n#\n" -+ "# POINT = 0\n" -+ "# SEGMENT = 1\n" -+ "# TRIANGLE = 2\n" -+ "# SQUARE = 3\n" -+ "# TETRAHEDRON = 4\n" -+ "# CUBE = 5\n" -+ "# PRISM = 6\n" -+ "# PYRAMID = 7\n" -+ "#\n"; -+ -+ const int dim = dimension; -+ os << "\ndimension\n" << dim; -+ -+ os << "\n\nelements\n" << num_elements << '\n'; -+ { -+ const bool have_element_map = (element_map.Size() == num_elements); -+ MFEM_ASSERT(have_element_map || element_map.Size() == 0, -+ "invalid MeshPart state"); -+ EntityHelper elem_helper(dim, entity_to_vertex); -+ MFEM_ASSERT(elem_helper.num_entities == num_elements, -+ "invalid MeshPart state"); -+ for (int nat_elem_id = 0; nat_elem_id < num_elements; nat_elem_id++) -+ { -+ const int bytype_elem_id = have_element_map ? -+ element_map[nat_elem_id] : nat_elem_id; -+ const Entity ent = elem_helper.FindEntity(bytype_elem_id); -+ // Print the element -+ os << attributes[nat_elem_id] << ' ' << ent.geom; -+ for (int i = 0; i < ent.num_verts; i++) -+ { -+ os << ' ' << ent.verts[i]; -+ } -+ os << '\n'; -+ } -+ } -+ -+ os << "\nboundary\n" << num_bdr_elements << '\n'; -+ { -+ const bool have_boundary_map = (boundary_map.Size() == num_bdr_elements); -+ MFEM_ASSERT(have_boundary_map || boundary_map.Size() == 0, -+ "invalid MeshPart state"); -+ EntityHelper bdr_helper(dim-1, entity_to_vertex); -+ MFEM_ASSERT(bdr_helper.num_entities == num_bdr_elements, -+ "invalid MeshPart state"); -+ for (int nat_bdr_id = 0; nat_bdr_id < num_bdr_elements; nat_bdr_id++) -+ { -+ const int bytype_bdr_id = have_boundary_map ? -+ boundary_map[nat_bdr_id] : nat_bdr_id; -+ const Entity ent = bdr_helper.FindEntity(bytype_bdr_id); -+ // Print the boundary element -+ os << bdr_attributes[nat_bdr_id] << ' ' << ent.geom; -+ for (int i = 0; i < ent.num_verts; i++) -+ { -+ os << ' ' << ent.verts[i]; -+ } -+ os << '\n'; -+ } -+ } -+ -+ os << "\nvertices\n" << num_vertices << '\n'; -+ if (!nodes) -+ { -+ const int sdim = space_dimension; -+ os << sdim << '\n'; -+ for (int i = 0; i < num_vertices; i++) -+ { -+ os << vertex_coordinates[i*sdim]; -+ for (int d = 1; d < sdim; d++) -+ { -+ os << ' ' << vertex_coordinates[i*sdim+d]; -+ } -+ os << '\n'; -+ } -+ } -+ else -+ { -+ os << "\nnodes\n"; -+ nodes->Save(os); -+ } -+ -+ os << "\nmfem_serial_mesh_end\n"; -+ -+ // Start: GroupTopology::Save -+ const int num_groups = my_groups.Size(); -+ os << "\ncommunication_groups\n"; -+ os << "number_of_groups " << num_groups << "\n\n"; -+ -+ os << "# number of entities in each group, followed by ranks in group\n"; -+ for (int group_id = 0; group_id < num_groups; ++group_id) -+ { -+ const int group_size = my_groups.RowSize(group_id); -+ const int *group_ptr = my_groups.GetRow(group_id); -+ os << group_size; -+ for (int group_member_index = 0; group_member_index < group_size; -+ ++group_member_index) -+ { -+ os << ' ' << group_ptr[group_member_index]; -+ } -+ os << '\n'; -+ } -+ // End: GroupTopology::Save -+ -+ const Table &g2v = group__shared_entity_to_vertex[Geometry::POINT]; -+ const Table &g2ev = group__shared_entity_to_vertex[Geometry::SEGMENT]; -+ const Table &g2tv = group__shared_entity_to_vertex[Geometry::TRIANGLE]; -+ const Table &g2qv = group__shared_entity_to_vertex[Geometry::SQUARE]; -+ -+ MFEM_VERIFY(g2v.RowSize(0) == 0, "internal erroor"); -+ os << "\ntotal_shared_vertices " << g2v.Size_of_connections() << '\n'; -+ if (dimension >= 2) -+ { -+ MFEM_VERIFY(g2ev.RowSize(0) == 0, "internal erroor"); -+ os << "total_shared_edges " << g2ev.Size_of_connections()/2 << '\n'; -+ } -+ if (dimension >= 3) -+ { -+ MFEM_VERIFY(g2tv.RowSize(0) == 0, "internal erroor"); -+ MFEM_VERIFY(g2qv.RowSize(0) == 0, "internal erroor"); -+ const int total_shared_faces = -+ g2tv.Size_of_connections()/3 + g2qv.Size_of_connections()/4; -+ os << "total_shared_faces " << total_shared_faces << '\n'; -+ } -+ os << "\n# group 0 has no shared entities\n"; -+ for (int gr = 1; gr < num_groups; gr++) -+ { -+ { -+ const int nv = g2v.RowSize(gr); -+ const int *sv = g2v.GetRow(gr); -+ os << "\n# group " << gr << "\nshared_vertices " << nv << '\n'; -+ for (int i = 0; i < nv; i++) -+ { -+ os << sv[i] << '\n'; -+ } -+ } -+ if (dimension >= 2) -+ { -+ const int ne = g2ev.RowSize(gr)/2; -+ const int *se = g2ev.GetRow(gr); -+ os << "\nshared_edges " << ne << '\n'; -+ for (int i = 0; i < ne; i++) -+ { -+ const int *v = se + 2*i; -+ os << v[0] << ' ' << v[1] << '\n'; -+ } -+ } -+ if (dimension >= 3) -+ { -+ const int nt = g2tv.RowSize(gr)/3; -+ const int *st = g2tv.GetRow(gr); -+ const int nq = g2qv.RowSize(gr)/4; -+ const int *sq = g2qv.GetRow(gr); -+ os << "\nshared_faces " << nt+nq << '\n'; -+ for (int i = 0; i < nt; i++) -+ { -+ os << Geometry::TRIANGLE; -+ const int *v = st + 3*i; -+ for (int j = 0; j < 3; j++) { os << ' ' << v[j]; } -+ os << '\n'; -+ } -+ for (int i = 0; i < nq; i++) -+ { -+ os << Geometry::SQUARE; -+ const int *v = sq + 4*i; -+ for (int j = 0; j < 4; j++) { os << ' ' << v[j]; } -+ os << '\n'; -+ } -+ } -+ } -+ -+ // Write out section end tag for mesh. -+ os << "mfem_mesh_end" << endl; -+} -+ -+Mesh &MeshPart::GetMesh() -+{ -+ if (mesh) { return *mesh; } -+ -+ mesh.reset(new Mesh(dimension, -+ num_vertices, -+ num_elements, -+ num_bdr_elements, -+ space_dimension)); -+ -+ // Add elements -+ { -+ const bool have_element_map = (element_map.Size() == num_elements); -+ MFEM_ASSERT(have_element_map || element_map.Size() == 0, -+ "invalid MeshPart state"); -+ EntityHelper elem_helper(dimension, entity_to_vertex); -+ MFEM_ASSERT(elem_helper.num_entities == num_elements, -+ "invalid MeshPart state"); -+ const bool have_tet_refine_flags = (tet_refine_flags.Size() > 0); -+ for (int nat_elem_id = 0; nat_elem_id < num_elements; nat_elem_id++) -+ { -+ const int bytype_elem_id = have_element_map ? -+ element_map[nat_elem_id] : nat_elem_id; -+ const Entity ent = elem_helper.FindEntity(bytype_elem_id); -+ Element *el = mesh->NewElement(ent.geom); -+ el->SetVertices(ent.verts); -+ el->SetAttribute(attributes[nat_elem_id]); -+ if (ent.geom == Geometry::TETRAHEDRON && have_tet_refine_flags) -+ { -+ constexpr int geom_tet = Geometry::TETRAHEDRON; -+ const int tet_id = (ent.verts - entity_to_vertex[geom_tet])/4; -+ const int ref_flag = tet_refine_flags[tet_id]; -+ static_cast(el)->SetRefinementFlag(ref_flag); -+ } -+ mesh->AddElement(el); -+ } -+ } -+ -+ // Add boundary elements -+ { -+ const bool have_boundary_map = (boundary_map.Size() == num_bdr_elements); -+ MFEM_ASSERT(have_boundary_map || boundary_map.Size() == 0, -+ "invalid MeshPart state"); -+ EntityHelper bdr_helper(dimension-1, entity_to_vertex); -+ MFEM_ASSERT(bdr_helper.num_entities == num_bdr_elements, -+ "invalid MeshPart state"); -+ for (int nat_bdr_id = 0; nat_bdr_id < num_bdr_elements; nat_bdr_id++) -+ { -+ const int bytype_bdr_id = have_boundary_map ? -+ boundary_map[nat_bdr_id] : nat_bdr_id; -+ const Entity ent = bdr_helper.FindEntity(bytype_bdr_id); -+ Element *bdr = mesh->NewElement(ent.geom); -+ bdr->SetVertices(ent.verts); -+ bdr->SetAttribute(bdr_attributes[nat_bdr_id]); -+ mesh->AddBdrElement(bdr); -+ } -+ } -+ -+ // Add vertices -+ if (vertex_coordinates.Size() == space_dimension*num_vertices) -+ { -+ MFEM_ASSERT(!nodes, "invalid MeshPart state"); -+ for (int vert_id = 0; vert_id < num_vertices; vert_id++) -+ { -+ mesh->AddVertex(vertex_coordinates + space_dimension*vert_id); -+ } -+ } -+ else -+ { -+ MFEM_ASSERT(vertex_coordinates.Size() == 0, "invalid MeshPart state"); -+ for (int vert_id = 0; vert_id < num_vertices; vert_id++) -+ { -+ mesh->AddVertex(0., 0., 0.); -+ } -+ // 'mesh.Nodes' cannot be set here -- they can be set later, if needed -+ } -+ -+ mesh->FinalizeTopology(/* generate_bdr: */ false); -+ -+ return *mesh; -+} -+ -+ -+MeshPartitioner::MeshPartitioner(Mesh &mesh_, -+ int num_parts_, -+ int *partitioning_, -+ int part_method) -+ : mesh(mesh_), -+ partitioning(partitioning_), -+ own_partitioning(false) -+{ -+ if (partitioning == nullptr) -+ { -+ partitioning = mesh.GeneratePartitioning(num_parts_, part_method); -+ own_partitioning = true; -+ } -+ -+ Transpose(Array(partitioning, mesh.GetNE()), -+ part_to_element, num_parts_); -+ // Note: the element ids in each row of 'part_to_element' are sorted. -+ -+ const int dim = mesh.Dimension(); -+ if (dim >= 2) -+ { -+ Transpose(mesh.ElementToEdgeTable(), edge_to_element, mesh.GetNEdges()); -+ } -+ -+ Array boundary_to_part(mesh.GetNBE()); -+ // Same logic as in ParMesh::BuildLocalBoundary -+ if (dim >= 3) -+ { -+ for (int i = 0; i < boundary_to_part.Size(); i++) -+ { -+ int face, o, el1, el2; -+ mesh.GetBdrElementFace(i, &face, &o); -+ mesh.GetFaceElements(face, &el1, &el2); -+ boundary_to_part[i] = -+ partitioning[(o % 2 == 0 || el2 < 0) ? el1 : el2]; -+ } -+ } -+ else if (dim == 2) -+ { -+ for (int i = 0; i < boundary_to_part.Size(); i++) -+ { -+ int edge = mesh.GetBdrElementFaceIndex(i); -+ int el1 = edge_to_element.GetRow(edge)[0]; -+ boundary_to_part[i] = partitioning[el1]; -+ } -+ } -+ else if (dim == 1) -+ { -+ for (int i = 0; i < boundary_to_part.Size(); i++) -+ { -+ int vert = mesh.GetBdrElementFaceIndex(i); -+ int el1, el2; -+ mesh.GetFaceElements(vert, &el1, &el2); -+ boundary_to_part[i] = partitioning[el1]; -+ } -+ } -+ Transpose(boundary_to_part, part_to_boundary, num_parts_); -+ // Note: the boundary element ids in each row of 'part_to_boundary' are -+ // sorted. -+ boundary_to_part.DeleteAll(); -+ -+ Table *vert_element = mesh.GetVertexToElementTable(); // we must delete this -+ vertex_to_element.Swap(*vert_element); -+ delete vert_element; -+} -+ -+void MeshPartitioner::ExtractPart(int part_id, MeshPart &mesh_part) const -+{ -+ const int num_parts = part_to_element.Size(); -+ -+ MFEM_VERIFY(0 <= part_id && part_id < num_parts, -+ "invalid part_id = " << part_id -+ << ", num_parts = " << num_parts); -+ -+ const int dim = mesh.Dimension(); -+ const int sdim = mesh.SpaceDimension(); -+ const int num_elems = part_to_element.RowSize(part_id); -+ const int *elem_list = part_to_element.GetRow(part_id); // sorted -+ const int num_bdr_elems = part_to_boundary.RowSize(part_id); -+ const int *bdr_elem_list = part_to_boundary.GetRow(part_id); // sorted -+ -+ // Initialize 'mesh_part' -+ mesh_part.dimension = dim; -+ mesh_part.space_dimension = sdim; -+ mesh_part.num_vertices = 0; -+ mesh_part.num_elements = num_elems; -+ mesh_part.num_bdr_elements = num_bdr_elems; -+ for (int g = 0; g < Geometry::NumGeom; g++) -+ { -+ mesh_part.entity_to_vertex[g].SetSize(0); // can reuse Array allocation -+ } -+ mesh_part.tet_refine_flags.SetSize(0); -+ mesh_part.element_map.SetSize(0); // 0 or 'num_elements', if needed -+ mesh_part.boundary_map.SetSize(0); // 0 or 'num_bdr_elements', if needed -+ mesh_part.attributes.SetSize(num_elems); -+ mesh_part.bdr_attributes.SetSize(num_bdr_elems); -+ mesh_part.vertex_coordinates.SetSize(0); -+ -+ mesh_part.num_parts = num_parts; -+ mesh_part.my_part_id = part_id; -+ mesh_part.my_groups.Clear(); -+ for (int g = 0; g < Geometry::NumGeom; g++) -+ { -+ mesh_part.group__shared_entity_to_vertex[g].Clear(); -+ } -+ mesh_part.nodes.reset(nullptr); -+ mesh_part.nodal_fes.reset(nullptr); -+ mesh_part.mesh.reset(nullptr); -+ -+ // Initialize: -+ // - 'mesh_part.entity_to_vertex' for the elements (boundary elements are -+ // set later); vertex ids are global at this point - they will be mapped to -+ // local ids later -+ // - 'mesh_part.attributes' -+ // - 'mesh_part.tet_refine_flags' if needed -+ int geom_marker = 0, num_geom = 0; -+ for (int i = 0; i < num_elems; i++) -+ { -+ const Element *elem = mesh.GetElement(elem_list[i]); -+ const int geom = elem->GetGeometryType(); -+ const int nv = Geometry::NumVerts[geom]; -+ const int *v = elem->GetVertices(); -+ MFEM_VERIFY(numeric_limits::max() - nv >= -+ mesh_part.entity_to_vertex[geom].Size(), -+ "overflow in 'entity_to_vertex[geom]', geom: " -+ << Geometry::Name[geom]); -+ mesh_part.entity_to_vertex[geom].Append(v, nv); -+ mesh_part.attributes[i] = elem->GetAttribute(); -+ if (geom == Geometry::TETRAHEDRON) -+ { -+ // Create 'mesh_part.tet_refine_flags' but only if we find at least one -+ // non-zero flag in a tetrahedron. -+ const Tetrahedron *tet = static_cast(elem); -+ const int ref_flag = tet->GetRefinementFlag(); -+ if (mesh_part.tet_refine_flags.Size() == 0) -+ { -+ if (ref_flag) -+ { -+ // This is the first time we encounter non-zero 'ref_flag' -+ const int num_tets = mesh_part.entity_to_vertex[geom].Size()/nv; -+ mesh_part.tet_refine_flags.SetSize(num_tets, 0); -+ mesh_part.tet_refine_flags.Last() = ref_flag; -+ } -+ } -+ else -+ { -+ mesh_part.tet_refine_flags.Append(ref_flag); -+ } -+ } -+ if ((geom_marker & (1 << geom)) == 0) -+ { -+ geom_marker |= (1 << geom); -+ num_geom++; -+ } -+ } -+ MFEM_ASSERT(mesh_part.tet_refine_flags.Size() == 0 || -+ mesh_part.tet_refine_flags.Size() == -+ mesh_part.entity_to_vertex[Geometry::TETRAHEDRON].Size()/4, -+ "internal error"); -+ // Initialize 'mesh_part.element_map' if needed -+ if (num_geom > 1) -+ { -+ int offsets[Geometry::NumGeom]; -+ int offset = 0; -+ for (int g = Geometry::DimStart[dim]; g < Geometry::DimStart[dim+1]; g++) -+ { -+ offsets[g] = offset; -+ offset += mesh_part.entity_to_vertex[g].Size()/Geometry::NumVerts[g]; -+ } -+ mesh_part.element_map.SetSize(num_elems); -+ for (int i = 0; i < num_elems; i++) -+ { -+ const int geom = mesh.GetElementGeometry(elem_list[i]); -+ mesh_part.element_map[i] = offsets[geom]++; -+ } -+ } -+ -+ // Initialize: -+ // - 'mesh_part.entity_to_vertex' for the boundary elements; vertex ids are -+ // global at this point - they will be mapped to local ids later -+ // - 'mesh_part.bdr_attributes' -+ geom_marker = 0; num_geom = 0; -+ for (int i = 0; i < num_bdr_elems; i++) -+ { -+ const Element *bdr_elem = mesh.GetBdrElement(bdr_elem_list[i]); -+ const int geom = bdr_elem->GetGeometryType(); -+ const int nv = Geometry::NumVerts[geom]; -+ const int *v = bdr_elem->GetVertices(); -+ MFEM_VERIFY(numeric_limits::max() - nv >= -+ mesh_part.entity_to_vertex[geom].Size(), -+ "overflow in 'entity_to_vertex[geom]', geom: " -+ << Geometry::Name[geom]); -+ mesh_part.entity_to_vertex[geom].Append(v, nv); -+ mesh_part.bdr_attributes[i] = bdr_elem->GetAttribute(); -+ if ((geom_marker & (1 << geom)) == 0) -+ { -+ geom_marker |= (1 << geom); -+ num_geom++; -+ } -+ } -+ // Initialize 'mesh_part.boundary_map' if needed -+ if (num_geom > 1) -+ { -+ int offsets[Geometry::NumGeom]; -+ int offset = 0; -+ for (int g = Geometry::DimStart[dim-1]; g < Geometry::DimStart[dim]; g++) -+ { -+ offsets[g] = offset; -+ offset += mesh_part.entity_to_vertex[g].Size()/Geometry::NumVerts[g]; -+ } -+ mesh_part.boundary_map.SetSize(num_bdr_elems); -+ for (int i = 0; i < num_bdr_elems; i++) -+ { -+ const int geom = mesh.GetBdrElementGeometry(bdr_elem_list[i]); -+ mesh_part.boundary_map[i] = offsets[geom]++; -+ } -+ } -+ -+ // Create the vertex id map, 'vertex_loc_to_glob', which maps local ids to -+ // global ones; the map is sorted, preserving the global ordering. -+ Array vertex_loc_to_glob; -+ { -+ std::unordered_set vertex_set; -+ for (int i = 0; i < num_elems; i++) -+ { -+ const Element *elem = mesh.GetElement(elem_list[i]); -+ const int geom = elem->GetGeometryType(); -+ const int nv = Geometry::NumVerts[geom]; -+ const int *v = elem->GetVertices(); -+ vertex_set.insert(v, v + nv); -+ } -+ vertex_loc_to_glob.SetSize(vertex_set.size()); -+ std::copy(vertex_set.begin(), vertex_set.end(), // src -+ vertex_loc_to_glob.begin()); // dest -+ } -+ vertex_loc_to_glob.Sort(); -+ -+ // Initialize 'mesh_part.num_vertices' -+ mesh_part.num_vertices = vertex_loc_to_glob.Size(); -+ -+ // Update the vertex ids in the arrays 'mesh_part.entity_to_vertex' from -+ // global to local. -+ for (int g = 0; g < Geometry::NumGeom; g++) -+ { -+ Array &vert_array = mesh_part.entity_to_vertex[g]; -+ for (int i = 0; i < vert_array.Size(); i++) -+ { -+ const int glob_id = vert_array[i]; -+ const int loc_id = vertex_loc_to_glob.FindSorted(glob_id); -+ MFEM_ASSERT(loc_id >= 0, "internal error: global vertex id not found"); -+ vert_array[i] = loc_id; -+ } -+ } -+ -+ // Initialize one of 'mesh_part.vertex_coordinates' or 'mesh_part.nodes' -+ if (!mesh.GetNodes()) -+ { -+ MFEM_VERIFY(numeric_limits::max()/sdim >= vertex_loc_to_glob.Size(), -+ "overflow in 'vertex_coordinates', num_vertices = " -+ << vertex_loc_to_glob.Size() << ", sdim = " << sdim); -+ mesh_part.vertex_coordinates.SetSize(sdim*vertex_loc_to_glob.Size()); -+ for (int i = 0; i < vertex_loc_to_glob.Size(); i++) -+ { -+ const double *coord = mesh.GetVertex(vertex_loc_to_glob[i]); -+ for (int d = 0; d < sdim; d++) -+ { -+ mesh_part.vertex_coordinates[i*sdim+d] = coord[d]; -+ } -+ } -+ } -+ else -+ { -+ const GridFunction &glob_nodes = *mesh.GetNodes(); -+ mesh_part.nodal_fes = ExtractFESpace(mesh_part, *glob_nodes.FESpace()); -+ // Initialized 'mesh_part.mesh'. -+ // Note: the nodes of 'mesh_part.mesh' are not set. -+ -+ mesh_part.nodes = ExtractGridFunction(mesh_part, glob_nodes, -+ *mesh_part.nodal_fes); -+ -+ // Attach the 'mesh_part.nodes' to the 'mesh_part.mesh'. -+ mesh_part.mesh->NewNodes(*mesh_part.nodes, /* make_owner: */ false); -+ // Note: the vertices of 'mesh_part.mesh' are not set. -+ } -+ -+ // Begin constructing the "neighbor" groups, i.e. the groups that contain -+ // 'part_id'. -+ ListOfIntegerSets groups; -+ { -+ // the first group is the local one -+ IntegerSet group; -+ group.Recreate(1, &part_id); -+ groups.Insert(group); -+ } -+ -+ // 'shared_faces' : shared face id -> (global_face_id, group_id) -+ // Note: 'shared_faces' will be sorted by 'global_face_id'. -+ Array> shared_faces; -+ -+ // Add "neighbor" groups defined by faces -+ // Construct 'shared_faces'. -+ if (dim >= 3) -+ { -+ std::unordered_set face_set; -+ // Construct 'face_set' -+ const Table &elem_to_face = mesh.ElementToFaceTable(); -+ for (int loc_elem_id = 0; loc_elem_id < num_elems; loc_elem_id++) -+ { -+ const int glob_elem_id = elem_list[loc_elem_id]; -+ const int nfaces = elem_to_face.RowSize(glob_elem_id); -+ const int *faces = elem_to_face.GetRow(glob_elem_id); -+ face_set.insert(faces, faces + nfaces); -+ } -+ // Construct 'shared_faces'; add "neighbor" groups defined by faces. -+ IntegerSet group; -+ for (int glob_face_id : face_set) -+ { -+ int el[2]; -+ mesh.GetFaceElements(glob_face_id, &el[0], &el[1]); -+ if (el[1] < 0) { continue; } -+ el[0] = partitioning[el[0]]; -+ el[1] = partitioning[el[1]]; -+ MFEM_ASSERT(el[0] == part_id || el[1] == part_id, "internal error"); -+ if (el[0] != part_id || el[1] != part_id) -+ { -+ group.Recreate(2, el); -+ const int group_id = groups.Insert(group); -+ shared_faces.Append(Pair(glob_face_id, group_id)); -+ } -+ } -+ shared_faces.Sort(); // sort the shared faces by 'glob_face_id' -+ } -+ -+ // 'shared_edges' : shared edge id -> (global_edge_id, group_id) -+ // Note: 'shared_edges' will be sorted by 'global_edge_id'. -+ Array> shared_edges; -+ -+ // Add "neighbor" groups defined by edges. -+ // Construct 'shared_edges'. -+ if (dim >= 2) -+ { -+ std::unordered_set edge_set; -+ // Construct 'edge_set' -+ const Table &elem_to_edge = mesh.ElementToEdgeTable(); -+ for (int loc_elem_id = 0; loc_elem_id < num_elems; loc_elem_id++) -+ { -+ const int glob_elem_id = elem_list[loc_elem_id]; -+ const int nedges = elem_to_edge.RowSize(glob_elem_id); -+ const int *edges = elem_to_edge.GetRow(glob_elem_id); -+ edge_set.insert(edges, edges + nedges); -+ } -+ // Construct 'shared_edges'; add "neighbor" groups defined by edges. -+ IntegerSet group; -+ for (int glob_edge_id : edge_set) -+ { -+ const int nelem = edge_to_element.RowSize(glob_edge_id); -+ const int *elem = edge_to_element.GetRow(glob_edge_id); -+ Array &gr = group; // reference to the 'group' internal Array -+ gr.SetSize(nelem); -+ for (int j = 0; j < nelem; j++) -+ { -+ gr[j] = partitioning[elem[j]]; -+ } -+ gr.Sort(); -+ gr.Unique(); -+ MFEM_ASSERT(gr.FindSorted(part_id) >= 0, "internal error"); -+ if (group.Size() > 1) -+ { -+ const int group_id = groups.Insert(group); -+ shared_edges.Append(Pair(glob_edge_id, group_id)); -+ } -+ } -+ shared_edges.Sort(); // sort the shared edges by 'glob_edge_id' -+ } -+ -+ // 'shared_verts' : shared vertex id -> (global_vertex_id, group_id) -+ // Note: 'shared_verts' will be sorted by 'global_vertex_id'. -+ Array> shared_verts; -+ -+ // Add "neighbor" groups defined by vertices. -+ // Construct 'shared_verts'. -+ { -+ IntegerSet group; -+ for (int i = 0; i < vertex_loc_to_glob.Size(); i++) -+ { -+ // 'vertex_to_element' maps global vertex ids to global element ids -+ const int glob_vertex_id = vertex_loc_to_glob[i]; -+ const int nelem = vertex_to_element.RowSize(glob_vertex_id); -+ const int *elem = vertex_to_element.GetRow(glob_vertex_id); -+ Array &gr = group; // reference to the 'group' internal Array -+ gr.SetSize(nelem); -+ for (int j = 0; j < nelem; j++) -+ { -+ gr[j] = partitioning[elem[j]]; -+ } -+ gr.Sort(); -+ gr.Unique(); -+ MFEM_ASSERT(gr.FindSorted(part_id) >= 0, "internal error"); -+ if (group.Size() > 1) -+ { -+ const int group_id = groups.Insert(group); -+ shared_verts.Append(Pair(glob_vertex_id, group_id)); -+ } -+ } -+ } -+ -+ // Done constructing the "neighbor" groups in 'groups'. -+ const int num_groups = groups.Size(); -+ -+ // Define 'mesh_part.my_groups' -+ groups.AsTable(mesh_part.my_groups); -+ -+ // Construct 'mesh_part.group__shared_entity_to_vertex[Geometry::POINT]' -+ Table &group__shared_vertex_to_vertex = -+ mesh_part.group__shared_entity_to_vertex[Geometry::POINT]; -+ group__shared_vertex_to_vertex.MakeI(num_groups); -+ for (int sv = 0; sv < shared_verts.Size(); sv++) -+ { -+ const int group_id = shared_verts[sv].two; -+ group__shared_vertex_to_vertex.AddAColumnInRow(group_id); -+ } -+ group__shared_vertex_to_vertex.MakeJ(); -+ for (int sv = 0; sv < shared_verts.Size(); sv++) -+ { -+ const int glob_vertex_id = shared_verts[sv].one; -+ const int group_id = shared_verts[sv].two; -+ const int loc_vertex_id = vertex_loc_to_glob.FindSorted(glob_vertex_id); -+ MFEM_ASSERT(loc_vertex_id >= 0, "internal error"); -+ group__shared_vertex_to_vertex.AddConnection(group_id, loc_vertex_id); -+ } -+ group__shared_vertex_to_vertex.ShiftUpI(); -+ -+ // Construct 'mesh_part.group__shared_entity_to_vertex[Geometry::SEGMENT]' -+ if (dim >= 2) -+ { -+ Table &group__shared_edge_to_vertex = -+ mesh_part.group__shared_entity_to_vertex[Geometry::SEGMENT]; -+ group__shared_edge_to_vertex.MakeI(num_groups); -+ for (int se = 0; se < shared_edges.Size(); se++) -+ { -+ const int group_id = shared_edges[se].two; -+ group__shared_edge_to_vertex.AddColumnsInRow(group_id, 2); -+ } -+ group__shared_edge_to_vertex.MakeJ(); -+ const Table &edge_to_vertex = *mesh.GetEdgeVertexTable(); -+ for (int se = 0; se < shared_edges.Size(); se++) -+ { -+ const int glob_edge_id = shared_edges[se].one; -+ const int group_id = shared_edges[se].two; -+ const int *v = edge_to_vertex.GetRow(glob_edge_id); -+ for (int i = 0; i < 2; i++) -+ { -+ const int loc_vertex_id = vertex_loc_to_glob.FindSorted(v[i]); -+ MFEM_ASSERT(loc_vertex_id >= 0, "internal error"); -+ group__shared_edge_to_vertex.AddConnection(group_id, loc_vertex_id); -+ } -+ } -+ group__shared_edge_to_vertex.ShiftUpI(); -+ } -+ -+ // Construct 'mesh_part.group__shared_entity_to_vertex[Geometry::TRIANGLE]' -+ // and 'mesh_part.group__shared_entity_to_vertex[Geometry::SQUARE]'. -+ if (dim >= 3) -+ { -+ Table &group__shared_tria_to_vertex = -+ mesh_part.group__shared_entity_to_vertex[Geometry::TRIANGLE]; -+ Table &group__shared_quad_to_vertex = -+ mesh_part.group__shared_entity_to_vertex[Geometry::SQUARE]; -+ Array vertex_ids; -+ group__shared_tria_to_vertex.MakeI(num_groups); -+ group__shared_quad_to_vertex.MakeI(num_groups); -+ for (int sf = 0; sf < shared_faces.Size(); sf++) -+ { -+ const int glob_face_id = shared_faces[sf].one; -+ const int group_id = shared_faces[sf].two; -+ const int geom = mesh.GetFaceGeometry(glob_face_id); -+ mesh_part.group__shared_entity_to_vertex[geom]. -+ AddColumnsInRow(group_id, Geometry::NumVerts[geom]); -+ } -+ group__shared_tria_to_vertex.MakeJ(); -+ group__shared_quad_to_vertex.MakeJ(); -+ for (int sf = 0; sf < shared_faces.Size(); sf++) -+ { -+ const int glob_face_id = shared_faces[sf].one; -+ const int group_id = shared_faces[sf].two; -+ const int geom = mesh.GetFaceGeometry(glob_face_id); -+ mesh.GetFaceVertices(glob_face_id, vertex_ids); -+ // Rotate shared triangles that have an adjacent tetrahedron with a -+ // nonzero refinement flag. -+ // See also ParMesh::BuildSharedFaceElems. -+ if (geom == Geometry::TRIANGLE) -+ { -+ int glob_el_id[2]; -+ mesh.GetFaceElements(glob_face_id, &glob_el_id[0], &glob_el_id[1]); -+ int side = 0; -+ const Element *el = mesh.GetElement(glob_el_id[0]); -+ const Tetrahedron *tet = nullptr; -+ if (el->GetGeometryType() == Geometry::TETRAHEDRON) -+ { -+ tet = static_cast(el); -+ } -+ else -+ { -+ side = 1; -+ el = mesh.GetElement(glob_el_id[1]); -+ if (el->GetGeometryType() == Geometry::TETRAHEDRON) -+ { -+ tet = static_cast(el); -+ } -+ } -+ if (tet && tet->GetRefinementFlag()) -+ { -+ // mark the shared face for refinement by reorienting -+ // it according to the refinement flag in the tetrahedron -+ // to which this shared face belongs to. -+ int info[2]; -+ mesh.GetFaceInfos(glob_face_id, &info[0], &info[1]); -+ tet->GetMarkedFace(info[side]/64, &vertex_ids[0]); -+ } -+ } -+ for (int i = 0; i < vertex_ids.Size(); i++) -+ { -+ const int glob_id = vertex_ids[i]; -+ const int loc_id = vertex_loc_to_glob.FindSorted(glob_id); -+ MFEM_ASSERT(loc_id >= 0, "internal error"); -+ vertex_ids[i] = loc_id; -+ } -+ mesh_part.group__shared_entity_to_vertex[geom]. -+ AddConnections(group_id, vertex_ids, vertex_ids.Size()); -+ } -+ group__shared_tria_to_vertex.ShiftUpI(); -+ group__shared_quad_to_vertex.ShiftUpI(); -+ } -+} -+ -+std::unique_ptr -+MeshPartitioner::ExtractFESpace(MeshPart &mesh_part, -+ const FiniteElementSpace &global_fespace) const -+{ -+ mesh_part.GetMesh(); // initialize 'mesh_part.mesh' -+ // Note: the nodes of 'mesh_part.mesh' are not set. -+ -+ return std::unique_ptr( -+ new FiniteElementSpace(mesh_part.mesh.get(), -+ global_fespace.FEColl(), -+ global_fespace.GetVDim(), -+ global_fespace.GetOrdering())); -+} -+ -+std::unique_ptr -+MeshPartitioner::ExtractGridFunction(MeshPart &mesh_part, -+ const GridFunction &global_gf, -+ FiniteElementSpace &local_fespace) const -+{ -+ std::unique_ptr local_gf(new GridFunction(&local_fespace)); -+ -+ // Transfer data from 'global_gf' to 'local_gf'. -+ Array gvdofs, lvdofs; -+ Vector loc_vals; -+ const int part_id = mesh_part.my_part_id; -+ const int num_elems = part_to_element.RowSize(part_id); -+ const int *elem_list = part_to_element.GetRow(part_id); // sorted -+ for (int loc_elem_id = 0; loc_elem_id < num_elems; loc_elem_id++) -+ { -+ const int glob_elem_id = elem_list[loc_elem_id]; -+ auto glob_dt = global_gf.FESpace()->GetElementVDofs(glob_elem_id, gvdofs); -+ global_gf.GetSubVector(gvdofs, loc_vals); -+ if (glob_dt) { glob_dt->InvTransformPrimal(loc_vals); } -+ auto local_dt = local_fespace.GetElementVDofs(loc_elem_id, lvdofs); -+ if (local_dt) { local_dt->TransformPrimal(loc_vals); } -+ local_gf->SetSubVector(lvdofs, loc_vals); -+ } -+ return local_gf; -+} -+ -+MeshPartitioner::~MeshPartitioner() -+{ -+ if (own_partitioning) { delete [] partitioning; } -+} -+ -+ - GeometricFactors::GeometricFactors(const Mesh *mesh, const IntegrationRule &ir, - int flags, MemoryType d_mt) - { -diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp -index b9c5538c3..a6957720a 100644 ---- a/mesh/mesh.hpp -+++ b/mesh/mesh.hpp -@@ -27,6 +27,7 @@ - #include "../general/adios2stream.hpp" - #endif - #include -+#include - - namespace mfem - { -@@ -72,8 +73,10 @@ protected: - visualization purpose in GLVis. */ - mutable int nbInteriorFaces, nbBoundaryFaces; - -- int meshgen; // see MeshGenerator() -- int mesh_geoms; // sum of (1 << geom) for all geom of all dimensions -+ // see MeshGenerator(); global in parallel -+ int meshgen; -+ // sum of (1 << geom) for all geom of all dimensions; local in parallel -+ int mesh_geoms; - - // Counter for Mesh transformations: refinement, derefinement, rebalancing. - // Used for checking during Update operations on objects depending on the -@@ -295,11 +298,11 @@ protected: - void Destroy(); // Delete all owned data. - void ResetLazyData(); - -- Element *ReadElementWithoutAttr(std::istream &); -- static void PrintElementWithoutAttr(const Element *, std::ostream &); -+ Element *ReadElementWithoutAttr(std::istream &input); -+ static void PrintElementWithoutAttr(const Element *el, std::ostream &os); - -- Element *ReadElement(std::istream &); -- static void PrintElement(const Element *, std::ostream &); -+ Element *ReadElement(std::istream &input); -+ static void PrintElement(const Element *el, std::ostream &os); - - // Readers for different mesh formats, used in the Load() method. - // The implementations of these methods are in mesh_readers.cpp. -@@ -456,7 +459,7 @@ protected: - - void UpdateNURBS(); - -- void PrintTopo(std::ostream &out, const Array &e_to_k) const; -+ void PrintTopo(std::ostream &os, const Array &e_to_k) const; - - /// Used in GetFaceElementTransformations (...) - void GetLocalPtToSegTransformation(IsoparametricTransformation &, int); -@@ -565,7 +568,7 @@ protected: - // If NURBS mesh, write NURBS format. If NCMesh, write mfem v1.1 format. - // If section_delimiter is empty, write mfem v1.0 format. Otherwise, write - // mfem v1.2 format with the given section_delimiter at the end. -- void Printer(std::ostream &out = mfem::out, -+ void Printer(std::ostream &os = mfem::out, - std::string section_delimiter = "") const; - - /** Creates mesh for the parallelepiped [0,sx]x[0,sy]x[0,sz], divided into -@@ -859,7 +862,7 @@ public: - - int AddBdrPoint(int v, int attr = 1); - -- void GenerateBoundaryElements(); -+ virtual void GenerateBoundaryElements(); - /// Finalize the construction of a triangular Mesh. - void FinalizeTriMesh(int generate_edges = 0, int refine = 0, - bool fix_orientation = true); -@@ -2101,7 +2104,7 @@ public: - std::ostream &os, int elem_attr = 0) const; - - void PrintElementsWithPartitioning (int *partitioning, -- std::ostream &out, -+ std::ostream &os, - int interior_faces = 0); - - /// Print set of disjoint surfaces: -@@ -2109,13 +2112,13 @@ public: - * If Aface_face(i,j) != 0, print face j as a boundary - * element with attribute i+1. - */ -- void PrintSurfaces(const Table &Aface_face, std::ostream &out) const; -+ void PrintSurfaces(const Table &Aface_face, std::ostream &os) const; - - /// Auxiliary method used by PrintCharacteristics(). - /** It is also used in the `mesh-explorer` miniapp. */ - static void PrintElementsByGeometry(int dim, - const Array &num_elems_by_geom, -- std::ostream &out); -+ std::ostream &os); - - /** @brief Compute and print mesh characteristics such as number of vertices, - number of elements, number of boundary elements, minimal and maximal -@@ -2135,7 +2138,7 @@ public: - - #ifdef MFEM_DEBUG - /// Output an NCMesh-compatible debug dump. -- void DebugDump(std::ostream &out) const; -+ void DebugDump(std::ostream &os) const; - #endif - - /// @} -@@ -2214,9 +2217,194 @@ public: - /// @} - }; - --/** Overload operator<< for std::ostream and Mesh; valid also for the derived -- class ParMesh */ --std::ostream &operator<<(std::ostream &out, const Mesh &mesh); -+ -+// Class containing a minimal description of a part (a subset of the elements) -+// of a Mesh and its connectivity to other parts. The main purpose of this class -+// is to be communicated between MPI ranks for repartitioning purposes. It can -+// also be used to implement parallel mesh I/O functions with partitionings that -+// have number of parts different from the number of MPI tasks. -+// -+// Note: parts of NURBS or non-conforming meshes cannot be fully described by -+// this class alone. -+class MeshPart -+{ -+protected: -+ struct Entity { int geom; int num_verts; const int *verts; }; -+ struct EntityHelper -+ { -+ int dim, num_entities; -+ int geom_offsets[Geometry::NumGeom+1]; -+ typedef const Array entity_to_vertex_type[Geometry::NumGeom]; -+ entity_to_vertex_type &entity_to_vertex; -+ -+ EntityHelper(int dim_, -+ const Array (&entity_to_vertex_)[Geometry::NumGeom]); -+ Entity FindEntity(int bytype_entity_id); -+ }; -+ -+public: -+ // Reference space dimension of the elements -+ int dimension; -+ -+ // Dimension of the physical space into which the MeshPart is embedded. -+ int space_dimension; -+ -+ // Number of vertices -+ int num_vertices; -+ -+ // Number of elements with reference space dimension equal to 'dimension'. -+ int num_elements; -+ -+ // Number of boundary elements with reference space dimension equal to -+ // 'dimension'-1. -+ int num_bdr_elements; -+ -+ // Each 'entity_to_vertex[geom]' describes the entities of Geometry::Type -+ // 'geom' in terms of their vertices. The number of entities of type 'geom' -+ // is: -+ // num_entities[geom] = size('entity_to_vertex[geom]')/num_vertices[geom] -+ // The number of all elements, 'num_elements', is: -+ // 'num_elements' = sum_{dim[geom]=='dimension'} num_entities[geom] -+ // and the number of all boundary elements, 'num_bdr_elements' is: -+ // 'num_bdr_elements' = sum_{dim[geom]=='dimension'-1} num_entities[geom] -+ // Note that 'entity_to_vertex' does NOT describe all "faces" in the mesh -+ // part (i.e. all 'dimension'-1 entities) but only the boundary elements. -+ Array entity_to_vertex[Geometry::NumGeom]; -+ -+ // Store the refinement flags for tetraheral elements. If all tets have zero -+ // refinement flags then this array is empty, i.e. has size 0. -+ Array tet_refine_flags; -+ -+ // "By-type" element/boundary ordering: ordered by Geometry::Type and within -+ // each Geometry::Type 'geom' ordered as in 'entity_to_vertex[geom]'. -+ -+ // Optional re-ordering of the elements that will be used by (Par)Mesh -+ // objects constructed from this MeshPart. This array maps "natural" element -+ // ids (used by the Mesh/ParMesh objects) to "by-type" element ids (see -+ // above): -+ // "by-type" element id = element_map["natural" element id] -+ // The size of the array is either 'num_elements' or 0 when no re-ordering is -+ // needed (then "by-type" id == "natural" id). -+ Array element_map; -+ -+ // Optional re-ordering for the boundary elements, similar to 'element_map'. -+ Array boundary_map; -+ -+ // Element attributes. Ordered using the "natural" element ordering defined -+ // by the array 'element_map'. The size of this array is 'num_elements'. -+ Array attributes; -+ -+ // Boundary element attributes. Ordered using the "natural" boundary element -+ // ordering defined by the array 'boundary_map'. The size of this array is -+ // 'num_bdr_elements'. -+ Array bdr_attributes; -+ -+ // Optional vertex coordinates. The size of the array is either -+ // size = 'space_dimension' * 'num_vertices' -+ // or 0 when the vertex coordinates are not used, i.e. when the MeshPart uses -+ // a nodal GridFunction to describe its location in physical space. This -+ // array uses Ordering::byVDIM: "X0,Y0,Z0, X1,Y1,Z1, ...". -+ Array vertex_coordinates; -+ -+ // Optional serial Mesh object constructed on demand using the method -+ // GetMesh(). One use case for it is when one wants to construct FE spaces -+ // and GridFunction%s on the MeshPart for saving or MPI communication. -+ std::unique_ptr mesh; -+ -+ // Nodal FE space defined on 'mesh' used by the GridFunction 'nodes'. Uses -+ // the FE collection from the global nodal FE space. -+ std::unique_ptr nodal_fes; -+ -+ // 'nodes': pointer to a GridFunction describing the physical location of the -+ // MeshPart. Used for describing high-order and periodic meshes. This -+ // GridFunction is defined on the FE space 'nodal_fes' which, in turn, is -+ // defined on the Mesh 'mesh'. -+ std::unique_ptr nodes; -+ -+ // Connectivity to other MeshPart objects -+ // -------------------------------------- -+ -+ // Total number of MeshParts -+ int num_parts; -+ -+ // Index of the part described by this MeshPart: -+ // 0 <= 'my_part_id' < 'num_parts' -+ int my_part_id; -+ -+ // A group G is a subsets of the set { 0, 1, ..., 'num_parts'-1 } for which -+ // there is a mesh entity E (of any dimension) in the global mesh such that -+ // G is the set of the parts assigned to the elements adjacent to E. The -+ // MeshPart describes only the "neighbor" groups, i.e. the groups that -+ // contain 'my_part_id'. The Table 'my_groups' defines the "neighbor" groups -+ // in terms of their part ids. In other words, it maps "neighbor" group ids -+ // to a (sorted) list of part ids. In particular, the number of "neighbor" -+ // groups is given by 'my_groups.Size()'. The "local" group { 'my_part_id' } -+ // has index 0 in 'my_groups'. -+ Table my_groups; -+ -+ // Shared entities for this MeshPart are mesh entities of all dimensions less -+ // than 'dimension' that are generated by the elements of this MeshPart and -+ // at least one other MeshPart. -+ // -+ // The Table 'group__shared_entity_to_vertex[geom]' defines, for each group, -+ // the shared entities of Geometry::Type 'geom'. Each row (corresponding to a -+ // "neighbor" group, as defined by 'my_groups') in the Table defines the -+ // shared entities in a way similar to the arrays 'entity_to_vertex[geom]'. -+ // The "local" group (with index 0) does not have any shared entities, so the -+ // 0-th row in the Table is always empty. -+ // -+ // IMPORTANT: the desciptions of the groups in this MeshPart must match their -+ // descriptions in all neighboring MeshParts. This includes the ordering of -+ // the shared entities within the group, as well as the vertex ordering of -+ // each shared entity. -+ Table group__shared_entity_to_vertex[Geometry::NumGeom]; -+ -+ // Write the MeshPart to a stream using the parallel format "MFEM mesh v1.2". -+ void Print(std::ostream &os) const; -+ -+ // Construct a serrial Mesh object from the MeshPart. The nodes of 'mesh' are -+ // NOT initialized by this method, however, the nodal FE space and nodal -+ // GridFunction can be created and then attached to the 'mesh'. The Mesh is -+ // constructed only if 'mesh' is empty, otherwise the method simply returns -+ // the object held by 'mesh'. -+ Mesh &GetMesh(); -+}; -+ -+ -+// TODO: documentation -+class MeshPartitioner -+{ -+protected: -+ Mesh &mesh; -+ int *partitioning; -+ bool own_partitioning; -+ Table part_to_element; -+ Table part_to_boundary; -+ Table edge_to_element; -+ Table vertex_to_element; -+ -+public: -+ // TODO: documentation -+ MeshPartitioner(Mesh &mesh_, int num_parts_, int *partitioning_ = NULL, -+ int part_method = 1); -+ -+ // TODO: documentation -+ void ExtractPart(int part_id, MeshPart &mesh_part) const; -+ -+ // TODO: documentation -+ std::unique_ptr -+ ExtractFESpace(MeshPart &mesh_part, -+ const FiniteElementSpace &global_fespace) const; -+ -+ // TODO: documentation -+ std::unique_ptr -+ ExtractGridFunction(MeshPart &mesh_part, -+ const GridFunction &global_gf, -+ FiniteElementSpace &local_fespace) const; -+ -+ // Destructor -+ ~MeshPartitioner(); -+}; - - - /** @brief Structure for storing mesh geometric factors: coordinates, Jacobians, -@@ -2225,7 +2413,6 @@ std::ostream &operator<<(std::ostream &out, const Mesh &mesh); - Mesh. See Mesh::GetGeometricFactors(). */ - class GeometricFactors - { -- - private: - void Compute(const GridFunction &nodes, - MemoryType d_mt = MemoryType::DEFAULT); -@@ -2273,6 +2460,7 @@ public: - Vector detJ; - }; - -+ - /** @brief Structure for storing face geometric factors: coordinates, Jacobians, - determinants of the Jacobians, and normal vectors. */ - /** Typically objects of this type are constructed and owned by objects of class -@@ -2327,6 +2515,7 @@ public: - Vector normal; - }; - -+ - /// Class used to extrude the nodes of a mesh - class NodeExtrudeCoefficient : public VectorCoefficient - { -@@ -2358,8 +2547,12 @@ inline void ShiftRight(int &a, int &b, int &c) - a = c; c = b; b = t; - } - -+/** Overload operator<< for std::ostream and Mesh; valid also for the derived -+ class ParMesh */ -+std::ostream &operator<<(std::ostream &os, const Mesh &mesh); -+ - /// @brief Print function for Mesh::FaceInformation. --std::ostream& operator<<(std::ostream& os, const Mesh::FaceInformation& info); -+std::ostream& operator<<(std::ostream &os, const Mesh::FaceInformation& info); - - } - -diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp -index 26e2f4655..47a091c04 100644 ---- a/mesh/pmesh.cpp -+++ b/mesh/pmesh.cpp -@@ -250,6 +250,8 @@ ParMesh::ParMesh(MPI_Comm comm, Mesh &mesh, int *partitioning_, - BuildSharedVertMapping(nsvert, vert_element, vert_global_local); - delete vert_element; - -+ // FIXME: the next two lines are already done above! Any reason to do them -+ // again? - SetMeshGen(); - meshgen = mesh.meshgen; // copy the global 'meshgen' - } -@@ -1526,6 +1528,7 @@ ParMesh ParMesh::MakeSimplicial(ParMesh &orig_mesh) - void ParMesh::Finalize(bool refine, bool fix_orientation) - { - const int meshgen_save = meshgen; // Mesh::Finalize() may call SetMeshGen() -+ // 'mesh_geoms' is local, so there's no need to save and restore it. - - Mesh::Finalize(refine, fix_orientation); - -@@ -6213,6 +6216,7 @@ void ParMesh::ParPrint(ostream &os) const - { - os << "total_shared_faces " << sface_lface.Size() << '\n'; - } -+ os << "\n# group 0 has no shared entities\n"; - for (int gr = 1; gr < GetNGroups(); gr++) - { - { -diff --git a/mesh/pmesh.hpp b/mesh/pmesh.hpp -index 06f09dc0c..b6682defe 100644 ---- a/mesh/pmesh.hpp -+++ b/mesh/pmesh.hpp -@@ -108,6 +108,8 @@ protected: - // Convert the local 'meshgen' to a global one. - void ReduceMeshGen(); - -+ void GenerateBoundaryElements() override { /* TODO */ } -+ - // Determine sedge_ledge and sface_lface. - void FinalizeParTopo(); - -diff --git a/mesh/tetrahedron.cpp b/mesh/tetrahedron.cpp -index 0815fc0a7..c1b0ae6d8 100644 ---- a/mesh/tetrahedron.cpp -+++ b/mesh/tetrahedron.cpp -@@ -55,7 +55,7 @@ void Tetrahedron::Init(int ind1, int ind2, int ind3, int ind4, int attr, - } - - void Tetrahedron::ParseRefinementFlag(int refinement_edges[2], int &type, -- int &flag) -+ int &flag) const - { - int i, f = refinement_flag; - -@@ -136,9 +136,10 @@ void Tetrahedron::CreateRefinementFlag(int refinement_edges[2], int type, - refinement_flag |= refinement_edges[0]; - } - --void Tetrahedron::GetMarkedFace(const int face, int *fv) -+void Tetrahedron::GetMarkedFace(const int face, int *fv) const - { -- int re[2], type, flag, *tv = this->indices; -+ int re[2], type, flag; -+ const int *tv = this->indices; - ParseRefinementFlag(re, type, flag); - switch (face) - { -diff --git a/mesh/tetrahedron.hpp b/mesh/tetrahedron.hpp -index ad018a037..ef8f36eb8 100644 ---- a/mesh/tetrahedron.hpp -+++ b/mesh/tetrahedron.hpp -@@ -58,12 +58,13 @@ public: - /// Return element's type. - virtual Type GetType() const { return Element::TETRAHEDRON; } - -- void ParseRefinementFlag(int refinement_edges[2], int &type, int &flag); -+ void ParseRefinementFlag(int refinement_edges[2], int &type, -+ int &flag) const; - void CreateRefinementFlag(int refinement_edges[2], int type, int flag = 0); - -- void GetMarkedFace(const int face, int *fv); -+ void GetMarkedFace(const int face, int *fv) const; - -- int GetRefinementFlag() { return refinement_flag; } -+ int GetRefinementFlag() const { return refinement_flag; } - - void SetRefinementFlag(int rf) { refinement_flag = rf; } - -diff --git a/miniapps/meshing/makefile b/miniapps/meshing/makefile -index 1ccec0455..e34a5637e 100644 ---- a/miniapps/meshing/makefile -+++ b/miniapps/meshing/makefile -@@ -123,7 +123,7 @@ clean-build: - rm -rf *.dSYM *.TVD.*breakpoints - - clean-exec: -- @rm -f mobius-strip.mesh klein-bottle.mesh mesh-explorer.mesh -+ @rm -f mobius-strip.mesh klein-bottle.mesh mesh-explorer.mesh* - @rm -f toroid-*.mesh twist-*.mesh trimmer.mesh reflected.mesh - @rm -f partitioning.txt shaper.mesh extruder.mesh - @rm -f optimized* perturbed* polar-nc.mesh -diff --git a/miniapps/meshing/mesh-explorer.cpp b/miniapps/meshing/mesh-explorer.cpp -index f05e18e83..67e8b1f65 100644 ---- a/miniapps/meshing/mesh-explorer.cpp -+++ b/miniapps/meshing/mesh-explorer.cpp -@@ -308,6 +308,7 @@ int main (int argc, char *argv[]) - partitioning = 0; - bdr_partitioning.SetSize(mesh->GetNBE()); - bdr_partitioning = 0; -+ np = 1; - } - else - { -@@ -382,7 +383,8 @@ int main (int argc, char *argv[]) - "f) Find physical point in reference space\n" - "p) Generate a partitioning\n" - "o) Reorder elements\n" -- "S) Save in MFEM format\n" -+ "S) Save in MFEM serial format\n" -+ "D) Save in MFEM parallel format using the current partitioning\n" - "V) Save in VTK format (only linear and quadratic meshes)\n" - "D) Save as a DataCollection\n" - "q) Quit\n" -@@ -971,9 +973,8 @@ int main (int argc, char *argv[]) - cin >> nxyz[2]; np *= nxyz[2]; - } - } -- int *part = mesh->CartesianPartitioning(nxyz); -- partitioning = Array(part, mesh->GetNE()); -- delete [] part; -+ partitioning.MakeRef(mesh->CartesianPartitioning(nxyz), -+ mesh->GetNE(), true); - recover_bdr_partitioning(mesh, partitioning, bdr_partitioning); - } - else if (pk == 's') -@@ -984,7 +985,7 @@ int main (int argc, char *argv[]) - partitioning.SetSize(mesh->GetNE()); - for (int i = 0; i < mesh->GetNE(); i++) - { -- partitioning[i] = i * np / mesh->GetNE(); -+ partitioning[i] = (long long)i * np / mesh->GetNE(); - } - recover_bdr_partitioning(mesh, partitioning, bdr_partitioning); - } -@@ -997,9 +998,8 @@ int main (int argc, char *argv[]) - } - cout << "Enter number of processors: " << flush; - cin >> np; -- int *part = mesh->GeneratePartitioning(np, part_method); -- partitioning = Array(part, mesh->GetNE()); -- delete [] part; -+ partitioning.MakeRef(mesh->GeneratePartitioning(np, part_method), -+ mesh->GetNE(), true); - recover_bdr_partitioning(mesh, partitioning, bdr_partitioning); - } - if (partitioning) -@@ -1197,6 +1197,25 @@ int main (int argc, char *argv[]) - cout << "New mesh file: " << omesh_file << endl; - } - -+ if (mk == 'D') -+ { -+ const char mesh_prefix[] = "mesh-explorer.mesh."; -+ MeshPartitioner partitioner(*mesh, np, partitioning); -+ MeshPart mesh_part; -+ int precision; -+ cout << "Enter desired precision: " << flush; -+ cin >> precision; -+ for (int i = 0; i < np; i++) -+ { -+ partitioner.ExtractPart(i, mesh_part); -+ -+ ofstream omesh(MakeParFilename(mesh_prefix, i)); -+ omesh.precision(precision); -+ mesh_part.Print(omesh); -+ } -+ cout << "New parallel mesh files: " << mesh_prefix << "" << endl; -+ } -+ - if (mk == 'V') - { - const char omesh_file[] = "mesh-explorer.vtk"; +diff --git a/.gitignore b/.gitignore +index 030672a06..8cc9a33f7 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -227,7 +227,7 @@ miniapps/meshing/mobius-strip.mesh + miniapps/meshing/klein-bottle.mesh + miniapps/meshing/toroid-*.mesh + miniapps/meshing/twist-*.mesh +-miniapps/meshing/mesh-explorer.mesh ++miniapps/meshing/mesh-explorer.mesh* + miniapps/meshing/partitioning.txt + miniapps/meshing/mesh-explorer-visit* + miniapps/meshing/mesh-explorer-paraview/ +diff --git a/examples/ex1p-test.cpp b/examples/ex1p-test.cpp +new file mode 100644 +index 000000000..9b17ae982 +--- /dev/null ++++ b/examples/ex1p-test.cpp +@@ -0,0 +1,301 @@ ++// MFEM Example 1 - Parallel Version ++// ++// Compile with: make ex1p ++// ++// Sample runs: mpirun -np 4 ex1p -m ../data/square-disc.mesh ++// mpirun -np 4 ex1p -m ../data/star.mesh ++// mpirun -np 4 ex1p -m ../data/star-mixed.mesh ++// mpirun -np 4 ex1p -m ../data/escher.mesh ++// mpirun -np 4 ex1p -m ../data/fichera.mesh ++// mpirun -np 4 ex1p -m ../data/fichera-mixed.mesh ++// mpirun -np 4 ex1p -m ../data/toroid-wedge.mesh ++// mpirun -np 4 ex1p -m ../data/octahedron.mesh -o 1 ++// mpirun -np 4 ex1p -m ../data/periodic-annulus-sector.msh ++// mpirun -np 4 ex1p -m ../data/periodic-torus-sector.msh ++// mpirun -np 4 ex1p -m ../data/square-disc-p2.vtk -o 2 ++// mpirun -np 4 ex1p -m ../data/square-disc-p3.mesh -o 3 ++// mpirun -np 4 ex1p -m ../data/square-disc-nurbs.mesh -o -1 ++// mpirun -np 4 ex1p -m ../data/star-mixed-p2.mesh -o 2 ++// mpirun -np 4 ex1p -m ../data/disc-nurbs.mesh -o -1 ++// mpirun -np 4 ex1p -m ../data/pipe-nurbs.mesh -o -1 ++// mpirun -np 4 ex1p -m ../data/ball-nurbs.mesh -o 2 ++// mpirun -np 4 ex1p -m ../data/fichera-mixed-p2.mesh -o 2 ++// mpirun -np 4 ex1p -m ../data/star-surf.mesh ++// mpirun -np 4 ex1p -m ../data/square-disc-surf.mesh ++// mpirun -np 4 ex1p -m ../data/inline-segment.mesh ++// mpirun -np 4 ex1p -m ../data/amr-quad.mesh ++// mpirun -np 4 ex1p -m ../data/amr-hex.mesh ++// mpirun -np 4 ex1p -m ../data/mobius-strip.mesh ++// mpirun -np 4 ex1p -m ../data/mobius-strip.mesh -o -1 -sc ++// ++// Device sample runs: ++// mpirun -np 4 ex1p -pa -d cuda ++// mpirun -np 4 ex1p -pa -d occa-cuda ++// mpirun -np 4 ex1p -pa -d raja-omp ++// mpirun -np 4 ex1p -pa -d ceed-cpu ++// mpirun -np 4 ex1p -pa -d ceed-cpu -o 4 -a ++// * mpirun -np 4 ex1p -pa -d ceed-cuda ++// * mpirun -np 4 ex1p -pa -d ceed-hip ++// mpirun -np 4 ex1p -pa -d ceed-cuda:/gpu/cuda/shared ++// mpirun -np 4 ex1p -m ../data/beam-tet.mesh -pa -d ceed-cpu ++// ++// Description: This example code demonstrates the use of MFEM to define a ++// simple finite element discretization of the Laplace problem ++// -Delta u = 1 with homogeneous Dirichlet boundary conditions. ++// Specifically, we discretize using a FE space of the specified ++// order, or if order < 1 using an isoparametric/isogeometric ++// space (i.e. quadratic for quadratic curvilinear mesh, NURBS for ++// NURBS mesh, etc.) ++// ++// The example highlights the use of mesh refinement, finite ++// element grid functions, as well as linear and bilinear forms ++// corresponding to the left-hand side and right-hand side of the ++// discrete linear system. We also cover the explicit elimination ++// of essential boundary conditions, static condensation, and the ++// optional connection to the GLVis tool for visualization. ++ ++#include "mfem.hpp" ++#include ++#include ++ ++using namespace std; ++using namespace mfem; ++ ++int main(int argc, char *argv[]) ++{ ++ // 1. Initialize MPI. ++ MPI_Session mpi; ++ int num_procs = mpi.WorldSize(); ++ int myid = mpi.WorldRank(); ++ ++ // 2. Parse command-line options. ++ const char *mesh_file = "../data/star.mesh"; ++ int order = 1; ++ bool static_cond = false; ++ bool pa = false; ++ const char *device_config = "cpu"; ++ bool visualization = true; ++ bool algebraic_ceed = false; ++ ++ OptionsParser args(argc, argv); ++ args.AddOption(&mesh_file, "-m", "--mesh", ++ "Mesh file to use."); ++ args.AddOption(&order, "-o", "--order", ++ "Finite element order (polynomial degree) or -1 for" ++ " isoparametric space."); ++ args.AddOption(&static_cond, "-sc", "--static-condensation", "-no-sc", ++ "--no-static-condensation", "Enable static condensation."); ++ args.AddOption(&pa, "-pa", "--partial-assembly", "-no-pa", ++ "--no-partial-assembly", "Enable Partial Assembly."); ++ args.AddOption(&device_config, "-d", "--device", ++ "Device configuration string, see Device::Configure()."); ++#ifdef MFEM_USE_CEED ++ args.AddOption(&algebraic_ceed, "-a", "--algebraic", ++ "-no-a", "--no-algebraic", ++ "Use algebraic Ceed solver"); ++#endif ++ args.AddOption(&visualization, "-vis", "--visualization", "-no-vis", ++ "--no-visualization", ++ "Enable or disable GLVis visualization."); ++ args.Parse(); ++ if (!args.Good()) ++ { ++ if (myid == 0) ++ { ++ args.PrintUsage(cout); ++ } ++ return 1; ++ } ++ if (myid == 0) ++ { ++ args.PrintOptions(cout); ++ } ++ ++ // 3. Enable hardware devices such as GPUs, and programming models such as ++ // CUDA, OCCA, RAJA and OpenMP based on command line options. ++ Device device(device_config); ++ if (myid == 0) { device.Print(); } ++ ++ // 4. Read the (serial) mesh from the given mesh file on all processors. We ++ // can handle triangular, quadrilateral, tetrahedral, hexahedral, surface ++ // and volume meshes with the same code. ++ Mesh mesh(mesh_file, 1, 1); ++ int dim = mesh.Dimension(); ++ ++ // 5. Refine the serial mesh on all processors to increase the resolution. In ++ // this example we do 'ref_levels' of uniform refinement. We choose ++ // 'ref_levels' to be the largest number that gives a final mesh with no ++ // more than 10,000 elements. ++ { ++ int ref_levels = ++ (int)floor(log(10000./mesh.GetNE())/log(2.)/dim); ++ for (int l = 0; l < ref_levels; l++) ++ { ++ mesh.UniformRefinement(); ++ } ++ } ++ ++ // 6. Define a parallel mesh by a partitioning of the serial mesh. Refine ++ // this mesh further in parallel to increase the resolution. Once the ++ // parallel mesh is defined, the serial mesh can be deleted. ++ // ParMesh pmesh(MPI_COMM_WORLD, mesh); ++ mesh.Clear(); ++ ifstream mesh_ifs( ++ MakeParFilename("../miniapps/meshing/mesh-explorer.mesh.", ++ myid)); ++ ParMesh pmesh(MPI_COMM_WORLD, mesh_ifs, /* refine: */ false); ++ dim = pmesh.Dimension(); ++ pmesh.PrintInfo(cout); ++ { ++ int par_ref_levels = 0; ++ for (int l = 0; l < par_ref_levels; l++) ++ { ++ pmesh.UniformRefinement(); ++ } ++ } ++ ++ // 7. Define a parallel finite element space on the parallel mesh. Here we ++ // use continuous Lagrange finite elements of the specified order. If ++ // order < 1, we instead use an isoparametric/isogeometric space. ++ FiniteElementCollection *fec; ++ bool delete_fec; ++ if (order > 0) ++ { ++ fec = new H1_FECollection(order, dim); ++ delete_fec = true; ++ } ++ else if (pmesh.GetNodes()) ++ { ++ fec = pmesh.GetNodes()->OwnFEC(); ++ delete_fec = false; ++ if (myid == 0) ++ { ++ cout << "Using isoparametric FEs: " << fec->Name() << endl; ++ } ++ } ++ else ++ { ++ fec = new H1_FECollection(order = 1, dim); ++ delete_fec = true; ++ } ++ ParFiniteElementSpace fespace(&pmesh, fec); ++ HYPRE_BigInt size = fespace.GlobalTrueVSize(); ++ if (myid == 0) ++ { ++ cout << "Number of finite element unknowns: " << size << endl; ++ } ++ ++ // 8. Determine the list of true (i.e. parallel conforming) essential ++ // boundary dofs. In this example, the boundary conditions are defined ++ // by marking all the boundary attributes from the mesh as essential ++ // (Dirichlet) and converting them to a list of true dofs. ++ Array ess_tdof_list; ++ if (pmesh.bdr_attributes.Size()) ++ { ++ Array ess_bdr(pmesh.bdr_attributes.Max()); ++ ess_bdr = 1; ++ fespace.GetEssentialTrueDofs(ess_bdr, ess_tdof_list); ++ } ++ ++ // 9. Set up the parallel linear form b(.) which corresponds to the ++ // right-hand side of the FEM linear system, which in this case is ++ // (1,phi_i) where phi_i are the basis functions in fespace. ++ ParLinearForm b(&fespace); ++ ConstantCoefficient one(1.0); ++ b.AddDomainIntegrator(new DomainLFIntegrator(one)); ++ b.Assemble(); ++ ++ // 10. Define the solution vector x as a parallel finite element grid ++ // function corresponding to fespace. Initialize x with initial guess of ++ // zero, which satisfies the boundary conditions. ++ ParGridFunction x(&fespace); ++ x = 0.0; ++ ++ // 11. Set up the parallel bilinear form a(.,.) on the finite element space ++ // corresponding to the Laplacian operator -Delta, by adding the ++ // Diffusion domain integrator. ++ ParBilinearForm a(&fespace); ++ if (pa) { a.SetAssemblyLevel(AssemblyLevel::PARTIAL); } ++ a.AddDomainIntegrator(new DiffusionIntegrator(one)); ++ ++ // 12. Assemble the parallel bilinear form and the corresponding linear ++ // system, applying any necessary transformations such as: parallel ++ // assembly, eliminating boundary conditions, applying conforming ++ // constraints for non-conforming AMR, static condensation, etc. ++ if (static_cond) { a.EnableStaticCondensation(); } ++ a.Assemble(); ++ ++ OperatorPtr A; ++ Vector B, X; ++ a.FormLinearSystem(ess_tdof_list, x, b, A, X, B); ++ ++ // 13. Solve the linear system A X = B. ++ // * With full assembly, use the BoomerAMG preconditioner from hypre. ++ // * With partial assembly, use Jacobi smoothing, for now. ++ Solver *prec = NULL; ++ if (pa) ++ { ++ if (UsesTensorBasis(fespace)) ++ { ++ if (algebraic_ceed) ++ { ++ prec = new ceed::AlgebraicSolver(a, ess_tdof_list); ++ } ++ else ++ { ++ prec = new OperatorJacobiSmoother(a, ess_tdof_list); ++ } ++ } ++ } ++ else ++ { ++ prec = new HypreBoomerAMG; ++ } ++ CGSolver cg(MPI_COMM_WORLD); ++ cg.SetRelTol(1e-12); ++ cg.SetMaxIter(2000); ++ cg.SetPrintLevel(1); ++ if (prec) { cg.SetPreconditioner(*prec); } ++ cg.SetOperator(*A); ++ cg.Mult(B, X); ++ delete prec; ++ ++ // 14. Recover the parallel grid function corresponding to X. This is the ++ // local finite element solution on each processor. ++ a.RecoverFEMSolution(X, b, x); ++ ++ // 15. Save the refined mesh and the solution in parallel. This output can ++ // be viewed later using GLVis: "glvis -np -m mesh -g sol". ++ { ++ ostringstream mesh_name, sol_name; ++ mesh_name << "mesh." << setfill('0') << setw(6) << myid; ++ sol_name << "sol." << setfill('0') << setw(6) << myid; ++ ++ ofstream mesh_ofs(mesh_name.str().c_str()); ++ mesh_ofs.precision(8); ++ pmesh.Print(mesh_ofs); ++ ++ ofstream sol_ofs(sol_name.str().c_str()); ++ sol_ofs.precision(8); ++ x.Save(sol_ofs); ++ } ++ ++ // 16. Send the solution by socket to a GLVis server. ++ if (visualization) ++ { ++ char vishost[] = "localhost"; ++ int visport = 19916; ++ socketstream sol_sock(vishost, visport); ++ sol_sock << "parallel " << num_procs << " " << myid << "\n"; ++ sol_sock.precision(8); ++ sol_sock << "solution\n" << pmesh << x << flush; ++ } ++ ++ // 17. Free the used memory. ++ if (delete_fec) ++ { ++ delete fec; ++ } ++ ++ return 0; ++} +diff --git a/fem/pfespace.cpp b/fem/pfespace.cpp +index 76ac230a1..8761e1489 100644 +--- a/fem/pfespace.cpp ++++ b/fem/pfespace.cpp +@@ -716,13 +716,10 @@ void ParFiniteElementSpace::CheckNDSTriaDofs() + } + + // Check for shared triangle faces +- bool strias = false; ++ bool strias = false; ++ for (int g = 1; g < pmesh->GetNGroups(); g++) + { +- int ngrps = pmesh->GetNGroups(); +- for (int g = 1; g < ngrps; g++) +- { +- strias |= pmesh->GroupNTriangles(g); +- } ++ strias |= pmesh->GroupNTriangles(g); + } + + // Combine results +diff --git a/general/array.hpp b/general/array.hpp +index 0f16b3023..a19b099a1 100644 +--- a/general/array.hpp ++++ b/general/array.hpp +@@ -77,8 +77,8 @@ public: + /** @brief Creates array using an existing c-array of asize elements; + allocsize is set to -asize to indicate that the data will not + be deleted. */ +- inline Array(T *data_, int asize) +- { data.Wrap(data_, asize, false); size = asize; } ++ inline Array(T *data_, int asize, bool own_data = false) ++ { data.Wrap(data_, asize, own_data); size = asize; } + + /// Copy constructor: deep copy from @a src + /** This method supports source arrays using any MemoryType. */ +@@ -206,7 +206,7 @@ public: + inline void Copy(Array ©) const; + + /// Make this Array a reference to a pointer. +- inline void MakeRef(T *, int); ++ inline void MakeRef(T *data_, int size_, bool own_data = false); + + /// Make this Array a reference to 'master'. + inline void MakeRef(const Array &master); +@@ -869,11 +869,11 @@ inline void Array::Copy(Array ©) const + } + + template +-inline void Array::MakeRef(T *p, int s) ++inline void Array::MakeRef(T *data_, int size_, bool own_data) + { + data.Delete(); +- data.Wrap(p, s, false); +- size = s; ++ data.Wrap(data_, size_, own_data); ++ size = size_; + } + + template +diff --git a/general/communication.cpp b/general/communication.cpp +index 0c2fffc1f..e8002a273 100644 +--- a/general/communication.cpp ++++ b/general/communication.cpp +@@ -275,7 +275,7 @@ void GroupTopology::Save(ostream &os) const + os << "\ncommunication_groups\n"; + os << "number_of_groups " << NGroups() << "\n\n"; + +- os << "# number of entities in each group, followed by group ids in group\n"; ++ os << "# number of entities in each group, followed by ranks in group\n"; + for (int group_id = 0; group_id < NGroups(); ++group_id) + { + int group_size = GetGroupSize(group_id); +diff --git a/general/table.hpp b/general/table.hpp +index 2ed9f4a1b..96373b2d1 100644 +--- a/general/table.hpp ++++ b/general/table.hpp +@@ -208,6 +208,7 @@ void Transpose (const Table &A, Table &At, int ncols_A_ = -1); + Table * Transpose (const Table &A); + + /// Transpose an Array ++/** @note The column (TYPE II) indices in each row of @a At will be sorted. */ + void Transpose(const Array &A, Table &At, int ncols_A_ = -1); + + /// C = A * B (as boolean matrices) +diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp +index a8ec98649..5f82de812 100644 +--- a/mesh/mesh.cpp ++++ b/mesh/mesh.cpp +@@ -19,6 +19,7 @@ + #include "../general/device.hpp" + #include "../general/tic_toc.hpp" + #include "../general/gecko.hpp" ++#include "../general/sets.hpp" + #include "../fem/quadinterpolator.hpp" + + #include +@@ -31,6 +32,7 @@ + #include + #include + #include ++#include + + // Include the METIS header, if using version 5. If using METIS 4, the needed + // declarations are inlined below, i.e. no header is needed. +@@ -1290,7 +1292,7 @@ Mesh::FaceInformation::operator Mesh::FaceInfo() const + return res; + } + +-std::ostream& operator<<(std::ostream& os, const Mesh::FaceInformation& info) ++std::ostream &operator<<(std::ostream &os, const Mesh::FaceInformation& info) + { + os << "face topology="; + switch (info.topology) +@@ -2989,7 +2991,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) + { + GetElementToFaceTable(); + GenerateFaces(); +- if (NumOfBdrElements == 0 && generate_bdr) ++ if (ReduceInt(NumOfBdrElements) == 0 && generate_bdr) + { + GenerateBoundaryElements(); + GetElementToFaceTable(); // update be_to_face +@@ -3009,7 +3011,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) + if (Dim == 2) + { + GenerateFaces(); // 'Faces' in 2D refers to the edges +- if (NumOfBdrElements == 0 && generate_bdr) ++ if (ReduceInt(NumOfBdrElements) == 0 && generate_bdr) + { + GenerateBoundaryElements(); + } +@@ -3023,7 +3025,7 @@ void Mesh::FinalizeTopology(bool generate_bdr) + if (Dim == 1) + { + GenerateFaces(); +- if (NumOfBdrElements == 0 && generate_bdr) ++ if (ReduceInt(NumOfBdrElements) == 0 && generate_bdr) + { + // be_to_face will be set inside GenerateBoundaryElements + GenerateBoundaryElements(); +@@ -5643,6 +5645,12 @@ const FiniteElementSpace *Mesh::GetNodalFESpace() const + + void Mesh::SetCurvature(int order, bool discont, int space_dim, int ordering) + { ++ if (order <= 0) ++ { ++ delete Nodes; ++ Nodes = nullptr; ++ return; ++ } + space_dim = (space_dim == -1) ? spaceDim : space_dim; + FiniteElementCollection* nfec; + if (discont) +@@ -10685,7 +10693,7 @@ void Mesh::Printer(std::ostream &os, std::string section_delimiter) const + } + } + +-void Mesh::PrintTopo(std::ostream &os,const Array &e_to_k) const ++void Mesh::PrintTopo(std::ostream &os, const Array &e_to_k) const + { + int i; + Array vert; +@@ -12556,6 +12564,878 @@ void Mesh::GetGeometricParametersFromJacobian(const DenseMatrix &J, + } + + ++MeshPart::EntityHelper::EntityHelper( ++ int dim_, const Array (&entity_to_vertex_)[Geometry::NumGeom]) ++ : dim(dim_), ++ entity_to_vertex(entity_to_vertex_) ++{ ++ int geom_offset = 0; ++ for (int g = Geometry::DimStart[dim]; g < Geometry::DimStart[dim+1]; g++) ++ { ++ geom_offsets[g] = geom_offset; ++ geom_offset += entity_to_vertex[g].Size()/Geometry::NumVerts[g]; ++ } ++ geom_offsets[Geometry::DimStart[dim+1]] = geom_offset; ++ num_entities = geom_offset; ++} ++ ++MeshPart::Entity MeshPart::EntityHelper::FindEntity(int bytype_entity_id) ++{ ++ // Find the 'geom' that corresponds to 'bytype_entity_id' ++ int geom = Geometry::DimStart[dim]; ++ while (geom_offsets[geom+1] <= bytype_entity_id) { geom++; } ++ MFEM_ASSERT(geom < Geometry::NumGeom, "internal error"); ++ MFEM_ASSERT(Geometry::Dimension[geom] == dim, "internal error"); ++ const int nv = Geometry::NumVerts[geom]; ++ const int geom_elem_id = bytype_entity_id - geom_offsets[geom]; ++ const int *v = &entity_to_vertex[geom][nv*geom_elem_id]; ++ return { geom, nv, v }; ++} ++ ++void MeshPart::Print(std::ostream &os) const ++{ ++ os << "MFEM mesh v1.2\n"; ++ ++ // optional ++ os << ++ "\n#\n# MFEM Geometry Types (see mesh/geom.hpp):\n#\n" ++ "# POINT = 0\n" ++ "# SEGMENT = 1\n" ++ "# TRIANGLE = 2\n" ++ "# SQUARE = 3\n" ++ "# TETRAHEDRON = 4\n" ++ "# CUBE = 5\n" ++ "# PRISM = 6\n" ++ "# PYRAMID = 7\n" ++ "#\n"; ++ ++ const int dim = dimension; ++ os << "\ndimension\n" << dim; ++ ++ os << "\n\nelements\n" << num_elements << '\n'; ++ { ++ const bool have_element_map = (element_map.Size() == num_elements); ++ MFEM_ASSERT(have_element_map || element_map.Size() == 0, ++ "invalid MeshPart state"); ++ EntityHelper elem_helper(dim, entity_to_vertex); ++ MFEM_ASSERT(elem_helper.num_entities == num_elements, ++ "invalid MeshPart state"); ++ for (int nat_elem_id = 0; nat_elem_id < num_elements; nat_elem_id++) ++ { ++ const int bytype_elem_id = have_element_map ? ++ element_map[nat_elem_id] : nat_elem_id; ++ const Entity ent = elem_helper.FindEntity(bytype_elem_id); ++ // Print the element ++ os << attributes[nat_elem_id] << ' ' << ent.geom; ++ for (int i = 0; i < ent.num_verts; i++) ++ { ++ os << ' ' << ent.verts[i]; ++ } ++ os << '\n'; ++ } ++ } ++ ++ os << "\nboundary\n" << num_bdr_elements << '\n'; ++ { ++ const bool have_boundary_map = (boundary_map.Size() == num_bdr_elements); ++ MFEM_ASSERT(have_boundary_map || boundary_map.Size() == 0, ++ "invalid MeshPart state"); ++ EntityHelper bdr_helper(dim-1, entity_to_vertex); ++ MFEM_ASSERT(bdr_helper.num_entities == num_bdr_elements, ++ "invalid MeshPart state"); ++ for (int nat_bdr_id = 0; nat_bdr_id < num_bdr_elements; nat_bdr_id++) ++ { ++ const int bytype_bdr_id = have_boundary_map ? ++ boundary_map[nat_bdr_id] : nat_bdr_id; ++ const Entity ent = bdr_helper.FindEntity(bytype_bdr_id); ++ // Print the boundary element ++ os << bdr_attributes[nat_bdr_id] << ' ' << ent.geom; ++ for (int i = 0; i < ent.num_verts; i++) ++ { ++ os << ' ' << ent.verts[i]; ++ } ++ os << '\n'; ++ } ++ } ++ ++ os << "\nvertices\n" << num_vertices << '\n'; ++ if (!nodes) ++ { ++ const int sdim = space_dimension; ++ os << sdim << '\n'; ++ for (int i = 0; i < num_vertices; i++) ++ { ++ os << vertex_coordinates[i*sdim]; ++ for (int d = 1; d < sdim; d++) ++ { ++ os << ' ' << vertex_coordinates[i*sdim+d]; ++ } ++ os << '\n'; ++ } ++ } ++ else ++ { ++ os << "\nnodes\n"; ++ nodes->Save(os); ++ } ++ ++ os << "\nmfem_serial_mesh_end\n"; ++ ++ // Start: GroupTopology::Save ++ const int num_groups = my_groups.Size(); ++ os << "\ncommunication_groups\n"; ++ os << "number_of_groups " << num_groups << "\n\n"; ++ ++ os << "# number of entities in each group, followed by ranks in group\n"; ++ for (int group_id = 0; group_id < num_groups; ++group_id) ++ { ++ const int group_size = my_groups.RowSize(group_id); ++ const int *group_ptr = my_groups.GetRow(group_id); ++ os << group_size; ++ for (int group_member_index = 0; group_member_index < group_size; ++ ++group_member_index) ++ { ++ os << ' ' << group_ptr[group_member_index]; ++ } ++ os << '\n'; ++ } ++ // End: GroupTopology::Save ++ ++ const Table &g2v = group__shared_entity_to_vertex[Geometry::POINT]; ++ const Table &g2ev = group__shared_entity_to_vertex[Geometry::SEGMENT]; ++ const Table &g2tv = group__shared_entity_to_vertex[Geometry::TRIANGLE]; ++ const Table &g2qv = group__shared_entity_to_vertex[Geometry::SQUARE]; ++ ++ MFEM_VERIFY(g2v.RowSize(0) == 0, "internal erroor"); ++ os << "\ntotal_shared_vertices " << g2v.Size_of_connections() << '\n'; ++ if (dimension >= 2) ++ { ++ MFEM_VERIFY(g2ev.RowSize(0) == 0, "internal erroor"); ++ os << "total_shared_edges " << g2ev.Size_of_connections()/2 << '\n'; ++ } ++ if (dimension >= 3) ++ { ++ MFEM_VERIFY(g2tv.RowSize(0) == 0, "internal erroor"); ++ MFEM_VERIFY(g2qv.RowSize(0) == 0, "internal erroor"); ++ const int total_shared_faces = ++ g2tv.Size_of_connections()/3 + g2qv.Size_of_connections()/4; ++ os << "total_shared_faces " << total_shared_faces << '\n'; ++ } ++ os << "\n# group 0 has no shared entities\n"; ++ for (int gr = 1; gr < num_groups; gr++) ++ { ++ { ++ const int nv = g2v.RowSize(gr); ++ const int *sv = g2v.GetRow(gr); ++ os << "\n# group " << gr << "\nshared_vertices " << nv << '\n'; ++ for (int i = 0; i < nv; i++) ++ { ++ os << sv[i] << '\n'; ++ } ++ } ++ if (dimension >= 2) ++ { ++ const int ne = g2ev.RowSize(gr)/2; ++ const int *se = g2ev.GetRow(gr); ++ os << "\nshared_edges " << ne << '\n'; ++ for (int i = 0; i < ne; i++) ++ { ++ const int *v = se + 2*i; ++ os << v[0] << ' ' << v[1] << '\n'; ++ } ++ } ++ if (dimension >= 3) ++ { ++ const int nt = g2tv.RowSize(gr)/3; ++ const int *st = g2tv.GetRow(gr); ++ const int nq = g2qv.RowSize(gr)/4; ++ const int *sq = g2qv.GetRow(gr); ++ os << "\nshared_faces " << nt+nq << '\n'; ++ for (int i = 0; i < nt; i++) ++ { ++ os << Geometry::TRIANGLE; ++ const int *v = st + 3*i; ++ for (int j = 0; j < 3; j++) { os << ' ' << v[j]; } ++ os << '\n'; ++ } ++ for (int i = 0; i < nq; i++) ++ { ++ os << Geometry::SQUARE; ++ const int *v = sq + 4*i; ++ for (int j = 0; j < 4; j++) { os << ' ' << v[j]; } ++ os << '\n'; ++ } ++ } ++ } ++ ++ // Write out section end tag for mesh. ++ os << "mfem_mesh_end" << endl; ++} ++ ++Mesh &MeshPart::GetMesh() ++{ ++ if (mesh) { return *mesh; } ++ ++ mesh.reset(new Mesh(dimension, ++ num_vertices, ++ num_elements, ++ num_bdr_elements, ++ space_dimension)); ++ ++ // Add elements ++ { ++ const bool have_element_map = (element_map.Size() == num_elements); ++ MFEM_ASSERT(have_element_map || element_map.Size() == 0, ++ "invalid MeshPart state"); ++ EntityHelper elem_helper(dimension, entity_to_vertex); ++ MFEM_ASSERT(elem_helper.num_entities == num_elements, ++ "invalid MeshPart state"); ++ const bool have_tet_refine_flags = (tet_refine_flags.Size() > 0); ++ for (int nat_elem_id = 0; nat_elem_id < num_elements; nat_elem_id++) ++ { ++ const int bytype_elem_id = have_element_map ? ++ element_map[nat_elem_id] : nat_elem_id; ++ const Entity ent = elem_helper.FindEntity(bytype_elem_id); ++ Element *el = mesh->NewElement(ent.geom); ++ el->SetVertices(ent.verts); ++ el->SetAttribute(attributes[nat_elem_id]); ++ if (ent.geom == Geometry::TETRAHEDRON && have_tet_refine_flags) ++ { ++ constexpr int geom_tet = Geometry::TETRAHEDRON; ++ const int tet_id = (ent.verts - entity_to_vertex[geom_tet])/4; ++ const int ref_flag = tet_refine_flags[tet_id]; ++ static_cast(el)->SetRefinementFlag(ref_flag); ++ } ++ mesh->AddElement(el); ++ } ++ } ++ ++ // Add boundary elements ++ { ++ const bool have_boundary_map = (boundary_map.Size() == num_bdr_elements); ++ MFEM_ASSERT(have_boundary_map || boundary_map.Size() == 0, ++ "invalid MeshPart state"); ++ EntityHelper bdr_helper(dimension-1, entity_to_vertex); ++ MFEM_ASSERT(bdr_helper.num_entities == num_bdr_elements, ++ "invalid MeshPart state"); ++ for (int nat_bdr_id = 0; nat_bdr_id < num_bdr_elements; nat_bdr_id++) ++ { ++ const int bytype_bdr_id = have_boundary_map ? ++ boundary_map[nat_bdr_id] : nat_bdr_id; ++ const Entity ent = bdr_helper.FindEntity(bytype_bdr_id); ++ Element *bdr = mesh->NewElement(ent.geom); ++ bdr->SetVertices(ent.verts); ++ bdr->SetAttribute(bdr_attributes[nat_bdr_id]); ++ mesh->AddBdrElement(bdr); ++ } ++ } ++ ++ // Add vertices ++ if (vertex_coordinates.Size() == space_dimension*num_vertices) ++ { ++ MFEM_ASSERT(!nodes, "invalid MeshPart state"); ++ for (int vert_id = 0; vert_id < num_vertices; vert_id++) ++ { ++ mesh->AddVertex(vertex_coordinates + space_dimension*vert_id); ++ } ++ } ++ else ++ { ++ MFEM_ASSERT(vertex_coordinates.Size() == 0, "invalid MeshPart state"); ++ for (int vert_id = 0; vert_id < num_vertices; vert_id++) ++ { ++ mesh->AddVertex(0., 0., 0.); ++ } ++ // 'mesh.Nodes' cannot be set here -- they can be set later, if needed ++ } ++ ++ mesh->FinalizeTopology(/* generate_bdr: */ false); ++ ++ return *mesh; ++} ++ ++ ++MeshPartitioner::MeshPartitioner(Mesh &mesh_, ++ int num_parts_, ++ int *partitioning_, ++ int part_method) ++ : mesh(mesh_), ++ partitioning(partitioning_), ++ own_partitioning(false) ++{ ++ if (partitioning == nullptr) ++ { ++ partitioning = mesh.GeneratePartitioning(num_parts_, part_method); ++ own_partitioning = true; ++ } ++ ++ Transpose(Array(partitioning, mesh.GetNE()), ++ part_to_element, num_parts_); ++ // Note: the element ids in each row of 'part_to_element' are sorted. ++ ++ const int dim = mesh.Dimension(); ++ if (dim >= 2) ++ { ++ Transpose(mesh.ElementToEdgeTable(), edge_to_element, mesh.GetNEdges()); ++ } ++ ++ Array boundary_to_part(mesh.GetNBE()); ++ // Same logic as in ParMesh::BuildLocalBoundary ++ if (dim >= 3) ++ { ++ for (int i = 0; i < boundary_to_part.Size(); i++) ++ { ++ int face, o, el1, el2; ++ mesh.GetBdrElementFace(i, &face, &o); ++ mesh.GetFaceElements(face, &el1, &el2); ++ boundary_to_part[i] = ++ partitioning[(o % 2 == 0 || el2 < 0) ? el1 : el2]; ++ } ++ } ++ else if (dim == 2) ++ { ++ for (int i = 0; i < boundary_to_part.Size(); i++) ++ { ++ int edge = mesh.GetBdrElementFaceIndex(i); ++ int el1 = edge_to_element.GetRow(edge)[0]; ++ boundary_to_part[i] = partitioning[el1]; ++ } ++ } ++ else if (dim == 1) ++ { ++ for (int i = 0; i < boundary_to_part.Size(); i++) ++ { ++ int vert = mesh.GetBdrElementFaceIndex(i); ++ int el1, el2; ++ mesh.GetFaceElements(vert, &el1, &el2); ++ boundary_to_part[i] = partitioning[el1]; ++ } ++ } ++ Transpose(boundary_to_part, part_to_boundary, num_parts_); ++ // Note: the boundary element ids in each row of 'part_to_boundary' are ++ // sorted. ++ boundary_to_part.DeleteAll(); ++ ++ Table *vert_element = mesh.GetVertexToElementTable(); // we must delete this ++ vertex_to_element.Swap(*vert_element); ++ delete vert_element; ++} ++ ++void MeshPartitioner::ExtractPart(int part_id, MeshPart &mesh_part) const ++{ ++ const int num_parts = part_to_element.Size(); ++ ++ MFEM_VERIFY(0 <= part_id && part_id < num_parts, ++ "invalid part_id = " << part_id ++ << ", num_parts = " << num_parts); ++ ++ const int dim = mesh.Dimension(); ++ const int sdim = mesh.SpaceDimension(); ++ const int num_elems = part_to_element.RowSize(part_id); ++ const int *elem_list = part_to_element.GetRow(part_id); // sorted ++ const int num_bdr_elems = part_to_boundary.RowSize(part_id); ++ const int *bdr_elem_list = part_to_boundary.GetRow(part_id); // sorted ++ ++ // Initialize 'mesh_part' ++ mesh_part.dimension = dim; ++ mesh_part.space_dimension = sdim; ++ mesh_part.num_vertices = 0; ++ mesh_part.num_elements = num_elems; ++ mesh_part.num_bdr_elements = num_bdr_elems; ++ for (int g = 0; g < Geometry::NumGeom; g++) ++ { ++ mesh_part.entity_to_vertex[g].SetSize(0); // can reuse Array allocation ++ } ++ mesh_part.tet_refine_flags.SetSize(0); ++ mesh_part.element_map.SetSize(0); // 0 or 'num_elements', if needed ++ mesh_part.boundary_map.SetSize(0); // 0 or 'num_bdr_elements', if needed ++ mesh_part.attributes.SetSize(num_elems); ++ mesh_part.bdr_attributes.SetSize(num_bdr_elems); ++ mesh_part.vertex_coordinates.SetSize(0); ++ ++ mesh_part.num_parts = num_parts; ++ mesh_part.my_part_id = part_id; ++ mesh_part.my_groups.Clear(); ++ for (int g = 0; g < Geometry::NumGeom; g++) ++ { ++ mesh_part.group__shared_entity_to_vertex[g].Clear(); ++ } ++ mesh_part.nodes.reset(nullptr); ++ mesh_part.nodal_fes.reset(nullptr); ++ mesh_part.mesh.reset(nullptr); ++ ++ // Initialize: ++ // - 'mesh_part.entity_to_vertex' for the elements (boundary elements are ++ // set later); vertex ids are global at this point - they will be mapped to ++ // local ids later ++ // - 'mesh_part.attributes' ++ // - 'mesh_part.tet_refine_flags' if needed ++ int geom_marker = 0, num_geom = 0; ++ for (int i = 0; i < num_elems; i++) ++ { ++ const Element *elem = mesh.GetElement(elem_list[i]); ++ const int geom = elem->GetGeometryType(); ++ const int nv = Geometry::NumVerts[geom]; ++ const int *v = elem->GetVertices(); ++ MFEM_VERIFY(numeric_limits::max() - nv >= ++ mesh_part.entity_to_vertex[geom].Size(), ++ "overflow in 'entity_to_vertex[geom]', geom: " ++ << Geometry::Name[geom]); ++ mesh_part.entity_to_vertex[geom].Append(v, nv); ++ mesh_part.attributes[i] = elem->GetAttribute(); ++ if (geom == Geometry::TETRAHEDRON) ++ { ++ // Create 'mesh_part.tet_refine_flags' but only if we find at least one ++ // non-zero flag in a tetrahedron. ++ const Tetrahedron *tet = static_cast(elem); ++ const int ref_flag = tet->GetRefinementFlag(); ++ if (mesh_part.tet_refine_flags.Size() == 0) ++ { ++ if (ref_flag) ++ { ++ // This is the first time we encounter non-zero 'ref_flag' ++ const int num_tets = mesh_part.entity_to_vertex[geom].Size()/nv; ++ mesh_part.tet_refine_flags.SetSize(num_tets, 0); ++ mesh_part.tet_refine_flags.Last() = ref_flag; ++ } ++ } ++ else ++ { ++ mesh_part.tet_refine_flags.Append(ref_flag); ++ } ++ } ++ if ((geom_marker & (1 << geom)) == 0) ++ { ++ geom_marker |= (1 << geom); ++ num_geom++; ++ } ++ } ++ MFEM_ASSERT(mesh_part.tet_refine_flags.Size() == 0 || ++ mesh_part.tet_refine_flags.Size() == ++ mesh_part.entity_to_vertex[Geometry::TETRAHEDRON].Size()/4, ++ "internal error"); ++ // Initialize 'mesh_part.element_map' if needed ++ if (num_geom > 1) ++ { ++ int offsets[Geometry::NumGeom]; ++ int offset = 0; ++ for (int g = Geometry::DimStart[dim]; g < Geometry::DimStart[dim+1]; g++) ++ { ++ offsets[g] = offset; ++ offset += mesh_part.entity_to_vertex[g].Size()/Geometry::NumVerts[g]; ++ } ++ mesh_part.element_map.SetSize(num_elems); ++ for (int i = 0; i < num_elems; i++) ++ { ++ const int geom = mesh.GetElementGeometry(elem_list[i]); ++ mesh_part.element_map[i] = offsets[geom]++; ++ } ++ } ++ ++ // Initialize: ++ // - 'mesh_part.entity_to_vertex' for the boundary elements; vertex ids are ++ // global at this point - they will be mapped to local ids later ++ // - 'mesh_part.bdr_attributes' ++ geom_marker = 0; num_geom = 0; ++ for (int i = 0; i < num_bdr_elems; i++) ++ { ++ const Element *bdr_elem = mesh.GetBdrElement(bdr_elem_list[i]); ++ const int geom = bdr_elem->GetGeometryType(); ++ const int nv = Geometry::NumVerts[geom]; ++ const int *v = bdr_elem->GetVertices(); ++ MFEM_VERIFY(numeric_limits::max() - nv >= ++ mesh_part.entity_to_vertex[geom].Size(), ++ "overflow in 'entity_to_vertex[geom]', geom: " ++ << Geometry::Name[geom]); ++ mesh_part.entity_to_vertex[geom].Append(v, nv); ++ mesh_part.bdr_attributes[i] = bdr_elem->GetAttribute(); ++ if ((geom_marker & (1 << geom)) == 0) ++ { ++ geom_marker |= (1 << geom); ++ num_geom++; ++ } ++ } ++ // Initialize 'mesh_part.boundary_map' if needed ++ if (num_geom > 1) ++ { ++ int offsets[Geometry::NumGeom]; ++ int offset = 0; ++ for (int g = Geometry::DimStart[dim-1]; g < Geometry::DimStart[dim]; g++) ++ { ++ offsets[g] = offset; ++ offset += mesh_part.entity_to_vertex[g].Size()/Geometry::NumVerts[g]; ++ } ++ mesh_part.boundary_map.SetSize(num_bdr_elems); ++ for (int i = 0; i < num_bdr_elems; i++) ++ { ++ const int geom = mesh.GetBdrElementGeometry(bdr_elem_list[i]); ++ mesh_part.boundary_map[i] = offsets[geom]++; ++ } ++ } ++ ++ // Create the vertex id map, 'vertex_loc_to_glob', which maps local ids to ++ // global ones; the map is sorted, preserving the global ordering. ++ Array vertex_loc_to_glob; ++ { ++ std::unordered_set vertex_set; ++ for (int i = 0; i < num_elems; i++) ++ { ++ const Element *elem = mesh.GetElement(elem_list[i]); ++ const int geom = elem->GetGeometryType(); ++ const int nv = Geometry::NumVerts[geom]; ++ const int *v = elem->GetVertices(); ++ vertex_set.insert(v, v + nv); ++ } ++ vertex_loc_to_glob.SetSize(vertex_set.size()); ++ std::copy(vertex_set.begin(), vertex_set.end(), // src ++ vertex_loc_to_glob.begin()); // dest ++ } ++ vertex_loc_to_glob.Sort(); ++ ++ // Initialize 'mesh_part.num_vertices' ++ mesh_part.num_vertices = vertex_loc_to_glob.Size(); ++ ++ // Update the vertex ids in the arrays 'mesh_part.entity_to_vertex' from ++ // global to local. ++ for (int g = 0; g < Geometry::NumGeom; g++) ++ { ++ Array &vert_array = mesh_part.entity_to_vertex[g]; ++ for (int i = 0; i < vert_array.Size(); i++) ++ { ++ const int glob_id = vert_array[i]; ++ const int loc_id = vertex_loc_to_glob.FindSorted(glob_id); ++ MFEM_ASSERT(loc_id >= 0, "internal error: global vertex id not found"); ++ vert_array[i] = loc_id; ++ } ++ } ++ ++ // Initialize one of 'mesh_part.vertex_coordinates' or 'mesh_part.nodes' ++ if (!mesh.GetNodes()) ++ { ++ MFEM_VERIFY(numeric_limits::max()/sdim >= vertex_loc_to_glob.Size(), ++ "overflow in 'vertex_coordinates', num_vertices = " ++ << vertex_loc_to_glob.Size() << ", sdim = " << sdim); ++ mesh_part.vertex_coordinates.SetSize(sdim*vertex_loc_to_glob.Size()); ++ for (int i = 0; i < vertex_loc_to_glob.Size(); i++) ++ { ++ const double *coord = mesh.GetVertex(vertex_loc_to_glob[i]); ++ for (int d = 0; d < sdim; d++) ++ { ++ mesh_part.vertex_coordinates[i*sdim+d] = coord[d]; ++ } ++ } ++ } ++ else ++ { ++ const GridFunction &glob_nodes = *mesh.GetNodes(); ++ mesh_part.nodal_fes = ExtractFESpace(mesh_part, *glob_nodes.FESpace()); ++ // Initialized 'mesh_part.mesh'. ++ // Note: the nodes of 'mesh_part.mesh' are not set. ++ ++ mesh_part.nodes = ExtractGridFunction(mesh_part, glob_nodes, ++ *mesh_part.nodal_fes); ++ ++ // Attach the 'mesh_part.nodes' to the 'mesh_part.mesh'. ++ mesh_part.mesh->NewNodes(*mesh_part.nodes, /* make_owner: */ false); ++ // Note: the vertices of 'mesh_part.mesh' are not set. ++ } ++ ++ // Begin constructing the "neighbor" groups, i.e. the groups that contain ++ // 'part_id'. ++ ListOfIntegerSets groups; ++ { ++ // the first group is the local one ++ IntegerSet group; ++ group.Recreate(1, &part_id); ++ groups.Insert(group); ++ } ++ ++ // 'shared_faces' : shared face id -> (global_face_id, group_id) ++ // Note: 'shared_faces' will be sorted by 'global_face_id'. ++ Array> shared_faces; ++ ++ // Add "neighbor" groups defined by faces ++ // Construct 'shared_faces'. ++ if (dim >= 3) ++ { ++ std::unordered_set face_set; ++ // Construct 'face_set' ++ const Table &elem_to_face = mesh.ElementToFaceTable(); ++ for (int loc_elem_id = 0; loc_elem_id < num_elems; loc_elem_id++) ++ { ++ const int glob_elem_id = elem_list[loc_elem_id]; ++ const int nfaces = elem_to_face.RowSize(glob_elem_id); ++ const int *faces = elem_to_face.GetRow(glob_elem_id); ++ face_set.insert(faces, faces + nfaces); ++ } ++ // Construct 'shared_faces'; add "neighbor" groups defined by faces. ++ IntegerSet group; ++ for (int glob_face_id : face_set) ++ { ++ int el[2]; ++ mesh.GetFaceElements(glob_face_id, &el[0], &el[1]); ++ if (el[1] < 0) { continue; } ++ el[0] = partitioning[el[0]]; ++ el[1] = partitioning[el[1]]; ++ MFEM_ASSERT(el[0] == part_id || el[1] == part_id, "internal error"); ++ if (el[0] != part_id || el[1] != part_id) ++ { ++ group.Recreate(2, el); ++ const int group_id = groups.Insert(group); ++ shared_faces.Append(Pair(glob_face_id, group_id)); ++ } ++ } ++ shared_faces.Sort(); // sort the shared faces by 'glob_face_id' ++ } ++ ++ // 'shared_edges' : shared edge id -> (global_edge_id, group_id) ++ // Note: 'shared_edges' will be sorted by 'global_edge_id'. ++ Array> shared_edges; ++ ++ // Add "neighbor" groups defined by edges. ++ // Construct 'shared_edges'. ++ if (dim >= 2) ++ { ++ std::unordered_set edge_set; ++ // Construct 'edge_set' ++ const Table &elem_to_edge = mesh.ElementToEdgeTable(); ++ for (int loc_elem_id = 0; loc_elem_id < num_elems; loc_elem_id++) ++ { ++ const int glob_elem_id = elem_list[loc_elem_id]; ++ const int nedges = elem_to_edge.RowSize(glob_elem_id); ++ const int *edges = elem_to_edge.GetRow(glob_elem_id); ++ edge_set.insert(edges, edges + nedges); ++ } ++ // Construct 'shared_edges'; add "neighbor" groups defined by edges. ++ IntegerSet group; ++ for (int glob_edge_id : edge_set) ++ { ++ const int nelem = edge_to_element.RowSize(glob_edge_id); ++ const int *elem = edge_to_element.GetRow(glob_edge_id); ++ Array &gr = group; // reference to the 'group' internal Array ++ gr.SetSize(nelem); ++ for (int j = 0; j < nelem; j++) ++ { ++ gr[j] = partitioning[elem[j]]; ++ } ++ gr.Sort(); ++ gr.Unique(); ++ MFEM_ASSERT(gr.FindSorted(part_id) >= 0, "internal error"); ++ if (group.Size() > 1) ++ { ++ const int group_id = groups.Insert(group); ++ shared_edges.Append(Pair(glob_edge_id, group_id)); ++ } ++ } ++ shared_edges.Sort(); // sort the shared edges by 'glob_edge_id' ++ } ++ ++ // 'shared_verts' : shared vertex id -> (global_vertex_id, group_id) ++ // Note: 'shared_verts' will be sorted by 'global_vertex_id'. ++ Array> shared_verts; ++ ++ // Add "neighbor" groups defined by vertices. ++ // Construct 'shared_verts'. ++ { ++ IntegerSet group; ++ for (int i = 0; i < vertex_loc_to_glob.Size(); i++) ++ { ++ // 'vertex_to_element' maps global vertex ids to global element ids ++ const int glob_vertex_id = vertex_loc_to_glob[i]; ++ const int nelem = vertex_to_element.RowSize(glob_vertex_id); ++ const int *elem = vertex_to_element.GetRow(glob_vertex_id); ++ Array &gr = group; // reference to the 'group' internal Array ++ gr.SetSize(nelem); ++ for (int j = 0; j < nelem; j++) ++ { ++ gr[j] = partitioning[elem[j]]; ++ } ++ gr.Sort(); ++ gr.Unique(); ++ MFEM_ASSERT(gr.FindSorted(part_id) >= 0, "internal error"); ++ if (group.Size() > 1) ++ { ++ const int group_id = groups.Insert(group); ++ shared_verts.Append(Pair(glob_vertex_id, group_id)); ++ } ++ } ++ } ++ ++ // Done constructing the "neighbor" groups in 'groups'. ++ const int num_groups = groups.Size(); ++ ++ // Define 'mesh_part.my_groups' ++ groups.AsTable(mesh_part.my_groups); ++ ++ // Construct 'mesh_part.group__shared_entity_to_vertex[Geometry::POINT]' ++ Table &group__shared_vertex_to_vertex = ++ mesh_part.group__shared_entity_to_vertex[Geometry::POINT]; ++ group__shared_vertex_to_vertex.MakeI(num_groups); ++ for (int sv = 0; sv < shared_verts.Size(); sv++) ++ { ++ const int group_id = shared_verts[sv].two; ++ group__shared_vertex_to_vertex.AddAColumnInRow(group_id); ++ } ++ group__shared_vertex_to_vertex.MakeJ(); ++ for (int sv = 0; sv < shared_verts.Size(); sv++) ++ { ++ const int glob_vertex_id = shared_verts[sv].one; ++ const int group_id = shared_verts[sv].two; ++ const int loc_vertex_id = vertex_loc_to_glob.FindSorted(glob_vertex_id); ++ MFEM_ASSERT(loc_vertex_id >= 0, "internal error"); ++ group__shared_vertex_to_vertex.AddConnection(group_id, loc_vertex_id); ++ } ++ group__shared_vertex_to_vertex.ShiftUpI(); ++ ++ // Construct 'mesh_part.group__shared_entity_to_vertex[Geometry::SEGMENT]' ++ if (dim >= 2) ++ { ++ Table &group__shared_edge_to_vertex = ++ mesh_part.group__shared_entity_to_vertex[Geometry::SEGMENT]; ++ group__shared_edge_to_vertex.MakeI(num_groups); ++ for (int se = 0; se < shared_edges.Size(); se++) ++ { ++ const int group_id = shared_edges[se].two; ++ group__shared_edge_to_vertex.AddColumnsInRow(group_id, 2); ++ } ++ group__shared_edge_to_vertex.MakeJ(); ++ const Table &edge_to_vertex = *mesh.GetEdgeVertexTable(); ++ for (int se = 0; se < shared_edges.Size(); se++) ++ { ++ const int glob_edge_id = shared_edges[se].one; ++ const int group_id = shared_edges[se].two; ++ const int *v = edge_to_vertex.GetRow(glob_edge_id); ++ for (int i = 0; i < 2; i++) ++ { ++ const int loc_vertex_id = vertex_loc_to_glob.FindSorted(v[i]); ++ MFEM_ASSERT(loc_vertex_id >= 0, "internal error"); ++ group__shared_edge_to_vertex.AddConnection(group_id, loc_vertex_id); ++ } ++ } ++ group__shared_edge_to_vertex.ShiftUpI(); ++ } ++ ++ // Construct 'mesh_part.group__shared_entity_to_vertex[Geometry::TRIANGLE]' ++ // and 'mesh_part.group__shared_entity_to_vertex[Geometry::SQUARE]'. ++ if (dim >= 3) ++ { ++ Table &group__shared_tria_to_vertex = ++ mesh_part.group__shared_entity_to_vertex[Geometry::TRIANGLE]; ++ Table &group__shared_quad_to_vertex = ++ mesh_part.group__shared_entity_to_vertex[Geometry::SQUARE]; ++ Array vertex_ids; ++ group__shared_tria_to_vertex.MakeI(num_groups); ++ group__shared_quad_to_vertex.MakeI(num_groups); ++ for (int sf = 0; sf < shared_faces.Size(); sf++) ++ { ++ const int glob_face_id = shared_faces[sf].one; ++ const int group_id = shared_faces[sf].two; ++ const int geom = mesh.GetFaceGeometry(glob_face_id); ++ mesh_part.group__shared_entity_to_vertex[geom]. ++ AddColumnsInRow(group_id, Geometry::NumVerts[geom]); ++ } ++ group__shared_tria_to_vertex.MakeJ(); ++ group__shared_quad_to_vertex.MakeJ(); ++ for (int sf = 0; sf < shared_faces.Size(); sf++) ++ { ++ const int glob_face_id = shared_faces[sf].one; ++ const int group_id = shared_faces[sf].two; ++ const int geom = mesh.GetFaceGeometry(glob_face_id); ++ mesh.GetFaceVertices(glob_face_id, vertex_ids); ++ // Rotate shared triangles that have an adjacent tetrahedron with a ++ // nonzero refinement flag. ++ // See also ParMesh::BuildSharedFaceElems. ++ if (geom == Geometry::TRIANGLE) ++ { ++ int glob_el_id[2]; ++ mesh.GetFaceElements(glob_face_id, &glob_el_id[0], &glob_el_id[1]); ++ int side = 0; ++ const Element *el = mesh.GetElement(glob_el_id[0]); ++ const Tetrahedron *tet = nullptr; ++ if (el->GetGeometryType() == Geometry::TETRAHEDRON) ++ { ++ tet = static_cast(el); ++ } ++ else ++ { ++ side = 1; ++ el = mesh.GetElement(glob_el_id[1]); ++ if (el->GetGeometryType() == Geometry::TETRAHEDRON) ++ { ++ tet = static_cast(el); ++ } ++ } ++ if (tet && tet->GetRefinementFlag()) ++ { ++ // mark the shared face for refinement by reorienting ++ // it according to the refinement flag in the tetrahedron ++ // to which this shared face belongs to. ++ int info[2]; ++ mesh.GetFaceInfos(glob_face_id, &info[0], &info[1]); ++ tet->GetMarkedFace(info[side]/64, &vertex_ids[0]); ++ } ++ } ++ for (int i = 0; i < vertex_ids.Size(); i++) ++ { ++ const int glob_id = vertex_ids[i]; ++ const int loc_id = vertex_loc_to_glob.FindSorted(glob_id); ++ MFEM_ASSERT(loc_id >= 0, "internal error"); ++ vertex_ids[i] = loc_id; ++ } ++ mesh_part.group__shared_entity_to_vertex[geom]. ++ AddConnections(group_id, vertex_ids, vertex_ids.Size()); ++ } ++ group__shared_tria_to_vertex.ShiftUpI(); ++ group__shared_quad_to_vertex.ShiftUpI(); ++ } ++} ++ ++std::unique_ptr ++MeshPartitioner::ExtractFESpace(MeshPart &mesh_part, ++ const FiniteElementSpace &global_fespace) const ++{ ++ mesh_part.GetMesh(); // initialize 'mesh_part.mesh' ++ // Note: the nodes of 'mesh_part.mesh' are not set. ++ ++ return std::unique_ptr( ++ new FiniteElementSpace(mesh_part.mesh.get(), ++ global_fespace.FEColl(), ++ global_fespace.GetVDim(), ++ global_fespace.GetOrdering())); ++} ++ ++std::unique_ptr ++MeshPartitioner::ExtractGridFunction(MeshPart &mesh_part, ++ const GridFunction &global_gf, ++ FiniteElementSpace &local_fespace) const ++{ ++ std::unique_ptr local_gf(new GridFunction(&local_fespace)); ++ ++ // Transfer data from 'global_gf' to 'local_gf'. ++ Array gvdofs, lvdofs; ++ Vector loc_vals; ++ const int part_id = mesh_part.my_part_id; ++ const int num_elems = part_to_element.RowSize(part_id); ++ const int *elem_list = part_to_element.GetRow(part_id); // sorted ++ for (int loc_elem_id = 0; loc_elem_id < num_elems; loc_elem_id++) ++ { ++ const int glob_elem_id = elem_list[loc_elem_id]; ++ auto glob_dt = global_gf.FESpace()->GetElementVDofs(glob_elem_id, gvdofs); ++ global_gf.GetSubVector(gvdofs, loc_vals); ++ if (glob_dt) { glob_dt->InvTransformPrimal(loc_vals); } ++ auto local_dt = local_fespace.GetElementVDofs(loc_elem_id, lvdofs); ++ if (local_dt) { local_dt->TransformPrimal(loc_vals); } ++ local_gf->SetSubVector(lvdofs, loc_vals); ++ } ++ return local_gf; ++} ++ ++MeshPartitioner::~MeshPartitioner() ++{ ++ if (own_partitioning) { delete [] partitioning; } ++} ++ ++ + GeometricFactors::GeometricFactors(const Mesh *mesh, const IntegrationRule &ir, + int flags, MemoryType d_mt) + { +diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp +index b9c5538c3..a6957720a 100644 +--- a/mesh/mesh.hpp ++++ b/mesh/mesh.hpp +@@ -27,6 +27,7 @@ + #include "../general/adios2stream.hpp" + #endif + #include ++#include + + namespace mfem + { +@@ -72,8 +73,10 @@ protected: + visualization purpose in GLVis. */ + mutable int nbInteriorFaces, nbBoundaryFaces; + +- int meshgen; // see MeshGenerator() +- int mesh_geoms; // sum of (1 << geom) for all geom of all dimensions ++ // see MeshGenerator(); global in parallel ++ int meshgen; ++ // sum of (1 << geom) for all geom of all dimensions; local in parallel ++ int mesh_geoms; + + // Counter for Mesh transformations: refinement, derefinement, rebalancing. + // Used for checking during Update operations on objects depending on the +@@ -295,11 +298,11 @@ protected: + void Destroy(); // Delete all owned data. + void ResetLazyData(); + +- Element *ReadElementWithoutAttr(std::istream &); +- static void PrintElementWithoutAttr(const Element *, std::ostream &); ++ Element *ReadElementWithoutAttr(std::istream &input); ++ static void PrintElementWithoutAttr(const Element *el, std::ostream &os); + +- Element *ReadElement(std::istream &); +- static void PrintElement(const Element *, std::ostream &); ++ Element *ReadElement(std::istream &input); ++ static void PrintElement(const Element *el, std::ostream &os); + + // Readers for different mesh formats, used in the Load() method. + // The implementations of these methods are in mesh_readers.cpp. +@@ -456,7 +459,7 @@ protected: + + void UpdateNURBS(); + +- void PrintTopo(std::ostream &out, const Array &e_to_k) const; ++ void PrintTopo(std::ostream &os, const Array &e_to_k) const; + + /// Used in GetFaceElementTransformations (...) + void GetLocalPtToSegTransformation(IsoparametricTransformation &, int); +@@ -565,7 +568,7 @@ protected: + // If NURBS mesh, write NURBS format. If NCMesh, write mfem v1.1 format. + // If section_delimiter is empty, write mfem v1.0 format. Otherwise, write + // mfem v1.2 format with the given section_delimiter at the end. +- void Printer(std::ostream &out = mfem::out, ++ void Printer(std::ostream &os = mfem::out, + std::string section_delimiter = "") const; + + /** Creates mesh for the parallelepiped [0,sx]x[0,sy]x[0,sz], divided into +@@ -859,7 +862,7 @@ public: + + int AddBdrPoint(int v, int attr = 1); + +- void GenerateBoundaryElements(); ++ virtual void GenerateBoundaryElements(); + /// Finalize the construction of a triangular Mesh. + void FinalizeTriMesh(int generate_edges = 0, int refine = 0, + bool fix_orientation = true); +@@ -2101,7 +2104,7 @@ public: + std::ostream &os, int elem_attr = 0) const; + + void PrintElementsWithPartitioning (int *partitioning, +- std::ostream &out, ++ std::ostream &os, + int interior_faces = 0); + + /// Print set of disjoint surfaces: +@@ -2109,13 +2112,13 @@ public: + * If Aface_face(i,j) != 0, print face j as a boundary + * element with attribute i+1. + */ +- void PrintSurfaces(const Table &Aface_face, std::ostream &out) const; ++ void PrintSurfaces(const Table &Aface_face, std::ostream &os) const; + + /// Auxiliary method used by PrintCharacteristics(). + /** It is also used in the `mesh-explorer` miniapp. */ + static void PrintElementsByGeometry(int dim, + const Array &num_elems_by_geom, +- std::ostream &out); ++ std::ostream &os); + + /** @brief Compute and print mesh characteristics such as number of vertices, + number of elements, number of boundary elements, minimal and maximal +@@ -2135,7 +2138,7 @@ public: + + #ifdef MFEM_DEBUG + /// Output an NCMesh-compatible debug dump. +- void DebugDump(std::ostream &out) const; ++ void DebugDump(std::ostream &os) const; + #endif + + /// @} +@@ -2214,9 +2217,194 @@ public: + /// @} + }; + +-/** Overload operator<< for std::ostream and Mesh; valid also for the derived +- class ParMesh */ +-std::ostream &operator<<(std::ostream &out, const Mesh &mesh); ++ ++// Class containing a minimal description of a part (a subset of the elements) ++// of a Mesh and its connectivity to other parts. The main purpose of this class ++// is to be communicated between MPI ranks for repartitioning purposes. It can ++// also be used to implement parallel mesh I/O functions with partitionings that ++// have number of parts different from the number of MPI tasks. ++// ++// Note: parts of NURBS or non-conforming meshes cannot be fully described by ++// this class alone. ++class MeshPart ++{ ++protected: ++ struct Entity { int geom; int num_verts; const int *verts; }; ++ struct EntityHelper ++ { ++ int dim, num_entities; ++ int geom_offsets[Geometry::NumGeom+1]; ++ typedef const Array entity_to_vertex_type[Geometry::NumGeom]; ++ entity_to_vertex_type &entity_to_vertex; ++ ++ EntityHelper(int dim_, ++ const Array (&entity_to_vertex_)[Geometry::NumGeom]); ++ Entity FindEntity(int bytype_entity_id); ++ }; ++ ++public: ++ // Reference space dimension of the elements ++ int dimension; ++ ++ // Dimension of the physical space into which the MeshPart is embedded. ++ int space_dimension; ++ ++ // Number of vertices ++ int num_vertices; ++ ++ // Number of elements with reference space dimension equal to 'dimension'. ++ int num_elements; ++ ++ // Number of boundary elements with reference space dimension equal to ++ // 'dimension'-1. ++ int num_bdr_elements; ++ ++ // Each 'entity_to_vertex[geom]' describes the entities of Geometry::Type ++ // 'geom' in terms of their vertices. The number of entities of type 'geom' ++ // is: ++ // num_entities[geom] = size('entity_to_vertex[geom]')/num_vertices[geom] ++ // The number of all elements, 'num_elements', is: ++ // 'num_elements' = sum_{dim[geom]=='dimension'} num_entities[geom] ++ // and the number of all boundary elements, 'num_bdr_elements' is: ++ // 'num_bdr_elements' = sum_{dim[geom]=='dimension'-1} num_entities[geom] ++ // Note that 'entity_to_vertex' does NOT describe all "faces" in the mesh ++ // part (i.e. all 'dimension'-1 entities) but only the boundary elements. ++ Array entity_to_vertex[Geometry::NumGeom]; ++ ++ // Store the refinement flags for tetraheral elements. If all tets have zero ++ // refinement flags then this array is empty, i.e. has size 0. ++ Array tet_refine_flags; ++ ++ // "By-type" element/boundary ordering: ordered by Geometry::Type and within ++ // each Geometry::Type 'geom' ordered as in 'entity_to_vertex[geom]'. ++ ++ // Optional re-ordering of the elements that will be used by (Par)Mesh ++ // objects constructed from this MeshPart. This array maps "natural" element ++ // ids (used by the Mesh/ParMesh objects) to "by-type" element ids (see ++ // above): ++ // "by-type" element id = element_map["natural" element id] ++ // The size of the array is either 'num_elements' or 0 when no re-ordering is ++ // needed (then "by-type" id == "natural" id). ++ Array element_map; ++ ++ // Optional re-ordering for the boundary elements, similar to 'element_map'. ++ Array boundary_map; ++ ++ // Element attributes. Ordered using the "natural" element ordering defined ++ // by the array 'element_map'. The size of this array is 'num_elements'. ++ Array attributes; ++ ++ // Boundary element attributes. Ordered using the "natural" boundary element ++ // ordering defined by the array 'boundary_map'. The size of this array is ++ // 'num_bdr_elements'. ++ Array bdr_attributes; ++ ++ // Optional vertex coordinates. The size of the array is either ++ // size = 'space_dimension' * 'num_vertices' ++ // or 0 when the vertex coordinates are not used, i.e. when the MeshPart uses ++ // a nodal GridFunction to describe its location in physical space. This ++ // array uses Ordering::byVDIM: "X0,Y0,Z0, X1,Y1,Z1, ...". ++ Array vertex_coordinates; ++ ++ // Optional serial Mesh object constructed on demand using the method ++ // GetMesh(). One use case for it is when one wants to construct FE spaces ++ // and GridFunction%s on the MeshPart for saving or MPI communication. ++ std::unique_ptr mesh; ++ ++ // Nodal FE space defined on 'mesh' used by the GridFunction 'nodes'. Uses ++ // the FE collection from the global nodal FE space. ++ std::unique_ptr nodal_fes; ++ ++ // 'nodes': pointer to a GridFunction describing the physical location of the ++ // MeshPart. Used for describing high-order and periodic meshes. This ++ // GridFunction is defined on the FE space 'nodal_fes' which, in turn, is ++ // defined on the Mesh 'mesh'. ++ std::unique_ptr nodes; ++ ++ // Connectivity to other MeshPart objects ++ // -------------------------------------- ++ ++ // Total number of MeshParts ++ int num_parts; ++ ++ // Index of the part described by this MeshPart: ++ // 0 <= 'my_part_id' < 'num_parts' ++ int my_part_id; ++ ++ // A group G is a subsets of the set { 0, 1, ..., 'num_parts'-1 } for which ++ // there is a mesh entity E (of any dimension) in the global mesh such that ++ // G is the set of the parts assigned to the elements adjacent to E. The ++ // MeshPart describes only the "neighbor" groups, i.e. the groups that ++ // contain 'my_part_id'. The Table 'my_groups' defines the "neighbor" groups ++ // in terms of their part ids. In other words, it maps "neighbor" group ids ++ // to a (sorted) list of part ids. In particular, the number of "neighbor" ++ // groups is given by 'my_groups.Size()'. The "local" group { 'my_part_id' } ++ // has index 0 in 'my_groups'. ++ Table my_groups; ++ ++ // Shared entities for this MeshPart are mesh entities of all dimensions less ++ // than 'dimension' that are generated by the elements of this MeshPart and ++ // at least one other MeshPart. ++ // ++ // The Table 'group__shared_entity_to_vertex[geom]' defines, for each group, ++ // the shared entities of Geometry::Type 'geom'. Each row (corresponding to a ++ // "neighbor" group, as defined by 'my_groups') in the Table defines the ++ // shared entities in a way similar to the arrays 'entity_to_vertex[geom]'. ++ // The "local" group (with index 0) does not have any shared entities, so the ++ // 0-th row in the Table is always empty. ++ // ++ // IMPORTANT: the desciptions of the groups in this MeshPart must match their ++ // descriptions in all neighboring MeshParts. This includes the ordering of ++ // the shared entities within the group, as well as the vertex ordering of ++ // each shared entity. ++ Table group__shared_entity_to_vertex[Geometry::NumGeom]; ++ ++ // Write the MeshPart to a stream using the parallel format "MFEM mesh v1.2". ++ void Print(std::ostream &os) const; ++ ++ // Construct a serrial Mesh object from the MeshPart. The nodes of 'mesh' are ++ // NOT initialized by this method, however, the nodal FE space and nodal ++ // GridFunction can be created and then attached to the 'mesh'. The Mesh is ++ // constructed only if 'mesh' is empty, otherwise the method simply returns ++ // the object held by 'mesh'. ++ Mesh &GetMesh(); ++}; ++ ++ ++// TODO: documentation ++class MeshPartitioner ++{ ++protected: ++ Mesh &mesh; ++ int *partitioning; ++ bool own_partitioning; ++ Table part_to_element; ++ Table part_to_boundary; ++ Table edge_to_element; ++ Table vertex_to_element; ++ ++public: ++ // TODO: documentation ++ MeshPartitioner(Mesh &mesh_, int num_parts_, int *partitioning_ = NULL, ++ int part_method = 1); ++ ++ // TODO: documentation ++ void ExtractPart(int part_id, MeshPart &mesh_part) const; ++ ++ // TODO: documentation ++ std::unique_ptr ++ ExtractFESpace(MeshPart &mesh_part, ++ const FiniteElementSpace &global_fespace) const; ++ ++ // TODO: documentation ++ std::unique_ptr ++ ExtractGridFunction(MeshPart &mesh_part, ++ const GridFunction &global_gf, ++ FiniteElementSpace &local_fespace) const; ++ ++ // Destructor ++ ~MeshPartitioner(); ++}; + + + /** @brief Structure for storing mesh geometric factors: coordinates, Jacobians, +@@ -2225,7 +2413,6 @@ std::ostream &operator<<(std::ostream &out, const Mesh &mesh); + Mesh. See Mesh::GetGeometricFactors(). */ + class GeometricFactors + { +- + private: + void Compute(const GridFunction &nodes, + MemoryType d_mt = MemoryType::DEFAULT); +@@ -2273,6 +2460,7 @@ public: + Vector detJ; + }; + ++ + /** @brief Structure for storing face geometric factors: coordinates, Jacobians, + determinants of the Jacobians, and normal vectors. */ + /** Typically objects of this type are constructed and owned by objects of class +@@ -2327,6 +2515,7 @@ public: + Vector normal; + }; + ++ + /// Class used to extrude the nodes of a mesh + class NodeExtrudeCoefficient : public VectorCoefficient + { +@@ -2358,8 +2547,12 @@ inline void ShiftRight(int &a, int &b, int &c) + a = c; c = b; b = t; + } + ++/** Overload operator<< for std::ostream and Mesh; valid also for the derived ++ class ParMesh */ ++std::ostream &operator<<(std::ostream &os, const Mesh &mesh); ++ + /// @brief Print function for Mesh::FaceInformation. +-std::ostream& operator<<(std::ostream& os, const Mesh::FaceInformation& info); ++std::ostream& operator<<(std::ostream &os, const Mesh::FaceInformation& info); + + } + +diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp +index 26e2f4655..47a091c04 100644 +--- a/mesh/pmesh.cpp ++++ b/mesh/pmesh.cpp +@@ -250,6 +250,8 @@ ParMesh::ParMesh(MPI_Comm comm, Mesh &mesh, int *partitioning_, + BuildSharedVertMapping(nsvert, vert_element, vert_global_local); + delete vert_element; + ++ // FIXME: the next two lines are already done above! Any reason to do them ++ // again? + SetMeshGen(); + meshgen = mesh.meshgen; // copy the global 'meshgen' + } +@@ -1526,6 +1528,7 @@ ParMesh ParMesh::MakeSimplicial(ParMesh &orig_mesh) + void ParMesh::Finalize(bool refine, bool fix_orientation) + { + const int meshgen_save = meshgen; // Mesh::Finalize() may call SetMeshGen() ++ // 'mesh_geoms' is local, so there's no need to save and restore it. + + Mesh::Finalize(refine, fix_orientation); + +@@ -6213,6 +6216,7 @@ void ParMesh::ParPrint(ostream &os) const + { + os << "total_shared_faces " << sface_lface.Size() << '\n'; + } ++ os << "\n# group 0 has no shared entities\n"; + for (int gr = 1; gr < GetNGroups(); gr++) + { + { +diff --git a/mesh/pmesh.hpp b/mesh/pmesh.hpp +index 06f09dc0c..b6682defe 100644 +--- a/mesh/pmesh.hpp ++++ b/mesh/pmesh.hpp +@@ -108,6 +108,8 @@ protected: + // Convert the local 'meshgen' to a global one. + void ReduceMeshGen(); + ++ void GenerateBoundaryElements() override { /* TODO */ } ++ + // Determine sedge_ledge and sface_lface. + void FinalizeParTopo(); + +diff --git a/mesh/tetrahedron.cpp b/mesh/tetrahedron.cpp +index 0815fc0a7..c1b0ae6d8 100644 +--- a/mesh/tetrahedron.cpp ++++ b/mesh/tetrahedron.cpp +@@ -55,7 +55,7 @@ void Tetrahedron::Init(int ind1, int ind2, int ind3, int ind4, int attr, + } + + void Tetrahedron::ParseRefinementFlag(int refinement_edges[2], int &type, +- int &flag) ++ int &flag) const + { + int i, f = refinement_flag; + +@@ -136,9 +136,10 @@ void Tetrahedron::CreateRefinementFlag(int refinement_edges[2], int type, + refinement_flag |= refinement_edges[0]; + } + +-void Tetrahedron::GetMarkedFace(const int face, int *fv) ++void Tetrahedron::GetMarkedFace(const int face, int *fv) const + { +- int re[2], type, flag, *tv = this->indices; ++ int re[2], type, flag; ++ const int *tv = this->indices; + ParseRefinementFlag(re, type, flag); + switch (face) + { +diff --git a/mesh/tetrahedron.hpp b/mesh/tetrahedron.hpp +index ad018a037..ef8f36eb8 100644 +--- a/mesh/tetrahedron.hpp ++++ b/mesh/tetrahedron.hpp +@@ -58,12 +58,13 @@ public: + /// Return element's type. + virtual Type GetType() const { return Element::TETRAHEDRON; } + +- void ParseRefinementFlag(int refinement_edges[2], int &type, int &flag); ++ void ParseRefinementFlag(int refinement_edges[2], int &type, ++ int &flag) const; + void CreateRefinementFlag(int refinement_edges[2], int type, int flag = 0); + +- void GetMarkedFace(const int face, int *fv); ++ void GetMarkedFace(const int face, int *fv) const; + +- int GetRefinementFlag() { return refinement_flag; } ++ int GetRefinementFlag() const { return refinement_flag; } + + void SetRefinementFlag(int rf) { refinement_flag = rf; } + +diff --git a/miniapps/meshing/makefile b/miniapps/meshing/makefile +index 1ccec0455..e34a5637e 100644 +--- a/miniapps/meshing/makefile ++++ b/miniapps/meshing/makefile +@@ -123,7 +123,7 @@ clean-build: + rm -rf *.dSYM *.TVD.*breakpoints + + clean-exec: +- @rm -f mobius-strip.mesh klein-bottle.mesh mesh-explorer.mesh ++ @rm -f mobius-strip.mesh klein-bottle.mesh mesh-explorer.mesh* + @rm -f toroid-*.mesh twist-*.mesh trimmer.mesh reflected.mesh + @rm -f partitioning.txt shaper.mesh extruder.mesh + @rm -f optimized* perturbed* polar-nc.mesh +diff --git a/miniapps/meshing/mesh-explorer.cpp b/miniapps/meshing/mesh-explorer.cpp +index f05e18e83..67e8b1f65 100644 +--- a/miniapps/meshing/mesh-explorer.cpp ++++ b/miniapps/meshing/mesh-explorer.cpp +@@ -308,6 +308,7 @@ int main (int argc, char *argv[]) + partitioning = 0; + bdr_partitioning.SetSize(mesh->GetNBE()); + bdr_partitioning = 0; ++ np = 1; + } + else + { +@@ -382,7 +383,8 @@ int main (int argc, char *argv[]) + "f) Find physical point in reference space\n" + "p) Generate a partitioning\n" + "o) Reorder elements\n" +- "S) Save in MFEM format\n" ++ "S) Save in MFEM serial format\n" ++ "D) Save in MFEM parallel format using the current partitioning\n" + "V) Save in VTK format (only linear and quadratic meshes)\n" + "D) Save as a DataCollection\n" + "q) Quit\n" +@@ -971,9 +973,8 @@ int main (int argc, char *argv[]) + cin >> nxyz[2]; np *= nxyz[2]; + } + } +- int *part = mesh->CartesianPartitioning(nxyz); +- partitioning = Array(part, mesh->GetNE()); +- delete [] part; ++ partitioning.MakeRef(mesh->CartesianPartitioning(nxyz), ++ mesh->GetNE(), true); + recover_bdr_partitioning(mesh, partitioning, bdr_partitioning); + } + else if (pk == 's') +@@ -984,7 +985,7 @@ int main (int argc, char *argv[]) + partitioning.SetSize(mesh->GetNE()); + for (int i = 0; i < mesh->GetNE(); i++) + { +- partitioning[i] = i * np / mesh->GetNE(); ++ partitioning[i] = (long long)i * np / mesh->GetNE(); + } + recover_bdr_partitioning(mesh, partitioning, bdr_partitioning); + } +@@ -997,9 +998,8 @@ int main (int argc, char *argv[]) + } + cout << "Enter number of processors: " << flush; + cin >> np; +- int *part = mesh->GeneratePartitioning(np, part_method); +- partitioning = Array(part, mesh->GetNE()); +- delete [] part; ++ partitioning.MakeRef(mesh->GeneratePartitioning(np, part_method), ++ mesh->GetNE(), true); + recover_bdr_partitioning(mesh, partitioning, bdr_partitioning); + } + if (partitioning) +@@ -1197,6 +1197,25 @@ int main (int argc, char *argv[]) + cout << "New mesh file: " << omesh_file << endl; + } + ++ if (mk == 'D') ++ { ++ const char mesh_prefix[] = "mesh-explorer.mesh."; ++ MeshPartitioner partitioner(*mesh, np, partitioning); ++ MeshPart mesh_part; ++ int precision; ++ cout << "Enter desired precision: " << flush; ++ cin >> precision; ++ for (int i = 0; i < np; i++) ++ { ++ partitioner.ExtractPart(i, mesh_part); ++ ++ ofstream omesh(MakeParFilename(mesh_prefix, i)); ++ omesh.precision(precision); ++ mesh_part.Print(omesh); ++ } ++ cout << "New parallel mesh files: " << mesh_prefix << "" << endl; ++ } ++ + if (mk == 'V') + { + const char omesh_file[] = "mesh-explorer.vtk"; diff --git a/extern/patch/mfem/patch_mesh_vis_dev.diff b/extern/patch/mfem/patch_mesh_vis_dev.diff index 6720e3a711..092afab56d 100644 --- a/extern/patch/mfem/patch_mesh_vis_dev.diff +++ b/extern/patch/mfem/patch_mesh_vis_dev.diff @@ -1,542 +1,477 @@ -diff --git a/fem/datacollection.cpp b/fem/datacollection.cpp -index 0dc718b07..3d478edaa 100644 ---- a/fem/datacollection.cpp -+++ b/fem/datacollection.cpp -@@ -210,6 +210,11 @@ void DataCollection::Save() - { - SaveOneQField(it); - } -+ -+ MFEM_VERIFY(coeff_field_map.begin() == coeff_field_map.end() && -+ vcoeff_field_map.begin() == vcoeff_field_map.end(), -+ "Coefficient/VectorCoefficient output is not supported for " -+ "DataCollection class!"); - } - - void DataCollection::SaveMesh() -@@ -309,9 +314,9 @@ void DataCollection::SaveField(const std::string &field_name) - } - } - --void DataCollection::SaveQField(const std::string &q_field_name) -+void DataCollection::SaveQField(const std::string &field_name) - { -- QFieldMapIterator it = q_field_map.find(q_field_name); -+ QFieldMapIterator it = q_field_map.find(field_name); - if (it != q_field_map.end()) - { - SaveOneQField(it); -@@ -765,7 +770,8 @@ ParaViewDataCollection::ParaViewDataCollection(const std::string& - levels_of_detail(1), - pv_data_format(VTKFormat::BINARY), - high_order_output(false), -- restart_mode(false) -+ restart_mode(false), -+ bdr_output(false) - { - cycle = 0; // always include a valid cycle index in file names - -@@ -910,16 +916,19 @@ void ParaViewDataCollection::Save() - std::string vtu_prefix = col_path + "/" + GenerateVTUPath() + "/"; - - // Save the local part of the mesh and grid functions fields to the local -- // VTU file -+ // VTU file. Also save coefficient fields. - { - std::ofstream os(vtu_prefix + GenerateVTUFileName("proc", myid)); - os.precision(precision); - SaveDataVTU(os, levels_of_detail); - } - -- // Save the local part of the quadrature function fields -+ // Save the local part of the quadrature function fields. - for (const auto &qfield : q_field_map) - { -+ MFEM_VERIFY(!bdr_output, -+ "QuadratureFunction output is not supported for " -+ "ParaViewDataCollection on domain boundary!"); - const std::string &field_name = qfield.first; - std::ofstream os(vtu_prefix + GenerateVTUFileName(field_name, myid)); - qfield.second->SaveVTU(os, pv_data_format, GetCompressionLevel(), field_name); -@@ -935,7 +944,7 @@ void ParaViewDataCollection::Save() - std::ofstream pvtu_out(vtu_prefix + GeneratePVTUFileName("data")); - WritePVTUHeader(pvtu_out); - -- // Grid function fields -+ // Grid function fields and coefficient fields - pvtu_out << "\n"; - for (auto &field_it : field_map) - { -@@ -945,7 +954,24 @@ void ParaViewDataCollection::Save() - << "\" NumberOfComponents=\"" << vec_dim << "\" " - << "format=\"" << GetDataFormatString() << "\" />\n"; - } -+ for (auto &field_it : coeff_field_map) -+ { -+ int vec_dim = 1; -+ pvtu_out << "\n"; -+ } -+ for (auto &field_it : vcoeff_field_map) -+ { -+ int vec_dim = field_it.second->GetVDim(); -+ pvtu_out << "\n"; -+ } - pvtu_out << "\n"; -+ - // Element attributes - pvtu_out << "\n"; - pvtu_out << "\t\n"; - os << "\n"; -- mesh->PrintVTU(os,ref,pv_data_format,high_order_output,GetCompressionLevel()); -+ mesh->PrintVTU(os,ref,pv_data_format,high_order_output,GetCompressionLevel(), -+ bdr_output); - - // dump out the grid functions as point data - os << "\n"; -@@ -1050,8 +1077,23 @@ void ParaViewDataCollection::SaveDataVTU(std::ostream &os, int ref) - // iterate over all grid functions - for (FieldMapIterator it=field_map.begin(); it!=field_map.end(); ++it) - { -+ MFEM_VERIFY(!bdr_output, -+ "GridFunction output is not supported for " -+ "ParaViewDataCollection on domain boundary!"); - SaveGFieldVTU(os,ref,it); - } -+ // save the coefficient functions -+ // iterate over all Coefficient and VectorCoefficient functions -+ for (CoeffFieldMapIterator it=coeff_field_map.begin(); -+ it!=coeff_field_map.end(); ++it) -+ { -+ SaveCoeffFieldVTU(os,ref,it); -+ } -+ for (VCoeffFieldMapIterator it=vcoeff_field_map.begin(); -+ it!=vcoeff_field_map.end(); ++it) -+ { -+ SaveVCoeffFieldVTU(os,ref,it); -+ } - os << "\n"; - // close the mesh - os << "\n"; // close the piece open in the PrintVTU method -@@ -1073,7 +1115,6 @@ void ParaViewDataCollection::SaveGFieldVTU(std::ostream &os, int ref_, - << " format=\"" << GetDataFormatString() << "\" >" << '\n'; - if (vec_dim == 1) - { -- // scalar data - for (int i = 0; i < mesh->GetNE(); i++) - { - RefG = GlobGeometryRefiner.Refine( -@@ -1103,11 +1144,131 @@ void ParaViewDataCollection::SaveGFieldVTU(std::ostream &os, int ref_, - } - } - } -+ if (pv_data_format != VTKFormat::ASCII) -+ { -+ WriteBase64WithSizeAndClear(os, buf, GetCompressionLevel()); -+ } -+ os << "" << std::endl; -+} -+ -+void ParaViewDataCollection::SaveCoeffFieldVTU(std::ostream &os, int ref_, -+ const CoeffFieldMapIterator &it) -+{ -+ RefinedGeometry *RefG; -+ double val; -+ std::vector buf; -+ int vec_dim = 1; -+ os << "first -+ << "\" NumberOfComponents=\"" << vec_dim << "\"" -+ << " format=\"" << GetDataFormatString() << "\" >" << '\n'; -+ { -+ // scalar data -+ if (!bdr_output) -+ { -+ for (int i = 0; i < mesh->GetNE(); i++) -+ { -+ RefG = GlobGeometryRefiner.Refine( -+ mesh->GetElementBaseGeometry(i), ref_, 1); -+ -+ ElementTransformation *eltrans = mesh->GetElementTransformation(i); -+ const IntegrationRule *ir = &RefG->RefPts; -+ for (int j = 0; j < ir->GetNPoints(); j++) -+ { -+ const IntegrationPoint &ip = ir->IntPoint(j); -+ eltrans->SetIntPoint(&ip); -+ val = it->second->Eval(*eltrans, ip); -+ WriteBinaryOrASCII(os, buf, val, "\n", pv_data_format); -+ } -+ } -+ } -+ else -+ { -+ for (int i = 0; i < mesh->GetNBE(); i++) -+ { -+ RefG = GlobGeometryRefiner.Refine( -+ mesh->GetBdrElementBaseGeometry(i), ref_, 1); -+ -+ ElementTransformation *eltrans = mesh->GetBdrElementTransformation(i); -+ const IntegrationRule *ir = &RefG->RefPts; -+ for (int j = 0; j < ir->GetNPoints(); j++) -+ { -+ const IntegrationPoint &ip = ir->IntPoint(j); -+ eltrans->SetIntPoint(&ip); -+ val = it->second->Eval(*eltrans, ip); -+ WriteBinaryOrASCII(os, buf, val, "\n", pv_data_format); -+ } -+ } -+ } -+ } -+ if (pv_data_format != VTKFormat::ASCII) -+ { -+ WriteBase64WithSizeAndClear(os, buf, GetCompressionLevel()); -+ } -+ os << "" << std::endl; -+} - -- if (IsBinaryFormat()) -+void ParaViewDataCollection::SaveVCoeffFieldVTU(std::ostream &os, int ref_, -+ const VCoeffFieldMapIterator &it) -+{ -+ RefinedGeometry *RefG; -+ Vector val; -+ std::vector buf; -+ int vec_dim = it->second->GetVDim(); -+ os << "first -+ << "\" NumberOfComponents=\"" << vec_dim << "\"" -+ << " format=\"" << GetDataFormatString() << "\" >" << '\n'; - { -- WriteVTKEncodedCompressed(os,buf.data(),buf.size(),GetCompressionLevel()); -- os << '\n'; -+ // vector data -+ if (!bdr_output) -+ { -+ for (int i = 0; i < mesh->GetNE(); i++) -+ { -+ RefG = GlobGeometryRefiner.Refine( -+ mesh->GetElementBaseGeometry(i), ref_, 1); -+ -+ ElementTransformation *eltrans = mesh->GetElementTransformation(i); -+ const IntegrationRule *ir = &RefG->RefPts; -+ for (int j = 0; j < ir->GetNPoints(); j++) -+ { -+ const IntegrationPoint &ip = ir->IntPoint(j); -+ eltrans->SetIntPoint(&ip); -+ it->second->Eval(val, *eltrans, ip); -+ for (int jj = 0; jj < val.Size(); jj++) -+ { -+ WriteBinaryOrASCII(os, buf, val(jj), " ", pv_data_format); -+ } -+ if (pv_data_format == VTKFormat::ASCII) { os << '\n'; } -+ } -+ } -+ } -+ else -+ { -+ for (int i = 0; i < mesh->GetNBE(); i++) -+ { -+ RefG = GlobGeometryRefiner.Refine( -+ mesh->GetBdrElementBaseGeometry(i), ref_, 1); -+ -+ ElementTransformation *eltrans = mesh->GetBdrElementTransformation(i); -+ const IntegrationRule *ir = &RefG->RefPts; -+ for (int j = 0; j < ir->GetNPoints(); j++) -+ { -+ const IntegrationPoint &ip = ir->IntPoint(j); -+ eltrans->SetIntPoint(&ip); -+ it->second->Eval(val, *eltrans, ip); -+ for (int jj = 0; jj < val.Size(); jj++) -+ { -+ WriteBinaryOrASCII(os, buf, val(jj), " ", pv_data_format); -+ } -+ if (pv_data_format == VTKFormat::ASCII) { os << '\n'; } -+ } -+ } -+ } -+ } -+ if (pv_data_format != VTKFormat::ASCII) -+ { -+ WriteBase64WithSizeAndClear(os, buf, GetCompressionLevel()); - } - os << "" << std::endl; - } -@@ -1140,6 +1301,11 @@ void ParaViewDataCollection::SetCompression(bool compression_) - compression = compression_; - } - -+void ParaViewDataCollection::SetBoundaryOutput(bool bdr_output_) -+{ -+ bdr_output = bdr_output_; -+} -+ - void ParaViewDataCollection::UseRestartMode(bool restart_mode_) - { - restart_mode = restart_mode_; -diff --git a/fem/datacollection.hpp b/fem/datacollection.hpp -index c216188af..4a26cd1f8 100644 ---- a/fem/datacollection.hpp -+++ b/fem/datacollection.hpp -@@ -133,6 +133,10 @@ private: - - /// A collection of named QuadratureFunctions - typedef NamedFieldsMap QFieldMap; -+ -+ /// A collection of named Coefficients and VectorCoefficients -+ typedef NamedFieldsMap CoeffFieldMap; -+ typedef NamedFieldsMap VCoeffFieldMap; - public: - typedef GFieldMap::MapType FieldMapType; - typedef GFieldMap::iterator FieldMapIterator; -@@ -142,6 +146,14 @@ public: - typedef QFieldMap::iterator QFieldMapIterator; - typedef QFieldMap::const_iterator QFieldMapConstIterator; - -+ typedef CoeffFieldMap::MapType CoeffFieldMapType; -+ typedef CoeffFieldMap::iterator CoeffFieldMapIterator; -+ typedef CoeffFieldMap::const_iterator CoeffFieldMapConstIterator; -+ -+ typedef VCoeffFieldMap::MapType VCoeffFieldMapType; -+ typedef VCoeffFieldMap::iterator VCoeffFieldMapIterator; -+ typedef VCoeffFieldMap::const_iterator VCoeffFieldMapConstIterator; -+ - /// Format constants to be used with SetFormat(). - /** Derived classes can define their own format enumerations and override the - method SetFormat() to perform input validation. */ -@@ -169,6 +181,11 @@ protected: - /** A FieldMap mapping registered names to QuadratureFunction pointers. */ - QFieldMap q_field_map; - -+ /** A FieldMap mapping registered names to Coefficient and VectorCoefficient -+ pointers. */ -+ CoeffFieldMap coeff_field_map; -+ VCoeffFieldMap vcoeff_field_map; -+ - /// The (common) mesh for the collected fields - Mesh *mesh; - -@@ -249,15 +266,28 @@ public: - { field_map.Deregister(field_name, own_data); } - - /// Add a QuadratureFunction to the collection. -- virtual void RegisterQField(const std::string& q_field_name, -+ virtual void RegisterQField(const std::string& field_name, - QuadratureFunction *qf) -- { q_field_map.Register(q_field_name, qf, own_data); } -- -+ { q_field_map.Register(field_name, qf, own_data); } - - /// Remove a QuadratureFunction from the collection - virtual void DeregisterQField(const std::string& field_name) - { q_field_map.Deregister(field_name, own_data); } - -+ /// Add a Coefficient or VectorCoefficient to the collection. -+ virtual void RegisterCoeffField(const std::string& field_name, -+ Coefficient *coeff) -+ { coeff_field_map.Register(field_name, coeff, own_data); } -+ virtual void RegisterVCoeffField(const std::string& field_name, -+ VectorCoefficient *vcoeff) -+ { vcoeff_field_map.Register(field_name, vcoeff, own_data); } -+ -+ /// Remove a Coefficient or VectorCoefficient from the collection -+ virtual void DeregisterCoeffField(const std::string& field_name) -+ { coeff_field_map.Deregister(field_name, own_data); } -+ virtual void DeregisterVCoeffField(const std::string& field_name) -+ { vcoeff_field_map.Deregister(field_name, own_data); } -+ - /// Check if a grid function is part of the collection - bool HasField(const std::string& field_name) const - { return field_map.Has(field_name); } -@@ -280,13 +310,27 @@ public: - #endif - - /// Check if a QuadratureFunction with the given name is in the collection. -- bool HasQField(const std::string& q_field_name) const -- { return q_field_map.Has(q_field_name); } -+ bool HasQField(const std::string& field_name) const -+ { return q_field_map.Has(field_name); } - - /// Get a pointer to a QuadratureFunction in the collection. - /** Returns NULL if @a field_name is not in the collection. */ -- QuadratureFunction *GetQField(const std::string& q_field_name) -- { return q_field_map.Get(q_field_name); } -+ QuadratureFunction *GetQField(const std::string& field_name) -+ { return q_field_map.Get(field_name); } -+ -+ /** Check if a Coefficient or VectorCoefficient with the given name is in -+ the collection. */ -+ bool HasCoeffField(const std::string& field_name) const -+ { return coeff_field_map.Has(field_name); } -+ bool HasVCoeffField(const std::string& field_name) const -+ { return vcoeff_field_map.Has(field_name); } -+ -+ /// Get a pointer to a Coefficient or VectorCoefficient in the collection. -+ /** Returns NULL if @a field_name is not in the collection. */ -+ Coefficient *GetCoeffField(const std::string& field_name) -+ { return coeff_field_map.Get(field_name); } -+ VectorCoefficient *GetVCoeffField(const std::string& field_name) -+ { return vcoeff_field_map.Get(field_name); } - - /// Get a const reference to the internal field map. - /** The keys in the map are the field names and the values are pointers to -@@ -300,13 +344,23 @@ public: - const QFieldMapType &GetQFieldMap() const - { return q_field_map.GetMap(); } - -+ /// Get a const reference to the internal coefficient-field map. -+ /** The keys in the map are the coefficient-field names and the values are -+ pointers to Coefficient%s or VectorCoefficient%s. */ -+ const CoeffFieldMapType &GetCoeffFieldMap() const -+ { return coeff_field_map.GetMap(); } -+ const VCoeffFieldMapType &GetVCoeffFieldMap() const -+ { return vcoeff_field_map.GetMap(); } -+ - /// Get a pointer to the mesh in the collection - Mesh *GetMesh() { return mesh; } -+ - /// Set/change the mesh associated with the collection - /** When passed a Mesh, assumes the serial case: MPI rank id is set to 0 and - MPI num_procs is set to 1. When passed a ParMesh, MPI info from the - ParMesh is used to set the DataCollection's MPI rank and num_procs. */ - virtual void SetMesh(Mesh *new_mesh); -+ - #ifdef MFEM_USE_MPI - /// Set/change the mesh associated with the collection. - /** For this case, @a comm is used to set the DataCollection's MPI rank id -@@ -369,7 +423,19 @@ public: - /// Save one field, assuming the collection directory already exists. - virtual void SaveField(const std::string &field_name); - /// Save one q-field, assuming the collection directory already exists. -- virtual void SaveQField(const std::string &q_field_name); -+ virtual void SaveQField(const std::string &field_name); -+ /** Save one coefficient-field, assuming the collection directory already -+ exists. */ -+ virtual void SaveCoeffField(const std::string &field_name) -+ { -+ MFEM_ABORT("SaveCoeffField not implemented for DataCollection class!"); -+ } -+ /** Save one coefficient-field, assuming the collection directory already -+ exists. */ -+ virtual void SaveVCoeffField(const std::string &field_name) -+ { -+ MFEM_ABORT("SaveVCoeffField not implemented for DataCollection class!"); -+ } - - /// Load the collection. Not implemented in the base class DataCollection. - virtual void Load(int cycle_ = 0); -@@ -512,12 +578,17 @@ private: - VTKFormat pv_data_format; - bool high_order_output; - bool restart_mode; -+ bool bdr_output; - - protected: - void WritePVTUHeader(std::ostream &out); - void WritePVTUFooter(std::ostream &out, const std::string &vtu_prefix); - void SaveDataVTU(std::ostream &out, int ref); - void SaveGFieldVTU(std::ostream& out, int ref_, const FieldMapIterator& it); -+ void SaveCoeffFieldVTU(std::ostream& out, int ref_, -+ const CoeffFieldMapIterator& it); -+ void SaveVCoeffFieldVTU(std::ostream& out, int ref_, -+ const VCoeffFieldMapIterator& it); - const char *GetDataFormatString() const; - const char *GetDataTypeString() const; - /// @brief If compression is enabled, return the compression level, otherwise -@@ -531,7 +602,6 @@ protected: - std::string GeneratePVTUFileName(const std::string &prefix); - std::string GeneratePVTUPath(); - -- - public: - /// Constructor. The collection name is used when saving the data. - /** If @a mesh_ is NULL, then the mesh can be set later by calling SetMesh(). -@@ -582,6 +652,10 @@ public: - /// by default). Reading high-order data requires ParaView 5.5 or later. - void SetHighOrderOutput(bool high_order_output_); - -+ /// Configures collection to save only fields evaluated on boundaries of -+ /// the mesh. -+ void SetBoundaryOutput(bool bdr_output_); -+ - /// Enable or disable restart mode. If restart is enabled, new writes will - /// preserve timestep metadata for any solutions prior to the currently - /// defined time. -diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp -index 8cbd6618f..a1403e5d5 100644 ---- a/mesh/mesh.cpp -+++ b/mesh/mesh.cpp -@@ -10965,7 +10965,7 @@ void Mesh::PrintVTU(std::string fname, - VTKFormat format, - bool high_order_output, - int compression_level, -- bool bdr) -+ bool bdr_elements) - { - int ref = (high_order_output && Nodes) - ? Nodes->FESpace()->GetMaxElementOrder() : 1; -@@ -10979,7 +10979,7 @@ void Mesh::PrintVTU(std::string fname, - } - os << " byte_order=\"" << VTKByteOrder() << "\">\n"; - os << "\n"; -- PrintVTU(os, ref, format, high_order_output, compression_level, bdr); -+ PrintVTU(os, ref, format, high_order_output, compression_level, bdr_elements); - os << "\n"; // need to close the piece open in the PrintVTU method - os << "\n"; - os << "" << std::endl; -diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp -index 29f75e666..208501345 100644 ---- a/mesh/mesh.hpp -+++ b/mesh/mesh.hpp -@@ -2090,7 +2090,7 @@ public: - VTKFormat format=VTKFormat::ASCII, - bool high_order_output=false, - int compression_level=0, -- bool bdr=false); -+ bool bdr_elements=false); - /** Print the boundary elements of the mesh in VTU format, and output the - boundary attributes as a data array (useful for boundary conditions). */ - void PrintBdrVTU(std::string fname, -diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp -index b0f28dc4d..6490793f3 100644 ---- a/mesh/pmesh.cpp -+++ b/mesh/pmesh.cpp -@@ -6301,7 +6301,7 @@ void ParMesh::PrintVTU(std::string pathname, - VTKFormat format, - bool high_order_output, - int compression_level, -- bool bdr) -+ bool bdr_elements) - { - int pad_digits_rank = 6; - DataCollection::create_directory(pathname, this, MyRank); -@@ -6361,7 +6361,8 @@ void ParMesh::PrintVTU(std::string pathname, - - std::string vtu_fname = pathname + "/" + fname + ".proc" - + to_padded_string(MyRank, pad_digits_rank); -- Mesh::PrintVTU(vtu_fname, format, high_order_output, compression_level, bdr); -+ Mesh::PrintVTU(vtu_fname, format, high_order_output, compression_level, -+ bdr_elements); - } - - int ParMesh::FindPoints(DenseMatrix& point_mat, Array& elem_id, -diff --git a/mesh/pmesh.hpp b/mesh/pmesh.hpp -index 27b97c028..e8e0955c8 100644 ---- a/mesh/pmesh.hpp -+++ b/mesh/pmesh.hpp -@@ -666,7 +666,7 @@ public: - VTKFormat format=VTKFormat::ASCII, - bool high_order_output=false, - int compression_level=0, -- bool bdr=false) override; -+ bool bdr_elements=false) override; - - /// Parallel version of Mesh::Load(). - void Load(std::istream &input, int generate_edges = 0, +diff --git a/fem/datacollection.cpp b/fem/datacollection.cpp +index 10e8b77b55..e37f6ce184 100644 +--- a/fem/datacollection.cpp ++++ b/fem/datacollection.cpp +@@ -310,9 +310,9 @@ void DataCollection::SaveField(const std::string &field_name) + } + } + +-void DataCollection::SaveQField(const std::string &q_field_name) ++void DataCollection::SaveQField(const std::string &field_name) + { +- QFieldMapIterator it = q_field_map.find(q_field_name); ++ QFieldMapIterator it = q_field_map.find(field_name); + if (it != q_field_map.end()) + { + SaveOneQField(it); +@@ -780,6 +780,11 @@ void ParaViewDataCollectionBase::SetHighOrderOutput(bool high_order_output_) + high_order_output = high_order_output_; + } + ++void ParaViewDataCollectionBase::SetBoundaryOutput(bool bdr_output_) ++{ ++ bdr_output = bdr_output_; ++} ++ + void ParaViewDataCollectionBase::SetCompressionLevel(int compression_level_) + { + MFEM_ASSERT(compression_level_ >= -1 && compression_level_ <= 9, +@@ -935,16 +940,19 @@ void ParaViewDataCollection::Save() + std::string vtu_prefix = col_path + "/" + GenerateVTUPath() + "/"; + + // Save the local part of the mesh and grid functions fields to the local +- // VTU file ++ // VTU file. Also save coefficient fields. + { + std::ofstream os(vtu_prefix + GenerateVTUFileName("proc", myid)); + os.precision(precision); + SaveDataVTU(os, levels_of_detail); + } + +- // Save the local part of the quadrature function fields ++ // Save the local part of the quadrature function fields. + for (const auto &qfield : q_field_map) + { ++ MFEM_VERIFY(!bdr_output, ++ "QuadratureFunction output is not supported for " ++ "ParaViewDataCollection on domain boundary!"); + const std::string &field_name = qfield.first; + std::ofstream os(vtu_prefix + GenerateVTUFileName(field_name, myid)); + qfield.second->SaveVTU(os, pv_data_format, GetCompressionLevel(), field_name); +@@ -960,7 +968,7 @@ void ParaViewDataCollection::Save() + std::ofstream pvtu_out(vtu_prefix + GeneratePVTUFileName("data")); + WritePVTUHeader(pvtu_out); + +- // Grid function fields ++ // Grid function fields and coefficient fields + pvtu_out << "\n"; + for (auto &field_it : field_map) + { +@@ -971,7 +979,24 @@ void ParaViewDataCollection::Save() + << VTKComponentLabels(vec_dim) << " " + << "format=\"" << GetDataFormatString() << "\" />\n"; + } ++ for (auto &field_it : coeff_field_map) ++ { ++ int vec_dim = 1; ++ pvtu_out << "\n"; ++ } ++ for (auto &field_it : vcoeff_field_map) ++ { ++ int vec_dim = field_it.second->GetVDim(); ++ pvtu_out << "\n"; ++ } + pvtu_out << "\n"; ++ + // Element attributes + pvtu_out << "\n"; + pvtu_out << "\t\n"; + os << "\n"; +- mesh->PrintVTU(os,ref,pv_data_format,high_order_output,GetCompressionLevel()); ++ mesh->PrintVTU(os,ref,pv_data_format,high_order_output,GetCompressionLevel(), ++ bdr_output); + + // dump out the grid functions as point data + os << "\n"; +@@ -1077,8 +1103,21 @@ void ParaViewDataCollection::SaveDataVTU(std::ostream &os, int ref) + // iterate over all grid functions + for (FieldMapIterator it=field_map.begin(); it!=field_map.end(); ++it) + { ++ MFEM_VERIFY(!bdr_output, ++ "GridFunction output is not supported for " ++ "ParaViewDataCollection on domain boundary!"); + SaveGFieldVTU(os,ref,it); + } ++ // save the coefficient functions ++ // iterate over all Coefficient and VectorCoefficient functions ++ for (const auto &kv : coeff_field_map) ++ { ++ SaveCoeffFieldVTU(os, ref, kv.first, *kv.second); ++ } ++ for (const auto &kv : vcoeff_field_map) ++ { ++ SaveVCoeffFieldVTU(os, ref, kv.first, *kv.second); ++ } + os << "\n"; + // close the mesh + os << "\n"; // close the piece open in the PrintVTU method +@@ -1101,7 +1140,6 @@ void ParaViewDataCollection::SaveGFieldVTU(std::ostream &os, int ref_, + << "format=\"" << GetDataFormatString() << "\" >" << '\n'; + if (vec_dim == 1) + { +- // scalar data + for (int i = 0; i < mesh->GetNE(); i++) + { + RefG = GlobGeometryRefiner.Refine( +@@ -1131,11 +1169,131 @@ void ParaViewDataCollection::SaveGFieldVTU(std::ostream &os, int ref_, + } + } + } ++ if (pv_data_format != VTKFormat::ASCII) ++ { ++ WriteBase64WithSizeAndClear(os, buf, GetCompressionLevel()); ++ } ++ os << "" << std::endl; ++} + +- if (IsBinaryFormat()) ++void ParaViewDataCollection::SaveCoeffFieldVTU(std::ostream &os, int ref_, ++ const std::string &name, Coefficient &coeff) ++{ ++ RefinedGeometry *RefG; ++ real_t val; ++ std::vector buf; ++ int vec_dim = 1; ++ os << "" << '\n'; ++ { ++ // scalar data ++ if (!bdr_output) ++ { ++ for (int i = 0; i < mesh->GetNE(); i++) ++ { ++ RefG = GlobGeometryRefiner.Refine( ++ mesh->GetElementBaseGeometry(i), ref_, 1); ++ ++ ElementTransformation *eltrans = mesh->GetElementTransformation(i); ++ const IntegrationRule *ir = &RefG->RefPts; ++ for (int j = 0; j < ir->GetNPoints(); j++) ++ { ++ const IntegrationPoint &ip = ir->IntPoint(j); ++ eltrans->SetIntPoint(&ip); ++ val = coeff.Eval(*eltrans, ip); ++ WriteBinaryOrASCII(os, buf, val, "\n", pv_data_format); ++ } ++ } ++ } ++ else ++ { ++ for (int i = 0; i < mesh->GetNBE(); i++) ++ { ++ RefG = GlobGeometryRefiner.Refine( ++ mesh->GetBdrElementBaseGeometry(i), ref_, 1); ++ ++ ElementTransformation *eltrans = mesh->GetBdrElementTransformation(i); ++ const IntegrationRule *ir = &RefG->RefPts; ++ for (int j = 0; j < ir->GetNPoints(); j++) ++ { ++ const IntegrationPoint &ip = ir->IntPoint(j); ++ eltrans->SetIntPoint(&ip); ++ val = coeff.Eval(*eltrans, ip); ++ WriteBinaryOrASCII(os, buf, val, "\n", pv_data_format); ++ } ++ } ++ } ++ } ++ if (pv_data_format != VTKFormat::ASCII) ++ { ++ WriteBase64WithSizeAndClear(os, buf, GetCompressionLevel()); ++ } ++ os << "" << std::endl; ++} ++ ++void ParaViewDataCollection::SaveVCoeffFieldVTU(std::ostream &os, int ref_, ++ const std::string &name, VectorCoefficient &coeff) ++{ ++ RefinedGeometry *RefG; ++ Vector val; ++ std::vector buf; ++ int vec_dim = coeff.GetVDim(); ++ os << "" << '\n'; ++ { ++ // vector data ++ if (!bdr_output) ++ { ++ for (int i = 0; i < mesh->GetNE(); i++) ++ { ++ RefG = GlobGeometryRefiner.Refine( ++ mesh->GetElementBaseGeometry(i), ref_, 1); ++ ++ ElementTransformation *eltrans = mesh->GetElementTransformation(i); ++ const IntegrationRule *ir = &RefG->RefPts; ++ for (int j = 0; j < ir->GetNPoints(); j++) ++ { ++ const IntegrationPoint &ip = ir->IntPoint(j); ++ eltrans->SetIntPoint(&ip); ++ coeff.Eval(val, *eltrans, ip); ++ for (int jj = 0; jj < val.Size(); jj++) ++ { ++ WriteBinaryOrASCII(os, buf, val(jj), " ", pv_data_format); ++ } ++ if (pv_data_format == VTKFormat::ASCII) { os << '\n'; } ++ } ++ } ++ } ++ else ++ { ++ for (int i = 0; i < mesh->GetNBE(); i++) ++ { ++ RefG = GlobGeometryRefiner.Refine( ++ mesh->GetBdrElementBaseGeometry(i), ref_, 1); ++ ++ ElementTransformation *eltrans = mesh->GetBdrElementTransformation(i); ++ const IntegrationRule *ir = &RefG->RefPts; ++ for (int j = 0; j < ir->GetNPoints(); j++) ++ { ++ const IntegrationPoint &ip = ir->IntPoint(j); ++ eltrans->SetIntPoint(&ip); ++ coeff.Eval(val, *eltrans, ip); ++ for (int jj = 0; jj < val.Size(); jj++) ++ { ++ WriteBinaryOrASCII(os, buf, val(jj), " ", pv_data_format); ++ } ++ if (pv_data_format == VTKFormat::ASCII) { os << '\n'; } ++ } ++ } ++ } ++ } ++ if (pv_data_format != VTKFormat::ASCII) + { +- WriteVTKEncodedCompressed(os,buf.data(),buf.size(),GetCompressionLevel()); +- os << '\n'; ++ WriteBase64WithSizeAndClear(os, buf, GetCompressionLevel()); + } + os << "" << std::endl; + } +diff --git a/fem/datacollection.hpp b/fem/datacollection.hpp +index a4a376f8b1..e82830304d 100644 +--- a/fem/datacollection.hpp ++++ b/fem/datacollection.hpp +@@ -133,6 +133,7 @@ private: + + /// A collection of named QuadratureFunctions + typedef NamedFieldsMap QFieldMap; ++ + public: + typedef GFieldMap::MapType FieldMapType; + typedef GFieldMap::iterator FieldMapIterator; +@@ -249,10 +250,9 @@ public: + { field_map.Deregister(field_name, own_data); } + + /// Add a QuadratureFunction to the collection. +- virtual void RegisterQField(const std::string& q_field_name, ++ virtual void RegisterQField(const std::string& field_name, + QuadratureFunction *qf) +- { q_field_map.Register(q_field_name, qf, own_data); } +- ++ { q_field_map.Register(field_name, qf, own_data); } + + /// Remove a QuadratureFunction from the collection + virtual void DeregisterQField(const std::string& field_name) +@@ -280,13 +280,13 @@ public: + #endif + + /// Check if a QuadratureFunction with the given name is in the collection. +- bool HasQField(const std::string& q_field_name) const +- { return q_field_map.Has(q_field_name); } ++ bool HasQField(const std::string& field_name) const ++ { return q_field_map.Has(field_name); } + + /// Get a pointer to a QuadratureFunction in the collection. + /** Returns NULL if @a field_name is not in the collection. */ +- QuadratureFunction *GetQField(const std::string& q_field_name) +- { return q_field_map.Get(q_field_name); } ++ QuadratureFunction *GetQField(const std::string& field_name) ++ { return q_field_map.Get(field_name); } + + /// Get a const reference to the internal field map. + /** The keys in the map are the field names and the values are pointers to +@@ -302,11 +302,13 @@ public: + + /// Get a pointer to the mesh in the collection + Mesh *GetMesh() { return mesh; } ++ + /// Set/change the mesh associated with the collection + /** When passed a Mesh, assumes the serial case: MPI rank id is set to 0 and + MPI num_procs is set to 1. When passed a ParMesh, MPI info from the + ParMesh is used to set the DataCollection's MPI rank and num_procs. */ + virtual void SetMesh(Mesh *new_mesh); ++ + #ifdef MFEM_USE_MPI + /// Set/change the mesh associated with the collection. + /** For this case, @a comm is used to set the DataCollection's MPI rank id +@@ -369,8 +371,7 @@ public: + /// Save one field, assuming the collection directory already exists. + virtual void SaveField(const std::string &field_name); + /// Save one q-field, assuming the collection directory already exists. +- virtual void SaveQField(const std::string &q_field_name); +- ++ virtual void SaveQField(const std::string &field_name); + /// Load the collection. Not implemented in the base class DataCollection. + virtual void Load(int cycle_ = 0); + +@@ -510,7 +511,9 @@ protected: + int compression_level = -1; + bool high_order_output = false; + bool restart_mode = false; ++ bool bdr_output = false; + VTKFormat pv_data_format = VTKFormat::BINARY; ++ + public: + ParaViewDataCollectionBase(const std::string &name, Mesh *mesh); + +@@ -543,6 +546,10 @@ public: + /// Reading high-order data requires ParaView 5.5 or later. + void SetHighOrderOutput(bool high_order_output_); + ++ /// @brief Configures collection to save only fields evaluated on boundaries of ++ /// the mesh. ++ void SetBoundaryOutput(bool bdr_output_); ++ + /// If compression is enabled, return the compression level, else return 0. + int GetCompressionLevel() const; + +@@ -564,8 +571,6 @@ public: + /// + /// If restart is enabled, new writes will preserve timestep metadata for any + /// solutions prior to the currently defined time. +- /// +- /// Initially, restart mode is disabled. + void UseRestartMode(bool restart_mode_); + }; + +@@ -575,11 +580,23 @@ class ParaViewDataCollection : public ParaViewDataCollectionBase + private: + std::fstream pvd_stream; + ++ /// A collection of named Coefficients and VectorCoefficients ++ using CoeffFieldMap = NamedFieldsMap; ++ using VCoeffFieldMap = NamedFieldsMap; ++ ++ /** A FieldMap mapping registered names to Coefficient and VectorCoefficient ++ pointers. */ ++ CoeffFieldMap coeff_field_map; ++ VCoeffFieldMap vcoeff_field_map; + protected: + void WritePVTUHeader(std::ostream &out); + void WritePVTUFooter(std::ostream &out, const std::string &vtu_prefix); + void SaveDataVTU(std::ostream &out, int ref); + void SaveGFieldVTU(std::ostream& out, int ref_, const FieldMapIterator& it); ++ void SaveCoeffFieldVTU(std::ostream& out, int ref_, const std::string &name, ++ Coefficient &coeff); ++ void SaveVCoeffFieldVTU(std::ostream& out, int ref_, const std::string &name, ++ VectorCoefficient& coeff); + const char *GetDataFormatString() const; + const char *GetDataTypeString() const; + +@@ -598,6 +615,25 @@ public: + ParaViewDataCollection(const std::string& collection_name, + Mesh *mesh_ = nullptr); + ++ /// Get a const reference to the internal coefficient-field map. ++ const typename CoeffFieldMap::MapType &GetCoeffFieldMap() const ++ { return coeff_field_map.GetMap(); } ++ const typename VCoeffFieldMap::MapType &GetVCoeffFieldMap() const ++ { return vcoeff_field_map.GetMap(); } ++ ++ /// Add a Coefficient or VectorCoefficient to the collection. ++ void RegisterCoeffField(const std::string& field_name, Coefficient *coeff) ++ { coeff_field_map.Register(field_name, coeff, own_data); } ++ void RegisterVCoeffField(const std::string& field_name, ++ VectorCoefficient *vcoeff) ++ { vcoeff_field_map.Register(field_name, vcoeff, own_data); } ++ ++ /// Remove a Coefficient or VectorCoefficient from the collection ++ void DeregisterCoeffField(const std::string& field_name) ++ { coeff_field_map.Deregister(field_name, own_data); } ++ void DeregisterVCoeffField(const std::string& field_name) ++ { vcoeff_field_map.Deregister(field_name, own_data); } ++ + /// Save the collection - the directory name is constructed based on the + /// cycle value + void Save() override; +diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp +index a1f8fd5b8c..4470765f9c 100644 +--- a/mesh/mesh.cpp ++++ b/mesh/mesh.cpp +@@ -12165,7 +12165,7 @@ void Mesh::PrintVTU(std::string fname, + VTKFormat format, + bool high_order_output, + int compression_level, +- bool bdr) ++ bool bdr_elements) + { + int ref = (high_order_output && Nodes) + ? Nodes->FESpace()->GetMaxElementOrder() : 1; +@@ -12179,7 +12179,7 @@ void Mesh::PrintVTU(std::string fname, + } + os << " byte_order=\"" << VTKByteOrder() << "\">\n"; + os << "\n"; +- PrintVTU(os, ref, format, high_order_output, compression_level, bdr); ++ PrintVTU(os, ref, format, high_order_output, compression_level, bdr_elements); + os << "\n"; // need to close the piece open in the PrintVTU method + os << "\n"; + os << "" << std::endl; +diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp +index 945bf36f16..edc6ab14d3 100644 +--- a/mesh/mesh.hpp ++++ b/mesh/mesh.hpp +@@ -2560,7 +2560,7 @@ public: + VTKFormat format=VTKFormat::ASCII, + bool high_order_output=false, + int compression_level=0, +- bool bdr=false); ++ bool bdr_elements=false); + /** Print the boundary elements of the mesh in VTU format, and output the + boundary attributes as a data array (useful for boundary conditions). */ + void PrintBdrVTU(std::string fname, +diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp +index f2cfef429d..c080aa9942 100644 +--- a/mesh/pmesh.cpp ++++ b/mesh/pmesh.cpp +@@ -6417,7 +6417,7 @@ void ParMesh::PrintVTU(std::string pathname, + VTKFormat format, + bool high_order_output, + int compression_level, +- bool bdr) ++ bool bdr_elements) + { + int pad_digits_rank = 6; + DataCollection::create_directory(pathname, this, MyRank); +@@ -6477,7 +6477,8 @@ void ParMesh::PrintVTU(std::string pathname, + + std::string vtu_fname = pathname + "/" + fname + ".proc" + + to_padded_string(MyRank, pad_digits_rank); +- Mesh::PrintVTU(vtu_fname, format, high_order_output, compression_level, bdr); ++ Mesh::PrintVTU(vtu_fname, format, high_order_output, compression_level, ++ bdr_elements); + } + + int ParMesh::FindPoints(DenseMatrix& point_mat, Array& elem_id, +diff --git a/mesh/pmesh.hpp b/mesh/pmesh.hpp +index 0c71e60ed9..d5e8bedee1 100644 +--- a/mesh/pmesh.hpp ++++ b/mesh/pmesh.hpp +@@ -788,7 +788,7 @@ public: + VTKFormat format=VTKFormat::ASCII, + bool high_order_output=false, + int compression_level=0, +- bool bdr=false) override; ++ bool bdr_elements=false) override; + + /// Parallel version of Mesh::Load(). + void Load(std::istream &input, int generate_edges = 0, diff --git a/extern/patch/mfem/patch_mfem_device_fixes.diff b/extern/patch/mfem/patch_mfem_device_fixes.diff index 448adba25f..d389db9b79 100644 --- a/extern/patch/mfem/patch_mfem_device_fixes.diff +++ b/extern/patch/mfem/patch_mfem_device_fixes.diff @@ -1,696 +1,696 @@ -diff --git a/general/device.cpp b/general/device.cpp -index ccee71cd7..f664f70c3 100644 ---- a/general/device.cpp -+++ b/general/device.cpp -@@ -9,14 +9,13 @@ - // terms of the BSD-3 license. We welcome feedback and contributions, see file - // CONTRIBUTING.md for details. - -+#include "device.hpp" - #include "forall.hpp" - #include "occa.hpp" - #ifdef MFEM_USE_CEED - #include "../fem/ceed/interface/util.hpp" - #endif - --#include --#include - #include - - namespace mfem -@@ -144,13 +143,11 @@ Device::Device() - } - } - -- - Device::~Device() - { - if ( device_env && !destroy_mm) { return; } - if (!device_env && destroy_mm && !mem_host_env) - { -- free(device_option); - #ifdef MFEM_USE_CEED - // Destroy FES -> CeedBasis, CeedElemRestriction hash table contents - for (auto entry : internal::ceed_basis_map) -@@ -169,7 +166,6 @@ Device::~Device() - mm.Destroy(); - } - Get().ngpu = -1; -- Get().mode = SEQUENTIAL; - Get().backends = Backend::CPU; - Get().host_mem_type = MemoryType::HOST; - Get().host_mem_class = MemoryClass::HOST; -@@ -189,6 +185,7 @@ void Device::Configure(const std::string &device, const int device_id) - } - - std::map bmap; -+ std::string device_option; - for (int i = 0; i < Backend::NUM_BACKENDS; i++) - { - bmap[internal::backend_name[i]] = internal::backend_list[i]; -@@ -200,21 +197,14 @@ void Device::Configure(const std::string &device, const int device_id) - end = (end != std::string::npos) ? end : device.size(); - const std::string bname = device.substr(beg, end - beg); - option = bname.find(':'); -- if (option==std::string::npos) // No option -- { -- const std::string backend = bname; -- std::map::iterator it = bmap.find(backend); -- MFEM_VERIFY(it != bmap.end(), "invalid backend name: '" << backend << '\''); -- Get().MarkBackend(it->second); -- } -- else -+ const std::string backend = (option != std::string::npos) ? -+ bname.substr(0, option) : bname; -+ std::map::iterator it = bmap.find(backend); -+ MFEM_VERIFY(it != bmap.end(), "Invalid backend name: '" << backend << '\''); -+ Get().MarkBackend(it->second); -+ if (option != std::string::npos) - { -- const std::string backend = bname.substr(0, option); -- const std::string boption = bname.substr(option+1); -- Get().device_option = strdup(boption.c_str()); -- std::map::iterator it = bmap.find(backend); -- MFEM_VERIFY(it != bmap.end(), "invalid backend name: '" << backend << '\''); -- Get().MarkBackend(it->second); -+ device_option += bname.substr(option); - } - if (end == device.size()) { break; } - beg = end + 1; -@@ -240,10 +230,10 @@ void Device::Configure(const std::string &device, const int device_id) - #endif - - // Perform setup. -- Get().Setup(device_id); -+ Get().Setup(device_option, device_id); - -- // Enable the device -- Enable(); -+ // Enable the device. -+ Get().UpdateMemoryTypeAndClass(device_option); - - // Copy all data members from the global 'singleton_device' into '*this'. - if (this != &Get()) { std::memcpy(this, &Get(), sizeof(Device)); } -@@ -252,30 +242,6 @@ void Device::Configure(const std::string &device, const int device_id) - destroy_mm = true; - } - --// static method --void Device::SetMemoryTypes(MemoryType h_mt, MemoryType d_mt) --{ -- // If the device and/or the MemoryTypes are configured through the -- // environment (variables 'MFEM_DEVICE', 'MFEM_MEMORY'), ignore calls to this -- // method. -- if (mem_host_env || mem_device_env || device_env) { return; } -- -- MFEM_VERIFY(!IsConfigured(), "the default MemoryTypes can only be set before" -- " Device construction and configuration"); -- MFEM_VERIFY(IsHostMemory(h_mt), -- "invalid host MemoryType, h_mt = " << (int)h_mt); -- MFEM_VERIFY(IsDeviceMemory(d_mt) || d_mt == h_mt, -- "invalid device MemoryType, d_mt = " << (int)d_mt -- << " (h_mt = " << (int)h_mt << ')'); -- -- Get().host_mem_type = h_mt; -- Get().device_mem_type = d_mt; -- mem_types_set = true; -- -- // h_mt and d_mt will be set as dual to each other during configuration by -- // the call mm.Configure(...) in UpdateMemoryTypeAndClass() --} -- - void Device::Print(std::ostream &os) - { - os << "Device configuration: "; -@@ -307,96 +273,53 @@ void Device::Print(std::ostream &os) - os << std::endl; - } - --void Device::UpdateMemoryTypeAndClass() -+// static method -+void Device::SetMemoryTypes(MemoryType h_mt, MemoryType d_mt) - { -- const bool debug = Device::Allows(Backend::DEBUG_DEVICE); -- -- const bool device = Device::Allows(Backend::DEVICE_MASK); -- --#ifdef MFEM_USE_UMPIRE -- // If MFEM has been compiled with Umpire support, use it as the default -- if (!mem_host_env && !mem_types_set) -- { -- host_mem_type = MemoryType::HOST_UMPIRE; -- if (!mem_device_env) -- { -- device_mem_type = MemoryType::HOST_UMPIRE; -- } -- } --#endif -- -- // Enable the device memory type -- if (device) -- { -- if (!mem_device_env) -- { -- if (mem_host_env) -- { -- switch (host_mem_type) -- { -- case MemoryType::HOST_UMPIRE: -- device_mem_type = MemoryType::DEVICE_UMPIRE; -- break; -- case MemoryType::HOST_DEBUG: -- device_mem_type = MemoryType::DEVICE_DEBUG; -- break; -- default: -- device_mem_type = MemoryType::DEVICE; -- } -- } -- else if (!mem_types_set) -- { --#ifndef MFEM_USE_UMPIRE -- device_mem_type = MemoryType::DEVICE; --#else -- device_mem_type = MemoryType::DEVICE_UMPIRE; --#endif -- } -- } -- device_mem_class = MemoryClass::DEVICE; -- } -- -- // Enable the UVM shortcut when requested -- if (device && device_option && !strcmp(device_option, "uvm")) -- { -- host_mem_type = MemoryType::MANAGED; -- device_mem_type = MemoryType::MANAGED; -- } -+ // If the device and/or the MemoryTypes are configured through the -+ // environment (variables 'MFEM_DEVICE', 'MFEM_MEMORY'), ignore calls to this -+ // method. -+ if (mem_host_env || mem_device_env || device_env) { return; } - -- // Enable the DEBUG mode when requested -- if (debug) -- { -- host_mem_type = MemoryType::HOST_DEBUG; -- device_mem_type = MemoryType::DEVICE_DEBUG; -- } -+ MFEM_VERIFY(!IsConfigured(), "The default MemoryTypes can only be set before" -+ " Device construction and configuration"); -+ MFEM_VERIFY(IsHostMemory(h_mt), -+ "Invalid host MemoryType, h_mt = " << (int)h_mt); -+ MFEM_VERIFY(IsDeviceMemory(d_mt) || d_mt == h_mt, -+ "Invalid device MemoryType, d_mt = " << (int)d_mt -+ << " (h_mt = " << (int)h_mt << ')'); - -- MFEM_VERIFY(!device || IsDeviceMemory(device_mem_type), -- "invalid device memory configuration!"); -+ Get().host_mem_type = h_mt; -+ Get().device_mem_type = d_mt; -+ mem_types_set = true; - -- // Update the memory manager with the new settings -- mm.Configure(host_mem_type, device_mem_type); -+ // h_mt and d_mt will be set as dual to each other during configuration by -+ // the call mm.Configure(...) in UpdateMemoryTypeAndClass(). - } - --void Device::Enable() -+// static method -+int Device::GetNumGPU() - { -- const bool accelerated = Get().backends & ~(Backend::CPU); -- if (accelerated) { Get().mode = Device::ACCELERATED;} -- Get().UpdateMemoryTypeAndClass(); --} -- -+ if (Get().ngpu >= 0) { return Get().ngpu; } - #ifdef MFEM_USE_CUDA --static void DeviceSetup(const int dev, int &ngpu) --{ -- ngpu = CuGetDeviceCount(); -- MFEM_VERIFY(ngpu > 0, "No CUDA device found!"); -- MFEM_GPU_CHECK(cudaSetDevice(dev)); --} -+ return CuGetDeviceCount(); -+#elif MFEM_USE_HIP -+ int ngpu; -+ MFEM_GPU_CHECK(hipGetDeviceCount(&ngpu)); -+ return ngpu; -+#else -+ MFEM_ABORT("Unable to query number of available devices without" -+ " MFEM_USE_CUDA or MFEM_USE_HIP!"); -+ return -1; - #endif -+} - - static void CudaDeviceSetup(const int dev, int &ngpu) - { - #ifdef MFEM_USE_CUDA -- DeviceSetup(dev, ngpu); -+ ngpu = CuGetDeviceCount(); -+ MFEM_VERIFY(ngpu > 0, "No CUDA device found!"); -+ MFEM_GPU_CHECK(cudaSetDevice(dev)); - #else - MFEM_CONTRACT_VAR(dev); - MFEM_CONTRACT_VAR(ngpu); -@@ -418,7 +341,7 @@ static void HipDeviceSetup(const int dev, int &ngpu) - static void RajaDeviceSetup(const int dev, int &ngpu) - { - #ifdef MFEM_USE_CUDA -- if (ngpu <= 0) { DeviceSetup(dev, ngpu); } -+ CudaDeviceSetup(dev, ngpu); - #elif defined(MFEM_USE_HIP) - HipDeviceSetup(dev, ngpu); - #else -@@ -443,7 +366,7 @@ static void OccaDeviceSetup(const int dev) - std::string mode("mode: 'CUDA', device_id : "); - internal::occaDevice.setup(mode.append(1,'0'+dev)); - #else -- MFEM_ABORT("the OCCA CUDA backend requires OCCA built with CUDA!"); -+ MFEM_ABORT("The OCCA CUDA backend requires OCCA built with CUDA!"); - #endif - } - else if (omp) -@@ -451,7 +374,7 @@ static void OccaDeviceSetup(const int dev) - #if OCCA_OPENMP_ENABLED - internal::occaDevice.setup("mode: 'OpenMP'"); - #else -- MFEM_ABORT("the OCCA OpenMP backend requires OCCA built with OpenMP!"); -+ MFEM_ABORT("The OCCA OpenMP backend requires OCCA built with OpenMP!"); - #endif - } - else -@@ -477,7 +400,7 @@ static void OccaDeviceSetup(const int dev) - occa::loadKernels("mfem"); - #else - MFEM_CONTRACT_VAR(dev); -- MFEM_ABORT("the OCCA backends require MFEM built with MFEM_USE_OCCA=YES"); -+ MFEM_ABORT("The OCCA backends require MFEM built with MFEM_USE_OCCA=YES"); - #endif - } - -@@ -502,80 +425,136 @@ static void CeedDeviceSetup(const char* ceed_spec) - #endif - } - --void Device::Setup(const int device_id) -+void Device::Setup(const std::string &device_option, const int device_id) - { -- MFEM_VERIFY(ngpu == -1, "the mfem::Device is already configured!"); -+ MFEM_VERIFY(ngpu == -1, "The mfem::Device is already configured!"); - - ngpu = 0; - dev = device_id; - #ifndef MFEM_USE_CUDA - MFEM_VERIFY(!Allows(Backend::CUDA_MASK), -- "the CUDA backends require MFEM built with MFEM_USE_CUDA=YES"); -+ "The CUDA backends require MFEM built with MFEM_USE_CUDA=YES"); - #endif - #ifndef MFEM_USE_HIP - MFEM_VERIFY(!Allows(Backend::HIP_MASK), -- "the HIP backends require MFEM built with MFEM_USE_HIP=YES"); -+ "The HIP backends require MFEM built with MFEM_USE_HIP=YES"); - #endif - #ifndef MFEM_USE_RAJA - MFEM_VERIFY(!Allows(Backend::RAJA_MASK), -- "the RAJA backends require MFEM built with MFEM_USE_RAJA=YES"); -+ "The RAJA backends require MFEM built with MFEM_USE_RAJA=YES"); - #endif - #ifndef MFEM_USE_OPENMP - MFEM_VERIFY(!Allows(Backend::OMP|Backend::RAJA_OMP), -- "the OpenMP and RAJA OpenMP backends require MFEM built with" -+ "The OpenMP and RAJA OpenMP backends require MFEM built with" - " MFEM_USE_OPENMP=YES"); - #endif - #ifndef MFEM_USE_CEED - MFEM_VERIFY(!Allows(Backend::CEED_MASK), -- "the CEED backends require MFEM built with MFEM_USE_CEED=YES"); --#else -- int ceed_cpu = Allows(Backend::CEED_CPU); -- int ceed_cuda = Allows(Backend::CEED_CUDA); -- int ceed_hip = Allows(Backend::CEED_HIP); -- MFEM_VERIFY(ceed_cpu + ceed_cuda + ceed_hip <= 1, -- "Only one CEED backend can be enabled at a time!"); -+ "The CEED backends require MFEM built with MFEM_USE_CEED=YES"); - #endif - if (Allows(Backend::CUDA)) { CudaDeviceSetup(dev, ngpu); } - if (Allows(Backend::HIP)) { HipDeviceSetup(dev, ngpu); } - if (Allows(Backend::RAJA_CUDA) || Allows(Backend::RAJA_HIP)) - { RajaDeviceSetup(dev, ngpu); } -- // The check for MFEM_USE_OCCA is in the function OccaDeviceSetup(). - if (Allows(Backend::OCCA_MASK)) { OccaDeviceSetup(dev); } -- if (Allows(Backend::CEED_CPU)) -+ if (Allows(Backend::CEED_MASK)) - { -- if (!device_option) -+ int ceed_cpu = Allows(Backend::CEED_CPU); -+ int ceed_cuda = Allows(Backend::CEED_CUDA); -+ int ceed_hip = Allows(Backend::CEED_HIP); -+ MFEM_VERIFY(ceed_cpu + ceed_cuda + ceed_hip == 1, -+ "Only one CEED backend can be enabled at a time!"); -+ -+ // NOTE: libCEED's /gpu/cuda/gen and /gpu/hip/gen backends are non- -+ // deterministic! -+ const char *ceed_spec_search = Allows(Backend::CEED_CPU) ? ":/cpu/self" : -+ (Allows(Backend::CEED_CUDA) ? ":/gpu/cuda" : -+ (Allows(Backend::CEED_HIP) ? ":/gpu/hip" : "")); -+ const char *ceed_spec_default = Allows(Backend::CEED_CPU) ? "/cpu/self" : -+ (Allows(Backend::CEED_CUDA) ? "/gpu/cuda/gen" : -+ (Allows(Backend::CEED_HIP) ? "/gpu/hip/gen" : "")); -+ std::string::size_type beg = device_option.find(ceed_spec_search), end; -+ if (beg == std::string::npos) - { -- CeedDeviceSetup("/cpu/self"); -+ CeedDeviceSetup(ceed_spec_default); - } - else - { -- CeedDeviceSetup(device_option); -+ end = device_option.find(':', beg + 1); -+ end = (end != std::string::npos) ? end : device_option.size(); -+ CeedDeviceSetup(device_option.substr(beg + 1, end - beg - 1).c_str()); - } - } -- if (Allows(Backend::CEED_CUDA)) -+ if (Allows(Backend::DEBUG_DEVICE)) { ngpu = 1; } -+} -+ -+void Device::UpdateMemoryTypeAndClass(const std::string &device_option) -+{ -+ const bool debug = Device::Allows(Backend::DEBUG_DEVICE); -+ const bool device = Device::Allows(Backend::DEVICE_MASK); -+ -+#ifdef MFEM_USE_UMPIRE -+ // If MFEM has been compiled with Umpire support, use it as the default -+ if (!mem_host_env && !mem_types_set) - { -- if (!device_option) -- { -- // NOTE: libCEED's /gpu/cuda/gen backend is non-deterministic! -- CeedDeviceSetup("/gpu/cuda/gen"); -- } -- else -+ host_mem_type = MemoryType::HOST_UMPIRE; -+ if (!mem_device_env) - { -- CeedDeviceSetup(device_option); -+ device_mem_type = MemoryType::HOST_UMPIRE; - } - } -- if (Allows(Backend::CEED_HIP)) -+#endif -+ -+ // Enable the device memory type -+ if (device) - { -- if (!device_option) -- { -- CeedDeviceSetup("/gpu/hip"); -- } -- else -+ if (!mem_device_env) - { -- CeedDeviceSetup(device_option); -+ if (mem_host_env) -+ { -+ switch (host_mem_type) -+ { -+ case MemoryType::HOST_UMPIRE: -+ device_mem_type = MemoryType::DEVICE_UMPIRE; -+ break; -+ case MemoryType::HOST_DEBUG: -+ device_mem_type = MemoryType::DEVICE_DEBUG; -+ break; -+ default: -+ device_mem_type = MemoryType::DEVICE; -+ } -+ } -+ else if (!mem_types_set) -+ { -+#ifndef MFEM_USE_UMPIRE -+ device_mem_type = MemoryType::DEVICE; -+#else -+ device_mem_type = MemoryType::DEVICE_UMPIRE; -+#endif -+ } - } -+ device_mem_class = MemoryClass::DEVICE; - } -- if (Allows(Backend::DEBUG_DEVICE)) { ngpu = 1; } -+ -+ // Enable the UVM shortcut when requested -+ if (device && device_option.find(":uvm") != std::string::npos) -+ { -+ host_mem_type = MemoryType::MANAGED; -+ device_mem_type = MemoryType::MANAGED; -+ } -+ -+ // Enable the DEBUG mode when requested -+ if (debug) -+ { -+ host_mem_type = MemoryType::HOST_DEBUG; -+ device_mem_type = MemoryType::DEVICE_DEBUG; -+ } -+ -+ MFEM_VERIFY(!device || IsDeviceMemory(device_mem_type), -+ "Invalid device memory configuration!"); -+ -+ // Update the memory manager with the new settings -+ mm.Configure(host_mem_type, device_mem_type); - } - --} // mfem -+} // namespace mfem -diff --git a/general/device.hpp b/general/device.hpp -index baa27397f..a2d89e22e 100644 ---- a/general/device.hpp -+++ b/general/device.hpp -@@ -14,6 +14,7 @@ - - #include "globals.hpp" - #include "mem_manager.hpp" -+#include - - namespace mfem - { -@@ -81,7 +82,6 @@ struct Backend - { - /// Number of backends: from (1 << 0) to (1 << (NUM_BACKENDS-1)). - NUM_BACKENDS = 15, -- - /// Biwise-OR of all CPU backends - CPU_MASK = CPU | RAJA_CPU | OCCA_CPU | CEED_CPU, - /// Biwise-OR of all CUDA backends -@@ -94,7 +94,6 @@ struct Backend - CEED_MASK = CEED_CPU | CEED_CUDA | CEED_HIP, - /// Biwise-OR of all device backends - DEVICE_MASK = CUDA_MASK | HIP_MASK | DEBUG_DEVICE, -- - /// Biwise-OR of all RAJA backends - RAJA_MASK = RAJA_CPU | RAJA_OMP | RAJA_CUDA | RAJA_HIP, - /// Biwise-OR of all OCCA backends -@@ -122,50 +121,44 @@ class Device - { - private: - friend class MemoryManager; -- enum MODES {SEQUENTIAL, ACCELERATED}; -- -- static bool device_env, mem_host_env, mem_device_env, mem_types_set; - static MFEM_EXPORT Device device_singleton; -+ static bool device_env, mem_host_env, mem_device_env, mem_types_set; - -- MODES mode = Device::SEQUENTIAL; - int dev = 0; ///< Device ID of the configured device. - int ngpu = -1; ///< Number of detected devices; -1: not initialized. -+ - /// Bitwise-OR of all configured backends. - unsigned long backends = Backend::CPU; -+ - /// Set to true during configuration, except in 'device_singleton'. - bool destroy_mm = false; - bool mpi_gpu_aware = false; - -- MemoryType host_mem_type = MemoryType::HOST; ///< Current Host MemoryType -- MemoryClass host_mem_class = MemoryClass::HOST; ///< Current Host MemoryClass -+ /// Current host MemoryType. -+ MemoryType host_mem_type = MemoryType::HOST; -+ /// Current host MemoryClass. -+ MemoryClass host_mem_class = MemoryClass::HOST; - -- /// Current Device MemoryType -+ /// Current device MemoryType. - MemoryType device_mem_type = MemoryType::HOST; -- /// Current Device MemoryClass -+ /// Current device MemoryClass. - MemoryClass device_mem_class = MemoryClass::HOST; - -- char *device_option = NULL; -- Device(Device const&); -- void operator=(Device const&); -- static Device& Get() { return device_singleton; } -- -- /// Setup switcher based on configuration settings -- void Setup(const int device_id = 0); -+ // Delete copy constructor and copy assignment. -+ Device(Device const &) = delete; -+ void operator=(Device const &) = delete; - -- void MarkBackend(Backend::Id b) { backends |= b; } -+ // Access the Device singleton. -+ static Device &Get() { return device_singleton; } - -- void UpdateMemoryTypeAndClass(); -+ /// Setup switcher based on configuration settings. -+ void Setup(const std::string &device_option, const int device_id); - -- /// Enable the use of the configured device in the code that follows. -- /** After this call MFEM classes will use the backend kernels whenever -- possible, transferring data automatically to the device, if necessary. -+ /// Configure host/device MemoryType/MemoryClass. -+ void UpdateMemoryTypeAndClass(const std::string &device_option); - -- If the only configured backend is the default host CPU one, the device -- will remain disabled. -- -- If the device is actually enabled, this method will also update the -- current host/device MemoryType and MemoryClass. */ -- static void Enable(); -+ /// Configure the backends to include @a b. -+ void MarkBackend(Backend::Id b) { backends |= b; } - - public: - /** @brief Default constructor. Unless Configure() is called later, the -@@ -182,16 +175,16 @@ public: - a program. - @note This object should be destroyed after all other MFEM objects that - use the Device are destroyed. */ -- Device(const std::string &device, const int dev = 0) -- { Configure(device, dev); } -+ Device(const std::string &device, const int device_id = 0) -+ { Configure(device, device_id); } - - /// Destructor. - ~Device(); - - /// Configure the Device backends. - /** The string parameter @a device must be a comma-separated list of backend -- string names (see below). The @a dev argument specifies the ID of the -- actual devices (e.g. GPU) to use. -+ string names (see below). The @a device_id argument specifies the ID of -+ the actual devices (e.g. GPU) to use. - * The available backends are described by the Backend class. - * The string name of a backend is the lowercase version of the - Backend::Id enumeration constant with '_' replaced by '-', e.g. the -@@ -219,8 +212,12 @@ public: - and evaluation of operators and enables the 'hip' backend to avoid - transfers between host and device. - * The 'debug' backend should not be combined with other device backends. -- */ -- void Configure(const std::string &device, const int dev = 0); -+ @note If the device is actually enabled, this method will also update the -+ current host/device MemoryType and MemoryClass. */ -+ void Configure(const std::string &device, const int device_id = 0); -+ -+ /// Print the configuration of the MFEM virtual device object. -+ void Print(std::ostream &out = mfem::out); - - /// Set the default host and device MemoryTypes, @a h_mt and @a d_mt. - /** The host and device MemoryTypes are also set to be dual to each other. -@@ -233,60 +230,64 @@ public: - the subsequent Device configuration. */ - static void SetMemoryTypes(MemoryType h_mt, MemoryType d_mt); - -- /// Print the configuration of the MFEM virtual device object. -- void Print(std::ostream &out = mfem::out); -- - /// Return true if Configure() has been called previously. -- static inline bool IsConfigured() { return Get().ngpu >= 0; } -+ static bool IsConfigured() { return Get().ngpu >= 0; } - - /// Return true if an actual device (e.g. GPU) has been configured. -- static inline bool IsAvailable() { return Get().ngpu > 0; } -+ static bool IsAvailable() { return Get().ngpu > 0; } - - /// Return true if any backend other than Backend::CPU is enabled. -- static inline bool IsEnabled() { return Get().mode == ACCELERATED; } -+ static bool IsEnabled() { return Get().backends & ~(Backend::CPU); } - - /// The opposite of IsEnabled(). -- static inline bool IsDisabled() { return !IsEnabled(); } -+ static bool IsDisabled() { return !IsEnabled(); } -+ -+ /// Get the device ID of the configured device. -+ static int GetId() { return Get().dev; } - -- /// Get the device id of the configured device. -- static inline int GetId() { return Get().dev; } -+ /// Get the number of available devices (may be called before configuration). -+ static int GetNumGPU(); - - /** @brief Return true if any of the backends in the backend mask, @a b_mask, - are allowed. */ - /** This method can be used with any of the Backend::Id constants, the - Backend::*_MASK, or combinations of those. */ -- static inline bool Allows(unsigned long b_mask) -+ static bool Allows(unsigned long b_mask) - { return Get().backends & b_mask; } - - /** @brief Get the current Host MemoryType. This is the MemoryType used by - most MFEM classes when allocating memory used on the host. - */ -- static inline MemoryType GetHostMemoryType() { return Get().host_mem_type; } -+ static MemoryType GetHostMemoryType() { return Get().host_mem_type; } - - /** @brief Get the current Host MemoryClass. This is the MemoryClass used - by most MFEM host Memory objects. */ -- static inline MemoryClass GetHostMemoryClass() { return Get().host_mem_class; } -+ static MemoryClass GetHostMemoryClass() { return Get().host_mem_class; } - - /** @brief Get the current Device MemoryType. This is the MemoryType used by - most MFEM classes when allocating memory to be used with device kernels. - */ -- static inline MemoryType GetDeviceMemoryType() { return Get().device_mem_type; } -+ static MemoryType GetDeviceMemoryType() { return Get().device_mem_type; } - - /// (DEPRECATED) Equivalent to GetDeviceMemoryType(). - /** @deprecated Use GetDeviceMemoryType() instead. */ -- static inline MemoryType GetMemoryType() { return Get().device_mem_type; } -+ static MemoryType GetMemoryType() { return Get().device_mem_type; } - - /** @brief Get the current Device MemoryClass. This is the MemoryClass used - by most MFEM device kernels to access Memory objects. */ -- static inline MemoryClass GetDeviceMemoryClass() { return Get().device_mem_class; } -+ static MemoryClass GetDeviceMemoryClass() { return Get().device_mem_class; } - - /// (DEPRECATED) Equivalent to GetDeviceMemoryClass(). - /** @deprecated Use GetDeviceMemoryClass() instead. */ -- static inline MemoryClass GetMemoryClass() { return Get().device_mem_class; } -+ static MemoryClass GetMemoryClass() { return Get().device_mem_class; } - -+ /** @brief Manually set the status of GPU-aware MPI flag for use in MPI -+ communication routines which have optimized implementations for device -+ buffers. */ - static void SetGPUAwareMPI(const bool force = true) - { Get().mpi_gpu_aware = force; } - -+ /// Get the status of GPU-aware MPI flag. - static bool GetGPUAwareMPI() { return Get().mpi_gpu_aware; } - }; - -@@ -298,7 +299,7 @@ public: - and ReadWrite(), while setting the device use flag in @a mem, if @a on_dev - is true. */ - template --MemoryClass GetMemoryClass(const Memory &mem, bool on_dev) -+inline MemoryClass GetMemoryClass(const Memory &mem, bool on_dev) - { - if (!on_dev) - { -@@ -362,6 +363,6 @@ inline T *HostReadWrite(Memory &mem, int size) - return mfem::ReadWrite(mem, size, false); - } - --} // mfem -+} // namespace mfem - - #endif // MFEM_DEVICE_HPP +diff --git a/general/device.cpp b/general/device.cpp +index ccee71cd7..f664f70c3 100644 +--- a/general/device.cpp ++++ b/general/device.cpp +@@ -9,14 +9,13 @@ + // terms of the BSD-3 license. We welcome feedback and contributions, see file + // CONTRIBUTING.md for details. + ++#include "device.hpp" + #include "forall.hpp" + #include "occa.hpp" + #ifdef MFEM_USE_CEED + #include "../fem/ceed/interface/util.hpp" + #endif + +-#include +-#include + #include + + namespace mfem +@@ -144,13 +143,11 @@ Device::Device() + } + } + +- + Device::~Device() + { + if ( device_env && !destroy_mm) { return; } + if (!device_env && destroy_mm && !mem_host_env) + { +- free(device_option); + #ifdef MFEM_USE_CEED + // Destroy FES -> CeedBasis, CeedElemRestriction hash table contents + for (auto entry : internal::ceed_basis_map) +@@ -169,7 +166,6 @@ Device::~Device() + mm.Destroy(); + } + Get().ngpu = -1; +- Get().mode = SEQUENTIAL; + Get().backends = Backend::CPU; + Get().host_mem_type = MemoryType::HOST; + Get().host_mem_class = MemoryClass::HOST; +@@ -189,6 +185,7 @@ void Device::Configure(const std::string &device, const int device_id) + } + + std::map bmap; ++ std::string device_option; + for (int i = 0; i < Backend::NUM_BACKENDS; i++) + { + bmap[internal::backend_name[i]] = internal::backend_list[i]; +@@ -200,21 +197,14 @@ void Device::Configure(const std::string &device, const int device_id) + end = (end != std::string::npos) ? end : device.size(); + const std::string bname = device.substr(beg, end - beg); + option = bname.find(':'); +- if (option==std::string::npos) // No option +- { +- const std::string backend = bname; +- std::map::iterator it = bmap.find(backend); +- MFEM_VERIFY(it != bmap.end(), "invalid backend name: '" << backend << '\''); +- Get().MarkBackend(it->second); +- } +- else ++ const std::string backend = (option != std::string::npos) ? ++ bname.substr(0, option) : bname; ++ std::map::iterator it = bmap.find(backend); ++ MFEM_VERIFY(it != bmap.end(), "Invalid backend name: '" << backend << '\''); ++ Get().MarkBackend(it->second); ++ if (option != std::string::npos) + { +- const std::string backend = bname.substr(0, option); +- const std::string boption = bname.substr(option+1); +- Get().device_option = strdup(boption.c_str()); +- std::map::iterator it = bmap.find(backend); +- MFEM_VERIFY(it != bmap.end(), "invalid backend name: '" << backend << '\''); +- Get().MarkBackend(it->second); ++ device_option += bname.substr(option); + } + if (end == device.size()) { break; } + beg = end + 1; +@@ -240,10 +230,10 @@ void Device::Configure(const std::string &device, const int device_id) + #endif + + // Perform setup. +- Get().Setup(device_id); ++ Get().Setup(device_option, device_id); + +- // Enable the device +- Enable(); ++ // Enable the device. ++ Get().UpdateMemoryTypeAndClass(device_option); + + // Copy all data members from the global 'singleton_device' into '*this'. + if (this != &Get()) { std::memcpy(this, &Get(), sizeof(Device)); } +@@ -252,30 +242,6 @@ void Device::Configure(const std::string &device, const int device_id) + destroy_mm = true; + } + +-// static method +-void Device::SetMemoryTypes(MemoryType h_mt, MemoryType d_mt) +-{ +- // If the device and/or the MemoryTypes are configured through the +- // environment (variables 'MFEM_DEVICE', 'MFEM_MEMORY'), ignore calls to this +- // method. +- if (mem_host_env || mem_device_env || device_env) { return; } +- +- MFEM_VERIFY(!IsConfigured(), "the default MemoryTypes can only be set before" +- " Device construction and configuration"); +- MFEM_VERIFY(IsHostMemory(h_mt), +- "invalid host MemoryType, h_mt = " << (int)h_mt); +- MFEM_VERIFY(IsDeviceMemory(d_mt) || d_mt == h_mt, +- "invalid device MemoryType, d_mt = " << (int)d_mt +- << " (h_mt = " << (int)h_mt << ')'); +- +- Get().host_mem_type = h_mt; +- Get().device_mem_type = d_mt; +- mem_types_set = true; +- +- // h_mt and d_mt will be set as dual to each other during configuration by +- // the call mm.Configure(...) in UpdateMemoryTypeAndClass() +-} +- + void Device::Print(std::ostream &os) + { + os << "Device configuration: "; +@@ -307,96 +273,53 @@ void Device::Print(std::ostream &os) + os << std::endl; + } + +-void Device::UpdateMemoryTypeAndClass() ++// static method ++void Device::SetMemoryTypes(MemoryType h_mt, MemoryType d_mt) + { +- const bool debug = Device::Allows(Backend::DEBUG_DEVICE); +- +- const bool device = Device::Allows(Backend::DEVICE_MASK); +- +-#ifdef MFEM_USE_UMPIRE +- // If MFEM has been compiled with Umpire support, use it as the default +- if (!mem_host_env && !mem_types_set) +- { +- host_mem_type = MemoryType::HOST_UMPIRE; +- if (!mem_device_env) +- { +- device_mem_type = MemoryType::HOST_UMPIRE; +- } +- } +-#endif +- +- // Enable the device memory type +- if (device) +- { +- if (!mem_device_env) +- { +- if (mem_host_env) +- { +- switch (host_mem_type) +- { +- case MemoryType::HOST_UMPIRE: +- device_mem_type = MemoryType::DEVICE_UMPIRE; +- break; +- case MemoryType::HOST_DEBUG: +- device_mem_type = MemoryType::DEVICE_DEBUG; +- break; +- default: +- device_mem_type = MemoryType::DEVICE; +- } +- } +- else if (!mem_types_set) +- { +-#ifndef MFEM_USE_UMPIRE +- device_mem_type = MemoryType::DEVICE; +-#else +- device_mem_type = MemoryType::DEVICE_UMPIRE; +-#endif +- } +- } +- device_mem_class = MemoryClass::DEVICE; +- } +- +- // Enable the UVM shortcut when requested +- if (device && device_option && !strcmp(device_option, "uvm")) +- { +- host_mem_type = MemoryType::MANAGED; +- device_mem_type = MemoryType::MANAGED; +- } ++ // If the device and/or the MemoryTypes are configured through the ++ // environment (variables 'MFEM_DEVICE', 'MFEM_MEMORY'), ignore calls to this ++ // method. ++ if (mem_host_env || mem_device_env || device_env) { return; } + +- // Enable the DEBUG mode when requested +- if (debug) +- { +- host_mem_type = MemoryType::HOST_DEBUG; +- device_mem_type = MemoryType::DEVICE_DEBUG; +- } ++ MFEM_VERIFY(!IsConfigured(), "The default MemoryTypes can only be set before" ++ " Device construction and configuration"); ++ MFEM_VERIFY(IsHostMemory(h_mt), ++ "Invalid host MemoryType, h_mt = " << (int)h_mt); ++ MFEM_VERIFY(IsDeviceMemory(d_mt) || d_mt == h_mt, ++ "Invalid device MemoryType, d_mt = " << (int)d_mt ++ << " (h_mt = " << (int)h_mt << ')'); + +- MFEM_VERIFY(!device || IsDeviceMemory(device_mem_type), +- "invalid device memory configuration!"); ++ Get().host_mem_type = h_mt; ++ Get().device_mem_type = d_mt; ++ mem_types_set = true; + +- // Update the memory manager with the new settings +- mm.Configure(host_mem_type, device_mem_type); ++ // h_mt and d_mt will be set as dual to each other during configuration by ++ // the call mm.Configure(...) in UpdateMemoryTypeAndClass(). + } + +-void Device::Enable() ++// static method ++int Device::GetNumGPU() + { +- const bool accelerated = Get().backends & ~(Backend::CPU); +- if (accelerated) { Get().mode = Device::ACCELERATED;} +- Get().UpdateMemoryTypeAndClass(); +-} +- ++ if (Get().ngpu >= 0) { return Get().ngpu; } + #ifdef MFEM_USE_CUDA +-static void DeviceSetup(const int dev, int &ngpu) +-{ +- ngpu = CuGetDeviceCount(); +- MFEM_VERIFY(ngpu > 0, "No CUDA device found!"); +- MFEM_GPU_CHECK(cudaSetDevice(dev)); +-} ++ return CuGetDeviceCount(); ++#elif MFEM_USE_HIP ++ int ngpu; ++ MFEM_GPU_CHECK(hipGetDeviceCount(&ngpu)); ++ return ngpu; ++#else ++ MFEM_ABORT("Unable to query number of available devices without" ++ " MFEM_USE_CUDA or MFEM_USE_HIP!"); ++ return -1; + #endif ++} + + static void CudaDeviceSetup(const int dev, int &ngpu) + { + #ifdef MFEM_USE_CUDA +- DeviceSetup(dev, ngpu); ++ ngpu = CuGetDeviceCount(); ++ MFEM_VERIFY(ngpu > 0, "No CUDA device found!"); ++ MFEM_GPU_CHECK(cudaSetDevice(dev)); + #else + MFEM_CONTRACT_VAR(dev); + MFEM_CONTRACT_VAR(ngpu); +@@ -418,7 +341,7 @@ static void HipDeviceSetup(const int dev, int &ngpu) + static void RajaDeviceSetup(const int dev, int &ngpu) + { + #ifdef MFEM_USE_CUDA +- if (ngpu <= 0) { DeviceSetup(dev, ngpu); } ++ CudaDeviceSetup(dev, ngpu); + #elif defined(MFEM_USE_HIP) + HipDeviceSetup(dev, ngpu); + #else +@@ -443,7 +366,7 @@ static void OccaDeviceSetup(const int dev) + std::string mode("mode: 'CUDA', device_id : "); + internal::occaDevice.setup(mode.append(1,'0'+dev)); + #else +- MFEM_ABORT("the OCCA CUDA backend requires OCCA built with CUDA!"); ++ MFEM_ABORT("The OCCA CUDA backend requires OCCA built with CUDA!"); + #endif + } + else if (omp) +@@ -451,7 +374,7 @@ static void OccaDeviceSetup(const int dev) + #if OCCA_OPENMP_ENABLED + internal::occaDevice.setup("mode: 'OpenMP'"); + #else +- MFEM_ABORT("the OCCA OpenMP backend requires OCCA built with OpenMP!"); ++ MFEM_ABORT("The OCCA OpenMP backend requires OCCA built with OpenMP!"); + #endif + } + else +@@ -477,7 +400,7 @@ static void OccaDeviceSetup(const int dev) + occa::loadKernels("mfem"); + #else + MFEM_CONTRACT_VAR(dev); +- MFEM_ABORT("the OCCA backends require MFEM built with MFEM_USE_OCCA=YES"); ++ MFEM_ABORT("The OCCA backends require MFEM built with MFEM_USE_OCCA=YES"); + #endif + } + +@@ -502,80 +425,136 @@ static void CeedDeviceSetup(const char* ceed_spec) + #endif + } + +-void Device::Setup(const int device_id) ++void Device::Setup(const std::string &device_option, const int device_id) + { +- MFEM_VERIFY(ngpu == -1, "the mfem::Device is already configured!"); ++ MFEM_VERIFY(ngpu == -1, "The mfem::Device is already configured!"); + + ngpu = 0; + dev = device_id; + #ifndef MFEM_USE_CUDA + MFEM_VERIFY(!Allows(Backend::CUDA_MASK), +- "the CUDA backends require MFEM built with MFEM_USE_CUDA=YES"); ++ "The CUDA backends require MFEM built with MFEM_USE_CUDA=YES"); + #endif + #ifndef MFEM_USE_HIP + MFEM_VERIFY(!Allows(Backend::HIP_MASK), +- "the HIP backends require MFEM built with MFEM_USE_HIP=YES"); ++ "The HIP backends require MFEM built with MFEM_USE_HIP=YES"); + #endif + #ifndef MFEM_USE_RAJA + MFEM_VERIFY(!Allows(Backend::RAJA_MASK), +- "the RAJA backends require MFEM built with MFEM_USE_RAJA=YES"); ++ "The RAJA backends require MFEM built with MFEM_USE_RAJA=YES"); + #endif + #ifndef MFEM_USE_OPENMP + MFEM_VERIFY(!Allows(Backend::OMP|Backend::RAJA_OMP), +- "the OpenMP and RAJA OpenMP backends require MFEM built with" ++ "The OpenMP and RAJA OpenMP backends require MFEM built with" + " MFEM_USE_OPENMP=YES"); + #endif + #ifndef MFEM_USE_CEED + MFEM_VERIFY(!Allows(Backend::CEED_MASK), +- "the CEED backends require MFEM built with MFEM_USE_CEED=YES"); +-#else +- int ceed_cpu = Allows(Backend::CEED_CPU); +- int ceed_cuda = Allows(Backend::CEED_CUDA); +- int ceed_hip = Allows(Backend::CEED_HIP); +- MFEM_VERIFY(ceed_cpu + ceed_cuda + ceed_hip <= 1, +- "Only one CEED backend can be enabled at a time!"); ++ "The CEED backends require MFEM built with MFEM_USE_CEED=YES"); + #endif + if (Allows(Backend::CUDA)) { CudaDeviceSetup(dev, ngpu); } + if (Allows(Backend::HIP)) { HipDeviceSetup(dev, ngpu); } + if (Allows(Backend::RAJA_CUDA) || Allows(Backend::RAJA_HIP)) + { RajaDeviceSetup(dev, ngpu); } +- // The check for MFEM_USE_OCCA is in the function OccaDeviceSetup(). + if (Allows(Backend::OCCA_MASK)) { OccaDeviceSetup(dev); } +- if (Allows(Backend::CEED_CPU)) ++ if (Allows(Backend::CEED_MASK)) + { +- if (!device_option) ++ int ceed_cpu = Allows(Backend::CEED_CPU); ++ int ceed_cuda = Allows(Backend::CEED_CUDA); ++ int ceed_hip = Allows(Backend::CEED_HIP); ++ MFEM_VERIFY(ceed_cpu + ceed_cuda + ceed_hip == 1, ++ "Only one CEED backend can be enabled at a time!"); ++ ++ // NOTE: libCEED's /gpu/cuda/gen and /gpu/hip/gen backends are non- ++ // deterministic! ++ const char *ceed_spec_search = Allows(Backend::CEED_CPU) ? ":/cpu/self" : ++ (Allows(Backend::CEED_CUDA) ? ":/gpu/cuda" : ++ (Allows(Backend::CEED_HIP) ? ":/gpu/hip" : "")); ++ const char *ceed_spec_default = Allows(Backend::CEED_CPU) ? "/cpu/self" : ++ (Allows(Backend::CEED_CUDA) ? "/gpu/cuda/gen" : ++ (Allows(Backend::CEED_HIP) ? "/gpu/hip/gen" : "")); ++ std::string::size_type beg = device_option.find(ceed_spec_search), end; ++ if (beg == std::string::npos) + { +- CeedDeviceSetup("/cpu/self"); ++ CeedDeviceSetup(ceed_spec_default); + } + else + { +- CeedDeviceSetup(device_option); ++ end = device_option.find(':', beg + 1); ++ end = (end != std::string::npos) ? end : device_option.size(); ++ CeedDeviceSetup(device_option.substr(beg + 1, end - beg - 1).c_str()); + } + } +- if (Allows(Backend::CEED_CUDA)) ++ if (Allows(Backend::DEBUG_DEVICE)) { ngpu = 1; } ++} ++ ++void Device::UpdateMemoryTypeAndClass(const std::string &device_option) ++{ ++ const bool debug = Device::Allows(Backend::DEBUG_DEVICE); ++ const bool device = Device::Allows(Backend::DEVICE_MASK); ++ ++#ifdef MFEM_USE_UMPIRE ++ // If MFEM has been compiled with Umpire support, use it as the default ++ if (!mem_host_env && !mem_types_set) + { +- if (!device_option) +- { +- // NOTE: libCEED's /gpu/cuda/gen backend is non-deterministic! +- CeedDeviceSetup("/gpu/cuda/gen"); +- } +- else ++ host_mem_type = MemoryType::HOST_UMPIRE; ++ if (!mem_device_env) + { +- CeedDeviceSetup(device_option); ++ device_mem_type = MemoryType::HOST_UMPIRE; + } + } +- if (Allows(Backend::CEED_HIP)) ++#endif ++ ++ // Enable the device memory type ++ if (device) + { +- if (!device_option) +- { +- CeedDeviceSetup("/gpu/hip"); +- } +- else ++ if (!mem_device_env) + { +- CeedDeviceSetup(device_option); ++ if (mem_host_env) ++ { ++ switch (host_mem_type) ++ { ++ case MemoryType::HOST_UMPIRE: ++ device_mem_type = MemoryType::DEVICE_UMPIRE; ++ break; ++ case MemoryType::HOST_DEBUG: ++ device_mem_type = MemoryType::DEVICE_DEBUG; ++ break; ++ default: ++ device_mem_type = MemoryType::DEVICE; ++ } ++ } ++ else if (!mem_types_set) ++ { ++#ifndef MFEM_USE_UMPIRE ++ device_mem_type = MemoryType::DEVICE; ++#else ++ device_mem_type = MemoryType::DEVICE_UMPIRE; ++#endif ++ } + } ++ device_mem_class = MemoryClass::DEVICE; + } +- if (Allows(Backend::DEBUG_DEVICE)) { ngpu = 1; } ++ ++ // Enable the UVM shortcut when requested ++ if (device && device_option.find(":uvm") != std::string::npos) ++ { ++ host_mem_type = MemoryType::MANAGED; ++ device_mem_type = MemoryType::MANAGED; ++ } ++ ++ // Enable the DEBUG mode when requested ++ if (debug) ++ { ++ host_mem_type = MemoryType::HOST_DEBUG; ++ device_mem_type = MemoryType::DEVICE_DEBUG; ++ } ++ ++ MFEM_VERIFY(!device || IsDeviceMemory(device_mem_type), ++ "Invalid device memory configuration!"); ++ ++ // Update the memory manager with the new settings ++ mm.Configure(host_mem_type, device_mem_type); + } + +-} // mfem ++} // namespace mfem +diff --git a/general/device.hpp b/general/device.hpp +index baa27397f..a2d89e22e 100644 +--- a/general/device.hpp ++++ b/general/device.hpp +@@ -14,6 +14,7 @@ + + #include "globals.hpp" + #include "mem_manager.hpp" ++#include + + namespace mfem + { +@@ -81,7 +82,6 @@ struct Backend + { + /// Number of backends: from (1 << 0) to (1 << (NUM_BACKENDS-1)). + NUM_BACKENDS = 15, +- + /// Biwise-OR of all CPU backends + CPU_MASK = CPU | RAJA_CPU | OCCA_CPU | CEED_CPU, + /// Biwise-OR of all CUDA backends +@@ -94,7 +94,6 @@ struct Backend + CEED_MASK = CEED_CPU | CEED_CUDA | CEED_HIP, + /// Biwise-OR of all device backends + DEVICE_MASK = CUDA_MASK | HIP_MASK | DEBUG_DEVICE, +- + /// Biwise-OR of all RAJA backends + RAJA_MASK = RAJA_CPU | RAJA_OMP | RAJA_CUDA | RAJA_HIP, + /// Biwise-OR of all OCCA backends +@@ -122,50 +121,44 @@ class Device + { + private: + friend class MemoryManager; +- enum MODES {SEQUENTIAL, ACCELERATED}; +- +- static bool device_env, mem_host_env, mem_device_env, mem_types_set; + static MFEM_EXPORT Device device_singleton; ++ static bool device_env, mem_host_env, mem_device_env, mem_types_set; + +- MODES mode = Device::SEQUENTIAL; + int dev = 0; ///< Device ID of the configured device. + int ngpu = -1; ///< Number of detected devices; -1: not initialized. ++ + /// Bitwise-OR of all configured backends. + unsigned long backends = Backend::CPU; ++ + /// Set to true during configuration, except in 'device_singleton'. + bool destroy_mm = false; + bool mpi_gpu_aware = false; + +- MemoryType host_mem_type = MemoryType::HOST; ///< Current Host MemoryType +- MemoryClass host_mem_class = MemoryClass::HOST; ///< Current Host MemoryClass ++ /// Current host MemoryType. ++ MemoryType host_mem_type = MemoryType::HOST; ++ /// Current host MemoryClass. ++ MemoryClass host_mem_class = MemoryClass::HOST; + +- /// Current Device MemoryType ++ /// Current device MemoryType. + MemoryType device_mem_type = MemoryType::HOST; +- /// Current Device MemoryClass ++ /// Current device MemoryClass. + MemoryClass device_mem_class = MemoryClass::HOST; + +- char *device_option = NULL; +- Device(Device const&); +- void operator=(Device const&); +- static Device& Get() { return device_singleton; } +- +- /// Setup switcher based on configuration settings +- void Setup(const int device_id = 0); ++ // Delete copy constructor and copy assignment. ++ Device(Device const &) = delete; ++ void operator=(Device const &) = delete; + +- void MarkBackend(Backend::Id b) { backends |= b; } ++ // Access the Device singleton. ++ static Device &Get() { return device_singleton; } + +- void UpdateMemoryTypeAndClass(); ++ /// Setup switcher based on configuration settings. ++ void Setup(const std::string &device_option, const int device_id); + +- /// Enable the use of the configured device in the code that follows. +- /** After this call MFEM classes will use the backend kernels whenever +- possible, transferring data automatically to the device, if necessary. ++ /// Configure host/device MemoryType/MemoryClass. ++ void UpdateMemoryTypeAndClass(const std::string &device_option); + +- If the only configured backend is the default host CPU one, the device +- will remain disabled. +- +- If the device is actually enabled, this method will also update the +- current host/device MemoryType and MemoryClass. */ +- static void Enable(); ++ /// Configure the backends to include @a b. ++ void MarkBackend(Backend::Id b) { backends |= b; } + + public: + /** @brief Default constructor. Unless Configure() is called later, the +@@ -182,16 +175,16 @@ public: + a program. + @note This object should be destroyed after all other MFEM objects that + use the Device are destroyed. */ +- Device(const std::string &device, const int dev = 0) +- { Configure(device, dev); } ++ Device(const std::string &device, const int device_id = 0) ++ { Configure(device, device_id); } + + /// Destructor. + ~Device(); + + /// Configure the Device backends. + /** The string parameter @a device must be a comma-separated list of backend +- string names (see below). The @a dev argument specifies the ID of the +- actual devices (e.g. GPU) to use. ++ string names (see below). The @a device_id argument specifies the ID of ++ the actual devices (e.g. GPU) to use. + * The available backends are described by the Backend class. + * The string name of a backend is the lowercase version of the + Backend::Id enumeration constant with '_' replaced by '-', e.g. the +@@ -219,8 +212,12 @@ public: + and evaluation of operators and enables the 'hip' backend to avoid + transfers between host and device. + * The 'debug' backend should not be combined with other device backends. +- */ +- void Configure(const std::string &device, const int dev = 0); ++ @note If the device is actually enabled, this method will also update the ++ current host/device MemoryType and MemoryClass. */ ++ void Configure(const std::string &device, const int device_id = 0); ++ ++ /// Print the configuration of the MFEM virtual device object. ++ void Print(std::ostream &out = mfem::out); + + /// Set the default host and device MemoryTypes, @a h_mt and @a d_mt. + /** The host and device MemoryTypes are also set to be dual to each other. +@@ -233,60 +230,64 @@ public: + the subsequent Device configuration. */ + static void SetMemoryTypes(MemoryType h_mt, MemoryType d_mt); + +- /// Print the configuration of the MFEM virtual device object. +- void Print(std::ostream &out = mfem::out); +- + /// Return true if Configure() has been called previously. +- static inline bool IsConfigured() { return Get().ngpu >= 0; } ++ static bool IsConfigured() { return Get().ngpu >= 0; } + + /// Return true if an actual device (e.g. GPU) has been configured. +- static inline bool IsAvailable() { return Get().ngpu > 0; } ++ static bool IsAvailable() { return Get().ngpu > 0; } + + /// Return true if any backend other than Backend::CPU is enabled. +- static inline bool IsEnabled() { return Get().mode == ACCELERATED; } ++ static bool IsEnabled() { return Get().backends & ~(Backend::CPU); } + + /// The opposite of IsEnabled(). +- static inline bool IsDisabled() { return !IsEnabled(); } ++ static bool IsDisabled() { return !IsEnabled(); } ++ ++ /// Get the device ID of the configured device. ++ static int GetId() { return Get().dev; } + +- /// Get the device id of the configured device. +- static inline int GetId() { return Get().dev; } ++ /// Get the number of available devices (may be called before configuration). ++ static int GetNumGPU(); + + /** @brief Return true if any of the backends in the backend mask, @a b_mask, + are allowed. */ + /** This method can be used with any of the Backend::Id constants, the + Backend::*_MASK, or combinations of those. */ +- static inline bool Allows(unsigned long b_mask) ++ static bool Allows(unsigned long b_mask) + { return Get().backends & b_mask; } + + /** @brief Get the current Host MemoryType. This is the MemoryType used by + most MFEM classes when allocating memory used on the host. + */ +- static inline MemoryType GetHostMemoryType() { return Get().host_mem_type; } ++ static MemoryType GetHostMemoryType() { return Get().host_mem_type; } + + /** @brief Get the current Host MemoryClass. This is the MemoryClass used + by most MFEM host Memory objects. */ +- static inline MemoryClass GetHostMemoryClass() { return Get().host_mem_class; } ++ static MemoryClass GetHostMemoryClass() { return Get().host_mem_class; } + + /** @brief Get the current Device MemoryType. This is the MemoryType used by + most MFEM classes when allocating memory to be used with device kernels. + */ +- static inline MemoryType GetDeviceMemoryType() { return Get().device_mem_type; } ++ static MemoryType GetDeviceMemoryType() { return Get().device_mem_type; } + + /// (DEPRECATED) Equivalent to GetDeviceMemoryType(). + /** @deprecated Use GetDeviceMemoryType() instead. */ +- static inline MemoryType GetMemoryType() { return Get().device_mem_type; } ++ static MemoryType GetMemoryType() { return Get().device_mem_type; } + + /** @brief Get the current Device MemoryClass. This is the MemoryClass used + by most MFEM device kernels to access Memory objects. */ +- static inline MemoryClass GetDeviceMemoryClass() { return Get().device_mem_class; } ++ static MemoryClass GetDeviceMemoryClass() { return Get().device_mem_class; } + + /// (DEPRECATED) Equivalent to GetDeviceMemoryClass(). + /** @deprecated Use GetDeviceMemoryClass() instead. */ +- static inline MemoryClass GetMemoryClass() { return Get().device_mem_class; } ++ static MemoryClass GetMemoryClass() { return Get().device_mem_class; } + ++ /** @brief Manually set the status of GPU-aware MPI flag for use in MPI ++ communication routines which have optimized implementations for device ++ buffers. */ + static void SetGPUAwareMPI(const bool force = true) + { Get().mpi_gpu_aware = force; } + ++ /// Get the status of GPU-aware MPI flag. + static bool GetGPUAwareMPI() { return Get().mpi_gpu_aware; } + }; + +@@ -298,7 +299,7 @@ public: + and ReadWrite(), while setting the device use flag in @a mem, if @a on_dev + is true. */ + template +-MemoryClass GetMemoryClass(const Memory &mem, bool on_dev) ++inline MemoryClass GetMemoryClass(const Memory &mem, bool on_dev) + { + if (!on_dev) + { +@@ -362,6 +363,6 @@ inline T *HostReadWrite(Memory &mem, int size) + return mfem::ReadWrite(mem, size, false); + } + +-} // mfem ++} // namespace mfem + + #endif // MFEM_DEVICE_HPP diff --git a/extern/patch/mfem/patch_ncmesh_interior_boundary_dev.diff b/extern/patch/mfem/patch_ncmesh_interior_boundary_dev.diff index 45f62f1089..63e576090f 100644 --- a/extern/patch/mfem/patch_ncmesh_interior_boundary_dev.diff +++ b/extern/patch/mfem/patch_ncmesh_interior_boundary_dev.diff @@ -1,6082 +1,6082 @@ -diff --git a/CHANGELOG b/CHANGELOG -index cd5f1333f..d2f362c06 100644 ---- a/CHANGELOG -+++ b/CHANGELOG -@@ -15,6 +15,8 @@ Discretization improvements - --------------------------- - - Introduced support for higher order non conformal Nedelec elements on - simplices in ParMesh. -+- Introduced support for internal boundary elements in nonconformal adapted -+ meshes. - - Miscellaneous - ------------- -diff --git a/fem/bilinearform.cpp b/fem/bilinearform.cpp -index 6eae233bc..caaafaac8 100644 ---- a/fem/bilinearform.cpp -+++ b/fem/bilinearform.cpp -@@ -1072,7 +1072,8 @@ void BilinearForm::EliminateEssentialBCFromDofs( - void BilinearForm::EliminateEssentialBCFromDofs (const Array &ess_dofs, - DiagonalPolicy dpolicy) - { -- MFEM_ASSERT(ess_dofs.Size() == height, "incorrect dof Array size"); -+ MFEM_ASSERT(ess_dofs.Size() == height, -+ "incorrect dof Array size: " << ess_dofs.Size() << ' ' << height); - - for (int i = 0; i < ess_dofs.Size(); i++) - if (ess_dofs[i] < 0) -@@ -1084,7 +1085,8 @@ void BilinearForm::EliminateEssentialBCFromDofs (const Array &ess_dofs, - void BilinearForm::EliminateEssentialBCFromDofsDiag (const Array &ess_dofs, - double value) - { -- MFEM_ASSERT(ess_dofs.Size() == height, "incorrect dof Array size"); -+ MFEM_ASSERT(ess_dofs.Size() == height, -+ "incorrect dof Array size: " << ess_dofs.Size() << ' ' << height); - - for (int i = 0; i < ess_dofs.Size(); i++) - if (ess_dofs[i] < 0) -diff --git a/fem/fespace.cpp b/fem/fespace.cpp -index 660fec17a..1462adc81 100644 ---- a/fem/fespace.cpp -+++ b/fem/fespace.cpp -@@ -503,13 +503,11 @@ void FiniteElementSpace::BuildDofToArrays() - } - } - --static void mark_dofs(const Array &dofs, Array &mark_array) -+static void MarkDofs(const Array &dofs, Array &mark_array) - { -- for (int i = 0; i < dofs.Size(); i++) -+ for (auto d : dofs) - { -- int k = dofs[i]; -- if (k < 0) { k = -1 - k; } -- mark_array[k] = -1; -+ mark_array[d >= 0 ? d : -1 - d] = -1; - } - } - -@@ -517,11 +515,9 @@ void FiniteElementSpace::GetEssentialVDofs(const Array &bdr_attr_is_ess, - Array &ess_vdofs, - int component) const - { -- Array vdofs, dofs; -- -+ Array dofs; - ess_vdofs.SetSize(GetVSize()); - ess_vdofs = 0; -- - for (int i = 0; i < GetNBE(); i++) - { - if (bdr_attr_is_ess[GetBdrAttribute(i)-1]) -@@ -529,16 +525,14 @@ void FiniteElementSpace::GetEssentialVDofs(const Array &bdr_attr_is_ess, - if (component < 0) - { - // Mark all components. -- GetBdrElementVDofs(i, vdofs); -- mark_dofs(vdofs, ess_vdofs); -+ GetBdrElementVDofs(i, dofs); - } - else - { - GetBdrElementDofs(i, dofs); -- for (int d = 0; d < dofs.Size(); d++) -- { dofs[d] = DofToVDof(dofs[d], component); } -- mark_dofs(dofs, ess_vdofs); -+ for (auto &d : dofs) { d = DofToVDof(d, component); } - } -+ MarkDofs(dofs, ess_vdofs); - } - } - -@@ -546,38 +540,47 @@ void FiniteElementSpace::GetEssentialVDofs(const Array &bdr_attr_is_ess, - // local DOFs affected by boundary elements on other processors - if (Nonconforming()) - { -- Array bdr_verts, bdr_edges; -- mesh->ncmesh->GetBoundaryClosure(bdr_attr_is_ess, bdr_verts, bdr_edges); -- -- for (int i = 0; i < bdr_verts.Size(); i++) -+ Array bdr_verts, bdr_edges, bdr_faces; -+ mesh->ncmesh->GetBoundaryClosure(bdr_attr_is_ess, bdr_verts, bdr_edges, -+ bdr_faces); -+ for (auto v : bdr_verts) -+ { -+ if (component < 0) -+ { -+ GetVertexVDofs(v, dofs); -+ } -+ else -+ { -+ GetVertexDofs(v, dofs); -+ for (auto &d : dofs) { d = DofToVDof(d, component); } -+ } -+ MarkDofs(dofs, ess_vdofs); -+ } -+ for (auto e : bdr_edges) - { - if (component < 0) - { -- GetVertexVDofs(bdr_verts[i], vdofs); -- mark_dofs(vdofs, ess_vdofs); -+ GetEdgeVDofs(e, dofs); - } - else - { -- GetVertexDofs(bdr_verts[i], dofs); -- for (int d = 0; d < dofs.Size(); d++) -- { dofs[d] = DofToVDof(dofs[d], component); } -- mark_dofs(dofs, ess_vdofs); -+ GetEdgeDofs(e, dofs); -+ for (auto &d : dofs) { d = DofToVDof(d, component); } - } -+ MarkDofs(dofs, ess_vdofs); - } -- for (int i = 0; i < bdr_edges.Size(); i++) -+ for (auto f : bdr_faces) - { - if (component < 0) - { -- GetEdgeVDofs(bdr_edges[i], vdofs); -- mark_dofs(vdofs, ess_vdofs); -+ GetEntityVDofs(2, f, dofs); - } - else - { -- GetEdgeDofs(bdr_edges[i], dofs); -- for (int d = 0; d < dofs.Size(); d++) -- { dofs[d] = DofToVDof(dofs[d], component); } -- mark_dofs(dofs, ess_vdofs); -+ GetEntityDofs(2, f, dofs); -+ for (auto &d : dofs) { d = DofToVDof(d, component); } - } -+ MarkDofs(dofs, ess_vdofs); - } - } - } -@@ -596,6 +599,30 @@ void FiniteElementSpace::GetEssentialTrueDofs(const Array &bdr_attr_is_ess, - else - { - R->BooleanMult(ess_vdofs, ess_tdofs); -+#ifdef MFEM_DEBUG -+ // Verify that in boolean arithmetic: P^T ess_dofs = R ess_dofs -+ Array ess_tdofs2(ess_tdofs.Size()); -+ GetConformingProlongation()->BooleanMultTranspose(ess_vdofs, ess_tdofs2); -+ -+ int counter = 0; -+ std::string error_msg = "failed dof: "; -+ for (int i = 0; i < ess_tdofs2.Size(); ++i) -+ { -+ if (bool(ess_tdofs[i]) != bool(ess_tdofs2[i])) -+ { -+ error_msg += std::to_string(i) += "(R "; -+ error_msg += std::to_string(bool(ess_tdofs[i])) += " P^T "; -+ error_msg += std::to_string(bool(ess_tdofs2[i])) += ") "; -+ counter++; -+ } -+ } -+ -+ MFEM_ASSERT(R->Height() == GetConformingProlongation()->Width(), "!"); -+ MFEM_ASSERT(R->Width() == GetConformingProlongation()->Height(), "!"); -+ MFEM_ASSERT(R->Width() == ess_vdofs.Size(), "!"); -+ MFEM_VERIFY(counter == 0, "internal MFEM error: counter = " << counter -+ << ' ' << error_msg); -+#endif - } - MarkerToList(ess_tdofs, ess_tdof_list); - } -@@ -944,6 +971,15 @@ int FiniteElementSpace::GetEntityDofs(int entity, int index, Array &dofs, - } - } - -+int FiniteElementSpace::GetEntityVDofs(int entity, int index, Array &dofs, -+ Geometry::Type master_geom, -+ int variant) const -+{ -+ int n = GetEntityDofs(entity, index, dofs, master_geom, variant); -+ DofsToVDofs(dofs); -+ return n; -+} -+ - void FiniteElementSpace::BuildConformingInterpolation() const - { - #ifdef MFEM_USE_MPI -diff --git a/fem/fespace.hpp b/fem/fespace.hpp -index 0fd44b613..cd0f861a0 100644 ---- a/fem/fespace.hpp -+++ b/fem/fespace.hpp -@@ -383,6 +383,10 @@ protected: - int GetEntityDofs(int entity, int index, Array &dofs, - Geometry::Type master_geom = Geometry::INVALID, - int variant = 0) const; -+ /// Helper to get vertex, edge or face VDOFs (entity=0,1,2 resp.). -+ int GetEntityVDofs(int entity, int index, Array &dofs, -+ Geometry::Type master_geom = Geometry::INVALID, -+ int variant = 0) const; - - // Get degenerate face DOFs: see explanation in method implementation. - int GetDegenerateFaceDofs(int index, Array &dofs, -@@ -840,6 +844,7 @@ public: - /// @brief Returns the indices of the degrees of freedom for the specified - /// face, including the DOFs for the edges and the vertices of the face. - /// -+ /// - /// In variable order spaces, multiple variants of DOFs can be returned. - /// See GetEdgeDofs() for more details. - /// @return Order of the selected variant, or -1 if there are no more -diff --git a/fem/gridfunc.cpp b/fem/gridfunc.cpp -index 52620452f..310d8d704 100644 ---- a/fem/gridfunc.cpp -+++ b/fem/gridfunc.cpp -@@ -2125,8 +2125,8 @@ void GridFunction::AccumulateAndCountBdrValues( - Vector vals; - Mesh *mesh = fes->GetMesh(); - NCMesh *ncmesh = mesh->ncmesh; -- Array bdr_edges, bdr_vertices; -- ncmesh->GetBoundaryClosure(attr, bdr_vertices, bdr_edges); -+ Array bdr_edges, bdr_vertices, bdr_faces; -+ ncmesh->GetBoundaryClosure(attr, bdr_vertices, bdr_edges, bdr_faces); - - for (i = 0; i < bdr_edges.Size(); i++) - { -@@ -2232,8 +2232,8 @@ void GridFunction::AccumulateAndCountBdrTangentValues( - { - Mesh *mesh = fes->GetMesh(); - NCMesh *ncmesh = mesh->ncmesh; -- Array bdr_edges, bdr_vertices; -- ncmesh->GetBoundaryClosure(bdr_attr, bdr_vertices, bdr_edges); -+ Array bdr_edges, bdr_vertices, bdr_faces; -+ ncmesh->GetBoundaryClosure(bdr_attr, bdr_vertices, bdr_edges, bdr_faces); - - for (int i = 0; i < bdr_edges.Size(); i++) - { -diff --git a/fem/gridfunc.hpp b/fem/gridfunc.hpp -index 245d00078..50d7c1105 100644 ---- a/fem/gridfunc.hpp -+++ b/fem/gridfunc.hpp -@@ -586,6 +586,10 @@ public: - return ComputeLpError(infinity(), exsol, NULL, NULL, irs); - } - -+ virtual double ComputeL1Error(Coefficient *exsol[], -+ const IntegrationRule *irs[] = NULL) const -+ { return ComputeW11Error(*exsol, NULL, 1, NULL, irs); } -+ - virtual double ComputeL1Error(Coefficient &exsol, - const IntegrationRule *irs[] = NULL) const - { return ComputeLpError(1.0, exsol, NULL, irs); } -diff --git a/fem/pfespace.cpp b/fem/pfespace.cpp -index 8761e1489..3fa9fe5c7 100644 ---- a/fem/pfespace.cpp -+++ b/fem/pfespace.cpp -@@ -90,20 +90,19 @@ ParNURBSExtension *ParFiniteElementSpace::MakeLocalNURBSext( - void ParFiniteElementSpace::ParInit(ParMesh *pm) - { - pmesh = pm; -- pncmesh = NULL; -+ pncmesh = nullptr; - - MyComm = pmesh->GetComm(); - NRanks = pmesh->GetNRanks(); - MyRank = pmesh->GetMyRank(); - -- gcomm = NULL; -+ gcomm = nullptr; - -- P = NULL; -- Pconf = NULL; -+ P = nullptr; -+ Pconf = nullptr; - nonconf_P = false; -- Rconf = NULL; -- R = NULL; -- -+ Rconf = nullptr; -+ R = nullptr; - num_face_nbr_dofs = -1; - - if (NURBSext && !pNURBSext()) -@@ -519,7 +518,7 @@ void ParFiniteElementSpace::GetBdrElementDofs(int i, Array &dofs, - int ParFiniteElementSpace::GetFaceDofs(int i, Array &dofs, - int variant) const - { -- if (face_dof && variant == 0) -+ if (face_dof != nullptr && variant == 0) - { - face_dof->GetRow(i, dofs); - return fec->GetOrder(); -@@ -1039,18 +1038,28 @@ void ParFiniteElementSpace::GetEssentialTrueDofs(const Array - #ifdef MFEM_DEBUG - // Verify that in boolean arithmetic: P^T ess_dofs = R ess_dofs. - Array true_ess_dofs2(true_ess_dofs.Size()); -- HypreParMatrix *Pt = Dof_TrueDof_Matrix()->Transpose(); -+ auto Pt = std::unique_ptr(Dof_TrueDof_Matrix()->Transpose()); -+ - const int *ess_dofs_data = ess_dofs.HostRead(); - Pt->BooleanMult(1, ess_dofs_data, 0, true_ess_dofs2); -- delete Pt; - int counter = 0; - const int *ted = true_ess_dofs.HostRead(); -+ std::string error_msg = "failed dof: "; - for (int i = 0; i < true_ess_dofs.Size(); i++) - { -- if (bool(ted[i]) != bool(true_ess_dofs2[i])) { counter++; } -+ if (bool(ted[i]) != bool(true_ess_dofs2[i])) -+ { -+ error_msg += std::to_string(i) += "(R "; -+ error_msg += std::to_string(bool(ted[i])) += " P^T "; -+ error_msg += std::to_string(bool(true_ess_dofs2[i])) += ") "; -+ ++counter; -+ } - } -+ MFEM_ASSERT(R->Height() == P->Width(), "!"); -+ MFEM_ASSERT(R->Width() == P->Height(), "!"); -+ MFEM_ASSERT(R->Width() == ess_dofs.Size(), "!"); - MFEM_VERIFY(counter == 0, "internal MFEM error: counter = " << counter -- << ", rank = " << MyRank); -+ << ", rank = " << MyRank << ", " << error_msg); - #endif - - MarkerToList(true_ess_dofs, ess_tdof_list); -@@ -1945,8 +1954,7 @@ struct PMatrixRow - elems.reserve(elems.size() + other.elems.size()); - for (const PMatrixElement &oei : other.elems) - { -- elems.push_back( -- PMatrixElement(oei.column, oei.stride, coef * oei.value)); -+ elems.emplace_back(oei.column, oei.stride, coef * oei.value); - } - } - -@@ -2022,7 +2030,7 @@ public: - void AddRow(int entity, int index, int edof, GroupId group, - const PMatrixRow &row) - { -- rows.push_back(RowInfo(entity, index, edof, group, row)); -+ rows.emplace_back(entity, index, edof, group, row); - } - - const std::vector& GetRows() const { return rows; } -@@ -2038,8 +2046,8 @@ protected: - ParNCMesh *pncmesh; - const FiniteElementCollection* fec; - -- virtual void Encode(int rank); -- virtual void Decode(int); -+ void Encode(int rank) override; -+ void Decode(int) override; - }; - - void NeighborRowMessage::Encode(int rank) -@@ -2158,6 +2166,11 @@ void NeighborRowMessage::Decode(int rank) - int fo = pncmesh->GetFaceOrientation(id.index); - ind = fec->DofOrderForOrientation(geom, fo); - } -+ // P2 tri faces have dofs that must be processed in pairs, as the doftransformation -+ // is not diagonal. -+ const bool process_dof_pairs = (ent == 2 && -+ fec->GetContType() == FiniteElementCollection::TANGENTIAL -+ && !Geometry::IsTensorProduct(geom)); - - #ifdef MFEM_DEBUG_PMATRIX - mfem::out << "Rank " << pncmesh->MyRank << " receiving from " << rank -@@ -2177,7 +2190,7 @@ void NeighborRowMessage::Decode(int rank) - - // Create a row for this entity, recording the index of the mesh - // element -- rows.push_back(RowInfo(ent, id.index, edof, group_ids[gi++])); -+ rows.emplace_back(ent, id.index, edof, group_ids[gi++]); - rows.back().row.read(stream, s); - - #ifdef MFEM_DEBUG_PMATRIX -@@ -2187,8 +2200,7 @@ void NeighborRowMessage::Decode(int rank) - << std::endl; - #endif - -- if (ent == 2 && fec->GetContType() == FiniteElementCollection::TANGENTIAL -- && !Geometry::IsTensorProduct(geom)) -+ if (process_dof_pairs) - { - // ND face dofs need to be processed together, as the transformation - // is given by a 2x2 matrix, so we manually apply an extra increment -@@ -2209,8 +2221,10 @@ void NeighborRowMessage::Decode(int rank) - // there is no hidden copying that could result in a dangling - // reference. - auto &first_row = rows.back().row; -+ - // This is the first "fundamental unit" used in the transformation. - const auto initial_first_row = first_row; -+ - // Extract the next dof too, and apply any dof order transformation - // expected. - const MeshId &next_id = ids[++i]; -@@ -2226,15 +2240,34 @@ void NeighborRowMessage::Decode(int rank) - edof = -1 - edof; - s *= -1.0; - } -- rows.push_back(RowInfo(ent, next_id.index, edof, group_ids[gi++])); -+ -+ rows.emplace_back(ent, next_id.index, edof, group_ids[gi++]); - rows.back().row.read(stream, s); - auto &second_row = rows.back().row; - - // This is the second "fundamental unit" used in the transformation. - const auto initial_second_row = second_row; -+ -+ // Transform the received dofs by the primal transform. This is -+ // because within mfem as a face is visited its orientation is -+ // asigned to match the element that visited it first. Thus on -+ // processor boundaries, the transform will always be identity -+ // going into the element. However, the sending processor also -+ // thought the face orientation was zero, so it has sent the -+ // information in a different orientation. To map onto the local -+ // orientation definition, extract the orientation of the sending -+ // rank (the lower rank face defines the orientation fo), then -+ // apply the transform to the dependencies. The action of this -+ // transform on the dependencies is performed by adding scaled -+ // versions of the original two rows (which by the mfem assumption -+ // of face orientation, represent the identity transform). -+ MFEM_ASSERT(fo != 2 && fo != 4, -+ "This code branch is ambiguous for face orientations 2 and 4." -+ " Please report this mesh for further testing.\n"); - const double *T = - ND_DofTransformation::GetFaceTransform(fo).GetData(); - -+ // Remove the identity matrix from the transformation. - first_row.AddRow(initial_first_row, T[0] - 1.0); - first_row.AddRow(initial_second_row, T[2]); - second_row.AddRow(initial_first_row, T[1]); -@@ -2410,7 +2443,7 @@ int ParFiniteElementSpace - - if (master_dofs.Size() == 0) { continue; } - -- const FiniteElement* fe = fec->FiniteElementForGeometry(mf.Geom()); -+ const FiniteElement * const fe = fec->FiniteElementForGeometry(mf.Geom()); - if (fe == nullptr) { continue; } - - switch (mf.Geom()) -@@ -2439,7 +2472,6 @@ int ParFiniteElementSpace - } - } - } -- - deps.Finalize(); - } - -@@ -2568,15 +2600,15 @@ int ParFiniteElementSpace - - // big container for all messages we send (the list is for iterations) - std::list send_msg; -- send_msg.push_back(NeighborRowMessage::Map()); -+ send_msg.emplace_back(); - - // put identity in P and R for true DOFs, set ldof_ltdof - for (int dof = 0, tdof = 0; dof < ndofs; dof++) - { - if (finalized[dof]) - { -- pmatrix[dof].elems.push_back( -- PMatrixElement(my_tdof_offset + vdim_factor*tdof, tdof_stride, 1.)); -+ pmatrix[dof].elems.emplace_back(my_tdof_offset + vdim_factor*tdof, tdof_stride, -+ 1.); - - // prepare messages to neighbors with identity rows - if (dof_group[dof] != 0) -@@ -2620,7 +2652,7 @@ int ParFiniteElementSpace - // prepare a new round of send buffers - if (send_msg.back().size()) - { -- send_msg.push_back(NeighborRowMessage::Map()); -+ send_msg.emplace_back(); - } - - // check for incoming messages, receive PMatrixRows -diff --git a/fem/pfespace.hpp b/fem/pfespace.hpp -index 72029be56..e74f66622 100644 ---- a/fem/pfespace.hpp -+++ b/fem/pfespace.hpp -@@ -124,8 +124,8 @@ private: - void GetGhostVertexDofs(const MeshId &id, Array &dofs) const; - void GetGhostEdgeDofs(const MeshId &edge_id, Array &dofs) const; - void GetGhostFaceDofs(const MeshId &face_id, Array &dofs) const; -- - void GetGhostDofs(int entity, const MeshId &id, Array &dofs) const; -+ - /// Return the dofs associated with the interior of the given mesh entity. - void GetBareDofs(int entity, int index, Array &dofs) const; - -diff --git a/fem/pgridfunc.cpp b/fem/pgridfunc.cpp -index 38b57a3e1..773221029 100644 ---- a/fem/pgridfunc.cpp -+++ b/fem/pgridfunc.cpp -@@ -736,7 +736,8 @@ void ParGridFunction::ProjectBdrCoefficientTangent(VectorCoefficient &vcoeff, - { - MFEM_ASSERT(pfes->GetLocalTDofNumber(i) == -1 || - bool(values_counter[i]) == bool(ess_vdofs_marker[i]), -- "internal error"); -+ "internal error: " << pfes->GetLocalTDofNumber(i) << ' ' << bool( -+ values_counter[i])); - } - #endif - } -diff --git a/fem/pgridfunc.hpp b/fem/pgridfunc.hpp -index bc422a260..041dc1c98 100644 ---- a/fem/pgridfunc.hpp -+++ b/fem/pgridfunc.hpp -@@ -112,12 +112,12 @@ public: - - ParFiniteElementSpace *ParFESpace() const { return pfes; } - -- virtual void Update(); -+ void Update() override; - - /// Associate a new FiniteElementSpace with the ParGridFunction. - /** The ParGridFunction is resized using the SetSize() method. The new space - @a f is expected to be a ParFiniteElementSpace. */ -- virtual void SetSpace(FiniteElementSpace *f); -+ void SetSpace(FiniteElementSpace *f) override; - - /// Associate a new parallel space with the ParGridFunction. - void SetSpace(ParFiniteElementSpace *f); -@@ -130,7 +130,7 @@ public: - ParGridFunction and sets the pointer @a v as external data in the - ParGridFunction. The new space @a f is expected to be a - ParFiniteElementSpace. */ -- virtual void MakeRef(FiniteElementSpace *f, double *v); -+ void MakeRef(FiniteElementSpace *f, double *v) override; - - /** @brief Make the ParGridFunction reference external data on a new - ParFiniteElementSpace. */ -@@ -147,7 +147,7 @@ public: - expected to be a ParFiniteElementSpace. - @note This version of the method will also perform bounds checks when - the build option MFEM_DEBUG is enabled. */ -- virtual void MakeRef(FiniteElementSpace *f, Vector &v, int v_offset); -+ void MakeRef(FiniteElementSpace *f, Vector &v, int v_offset) override; - - /** @brief Make the ParGridFunction reference external data on a new - ParFiniteElementSpace. */ -@@ -166,7 +166,7 @@ public: - void AddDistribute(double a, const Vector &tv) { AddDistribute(a, &tv); } - - /// Set the GridFunction from the given true-dof vector. -- virtual void SetFromTrueDofs(const Vector &tv) { Distribute(tv); } -+ void SetFromTrueDofs(const Vector &tv) override { Distribute(tv); } - - /// Short semantic for Distribute() - ParGridFunction &operator=(const HypreParVector &tv) -@@ -209,26 +209,26 @@ public: - const Vector &FaceNbrData() const { return face_nbr_data; } - - // Redefine to handle the case when i is a face-neighbor element -- virtual double GetValue(int i, const IntegrationPoint &ip, -- int vdim = 1) const; -+ double GetValue(int i, const IntegrationPoint &ip, -+ int vdim = 1) const override; - double GetValue(ElementTransformation &T) - { return GetValue(T, T.GetIntPoint()); } - - // Redefine to handle the case when T describes a face-neighbor element -- virtual double GetValue(ElementTransformation &T, const IntegrationPoint &ip, -- int comp = 0, Vector *tr = NULL) const; -+ double GetValue(ElementTransformation &T, const IntegrationPoint &ip, -+ int comp = 0, Vector *tr = NULL) const override; - -- virtual void GetVectorValue(int i, const IntegrationPoint &ip, -- Vector &val) const; -+ void GetVectorValue(int i, const IntegrationPoint &ip, -+ Vector &val) const override; - - // Redefine to handle the case when T describes a face-neighbor element -- virtual void GetVectorValue(ElementTransformation &T, -- const IntegrationPoint &ip, -- Vector &val, Vector *tr = NULL) const; -+ void GetVectorValue(ElementTransformation &T, -+ const IntegrationPoint &ip, -+ Vector &val, Vector *tr = NULL) const override; - - /** @brief For each vdof, counts how many elements contain the vdof, - as containment is determined by FiniteElementSpace::GetElementVDofs(). */ -- virtual void CountElementsPerVDof(Array &elem_per_vdof) const; -+ void CountElementsPerVDof(Array &elem_per_vdof) const override; - - /// Parallel version of GridFunction::GetDerivative(); see its documentation. - void GetDerivative(int comp, int der_comp, ParGridFunction &der); -@@ -237,112 +237,111 @@ public: - freedom of element @a el. If @a el is greater than or equal to the number - of local elements, it will be interpreted as a shifted index of a face - neighbor element. */ -- virtual void GetElementDofValues(int el, Vector &dof_vals) const; -+ void GetElementDofValues(int el, Vector &dof_vals) const override; - - using GridFunction::ProjectCoefficient; -- virtual void ProjectCoefficient(Coefficient &coeff); -+ void ProjectCoefficient(Coefficient &coeff) override; - - using GridFunction::ProjectDiscCoefficient; - /** @brief Project a discontinuous vector coefficient as a grid function on - a continuous finite element space. The values in shared dofs are - determined from the element with maximal attribute. */ -- virtual void ProjectDiscCoefficient(VectorCoefficient &coeff); -+ void ProjectDiscCoefficient(VectorCoefficient &coeff) override; - -- virtual void ProjectDiscCoefficient(Coefficient &coeff, AvgType type); -+ void ProjectDiscCoefficient(Coefficient &coeff, AvgType type) override; - -- virtual void ProjectDiscCoefficient(VectorCoefficient &vcoeff, AvgType type); -+ void ProjectDiscCoefficient(VectorCoefficient &vcoeff, AvgType type) override; - - using GridFunction::ProjectBdrCoefficient; - - // Only the values in the master are guaranteed to be correct! -- virtual void ProjectBdrCoefficient(VectorCoefficient &vcoeff, -- Array &attr) -+ void ProjectBdrCoefficient(VectorCoefficient &vcoeff, -+ Array &attr) override - { ProjectBdrCoefficient(NULL, &vcoeff, attr); } - - // Only the values in the master are guaranteed to be correct! -- virtual void ProjectBdrCoefficient(Coefficient *coeff[], Array &attr) -+ void ProjectBdrCoefficient(Coefficient *coeff[], Array &attr) override - { ProjectBdrCoefficient(coeff, NULL, attr); } - - // Only the values in the master are guaranteed to be correct! -- virtual void ProjectBdrCoefficientTangent(VectorCoefficient &vcoeff, -- Array &bdr_attr); -+ void ProjectBdrCoefficientTangent(VectorCoefficient &vcoeff, -+ Array &bdr_attr) override; - -- virtual double ComputeL1Error(Coefficient *exsol[], -- const IntegrationRule *irs[] = NULL) const -+ double ComputeL1Error(Coefficient *exsol[], -+ const IntegrationRule *irs[] = NULL) const override - { - return GlobalLpNorm(1.0, GridFunction::ComputeW11Error( - *exsol, NULL, 1, NULL, irs), pfes->GetComm()); - } - -- virtual double ComputeL1Error(Coefficient &exsol, -- const IntegrationRule *irs[] = NULL) const -+ double ComputeL1Error(Coefficient &exsol, -+ const IntegrationRule *irs[] = NULL) const override - { return ComputeLpError(1.0, exsol, NULL, irs); } - -- virtual double ComputeL1Error(VectorCoefficient &exsol, -- const IntegrationRule *irs[] = NULL) const -+ double ComputeL1Error(VectorCoefficient &exsol, -+ const IntegrationRule *irs[] = NULL) const override - { return ComputeLpError(1.0, exsol, NULL, NULL, irs); } - -- virtual double ComputeL2Error(Coefficient *exsol[], -- const IntegrationRule *irs[] = NULL, -- const Array *elems = NULL) const -+ double ComputeL2Error(Coefficient *exsol[], -+ const IntegrationRule *irs[] = NULL, -+ const Array *elems = NULL) const override - { - return GlobalLpNorm(2.0, GridFunction::ComputeL2Error(exsol, irs, elems), - pfes->GetComm()); - } - -- virtual double ComputeL2Error(Coefficient &exsol, -- const IntegrationRule *irs[] = NULL, -- const Array *elems = NULL) const -+ double ComputeL2Error(Coefficient &exsol, -+ const IntegrationRule *irs[] = NULL, -+ const Array *elems = NULL) const override - { - return GlobalLpNorm(2.0, GridFunction::ComputeL2Error(exsol, irs, elems), - pfes->GetComm()); - } - - -- virtual double ComputeL2Error(VectorCoefficient &exsol, -- const IntegrationRule *irs[] = NULL, -- const Array *elems = NULL) const -+ double ComputeL2Error(VectorCoefficient &exsol, -+ const IntegrationRule *irs[] = NULL, -+ const Array *elems = NULL) const override - { - return GlobalLpNorm(2.0, GridFunction::ComputeL2Error(exsol, irs, elems), - pfes->GetComm()); - } - - /// Returns ||grad u_ex - grad u_h||_L2 for H1 or L2 elements -- virtual double ComputeGradError(VectorCoefficient *exgrad, -- const IntegrationRule *irs[] = NULL) const -+ double ComputeGradError(VectorCoefficient *exgrad, -+ const IntegrationRule *irs[] = NULL) const override - { - return GlobalLpNorm(2.0, GridFunction::ComputeGradError(exgrad,irs), - pfes->GetComm()); - } - - /// Returns ||curl u_ex - curl u_h||_L2 for ND elements -- virtual double ComputeCurlError(VectorCoefficient *excurl, -- const IntegrationRule *irs[] = NULL) const -+ double ComputeCurlError(VectorCoefficient *excurl, -+ const IntegrationRule *irs[] = NULL) const override - { - return GlobalLpNorm(2.0, GridFunction::ComputeCurlError(excurl,irs), - pfes->GetComm()); - } - - /// Returns ||div u_ex - div u_h||_L2 for RT elements -- virtual double ComputeDivError(Coefficient *exdiv, -- const IntegrationRule *irs[] = NULL) const -+ double ComputeDivError(Coefficient *exdiv, -+ const IntegrationRule *irs[] = NULL) const override - { - return GlobalLpNorm(2.0, GridFunction::ComputeDivError(exdiv,irs), - pfes->GetComm()); - } - - /// Returns the Face Jumps error for L2 elements -- virtual double ComputeDGFaceJumpError(Coefficient *exsol, -- Coefficient *ell_coeff, -- JumpScaling jump_scaling, -- const IntegrationRule *irs[]=NULL) -- const; -+ double ComputeDGFaceJumpError(Coefficient *exsol, -+ Coefficient *ell_coeff, -+ JumpScaling jump_scaling, -+ const IntegrationRule *irs[]=NULL) const override; - - /// Returns either the H1-seminorm or the DG Face Jumps error or both - /// depending on norm_type = 1, 2, 3 -- virtual double ComputeH1Error(Coefficient *exsol, VectorCoefficient *exgrad, -- Coefficient *ell_coef, double Nu, -- int norm_type) const -+ double ComputeH1Error(Coefficient *exsol, VectorCoefficient *exgrad, -+ Coefficient *ell_coef, double Nu, -+ int norm_type) const override - { - return GlobalLpNorm(2.0, - GridFunction::ComputeH1Error(exsol,exgrad,ell_coef, -@@ -352,56 +351,56 @@ public: - - /// Returns the error measured in H1-norm for H1 elements or in "broken" - /// H1-norm for L2 elements -- virtual double ComputeH1Error(Coefficient *exsol, VectorCoefficient *exgrad, -- const IntegrationRule *irs[] = NULL) const -+ double ComputeH1Error(Coefficient *exsol, VectorCoefficient *exgrad, -+ const IntegrationRule *irs[] = NULL) const override - { - return GlobalLpNorm(2.0, GridFunction::ComputeH1Error(exsol,exgrad,irs), - pfes->GetComm()); - } - - /// Returns the error measured H(div)-norm for RT elements -- virtual double ComputeHDivError(VectorCoefficient *exsol, -- Coefficient *exdiv, -- const IntegrationRule *irs[] = NULL) const -+ double ComputeHDivError(VectorCoefficient *exsol, -+ Coefficient *exdiv, -+ const IntegrationRule *irs[] = NULL) const override - { - return GlobalLpNorm(2.0, GridFunction::ComputeHDivError(exsol,exdiv,irs), - pfes->GetComm()); - } - - /// Returns the error measured H(curl)-norm for ND elements -- virtual double ComputeHCurlError(VectorCoefficient *exsol, -- VectorCoefficient *excurl, -- const IntegrationRule *irs[] = NULL) const -+ double ComputeHCurlError(VectorCoefficient *exsol, -+ VectorCoefficient *excurl, -+ const IntegrationRule *irs[] = NULL) const override - { - return GlobalLpNorm(2.0, - GridFunction::ComputeHCurlError(exsol,excurl,irs), - pfes->GetComm()); - } - -- virtual double ComputeMaxError(Coefficient *exsol[], -- const IntegrationRule *irs[] = NULL) const -+ double ComputeMaxError(Coefficient *exsol[], -+ const IntegrationRule *irs[] = NULL) const override - { - return GlobalLpNorm(infinity(), - GridFunction::ComputeMaxError(exsol, irs), - pfes->GetComm()); - } - -- virtual double ComputeMaxError(Coefficient &exsol, -- const IntegrationRule *irs[] = NULL) const -+ double ComputeMaxError(Coefficient &exsol, -+ const IntegrationRule *irs[] = NULL) const override - { - return ComputeLpError(infinity(), exsol, NULL, irs); - } - -- virtual double ComputeMaxError(VectorCoefficient &exsol, -- const IntegrationRule *irs[] = NULL) const -+ double ComputeMaxError(VectorCoefficient &exsol, -+ const IntegrationRule *irs[] = NULL) const override - { - return ComputeLpError(infinity(), exsol, NULL, NULL, irs); - } - -- virtual double ComputeLpError(const double p, Coefficient &exsol, -- Coefficient *weight = NULL, -- const IntegrationRule *irs[] = NULL, -- const Array *elems = NULL) const -+ double ComputeLpError(const double p, Coefficient &exsol, -+ Coefficient *weight = NULL, -+ const IntegrationRule *irs[] = NULL, -+ const Array *elems = NULL) const override - { - return GlobalLpNorm(p, GridFunction::ComputeLpError(p, exsol, weight, irs, - elems), pfes->GetComm()); -@@ -410,23 +409,23 @@ public: - /** When given a vector weight, compute the pointwise (scalar) error as the - dot product of the vector error with the vector weight. Otherwise, the - scalar error is the l_2 norm of the vector error. */ -- virtual double ComputeLpError(const double p, VectorCoefficient &exsol, -- Coefficient *weight = NULL, -- VectorCoefficient *v_weight = NULL, -- const IntegrationRule *irs[] = NULL) const -+ double ComputeLpError(const double p, VectorCoefficient &exsol, -+ Coefficient *weight = NULL, -+ VectorCoefficient *v_weight = NULL, -+ const IntegrationRule *irs[] = NULL) const override - { - return GlobalLpNorm(p, GridFunction::ComputeLpError( - p, exsol, weight, v_weight, irs), pfes->GetComm()); - } - -- virtual void ComputeFlux(BilinearFormIntegrator &blfi, -- GridFunction &flux, -- bool wcoef = true, int subdomain = -1); -+ void ComputeFlux(BilinearFormIntegrator &blfi, -+ GridFunction &flux, -+ bool wcoef = true, int subdomain = -1) override; - - /** Save the local portion of the ParGridFunction. This differs from the - serial GridFunction::Save in that it takes into account the signs of - the local dofs. */ -- virtual void Save(std::ostream &out) const; -+ void Save(std::ostream &out) const override; - - /// Save the ParGridFunction to a single file (written using MPI rank 0). The - /// given @a precision will be used for ASCII output. -@@ -435,7 +434,7 @@ public: - /// Save the ParGridFunction to files (one for each MPI rank). The files will - /// be given suffixes according to the MPI rank. The given @a precision will - /// be used for ASCII output. -- virtual void Save(const char *fname, int precision=16) const; -+ void Save(const char *fname, int precision=16) const override; - - /// Returns a GridFunction on MPI rank @a save_rank that does not have any - /// duplication of vertices/nodes at processor boundaries. -@@ -452,15 +451,16 @@ public: - /** Save the local portion of the ParGridFunction. This differs from the - serial GridFunction::Save in that it takes into account the signs of - the local dofs. */ -- virtual void Save( -+ void Save( - adios2stream &out, const std::string &variable_name, -- const adios2stream::data_type type = adios2stream::data_type::point_data) const; -+ const adios2stream::data_type type = adios2stream::data_type::point_data) const -+ override; - #endif - - /// Merge the local grid functions - void SaveAsOne(std::ostream &out = mfem::out) const; - -- virtual ~ParGridFunction() { } -+ virtual ~ParGridFunction() = default; - }; - - -diff --git a/general/communication.hpp b/general/communication.hpp -index 46d4f9f21..a8fb8a1f6 100644 ---- a/general/communication.hpp -+++ b/general/communication.hpp -@@ -561,8 +561,8 @@ struct VarMessage - } - - protected: -- virtual void Encode(int rank) {} -- virtual void Decode(int rank) {} -+ virtual void Encode(int rank) = 0; -+ virtual void Decode(int rank) = 0; - }; - - -diff --git a/general/hash.hpp b/general/hash.hpp -index 288d51288..b517172aa 100644 ---- a/general/hash.hpp -+++ b/general/hash.hpp -@@ -335,6 +335,8 @@ public: - - iterator begin() { return iterator(Base::begin()); } - iterator end() { return iterator(); } -+ const_iterator begin() const { return const_iterator(Base::cbegin()); } -+ const_iterator end() const { return const_iterator(); } - - const_iterator cbegin() const { return const_iterator(Base::cbegin()); } - const_iterator cend() const { return const_iterator(); } -diff --git a/mesh/element.hpp b/mesh/element.hpp -index f1b003cae..ccd72724a 100644 ---- a/mesh/element.hpp -+++ b/mesh/element.hpp -@@ -57,12 +57,15 @@ public: - /// Set element's attribute. - inline void SetAttribute(const int attr) { attribute = attr; } - -- /// Set the indices the element according to the input. -- virtual void SetVertices(const int *ind); -- -- /// Returns element's vertices. -+ /// Get the indices defining the vertices - virtual void GetVertices(Array &v) const = 0; - -+ /// Set the indices defining the vertices -+ virtual void SetVertices(const Array &v) = 0; -+ -+ /// Set the indices the element according to the input. -+ virtual void SetVertices(const int *ind) = 0; -+ - /// @note The returned array should NOT be deleted by the caller. - virtual int *GetVertices() = 0; - -diff --git a/mesh/hexahedron.cpp b/mesh/hexahedron.cpp -index beeab3b6a..e86e209c1 100644 ---- a/mesh/hexahedron.cpp -+++ b/mesh/hexahedron.cpp -@@ -43,10 +43,18 @@ Hexahedron::Hexahedron(int ind1, int ind2, int ind3, int ind4, - void Hexahedron::GetVertices(Array &v) const - { - v.SetSize(8); -- for (int i = 0; i < 8; i++) -- { -- v[i] = indices[i]; -- } -+ std::copy(indices, indices + 8, v.begin()); -+} -+ -+void Hexahedron::SetVertices(const Array &v) -+{ -+ MFEM_ASSERT(v.Size() == 8, "!"); -+ std::copy(v.begin(), v.end(), indices); -+} -+ -+void Hexahedron::SetVertices(const int *ind) -+{ -+ std::copy(ind, ind + 8, indices); - } - - TriLinear3DFiniteElement HexahedronFE; -diff --git a/mesh/hexahedron.hpp b/mesh/hexahedron.hpp -index a8186c0c8..450cac0ce 100644 ---- a/mesh/hexahedron.hpp -+++ b/mesh/hexahedron.hpp -@@ -37,35 +37,42 @@ public: - int ind5, int ind6, int ind7, int ind8, int attr = 1); - - /// Return element's type -- Type GetType() const { return Element::HEXAHEDRON; } -+ Type GetType() const override { return Element::HEXAHEDRON; } - -- /// Returns the indices of the element's vertices. -- virtual void GetVertices(Array &v) const; -+ /// Get the indices defining the vertices -+ void GetVertices(Array &v) const override; - -- virtual int *GetVertices() { return indices; } -+ /// Set the indices defining the vertices -+ void SetVertices(const Array &v) override; - -- virtual int GetNVertices() const { return 8; } -+ /// @note The returned array should NOT be deleted by the caller. -+ int * GetVertices () override { return indices; } - -- virtual int GetNEdges() const { return 12; } -+ /// Set the vertices according to the given input. -+ void SetVertices(const int *ind) override; - -- virtual const int *GetEdgeVertices(int ei) const -+ int GetNVertices() const override { return 8; } -+ -+ int GetNEdges() const override { return 12; } -+ -+ const int *GetEdgeVertices(int ei) const override - { return geom_t::Edges[ei]; } - - /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. -- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const -+ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override - { nFaceVertices = 4; return 6; } - -- virtual int GetNFaces() const { return 6; } -+ int GetNFaces() const override { return 6; } - -- virtual int GetNFaceVertices(int) const { return 4; } -+ int GetNFaceVertices(int) const override { return 4; } - -- virtual const int *GetFaceVertices(int fi) const -+ const int *GetFaceVertices(int fi) const override - { return geom_t::FaceVert[fi]; } - -- virtual Element *Duplicate(Mesh *m) const -+ Element *Duplicate(Mesh *m) const override - { return new Hexahedron(indices, attribute); } - -- virtual ~Hexahedron() { } -+ virtual ~Hexahedron() = default; - }; - - extern MFEM_EXPORT class TriLinear3DFiniteElement HexahedronFE; -diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp -index 5f82de812..bf2fce576 100644 ---- a/mesh/mesh.cpp -+++ b/mesh/mesh.cpp -@@ -1415,6 +1415,7 @@ Geometry::Type Mesh::GetFaceGeometry(int Face) const - } - // ghost face - const int nc_face_id = faces_info[Face].NCFace; -+ - MFEM_ASSERT(nc_face_id >= 0, "parent ghost faces are not supported"); - return faces[nc_faces_info[nc_face_id].MasterFace]->GetGeometryType(); - } -@@ -1889,9 +1890,9 @@ int Mesh::AddBdrPoint(int v, int attr) - - void Mesh::GenerateBoundaryElements() - { -- for (int i = 0; i < boundary.Size(); i++) -+ for (auto &b : boundary) - { -- FreeElement(boundary[i]); -+ FreeElement(b); - } - - if (Dim == 3) -@@ -1902,9 +1903,9 @@ void Mesh::GenerateBoundaryElements() - - // count the 'NumOfBdrElements' - NumOfBdrElements = 0; -- for (int i = 0; i < faces_info.Size(); i++) -+ for (const auto &fi : faces_info) - { -- if (faces_info[i].Elem2No < 0) { NumOfBdrElements++; } -+ if (fi.Elem2No < 0) { ++NumOfBdrElements; } - } - - // Add the boundary elements -@@ -4403,7 +4404,7 @@ Mesh::Mesh(Mesh *orig_mesh, int ref_factor, int ref_type) - MakeRefined_(*orig_mesh, ref_factors, ref_type); - } - --void Mesh::MakeRefined_(Mesh &orig_mesh, const Array ref_factors, -+void Mesh::MakeRefined_(Mesh &orig_mesh, const Array &ref_factors, - int ref_type) - { - SetEmpty(); -@@ -6189,22 +6190,22 @@ int Mesh::CheckBdrElementOrientation(bool fix_it) - { - // swap vertices 0 and 1 so that we don't change the marked edge: - // (0,1,2) -> (1,0,2) -- mfem::Swap(bv[0], bv[1]); -+ mfem::Swap(bv[0], bv[1]); - if (bel_to_edge) - { - int *be = bel_to_edge->GetRow(i); -- mfem::Swap(be[1], be[2]); -+ mfem::Swap(be[1], be[2]); - } - break; - } - case Element::QUADRILATERAL: - { -- mfem::Swap(bv[0], bv[2]); -+ mfem::Swap(bv[0], bv[2]); - if (bel_to_edge) - { - int *be = bel_to_edge->GetRow(i); -- mfem::Swap(be[0], be[1]); -- mfem::Swap(be[2], be[3]); -+ mfem::Swap(be[0], be[1]); -+ mfem::Swap(be[2], be[3]); - } - break; - } -@@ -6997,26 +6998,27 @@ void Mesh::AddQuadFaceElement(int lf, int gf, int el, - - void Mesh::GenerateFaces() - { -- int i, nfaces = GetNumFaces(); -+ int nfaces = GetNumFaces(); - -- for (i = 0; i < faces.Size(); i++) -+ for (auto &f : faces) - { -- FreeElement(faces[i]); -+ FreeElement(f); - } - - // (re)generate the interior faces and the info for them - faces.SetSize(nfaces); - faces_info.SetSize(nfaces); -- for (i = 0; i < nfaces; i++) -+ for (int i = 0; i < nfaces; ++i) - { - faces[i] = NULL; - faces_info[i].Elem1No = -1; - faces_info[i].NCFace = -1; - } -- for (i = 0; i < NumOfElements; i++) -+ -+ Array v; -+ for (int i = 0; i < NumOfElements; ++i) - { -- const int *v = elements[i]->GetVertices(); -- const int *ef; -+ elements[i]->GetVertices(v); - if (Dim == 1) - { - AddPointFaceElement(0, v[0], i); -@@ -7024,7 +7026,7 @@ void Mesh::GenerateFaces() - } - else if (Dim == 2) - { -- ef = el_to_edge->GetRow(i); -+ const int * const ef = el_to_edge->GetRow(i); - const int ne = elements[i]->GetNEdges(); - for (int j = 0; j < ne; j++) - { -@@ -7034,7 +7036,7 @@ void Mesh::GenerateFaces() - } - else - { -- ef = el_to_face->GetRow(i); -+ const int * const ef = el_to_face->GetRow(i); - switch (GetElementType(i)) - { - case Element::TETRAHEDRON: -@@ -7100,9 +7102,9 @@ void Mesh::GenerateNCFaceInfo() - { - MFEM_VERIFY(ncmesh, "missing NCMesh."); - -- for (int i = 0; i < faces_info.Size(); i++) -+ for (auto &x : faces_info) - { -- faces_info[i].NCFace = -1; -+ x.NCFace = -1; - } - - const NCMesh::NCList &list = -@@ -7114,9 +7116,8 @@ void Mesh::GenerateNCFaceInfo() - int nfaces = GetNumFaces(); - - // add records for master faces -- for (int i = 0; i < list.masters.Size(); i++) -+ for (const NCMesh::Master &master : list.masters) - { -- const NCMesh::Master &master = list.masters[i]; - if (master.index >= nfaces) { continue; } - - FaceInfo &master_fi = faces_info[master.index]; -@@ -7128,10 +7129,8 @@ void Mesh::GenerateNCFaceInfo() - } - - // add records for slave faces -- for (int i = 0; i < list.slaves.Size(); i++) -+ for (const NCMesh::Slave &slave : list.slaves) - { -- const NCMesh::Slave &slave = list.slaves[i]; -- - if (slave.index < 0 || // degenerate slave face - slave.index >= nfaces || // ghost slave - slave.master >= nfaces) // has ghost master -@@ -7222,7 +7221,7 @@ STable3D *Mesh::GetFacesTable() - - STable3D *Mesh::GetElementToFaceTable(int ret_ftbl) - { -- int i, *v; -+ Array v; - STable3D *faces_tbl; - - if (el_to_face != NULL) -@@ -7231,9 +7230,9 @@ STable3D *Mesh::GetElementToFaceTable(int ret_ftbl) - } - el_to_face = new Table(NumOfElements, 6); // must be 6 for hexahedra - faces_tbl = new STable3D(NumOfVertices); -- for (i = 0; i < NumOfElements; i++) -+ for (int i = 0; i < NumOfElements; i++) - { -- v = elements[i]->GetVertices(); -+ elements[i]->GetVertices(v); - switch (GetElementType(i)) - { - case Element::TETRAHEDRON: -@@ -7297,9 +7296,10 @@ STable3D *Mesh::GetElementToFaceTable(int ret_ftbl) - el_to_face->Finalize(); - NumOfFaces = faces_tbl->NumberOfElements(); - be_to_face.SetSize(NumOfBdrElements); -- for (i = 0; i < NumOfBdrElements; i++) -+ -+ for (int i = 0; i < NumOfBdrElements; i++) - { -- v = boundary[i]->GetVertices(); -+ boundary[i]->GetVertices(v); - switch (GetBdrElementType(i)) - { - case Element::TRIANGLE: -diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp -index a6957720a..81324399b 100644 ---- a/mesh/mesh.hpp -+++ b/mesh/mesh.hpp -@@ -592,7 +592,7 @@ protected: - void Make1D(int n, double sx = 1.0); - - /// Internal function used in Mesh::MakeRefined -- void MakeRefined_(Mesh &orig_mesh, const Array ref_factors, -+ void MakeRefined_(Mesh &orig_mesh, const Array &ref_factors, - int ref_type); - - /// Initialize vertices/elements/boundary/tables from a nonconforming mesh. -@@ -1698,7 +1698,7 @@ public: - }; - - /** @brief This structure is used as a human readable output format that -- decipheres the information contained in Mesh::FaceInfo when using the -+ deciphers the information contained in Mesh::FaceInfo when using the - Mesh::GetFaceInformation() method. - - The element indices in this structure don't need further processing, -diff --git a/mesh/ncmesh.cpp b/mesh/ncmesh.cpp -index ecb5fb90b..5c57adc4a 100644 ---- a/mesh/ncmesh.cpp -+++ b/mesh/ncmesh.cpp -@@ -19,6 +19,36 @@ - - #include "ncmesh_tables.hpp" - -+ -+namespace -+{ -+/** -+ * @brief Base case of convenience variadic max function. -+ * -+ * @tparam T Base type -+ * @param arg Recursion base value -+ * @return T value to max over -+ */ -+template -+T max(T&& arg) -+{ -+ return arg; -+} -+/** -+ * @brief Convenience variadic max function. -+ * -+ * @tparam T Base Type -+ * @tparam Ts Parameter pack of other types -+ * @param arg Singular argument -+ * @param args Pack of arguments -+ * @return T maximum value -+ */ -+template -+T max(T arg, Ts... args) -+{ -+ return std::max(std::forward(arg), max(args...)); -+} -+} // namespace - namespace mfem - { - -@@ -2449,18 +2479,21 @@ void NCMesh::GetMeshComponents(Mesh &mesh) const - // left uninitialized here; they will be initialized later by the Mesh from - // Nodes -- here we just make sure mesh.vertices has the correct size. - -- for (int i = 0; i < mesh.NumOfElements; i++) -+ for (auto &elem : mesh.elements) - { -- mesh.FreeElement(mesh.elements[i]); -+ mesh.FreeElement(elem); - } - mesh.elements.SetSize(0); - -- for (int i = 0; i < mesh.NumOfBdrElements; i++) -+ for (auto &elem : mesh.boundary) - { -- mesh.FreeElement(mesh.boundary[i]); -+ mesh.FreeElement(elem); - } - mesh.boundary.SetSize(0); - -+ // Save off boundary face vertices to make boundary elements later. -+ std::map> unique_boundary_faces; -+ - // create an mfem::Element for each leaf Element - for (int i = 0; i < NElements; i++) - { -@@ -2478,65 +2511,83 @@ void NCMesh::GetMeshComponents(Mesh &mesh) const - elem->GetVertices()[j] = nodes[node[j]].vert_index; - } - -- // create boundary elements -- // TODO: use boundary_faces? -- for (int k = 0; k < gi.nf; k++) -+ // Loop over faces and collect those marked as boundaries -+ for (int k = 0; k < gi.nf; ++k) - { -- const int* fv = gi.faces[k]; - const int nfv = gi.nfv[k]; -- const Face* face = faces.Find(node[fv[0]], node[fv[1]], -- node[fv[2]], node[fv[3]]); -- if (face->Boundary()) -+ const int * const fv = gi.faces[k]; -+ const auto id = faces.FindId(node[fv[0]], node[fv[1]], node[fv[2]], -+ node[fv[3]]); -+ if (id >= 0 && faces[id].Boundary()) - { -- if ((nc_elem.geom == Geometry::CUBE) || -- ((nc_elem.geom == Geometry::PRISM || -- nc_elem.geom == Geometry::PYRAMID) && nfv == 4)) -- { -- auto* quad = (Quadrilateral*) mesh.NewElement(Geometry::SQUARE); -- quad->SetAttribute(face->attribute); -- for (int j = 0; j < 4; j++) -- { -- quad->GetVertices()[j] = nodes[node[fv[j]]].vert_index; -- } -- mesh.boundary.Append(quad); -- } -- else if (nc_elem.geom == Geometry::PRISM || -- nc_elem.geom == Geometry::PYRAMID || -- nc_elem.geom == Geometry::TETRAHEDRON) -+ const auto &face = faces[id]; -+ if (face.elem[0] >= 0 && face.elem[1] >= 0 && -+ nc_elem.rank != std::min(elements[face.elem[0]].rank, -+ elements[face.elem[1]].rank)) - { -- MFEM_ASSERT(nfv == 3, ""); -- auto* tri = (Triangle*) mesh.NewElement(Geometry::TRIANGLE); -- tri->SetAttribute(face->attribute); -- for (int j = 0; j < 3; j++) -- { -- tri->GetVertices()[j] = nodes[node[fv[j]]].vert_index; -- } -- mesh.boundary.Append(tri); -+ // This is a conformal internal face, but this element is not the lowest -+ // ranking attached processor, thus not the owner of the face. -+ // Consequently, we do not add this face to avoid double -+ // counting. -+ continue; - } -- else if (nc_elem.geom == Geometry::SQUARE || -- nc_elem.geom == Geometry::TRIANGLE) -+ -+ // Add in all boundary faces that are actual boundaries or not masters of another face. -+ // The fv[2] in the edge split is on purpose. -+ if ((nfv == 4 && -+ QuadFaceNotMaster(node[fv[0]], node[fv[1]], node[fv[2]], node[fv[3]])) -+ || (nfv == 3 && TriFaceNotMaster(node[fv[0]], node[fv[1]], node[fv[2]])) -+ || (nfv == 2 && -+ EdgeSplitLevel(node[fv[0]], node[fv[2]] /* [2] not an error */) == 0)) - { -- auto* segment = (Segment*) mesh.NewElement(Geometry::SEGMENT); -- segment->SetAttribute(face->attribute); -- for (int j = 0; j < 2; j++) -+ // This face has no split faces below, it is conformal or a -+ // slave. -+ unique_boundary_faces[id].SetSize(nfv); -+ for (int v = 0; v < nfv; ++v) - { -- segment->GetVertices()[j] = nodes[node[fv[2*j]]].vert_index; -+ // Using a map overwrites if a face is visited twice. -+ // The nfv==2 is necessary because faces of 2D are storing the -+ // second index in the 2 slot, not the 1 slot. -+ unique_boundary_faces[id][v] = nodes[node[fv[(nfv==2) ? 2*v : v]]].vert_index; - } -- mesh.boundary.Append(segment); -- } -- else -- { -- MFEM_ASSERT(nc_elem.geom == Geometry::SEGMENT, ""); -- auto* point = (mfem::Point*) mesh.NewElement(Geometry::POINT); -- point->SetAttribute(face->attribute); -- point->GetVertices()[0] = nodes[node[fv[0]]].vert_index; -- mesh.boundary.Append(point); - } - } - } - } -+ -+ auto geom_from_nfv = [](int nfv) -+ { -+ switch (nfv) -+ { -+ case 1: return Geometry::POINT; -+ case 2: return Geometry::SEGMENT; -+ case 3: return Geometry::TRIANGLE; -+ case 4: return Geometry::SQUARE; -+ } -+ return Geometry::INVALID; -+ }; -+ -+ for (const auto &fv : unique_boundary_faces) -+ { -+ const auto f = fv.first; -+ const auto &v = fv.second; -+ const auto &face = faces.At(f); -+ -+ auto geom = geom_from_nfv(v.Size()); -+ -+ MFEM_ASSERT(geom != Geometry::INVALID, -+ "nfv: " << v.Size() << -+ " does not match a valid face geometry: Quad, Tri, Segment, Point"); -+ -+ // Add a new boundary element, with matching attribute and vertices -+ mesh.boundary.Append(mesh.NewElement(geom)); -+ auto * const be = mesh.boundary.Last(); -+ be->SetAttribute(face.attribute); -+ be->SetVertices(v); -+ } - } - -+ - void NCMesh::OnMeshUpdated(Mesh *mesh) - { - //// PART 1: pull indices of regular edges/faces from the Mesh -@@ -2651,13 +2702,14 @@ void NCMesh::OnMeshUpdated(Mesh *mesh) - for (int j = 0; j < gi.nf; j++) - { - const int *fv = gi.faces[j]; -- Face* face = faces.Find(el.node[fv[0]], el.node[fv[1]], -- el.node[fv[2]], el.node[fv[3]]); -- MFEM_ASSERT(face, "face not found!"); -+ int fid = faces.FindId(el.node[fv[0]], el.node[fv[1]], -+ el.node[fv[2]], el.node[fv[3]]); -+ MFEM_ASSERT(fid >= 0, "face not found!"); -+ auto &face = faces[fid]; - -- if (face->index < 0) -+ if (face.index < 0) - { -- face->index = NFaces + (nghosts++); -+ face.index = NFaces + (nghosts++); - - // store the face geometry - static const Geometry::Type types[5] = -@@ -2665,7 +2717,7 @@ void NCMesh::OnMeshUpdated(Mesh *mesh) - Geometry::INVALID, Geometry::INVALID, - Geometry::SEGMENT, Geometry::TRIANGLE, Geometry::SQUARE - }; -- face_geom[face->index] = types[gi.nfv[j]]; -+ face_geom[face.index] = types[gi.nfv[j]]; - } - } - } -@@ -2741,7 +2793,7 @@ bool NCMesh::TriFaceSplit(int v1, int v2, int v3, int mid[3]) const - - if (mid) { mid[0] = e1, mid[1] = e2, mid[2] = e3; } - -- // NOTE: face (v1, v2, v3) still needs to be checked -+ // This is necessary but not sufficient to determine if a face has been split. - return true; - } - -@@ -3157,6 +3209,7 @@ void NCMesh::BuildFaceList() - int fgeom = (node[3] >= 0) ? Geometry::SQUARE : Geometry::TRIANGLE; - - Face &fa = faces[face]; -+ bool is_master = false; - if (fa.elem[0] >= 0 && fa.elem[1] >= 0) - { - // this is a conforming face, add it to the list -@@ -3183,6 +3236,7 @@ void NCMesh::BuildFaceList() - if (sb < se) - { - // found slaves, so this is a master face; add it to the list -+ is_master = true; - face_list.masters.Append( - Master(fa.index, elem, j, fgeom, sb, se)); - -@@ -3194,7 +3248,8 @@ void NCMesh::BuildFaceList() - } - } - -- if (fa.Boundary()) { boundary_faces.Append(face); } -+ // To support internal boundaries can only insert non-master faces. -+ if (fa.Boundary() && !is_master) { boundary_faces.Append(face); } - } - } - -@@ -3270,20 +3325,22 @@ void NCMesh::BuildEdgeList() - // tell ParNCMesh about the edge - ElementSharesEdge(elem, j, enode); - -- // (2D only, store boundary faces) -- if (Dim <= 2) -- { -- int face = faces.FindId(node[0], node[0], node[1], node[1]); -- MFEM_ASSERT(face >= 0, "face not found!"); -- if (faces[face].Boundary()) { boundary_faces.Append(face); } -- } -- - // store element/local for later - edge_element[nd.edge_index] = elem; - edge_local[nd.edge_index] = j; - - // skip slave edges here, they will be reached from their masters -- if (GetEdgeMaster(enode) >= 0) { continue; } -+ if (GetEdgeMaster(enode) >= 0) -+ { -+ // (2D only, store internal boundary faces) -+ if (Dim <= 2) -+ { -+ int face = faces.FindId(node[0], node[0], node[1], node[1]); -+ MFEM_ASSERT(face >= 0, "face not found!"); -+ if (faces[face].Boundary()) { boundary_faces.Append(face); } -+ } -+ continue; -+ } - - // have we already processed this edge? skip if yes - if (processed_edges[enode]) { continue; } -@@ -3316,6 +3373,13 @@ void NCMesh::BuildEdgeList() - { - // no slaves, this is a conforming edge - edge_list.conforming.Append(MeshId(nd.edge_index, elem, j)); -+ // (2D only, store boundary faces) -+ if (Dim <= 2) -+ { -+ int face = faces.FindId(node[0], node[0], node[1], node[1]); -+ MFEM_ASSERT(face >= 0, "face not found!"); -+ if (faces[face].Boundary()) { boundary_faces.Append(face); } -+ } - } - } - } -@@ -3477,7 +3541,6 @@ NCMesh::NCList::BuildIndex() const - inv_index.emplace(slaves[i].index, std::make_pair(MeshIdType::SLAVE, i)); - } - } -- - MFEM_ASSERT(inv_index.size() > 0, - "Empty inverse index, member lists must be populated before BuildIndex is called!"); - } -@@ -5195,22 +5258,23 @@ void NCMesh::FindFaceNodes(int face, int node[4]) - } - - void NCMesh::GetBoundaryClosure(const Array &bdr_attr_is_ess, -- Array &bdr_vertices, Array &bdr_edges) -+ Array &bdr_vertices, Array &bdr_edges, -+ Array &bdr_faces) - { - bdr_vertices.SetSize(0); - bdr_edges.SetSize(0); -+ bdr_faces.SetSize(0); - - if (Dim == 3) - { - GetFaceList(); // make sure 'boundary_faces' is up to date - -- for (int i = 0; i < boundary_faces.Size(); i++) -+ for (int f : boundary_faces) - { -- int face = boundary_faces[i]; -- if (bdr_attr_is_ess[faces[face].attribute - 1]) -+ if (bdr_attr_is_ess[faces[f].attribute - 1]) - { - int node[4]; -- FindFaceNodes(face, node); -+ FindFaceNodes(f, node); - int nfv = (node[3] < 0) ? 3 : 4; - - for (int j = 0; j < nfv; j++) -@@ -5228,6 +5292,17 @@ void NCMesh::GetBoundaryClosure(const Array &bdr_attr_is_ess, - bdr_edges.Append(nodes[enode].edge_index); - } - } -+ -+ // If the face is a slave face, collect any non-ghost master face -+ const Face &face = faces[f]; -+ -+ const auto id_and_type = GetFaceList().GetMeshIdAndType(face.index); -+ if (id_and_type.type == NCList::MeshIdType::SLAVE) -+ { -+ // A slave face must mark any masters -+ const auto &slave_face_id = static_cast(*id_and_type.id); -+ bdr_faces.Append(slave_face_id.master); -+ } - } - } - } -@@ -5235,36 +5310,38 @@ void NCMesh::GetBoundaryClosure(const Array &bdr_attr_is_ess, - { - GetEdgeList(); // make sure 'boundary_faces' is up to date - -- for (int i = 0; i < boundary_faces.Size(); i++) -+ for (int f : boundary_faces) - { -- int face = boundary_faces[i]; -- Face &fc = faces[face]; -- if (bdr_attr_is_ess[fc.attribute - 1]) -+ Face &face = faces[f]; -+ if (bdr_attr_is_ess[face.attribute - 1]) -+ { -+ bdr_vertices.Append(nodes[face.p1].vert_index); -+ bdr_vertices.Append(nodes[face.p3].vert_index); -+ } -+ -+ const auto id_and_type = GetEdgeList().GetMeshIdAndType(face.index); -+ if (id_and_type.type == NCList::MeshIdType::SLAVE) - { -- bdr_vertices.Append(nodes[fc.p1].vert_index); -- bdr_vertices.Append(nodes[fc.p3].vert_index); -+ // A slave face must mark any masters -+ const auto &slave_edge_id = static_cast(*id_and_type.id); -+ bdr_edges.Append(slave_edge_id.master); - } - } - } - -- bdr_vertices.Sort(); -- bdr_vertices.Unique(); -- -- bdr_edges.Sort(); -- bdr_edges.Unique(); --} -+ // Filter, sort and unique an array, so it contains only local unique values. -+ auto FilterSortUnique = [](Array &v, int N) -+ { -+ // Perform the O(N) filter before the O(NlogN) sort. -+ // begin -> it is only entries < N. -+ auto it = std::remove_if(v.begin(), v.end(), [N](int i) { return i >= N; }); -+ std::sort(v.begin(), it); -+ v.SetSize(std::distance(v.begin(), std::unique(v.begin(), it))); -+ }; - --static int max4(int a, int b, int c, int d) --{ -- return std::max(std::max(a, b), std::max(c, d)); --} --static int max6(int a, int b, int c, int d, int e, int f) --{ -- return std::max(max4(a, b, c, d), std::max(e, f)); --} --static int max8(int a, int b, int c, int d, int e, int f, int g, int h) --{ -- return std::max(max4(a, b, c, d), max4(e, f, g, h)); -+ FilterSortUnique(bdr_vertices, NVertices); -+ FilterSortUnique(bdr_edges, NEdges); -+ FilterSortUnique(bdr_faces, NFaces); - } - - int NCMesh::EdgeSplitLevel(int vn1, int vn2) const -@@ -5280,15 +5357,13 @@ int NCMesh::TriFaceSplitLevel(int vn1, int vn2, int vn3) const - if (TriFaceSplit(vn1, vn2, vn3, mid) && - faces.FindId(vn1, vn2, vn3) < 0) - { -- return 1 + max4(TriFaceSplitLevel(vn1, mid[0], mid[2]), -- TriFaceSplitLevel(mid[0], vn2, mid[1]), -- TriFaceSplitLevel(mid[2], mid[1], vn3), -- TriFaceSplitLevel(mid[0], mid[1], mid[2])); -- } -- else // not split -- { -- return 0; -+ return 1 + max(TriFaceSplitLevel(vn1, mid[0], mid[2]), -+ TriFaceSplitLevel(mid[0], vn2, mid[1]), -+ TriFaceSplitLevel(mid[2], mid[1], vn3), -+ TriFaceSplitLevel(mid[0], mid[1], mid[2])); - } -+ -+ return 0; // not split - } - - void NCMesh::QuadFaceSplitLevel(int vn1, int vn2, int vn3, int vn4, -@@ -5318,6 +5393,13 @@ void NCMesh::QuadFaceSplitLevel(int vn1, int vn2, int vn3, int vn4, - } - } - -+int NCMesh::QuadFaceSplitLevel(int vn1, int vn2, int vn3, int vn4) const -+{ -+ int h_level, v_level; -+ QuadFaceSplitLevel(vn1, vn2, vn3, vn4, h_level, v_level); -+ return h_level + v_level; -+} -+ - void NCMesh::CountSplits(int elem, int splits[3]) const - { - const Element &el = elements[elem]; -@@ -5354,57 +5436,52 @@ void NCMesh::CountSplits(int elem, int splits[3]) const - - if (el.Geom() == Geometry::CUBE) - { -- splits[0] = max8(flevel[0][0], flevel[1][0], flevel[3][0], flevel[5][0], -- elevel[0], elevel[2], elevel[4], elevel[6]); -+ splits[0] = max(flevel[0][0], flevel[1][0], flevel[3][0], flevel[5][0], -+ elevel[0], elevel[2], elevel[4], elevel[6]); - -- splits[1] = max8(flevel[0][1], flevel[2][0], flevel[4][0], flevel[5][1], -- elevel[1], elevel[3], elevel[5], elevel[7]); -+ splits[1] = max(flevel[0][1], flevel[2][0], flevel[4][0], flevel[5][1], -+ elevel[1], elevel[3], elevel[5], elevel[7]); - -- splits[2] = max8(flevel[1][1], flevel[2][1], flevel[3][1], flevel[4][1], -- elevel[8], elevel[9], elevel[10], elevel[11]); -+ splits[2] = max(flevel[1][1], flevel[2][1], flevel[3][1], flevel[4][1], -+ elevel[8], elevel[9], elevel[10], elevel[11]); - } - else if (el.Geom() == Geometry::PRISM) - { -- splits[0] = splits[1] = -- std::max( -- max6(flevel[0][0], flevel[1][0], 0, -- flevel[2][0], flevel[3][0], flevel[4][0]), -- max6(elevel[0], elevel[1], elevel[2], -- elevel[3], elevel[4], elevel[5])); -+ splits[0] = splits[1] = max(flevel[0][0], flevel[1][0], 0, -+ flevel[2][0], flevel[3][0], flevel[4][0], -+ elevel[0], elevel[1], elevel[2], -+ elevel[3], elevel[4], elevel[5]); - -- splits[2] = max6(flevel[2][1], flevel[3][1], flevel[4][1], -- elevel[6], elevel[7], elevel[8]); -+ splits[2] = max(flevel[2][1], flevel[3][1], flevel[4][1], -+ elevel[6], elevel[7], elevel[8]); - } - else if (el.Geom() == Geometry::PYRAMID) - { -- splits[0] = std::max( -- max6(flevel[0][0], flevel[1][0], 0, -- flevel[2][0], flevel[3][0], flevel[4][0]), -- max8(elevel[0], elevel[1], elevel[2], -- elevel[3], elevel[4], elevel[5], -- elevel[6], elevel[7])); -+ splits[0] = max(flevel[0][0], flevel[1][0], 0, -+ flevel[2][0], flevel[3][0], flevel[4][0], -+ elevel[0], elevel[1], elevel[2], -+ elevel[3], elevel[4], elevel[5], -+ elevel[6], elevel[7]); - - splits[1] = splits[0]; - splits[2] = splits[0]; - } - else if (el.Geom() == Geometry::TETRAHEDRON) - { -- splits[0] = std::max( -- max4(flevel[0][0], flevel[1][0], flevel[2][0], flevel[3][0]), -- max6(elevel[0], elevel[1], elevel[2], -- elevel[3], elevel[4], elevel[5])); -+ splits[0] = max(flevel[0][0], flevel[1][0], flevel[2][0], flevel[3][0], -+ elevel[0], elevel[1], elevel[2], elevel[3], elevel[4], elevel[5]); - - splits[1] = splits[0]; - splits[2] = splits[0]; - } - else if (el.Geom() == Geometry::SQUARE) - { -- splits[0] = std::max(elevel[0], elevel[2]); -- splits[1] = std::max(elevel[1], elevel[3]); -+ splits[0] = max(elevel[0], elevel[2]); -+ splits[1] = max(elevel[1], elevel[3]); - } - else if (el.Geom() == Geometry::TRIANGLE) - { -- splits[0] = std::max(elevel[0], std::max(elevel[1], elevel[2])); -+ splits[0] = max(elevel[0], elevel[1], elevel[2]); - splits[1] = splits[0]; - } - else -@@ -6377,17 +6454,17 @@ void NCMesh::DebugDump(std::ostream &os) const - - // dump faces - os << faces.Size() << "\n"; -- for (auto face = faces.cbegin(); face != faces.cend(); ++face) -+ for (const auto &face : faces) - { -- int elem = face->elem[0]; -- if (elem < 0) { elem = face->elem[1]; } -+ int elem = face.elem[0]; -+ if (elem < 0) { elem = face.elem[1]; } - MFEM_ASSERT(elem >= 0, ""); - const Element &el = elements[elem]; - - int lf = find_local_face(el.Geom(), -- find_node(el, face->p1), -- find_node(el, face->p2), -- find_node(el, face->p3)); -+ find_node(el, face.p1), -+ find_node(el, face.p2), -+ find_node(el, face.p3)); - - const int* fv = GI[el.Geom()].faces[lf]; - const int nfv = GI[el.Geom()].nfv[lf]; -@@ -6397,7 +6474,7 @@ void NCMesh::DebugDump(std::ostream &os) const - { - os << " " << el.node[fv[i]]; - } -- //os << " # face " << face.index() << ", index " << face->index << "\n"; -+ //os << " # face " << face.index() << ", index " << face.index << "\n"; - os << "\n"; - } - } -diff --git a/mesh/ncmesh.hpp b/mesh/ncmesh.hpp -index 8ac50d342..b004cf43e 100644 ---- a/mesh/ncmesh.hpp -+++ b/mesh/ncmesh.hpp -@@ -293,6 +293,7 @@ public: - mutable std::unordered_map> inv_index; - }; - -+ - /// Return the current list of conforming and nonconforming faces. - const NCList& GetFaceList() - { -@@ -392,11 +393,13 @@ public: - /** Get a list of vertices (2D/3D) and edges (3D) that coincide with boundary - elements with the specified attributes (marked in 'bdr_attr_is_ess'). - In 3D this function also reveals "hidden" boundary edges. In parallel it -- helps identifying boundary vertices/edges affected by non-local boundary -- elements. */ -+ helps identifying boundary vertices/edges/faces affected by non-local boundary -+ elements. Hidden faces can occur for an internal boundary coincident to a processor -+ boundary. -+ */ - virtual void GetBoundaryClosure(const Array &bdr_attr_is_ess, - Array &bdr_vertices, -- Array &bdr_edges); -+ Array &bdr_edges, Array &bdr_faces); - - /// Return element geometry type. @a index is the Mesh element number. - Geometry::Type GetElementGeometry(int index) const -@@ -456,7 +459,6 @@ protected: // non-public interface for the Mesh class - by calling Mesh::SetCurvature or otherwise setting the Nodes. */ - void MakeTopologyOnly() { coordinates.DeleteAll(); } - -- - protected: // implementation - - int Dim, spaceDim; ///< dimensions of the elements and the vertex coordinates -@@ -594,7 +596,6 @@ protected: // implementation - - Table element_vertex; ///< leaf-element to vertex table, see FindSetNeighbors - -- - /// Update the leaf elements indices in leaf_elements - void UpdateLeafElements(); - -@@ -712,10 +713,79 @@ protected: // implementation - - mfem::Element* NewMeshElement(int geom) const; - -- int QuadFaceSplitType(int v1, int v2, int v3, int v4, int mid[5] -+ /** -+ * @brief Given a quad face defined by four vertices, establish which edges -+ * of this face have been split, and if so optionally return the mid points -+ * of those edges. -+ * -+ * @param n1 The first node defining the face -+ * @param n2 The second node defining the face -+ * @param n3 The third node defining the face -+ * @param n4 The fourth node defining the face -+ * @param mid optional return of the edge mid points. -+ * @return int 0 -- no split, 1 -- "vertical" split, 2 -- "horizontal" split -+ */ -+ int QuadFaceSplitType(int n1, int n2, int n3, int n4, int mid[5] - = NULL /*optional output of mid-edge nodes*/) const; - -- bool TriFaceSplit(int v1, int v2, int v3, int mid[3] = NULL) const; -+ /** -+ * @brief Given a tri face defined by three vertices, establish whether the -+ * edges that make up this face have been split, and if so optionally return -+ * the midpoints. -+ * @details This is a necessary condition for this face to have been split, -+ * but is not sufficient. Consider a triangle attached to three refined -+ * triangles, in this scenario all edges can be split but this face not be -+ * split. In this case, it is necessary to check if there is a face made up -+ * of the returned midpoint nodes. -+ * -+ * @param n1 The first node defining the face -+ * @param n2 The second node defining the face -+ * @param n3 The third node defining the face -+ * @param mid optional return of the edge mid points. -+ * @return true Splits for all edges have been found -+ * @return false -+ */ -+ bool TriFaceSplit(int n1, int n2, int n3, int mid[3] = NULL) const; -+ -+ /** -+ * @brief Determine if a Triangle face is not a master -+ * @details This check requires looking for the edges making up the triangle -+ * being split, if nodes exist at their midpoints, and there are vertices at -+ * them, this implies the face COULD be split. To determine if it is, we then -+ * check whether these midpoints have all been connected, this is required to -+ * discriminate between an internal master face surrounded by nonconformal -+ * refinements and a conformal boundary face surrounded by refinements. -+ * -+ * @param n1 The first node defining the face -+ * @param n2 The second node defining the face -+ * @param n3 The third node defining the face -+ * @return true The face is not a master -+ * @return false The face is a master -+ */ -+ inline bool TriFaceNotMaster(int n1, int n2, int n3) const -+ { -+ int mid[3]; -+ return !TriFaceSplit(n1, n2, n3, mid) // The edges aren't split -+ // OR none of the midpoints are connected. -+ || (nodes.FindId(mid[0], mid[1]) < 0 && -+ nodes.FindId(mid[0], mid[2]) < 0 && -+ nodes.FindId(mid[1], mid[2]) < 0); -+ } -+ -+ /** -+ * @brief Determine if a Quad face is not a master -+ * -+ * @param n1 The first node defining the face -+ * @param n2 The second node defining the face -+ * @param n3 The third node defining the face -+ * @param n4 The fourth node defining the face -+ * @return true The quad face is not a master -+ * @return false The quad face is a master -+ */ -+ inline bool QuadFaceNotMaster(int n1, int n2, int n3, int n4) const -+ { -+ return QuadFaceSplitType(n1, n2, n3, n4) == 0; -+ } - - void ForceRefinement(int vn1, int vn2, int vn3, int vn4); - -@@ -792,7 +862,6 @@ protected: // implementation - virtual void ElementSharesEdge(int elem, int local, int enode) {} // ParNCMesh - virtual void ElementSharesVertex(int elem, int local, int vnode) {} // ParNCMesh - -- - // neighbors / element_vertex table - - /** Return all vertex-, edge- and face-neighbors of a set of elements. -@@ -981,9 +1050,7 @@ protected: // implementation - void InitDerefTransforms(); - void SetDerefMatrixCodes(int parent, Array &fine_coarse); - -- - // vertex temporary data, used by GetMeshComponents -- - struct TmpVertex - { - bool valid, visited; -@@ -1002,10 +1069,56 @@ protected: // implementation - - void FindFaceNodes(int face, int node[4]); - -+ /** -+ * @brief Return the number of splits of this edge that have occurred in the -+ * NCMesh. If zero, this means the segment is not the master of any other segments. -+ * -+ * @param vn1 The first vertex making up the segment -+ * @param vn2 The second vertex making up the segment -+ * @return int The depth of splits of this segment that are present in the mesh. -+ */ - int EdgeSplitLevel(int vn1, int vn2) const; -+ /** -+ * @brief Return the number of splits of this triangle that have occurred in -+ * the NCMesh. If zero, this means the triangle is neither split, nor the -+ * master of a split face. -+ * -+ * @param vn1 The first vertex making up the triangle -+ * @param vn2 The second vertex making up the triangle -+ * @param vn3 The third vertex making up the triangle -+ * @return int The depth of splits of this triangle that are present in the mesh. -+ */ - int TriFaceSplitLevel(int vn1, int vn2, int vn3) const; -+ /** -+ * @brief Computes the number of horizontal and vertical splits of this quad -+ * that have occurred in the NCMesh. If zero, this means the quad is not -+ * the master of any other quad. -+ * -+ * @param vn1 The first vertex making up the quad -+ * @param vn2 The second vertex making up the quad -+ * @param vn3 The third vertex making up the quad -+ * @param vn4 The fourth vertex making up the quad -+ * @param h_level The number of "horizontal" splits of the quad -+ * @param v_level The number of "vertical" splits of the quad -+ */ - void QuadFaceSplitLevel(int vn1, int vn2, int vn3, int vn4, - int& h_level, int& v_level) const; -+ /** -+ * @brief Returns the total number of splits of this quad that have occurred -+ * in the NCMesh. If zero, this means the quad is not -+ * the master of any other quad. -+ * @details This is a convenience wrapper that sums the horizontal and -+ * vertical levels from the full method. -+ * -+ * @param vn1 The first vertex making up the quad -+ * @param vn2 The second vertex making up the quad -+ * @param vn3 The third vertex making up the quad -+ * @param vn4 The fourth vertex making up the quad -+ * @return int The depth of splits of this triangle that are present in the -+ * mesh. NB: An isotropic refinement has a level of 2, one horizontal split, -+ * followed by a vertical split. -+ */ -+ int QuadFaceSplitLevel(int vn1, int vn2, int vn3, int vn4) const; - - void CountSplits(int elem, int splits[3]) const; - void GetLimitRefinements(Array &refinements, int max_level); -@@ -1042,7 +1155,6 @@ protected: // implementation - /// Load the deprecated MFEM mesh v1.1 format for backward compatibility. - void LoadLegacyFormat(std::istream &input, int &curved, int &is_nc); - -- - // geometry - - /// This holds in one place the constants about the geometries we support -diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp -index 47a091c04..967b448fc 100644 ---- a/mesh/pmesh.cpp -+++ b/mesh/pmesh.cpp -@@ -364,11 +364,8 @@ int ParMesh::BuildLocalVertices(const mfem::Mesh &mesh, - int ParMesh::BuildLocalElements(const Mesh& mesh, const int* partitioning, - const Array& vert_global_local) - { -- int nelems = 0; -- for (int i = 0; i < mesh.GetNE(); i++) -- { -- if (partitioning[i] == MyRank) { nelems++; } -- } -+ const int nelems = std::count_if(partitioning, -+ partitioning + mesh.GetNE(), [this](int i) { return i == MyRank;}); - - elements.SetSize(nelems); - -@@ -387,7 +384,7 @@ int ParMesh::BuildLocalElements(const Mesh& mesh, const int* partitioning, - { - v[j] = vert_global_local[v[j]]; - } -- element_counter++; -+ ++element_counter; - } - } - -@@ -400,7 +397,6 @@ int ParMesh::BuildLocalBoundary(const Mesh& mesh, const int* partitioning, - Table*& edge_element) - { - int nbdry = 0; -- - if (mesh.NURBSext) - { - activeBdrElem.SetSize(mesh.GetNBE()); -@@ -2102,7 +2098,7 @@ void ParMesh::ExchangeFaceNbrData() - - if (Nonconforming()) - { -- // with ParNCMesh we can set up face neighbors mostly without communication -+ // With ParNCMesh we can set up face neighbors mostly without communication. - pncmesh->GetFaceNeighbors(*this); - have_face_nbr_data = true; - -diff --git a/mesh/pncmesh.cpp b/mesh/pncmesh.cpp -index cd6625e9c..169c5c6bb 100644 ---- a/mesh/pncmesh.cpp -+++ b/mesh/pncmesh.cpp -@@ -577,41 +577,81 @@ void ParNCMesh::CalcFaceOrientations() - face_orient.SetSize(NFaces); - face_orient = 0; - -- for (auto face = faces.begin(); face != faces.end(); ++face) -+ for (auto face : faces) - { -- if (face->elem[0] >= 0 && face->elem[1] >= 0 && face->index < NFaces) -+ if (face.elem[0] >= 0 && face.elem[1] >= 0 && face.index < NFaces) - { -- Element *e1 = &elements[face->elem[0]]; -- Element *e2 = &elements[face->elem[1]]; -+ Element *e1 = &elements[face.elem[0]]; -+ Element *e2 = &elements[face.elem[1]]; - - if (e1->rank == e2->rank) { continue; } - if (e1->rank > e2->rank) { std::swap(e1, e2); } - -- face_orient[face->index] = get_face_orientation(*face, *e1, *e2); -+ face_orient[face.index] = get_face_orientation(face, *e1, *e2); - } - } - } - - void ParNCMesh::GetBoundaryClosure(const Array &bdr_attr_is_ess, - Array &bdr_vertices, -- Array &bdr_edges) -+ Array &bdr_edges, Array &bdr_faces) - { -- NCMesh::GetBoundaryClosure(bdr_attr_is_ess, bdr_vertices, bdr_edges); -+ NCMesh::GetBoundaryClosure(bdr_attr_is_ess, bdr_vertices, bdr_edges, bdr_faces); - -- int i, j; -- // filter out ghost vertices -- for (i = j = 0; i < bdr_vertices.Size(); i++) -+ if (Dim == 3) - { -- if (bdr_vertices[i] < NVertices) { bdr_vertices[j++] = bdr_vertices[i]; } -+ // Mark masters of shared slave boundary faces as essential boundary faces. Some -+ // master faces may only have slave children. -+ for (const auto &mf : shared_faces.masters) -+ { -+ if (elements[mf.element].rank != MyRank) { continue; } -+ for (int j = mf.slaves_begin; j < mf.slaves_end; j++) -+ { -+ const auto &sf = GetFaceList().slaves[j]; -+ if (sf.index < 0) -+ { -+ // Edge-face constraint. Skip this edge. -+ continue; -+ } -+ Face *face = GetFace(elements[sf.element], sf.local); -+ if (face && face->Boundary() && bdr_attr_is_ess[face->attribute - 1]) -+ { -+ bdr_faces.Append(mf.index); -+ } -+ } -+ } - } -- bdr_vertices.SetSize(j); -- -- // filter out ghost edges -- for (i = j = 0; i < bdr_edges.Size(); i++) -+ else if (Dim == 2) - { -- if (bdr_edges[i] < NEdges) { bdr_edges[j++] = bdr_edges[i]; } -+ // Mark masters of shared slave boundary edges as essential boundary edges. Some -+ // master edges may only have slave children. -+ for (const auto &me : shared_edges.masters) -+ { -+ if (elements[me.element].rank != MyRank) { continue; } -+ for (int j = me.slaves_begin; j < me.slaves_end; j++) -+ { -+ const auto &se = GetEdgeList().slaves[j]; -+ Face *face = GetFace(elements[se.element], se.local); -+ if (face && face->Boundary() && bdr_attr_is_ess[face->attribute - 1]) -+ { -+ bdr_edges.Append(me.index); -+ } -+ } -+ } - } -- bdr_edges.SetSize(j); -+ -+ // Filter, sort and unique an array, so it contains only local unique values. -+ auto FilterSortUnique = [](Array &v, int N) -+ { -+ // Perform the O(N) filter before the O(NlogN) sort. -+ auto local = std::remove_if(v.begin(), v.end(), [N](int i) { return i >= N; }); -+ std::sort(v.begin(), local); -+ v.SetSize(std::distance(v.begin(), std::unique(v.begin(), local))); -+ }; -+ -+ FilterSortUnique(bdr_vertices, NVertices); -+ FilterSortUnique(bdr_edges, NEdges); -+ FilterSortUnique(bdr_faces, NFaces); - } - - -@@ -698,9 +738,9 @@ static void set_to_array(const std::set &set, Array &array) - { - array.Reserve(set.size()); - array.SetSize(0); -- for (std::set::iterator it = set.begin(); it != set.end(); ++it) -+ for (auto x : set) - { -- array.Append(*it); -+ array.Append(x); - } - } - -@@ -789,8 +829,10 @@ void ParNCMesh::GetConformingSharedStructures(ParMesh &pmesh) - for (int ent = 0; ent < Dim; ent++) - { - GetSharedList(ent); -- MFEM_VERIFY(entity_conf_group[ent].Size(), "internal error"); -- MFEM_VERIFY(entity_elem_local[ent].Size(), "internal error"); -+ MFEM_VERIFY(entity_conf_group[ent].Size() || -+ pmesh.GetNE() == 0, "Non empty partitions must be connected"); -+ MFEM_VERIFY(entity_elem_local[ent].Size() || -+ pmesh.GetNE() == 0, "Non empty partitions must be connected"); - } - } - -@@ -1119,7 +1161,7 @@ void ParNCMesh::GetFaceNeighbors(ParMesh &pmesh) - bool sloc = (sfe.rank == MyRank); - bool mloc = (mfe.rank == MyRank); - if (sloc == mloc // both or neither face is owned by this processor -- || sf.index < 0) // the face is degenerate (i.e. a face-edge constraint) -+ || sf.index < 0) // the face is degenerate (i.e. a edge-face constraint) - { - continue; - } -@@ -1307,8 +1349,6 @@ void ParNCMesh::GetFaceNeighbors(ParMesh &pmesh) - MPI_Waitall(int(send_requests.size()), send_requests.data(), status.data()); - } - } -- -- - // NOTE: this function skips ParMesh::send_face_nbr_vertices and - // ParMesh::face_nbr_vertices_offset, these are not used outside of ParMesh - } -@@ -1322,7 +1362,6 @@ void ParNCMesh::ClearAuxPM() - aux_pm_store.DeleteAll(); - } - -- - //// Prune, Refine, Derefine /////////////////////////////////////////////////// - - bool ParNCMesh::PruneTree(int elem) -@@ -1953,10 +1992,9 @@ void ParNCMesh::RedistributeElements(Array &new_ranks, int target_elements, - NeighborElementRankMessage::RecvAll(recv_ghost_ranks, MyComm); - - // read new ranks for the ghost layer from messages received -- NeighborElementRankMessage::Map::iterator it; -- for (it = recv_ghost_ranks.begin(); it != recv_ghost_ranks.end(); ++it) -+ for (auto &kv : recv_ghost_ranks) - { -- NeighborElementRankMessage &msg = it->second; -+ NeighborElementRankMessage &msg = kv.second; - for (int i = 0; i < msg.Size(); i++) - { - int ghost_index = elements[msg.elements[i]].index; -@@ -2483,9 +2521,8 @@ void ParNCMesh::AdjustMeshIds(Array ids[], int rank) - - // find vertices/edges of master faces shared with 'rank', and modify their - // MeshIds so their element/local matches the element of the master face -- for (int i = 0; i < shared_faces.masters.Size(); i++) -+ for (const MeshId &face_id : shared_faces.masters) - { -- const MeshId &face_id = shared_faces.masters[i]; - if (contains_rank[entity_pmat_group[2][face_id.index]]) - { - int v[4], e[4], eo[4], pos, k; -diff --git a/mesh/pncmesh.hpp b/mesh/pncmesh.hpp -index df5fb929e..d33f5cbbe 100644 ---- a/mesh/pncmesh.hpp -+++ b/mesh/pncmesh.hpp -@@ -229,10 +229,11 @@ public: - const Table &deref_table); - - /** Extension of NCMesh::GetBoundaryClosure. Filters out ghost vertices and -- ghost edges from 'bdr_vertices' and 'bdr_edges'. */ -+ ghost edges from 'bdr_vertices' and 'bdr_edges', and uncovers hidden internal -+ boundary faces. */ - void GetBoundaryClosure(const Array &bdr_attr_is_ess, - Array &bdr_vertices, -- Array &bdr_edges) override; -+ Array &bdr_edges, Array &bdr_faces) override; - - /// Save memory by releasing all non-essential and cached data. - void Trim() override; -@@ -258,8 +259,6 @@ protected: // interface for ParMesh - /** Populate face neighbor members of ParMesh from the ghost layer, without - communication. */ - void GetFaceNeighbors(class ParMesh &pmesh); -- -- - protected: // implementation - - MPI_Comm MyComm; -diff --git a/mesh/point.cpp b/mesh/point.cpp -index ecf6a4dd0..473655b11 100644 ---- a/mesh/point.cpp -+++ b/mesh/point.cpp -@@ -21,12 +21,24 @@ Point::Point( const int *ind, int attr ) : Element(Geometry::POINT) - indices[0] = ind[0]; - } - --void Point::GetVertices( Array &v ) const -+void Point::GetVertices(Array &v) const - { -- v.SetSize( 1 ); -+ v.SetSize(1); - v[0] = indices[0]; - } - -+void Point::SetVertices(const Array &v) -+{ -+ MFEM_ASSERT(v.Size() == 1, "!"); -+ indices[0] = v[0]; -+} -+ -+ -+void Point::SetVertices(const int *ind) -+{ -+ indices[0] = ind[0]; -+} -+ - PointFiniteElement PointFE; - - } -diff --git a/mesh/point.hpp b/mesh/point.hpp -index f154e205e..be00c9c84 100644 ---- a/mesh/point.hpp -+++ b/mesh/point.hpp -@@ -33,33 +33,40 @@ public: - Point( const int *ind, int attr = -1 ); - - /// Return element's type. -- virtual Type GetType() const { return Element::POINT; } -+ Type GetType() const override { return Element::POINT; } - -- /// Returns the indices of the element's vertices. -- virtual void GetVertices( Array &v ) const; -+ /// Get the indices defining the vertices -+ void GetVertices(Array &v) const override; - -- virtual int * GetVertices () { return indices; } -+ /// Set the indices defining the vertices -+ void SetVertices(const Array &v) override; - -- virtual int GetNVertices() const { return 1; } -+ /// @note The returned array should NOT be deleted by the caller. -+ int * GetVertices () override { return indices; } - -- virtual int GetNEdges() const { return (0); } -+ /// Set the vertices according to the given input. -+ void SetVertices(const int *ind) override; - -- virtual const int *GetEdgeVertices(int ei) const { return NULL; } -+ int GetNVertices() const override { return 1; } -+ -+ int GetNEdges() const override { return (0); } -+ -+ const int *GetEdgeVertices(int ei) const override { return NULL; } - - /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. -- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const -+ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override - { nFaceVertices = 0; return 0; } - -- virtual int GetNFaces() const { return 0; } -+ int GetNFaces() const override { return 0; } - -- virtual int GetNFaceVertices(int) const { return 0; } -+ int GetNFaceVertices(int) const override { return 0; } - -- virtual const int *GetFaceVertices(int fi) const { return NULL; } -+ const int *GetFaceVertices(int fi) const override { return NULL; } - -- virtual Element *Duplicate(Mesh *m) const -+ Element *Duplicate(Mesh *m) const override - { return new Point (indices, attribute); } - -- virtual ~Point() { } -+ virtual ~Point() = default; - }; - - class PointFiniteElement; -diff --git a/mesh/pyramid.cpp b/mesh/pyramid.cpp -index d67841564..f64f2afe9 100644 ---- a/mesh/pyramid.cpp -+++ b/mesh/pyramid.cpp -@@ -48,10 +48,13 @@ void Pyramid::SetVertices(const int *ind) - void Pyramid::GetVertices(Array &v) const - { - v.SetSize(5); -- for (int i = 0; i < 5; i++) -- { -- v[i] = indices[i]; -- } -+ std::copy(indices, indices + 5, v.begin()); -+} -+ -+void Pyramid::SetVertices(const Array &v) -+{ -+ MFEM_ASSERT(v.Size() == 5, "!"); -+ std::copy(v.begin(), v.end(), indices); - } - - int Pyramid::GetNFaces(int &nFaceVertices) const -diff --git a/mesh/pyramid.hpp b/mesh/pyramid.hpp -index 8e171a31d..adcc540ed 100644 ---- a/mesh/pyramid.hpp -+++ b/mesh/pyramid.hpp -@@ -37,38 +37,42 @@ public: - int attr = 1); - - /// Return element's type. -- virtual Type GetType() const { return Element::PYRAMID; } -+ Type GetType() const override { return Element::PYRAMID; } - -- /// Set the vertices according to the given input. -- virtual void SetVertices(const int *ind); -+ /// Get the indices defining the vertices -+ void GetVertices(Array &v) const override; -+ -+ /// Set the indices defining the vertices -+ void SetVertices(const Array &v) override; - -- /// Returns the indices of the element's vertices. -- virtual void GetVertices(Array &v) const; -+ /// @note The returned array should NOT be deleted by the caller. -+ int * GetVertices () override { return indices; } - -- virtual int *GetVertices() { return indices; } -+ /// Set the vertices according to the given input. -+ void SetVertices(const int *ind) override; - -- virtual int GetNVertices() const { return 5; } -+ int GetNVertices() const override { return 5; } - -- virtual int GetNEdges() const { return 8; } -+ int GetNEdges() const override { return 8; } - -- virtual const int *GetEdgeVertices(int ei) const -+ const int *GetEdgeVertices(int ei) const override - { return geom_t::Edges[ei]; } - - /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. -- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const; -+ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override; - -- virtual int GetNFaces() const { return 5; } -+ int GetNFaces() const override { return 5; } - -- virtual int GetNFaceVertices(int fi) const -+ int GetNFaceVertices(int fi) const override - { return ( ( fi < 1 ) ? 4 : 3); } - -- virtual const int *GetFaceVertices(int fi) const -+ const int *GetFaceVertices(int fi) const override - { return geom_t::FaceVert[fi]; } - -- virtual Element *Duplicate(Mesh *m) const -+ Element *Duplicate(Mesh *m) const override - { return new Pyramid(indices, attribute); } - -- virtual ~Pyramid() { } -+ virtual ~Pyramid() = default; - }; - - extern class LinearPyramidFiniteElement PyramidFE; -diff --git a/mesh/quadrilateral.cpp b/mesh/quadrilateral.cpp -index 1a69cf179..29fa3bbe1 100644 ---- a/mesh/quadrilateral.cpp -+++ b/mesh/quadrilateral.cpp -@@ -37,19 +37,20 @@ Quadrilateral::Quadrilateral( int ind1, int ind2, int ind3, int ind4, - - void Quadrilateral::SetVertices(const int *ind) - { -- for (int i=0; i<4; i++) -- { -- indices[i] = ind[i]; -- } -+ std::copy(ind, ind + 4, indices); - } - --void Quadrilateral::GetVertices( Array &v ) const -+void Quadrilateral::GetVertices(Array &v) const - { -- v.SetSize( 4 ); -- for (int i=0; i<4; i++) -- { -- v[i] = indices[i]; -- } -+ v.SetSize(4); -+ std::copy(indices, indices + 4, v.begin()); -+} -+ -+ -+void Quadrilateral::SetVertices(const Array &v) -+{ -+ MFEM_ASSERT(v.Size() == 4, "!"); -+ std::copy(v.begin(), v.end(), indices); - } - - BiLinear2DFiniteElement QuadrilateralFE; -diff --git a/mesh/quadrilateral.hpp b/mesh/quadrilateral.hpp -index 9f6b9a442..70fcbfdcc 100644 ---- a/mesh/quadrilateral.hpp -+++ b/mesh/quadrilateral.hpp -@@ -36,37 +36,41 @@ public: - Quadrilateral(int ind1, int ind2, int ind3, int ind4, int attr = 1); - - /// Return element's type -- Type GetType() const { return Element::QUADRILATERAL; } -+ Type GetType() const override { return Element::QUADRILATERAL; } - -- /// Set the vertices according to the given input. -- virtual void SetVertices(const int *ind); -+ /// Get the indices defining the vertices -+ void GetVertices(Array &v) const override; -+ -+ /// Set the indices defining the vertices -+ void SetVertices(const Array &v) override; - -- /// Returns the indices of the element's vertices. -- virtual void GetVertices(Array &v) const; -+ /// @note The returned array should NOT be deleted by the caller. -+ int * GetVertices () override { return indices; } - -- virtual int *GetVertices() { return indices; } -+ /// Set the vertices according to the given input. -+ void SetVertices(const int *ind) override; - -- virtual int GetNVertices() const { return 4; } -+ int GetNVertices() const override { return 4; } - -- virtual int GetNEdges() const { return (4); } -+ int GetNEdges() const override { return (4); } - -- virtual const int *GetEdgeVertices(int ei) const -+ const int *GetEdgeVertices(int ei) const override - { return geom_t::Edges[ei]; } - - /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. -- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const -+ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override - { nFaceVertices = 0; return 0; } - -- virtual int GetNFaces() const { return 0; } -+ int GetNFaces() const override { return 0; } - -- virtual int GetNFaceVertices(int) const { return 0; } -+ int GetNFaceVertices(int) const override { return 0; } - -- virtual const int *GetFaceVertices(int fi) const { return NULL; } -+ const int *GetFaceVertices(int fi) const override { return NULL; } - -- virtual Element *Duplicate(Mesh *m) const -+ Element *Duplicate(Mesh *m) const override - { return new Quadrilateral(indices, attribute); } - -- virtual ~Quadrilateral() { } -+ virtual ~Quadrilateral() = default; - }; - - extern MFEM_EXPORT class BiLinear2DFiniteElement QuadrilateralFE; -diff --git a/mesh/segment.cpp b/mesh/segment.cpp -index 717245907..910614770 100644 ---- a/mesh/segment.cpp -+++ b/mesh/segment.cpp -@@ -37,13 +37,16 @@ void Segment::SetVertices(const int *ind) - indices[1] = ind[1]; - } - --void Segment::GetVertices( Array &v ) const -+void Segment::GetVertices(Array &v) const - { -- v.SetSize( 2 ); -- for (int i=0; i<2; i++) -- { -- v[i] = indices[i]; -- } -+ v.SetSize(2); -+ std::copy(indices, indices + 2, v.begin()); -+} -+ -+void Segment::SetVertices(const Array &v) -+{ -+ MFEM_ASSERT(v.Size() == 2, "!"); -+ std::copy(v.begin(), v.end(), indices); - } - - Linear1DFiniteElement SegmentFE; -diff --git a/mesh/segment.hpp b/mesh/segment.hpp -index 6ca918758..aafc4909f 100644 ---- a/mesh/segment.hpp -+++ b/mesh/segment.hpp -@@ -35,37 +35,41 @@ public: - /// Constructs triangle by specifying the indices and the attribute. - Segment(int ind1, int ind2, int attr = 1); - -- /// Set the indices the element according to the input. -- virtual void SetVertices(const int *ind); -- - /// Return element's type. -- virtual Type GetType() const { return Element::SEGMENT; } -+ Type GetType() const override { return Element::SEGMENT; } -+ -+ /// Get the indices defining the vertices -+ void GetVertices(Array &v) const override; -+ -+ /// Set the indices defining the vertices -+ void SetVertices(const Array &v) override; - -- /// Returns the indices of the element's vertices. -- virtual void GetVertices(Array &v) const; -+ /// @note The returned array should NOT be deleted by the caller. -+ int * GetVertices () override { return indices; } - -- virtual int *GetVertices() { return indices; } -+ /// Set the vertices according to the given input. -+ void SetVertices(const int *ind) override; - -- virtual int GetNVertices() const { return 2; } -+ int GetNVertices() const override { return 2; } - -- virtual int GetNEdges() const { return (0); } -+ int GetNEdges() const override { return 0; } - -- virtual const int *GetEdgeVertices(int ei) const { return NULL; } -+ const int *GetEdgeVertices(int ei) const override { return NULL; } - - /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. -- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const -+ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override - { nFaceVertices = 0; return 0; } - -- virtual int GetNFaces() const { return 0; } -+ int GetNFaces() const override { return 0; } - -- virtual int GetNFaceVertices(int) const { return 0; } -+ int GetNFaceVertices(int) const override { return 0; } - -- virtual const int *GetFaceVertices(int fi) const { return NULL; } -+ const int *GetFaceVertices(int fi) const override { return NULL; } - -- virtual Element *Duplicate(Mesh *m) const -+ Element *Duplicate(Mesh *m) const override - { return new Segment(indices, attribute); } - -- virtual ~Segment() { } -+ virtual ~Segment() = default; - }; - - class Linear1DFiniteElement; -diff --git a/mesh/tetrahedron.cpp b/mesh/tetrahedron.cpp -index c1b0ae6d8..133a69b41 100644 ---- a/mesh/tetrahedron.cpp -+++ b/mesh/tetrahedron.cpp -@@ -341,10 +341,13 @@ void Tetrahedron::GetPointMatrix(unsigned transform, DenseMatrix &pm) - void Tetrahedron::GetVertices(Array &v) const - { - v.SetSize(4); -- for (int i = 0; i < 4; i++) -- { -- v[i] = indices[i]; -- } -+ std::copy(indices, indices + 4, v.begin()); -+} -+ -+void Tetrahedron::SetVertices(const Array &v) -+{ -+ MFEM_ASSERT(v.Size() == 4, "!"); -+ std::copy(v.begin(), v.end(), indices); - } - - Element *Tetrahedron::Duplicate(Mesh *m) const -diff --git a/mesh/tetrahedron.hpp b/mesh/tetrahedron.hpp -index ef8f36eb8..157c75895 100644 ---- a/mesh/tetrahedron.hpp -+++ b/mesh/tetrahedron.hpp -@@ -56,7 +56,7 @@ public: - int ref_flag = 0); - - /// Return element's type. -- virtual Type GetType() const { return Element::TETRAHEDRON; } -+ Type GetType() const override { return Element::TETRAHEDRON; } - - void ParseRefinementFlag(int refinement_edges[2], int &type, - int &flag) const; -@@ -69,10 +69,7 @@ public: - void SetRefinementFlag(int rf) { refinement_flag = rf; } - - /// Return 1 if the element needs refinement in order to get conforming mesh. -- virtual int NeedRefinement(HashTable &v_to_v) const; -- -- /// Set the vertices according to the given input. -- virtual void SetVertices(const int *ind); -+ int NeedRefinement(HashTable &v_to_v) const override; - - /** Reorder the vertices so that the longest edge is from vertex 0 - to vertex 1. If called it should be once from the mesh constructor, -@@ -81,42 +78,49 @@ public: - void MarkEdge(const DSTable &v_to_v, const Array &length, - const Array &length2); - -- virtual void ResetTransform(int tr) { transform = tr; } -- virtual unsigned GetTransform() const { return transform; } -+ void ResetTransform(int tr) override { transform = tr; } -+ unsigned GetTransform() const override { return transform; } - - /// Add 'tr' to the current chain of coarse-fine transformations. -- virtual void PushTransform(int tr) -+ void PushTransform(int tr) override - { transform = (transform << 3) | (tr + 1); } - - /// Calculate point matrix corresponding to a chain of transformations. - static void GetPointMatrix(unsigned transform, DenseMatrix &pm); - -- /// Returns the indices of the element's vertices. -- virtual void GetVertices(Array &v) const; -+ /// Get the indices defining the vertices -+ void GetVertices(Array &v) const override; -+ -+ /// Set the indices defining the vertices -+ void SetVertices(const Array &v) override; - -- virtual int *GetVertices() { return indices; } -+ /// @note The returned array should NOT be deleted by the caller. -+ int * GetVertices () override { return indices; } -+ -+ /// Set the vertices according to the given input. -+ void SetVertices(const int *ind) override; - -- virtual int GetNVertices() const { return 4; } -+ int GetNVertices() const override { return 4; } - -- virtual int GetNEdges() const { return (6); } -+ int GetNEdges() const override { return (6); } - -- virtual const int *GetEdgeVertices(int ei) const -+ const int *GetEdgeVertices(int ei) const override - { return geom_t::Edges[ei]; } - - /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. -- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const -+ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override - { nFaceVertices = 3; return 4; } - -- virtual int GetNFaces() const { return 4; } -+ int GetNFaces() const override { return 4; } - -- virtual int GetNFaceVertices(int) const { return 3; } -+ int GetNFaceVertices(int) const override { return 3; } - -- virtual const int *GetFaceVertices(int fi) const -+ const int *GetFaceVertices(int fi) const override - { return geom_t::FaceVert[fi]; } - -- virtual Element *Duplicate(Mesh *m) const; -+ Element *Duplicate(Mesh *m) const override; - -- virtual ~Tetrahedron() { } -+ virtual ~Tetrahedron() = default; - }; - - // Defined in fe.cpp to ensure construction before 'mfem::Geometries'. -diff --git a/mesh/triangle.cpp b/mesh/triangle.cpp -index abd2b4379..80d11f4b6 100644 ---- a/mesh/triangle.cpp -+++ b/mesh/triangle.cpp -@@ -155,10 +155,13 @@ void Triangle::GetPointMatrix(unsigned transform, DenseMatrix &pm) - void Triangle::GetVertices(Array &v) const - { - v.SetSize(3); -- for (int i = 0; i < 3; i++) -- { -- v[i] = indices[i]; -- } -+ std::copy(indices, indices + 3, v.begin()); -+} -+ -+void Triangle::SetVertices(const Array &v) -+{ -+ MFEM_ASSERT(v.Size() == 3, "!"); -+ std::copy(v.begin(), v.end(), indices); - } - - // @cond DOXYGEN_SKIP -diff --git a/mesh/triangle.hpp b/mesh/triangle.hpp -index 49fb4fe99..480c04118 100644 ---- a/mesh/triangle.hpp -+++ b/mesh/triangle.hpp -@@ -39,13 +39,10 @@ public: - Triangle(int ind1, int ind2, int ind3, int attr = 1); - - /// Return element's type. -- virtual Type GetType() const { return Element::TRIANGLE; } -+ Type GetType() const override { return Element::TRIANGLE; } - - /// Return 1 if the element needs refinement in order to get conforming mesh. -- virtual int NeedRefinement(HashTable &v_to_v) const; -- -- /// Set the vertices according to the given input. -- virtual void SetVertices(const int *ind); -+ int NeedRefinement(HashTable &v_to_v) const override; - - /** Reorder the vertices so that the longest edge is from vertex 0 - to vertex 1. If called it should be once from the mesh constructor, -@@ -59,43 +56,51 @@ public: - static void MarkEdge(int *indices, const DSTable &v_to_v, - const Array &length, const Array &length2); - -- virtual void ResetTransform(int tr) { transform = tr; } -- virtual unsigned GetTransform() const { return transform; } -+ void ResetTransform(int tr) override { transform = tr; } -+ unsigned GetTransform() const override { return transform; } - - /// Add 'tr' to the current chain of coarse-fine transformations. -- virtual void PushTransform(int tr) -+ void PushTransform(int tr) override - { transform = (transform << 3) | (tr + 1); } - - /// Calculate point matrix corresponding to a chain of transformations. - static void GetPointMatrix(unsigned transform, DenseMatrix &pm); - -- /// Returns the indices of the element's vertices. -- virtual void GetVertices(Array &v) const; -+ /// Get the indices defining the vertices -+ void GetVertices(Array &v) const override; -+ -+ /// Set the indices defining the vertices -+ void SetVertices(const Array &v) override; -+ -+ /// @note The returned array should NOT be deleted by the caller. -+ int * GetVertices () override { return indices; } -+ -+ /// Set the vertices according to the given input. -+ void SetVertices(const int *ind) override; - -- virtual int *GetVertices() { return indices; } - -- virtual int GetNVertices() const { return 3; } -+ int GetNVertices() const override { return 3; } - -- virtual int GetNEdges() const { return (3); } -+ int GetNEdges() const override { return (3); } - -- virtual const int *GetEdgeVertices(int ei) const -+ const int *GetEdgeVertices(int ei) const override - { return geom_t::Edges[ei]; } - - /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. -- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const -+ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override - { nFaceVertices = 0; return 0; } - -- virtual int GetNFaces() const { return 0; } -+ int GetNFaces() const override { return 0; } - -- virtual int GetNFaceVertices(int) const { return 0; } -+ int GetNFaceVertices(int) const override { return 0; } - -- virtual const int *GetFaceVertices(int fi) const -+ const int *GetFaceVertices(int fi) const override - { MFEM_ABORT("not implemented"); return NULL; } - -- virtual Element *Duplicate(Mesh *m) const -+ Element *Duplicate(Mesh *m) const override - { return new Triangle(indices, attribute); } - -- virtual ~Triangle() { } -+ virtual ~Triangle() = default; - }; - - // Defined in fe.cpp to ensure construction before 'mfem::Geometries'. -diff --git a/mesh/wedge.cpp b/mesh/wedge.cpp -index 898da7653..b1aea933d 100644 ---- a/mesh/wedge.cpp -+++ b/mesh/wedge.cpp -@@ -50,10 +50,13 @@ void Wedge::SetVertices(const int *ind) - void Wedge::GetVertices(Array &v) const - { - v.SetSize(6); -- for (int i = 0; i < 6; i++) -- { -- v[i] = indices[i]; -- } -+ std::copy(indices, indices + 6, v.begin()); -+} -+ -+void Wedge::SetVertices(const Array &v) -+{ -+ MFEM_ASSERT(v.Size() == 6, "!"); -+ std::copy(v.begin(), v.end(), indices); - } - - int Wedge::GetNFaces(int &nFaceVertices) const -diff --git a/mesh/wedge.hpp b/mesh/wedge.hpp -index fb8583f8e..2eae6d104 100644 ---- a/mesh/wedge.hpp -+++ b/mesh/wedge.hpp -@@ -37,38 +37,42 @@ public: - int attr = 1); - - /// Return element's type. -- virtual Type GetType() const { return Element::WEDGE; } -+ Type GetType() const override { return Element::WEDGE; } - -- /// Set the vertices according to the given input. -- virtual void SetVertices(const int *ind); -+ /// Get the indices defining the vertices -+ void GetVertices(Array &v) const override; -+ -+ /// Set the indices defining the vertices -+ void SetVertices(const Array &v) override; - -- /// Returns the indices of the element's vertices. -- virtual void GetVertices(Array &v) const; -+ /// @note The returned array should NOT be deleted by the caller. -+ int * GetVertices () override { return indices; } - -- virtual int *GetVertices() { return indices; } -+ /// Set the vertices according to the given input. -+ void SetVertices(const int *ind) override; - -- virtual int GetNVertices() const { return 6; } -+ int GetNVertices() const override { return 6; } - -- virtual int GetNEdges() const { return 9; } -+ int GetNEdges() const override { return 9; } - -- virtual const int *GetEdgeVertices(int ei) const -+ const int *GetEdgeVertices(int ei) const override - { return geom_t::Edges[ei]; } - - /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. -- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const; -+ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override; - -- virtual int GetNFaces() const { return 5; } -+ int GetNFaces() const override { return 5; } - -- virtual int GetNFaceVertices(int fi) const -+ int GetNFaceVertices(int fi) const override - { return (fi < 2) ? 3 : 4; } - -- virtual const int *GetFaceVertices(int fi) const -+ const int *GetFaceVertices(int fi) const override - { return geom_t::FaceVert[fi]; } - -- virtual Element *Duplicate(Mesh *m) const -+ Element *Duplicate(Mesh *m) const override - { return new Wedge(indices, attribute); } - -- virtual ~Wedge() { } -+ virtual ~Wedge() = default; - }; - - extern MFEM_EXPORT class LinearWedgeFiniteElement WedgeFE; -diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt -index dc0e9fea8..678abb706 100644 ---- a/tests/unit/CMakeLists.txt -+++ b/tests/unit/CMakeLists.txt -@@ -43,6 +43,7 @@ set(UNIT_TESTS_SRCS - linalg/test_operator.cpp - linalg/test_vector.cpp - mesh/test_face_orientations.cpp -+ mesh/mesh_test_utils.cpp - mesh/test_fms.cpp - mesh/test_mesh.cpp - mesh/test_ncmesh.cpp -diff --git a/tests/unit/mesh/mesh_test_utils.cpp b/tests/unit/mesh/mesh_test_utils.cpp -new file mode 100644 -index 000000000..65fb2e01d ---- /dev/null -+++ b/tests/unit/mesh/mesh_test_utils.cpp -@@ -0,0 +1,207 @@ -+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced -+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files -+// LICENSE and NOTICE for details. LLNL-CODE-806117. -+// -+// This file is part of the MFEM library. For more information and source code -+// availability visit https://mfem.org. -+// -+// MFEM is free software; you can redistribute it and/or modify it under the -+// terms of the BSD-3 license. We welcome feedback and contributions, see file -+// CONTRIBUTING.md for details. -+ -+#include "mesh_test_utils.hpp" -+ -+namespace mfem -+{ -+ -+int CheckPoisson(Mesh &mesh, int order, int disabled_boundary_attribute) -+{ -+ constexpr int dim = 3; -+ -+ H1_FECollection fec(order, dim); -+ FiniteElementSpace fes(&mesh, &fec); -+ -+ GridFunction sol(&fes); -+ -+ ConstantCoefficient one(1.0); -+ BilinearForm a(&fes); -+ a.AddDomainIntegrator(new DiffusionIntegrator(one)); -+ a.Assemble(); -+ -+ LinearForm b(&fes); -+ b.AddDomainIntegrator(new DomainLFIntegrator(one)); -+ b.Assemble(); -+ -+ // Add in essential boundary conditions -+ Array ess_tdof_list; -+ REQUIRE(mesh.bdr_attributes.Max() > 0); -+ -+ // Mark all boundaries essential -+ Array bdr_attr_is_ess(mesh.bdr_attributes.Max()); -+ bdr_attr_is_ess = 1; -+ if (disabled_boundary_attribute >= 0) -+ { -+ bdr_attr_is_ess[mesh.bdr_attributes.Find(disabled_boundary_attribute)] = 0; -+ } -+ -+ fes.GetEssentialTrueDofs(bdr_attr_is_ess, ess_tdof_list); -+ REQUIRE(ess_tdof_list.Size() > 0); -+ -+ ConstantCoefficient zero(0.0); -+ sol.ProjectCoefficient(zero); -+ Vector B, X; -+ OperatorPtr A; -+ a.FormLinearSystem(ess_tdof_list, sol, b, A, X, B); -+ -+ // Solve the system -+ CG(*A, B, X, 2, 1000, 1e-20, 0.0); -+ -+ // Recover the solution -+ a.RecoverFEMSolution(X, b, sol); -+ -+ // Check that X solves the system A X = B. -+ A->AddMult(X, B, -1.0); -+ auto residual_norm = B.Norml2(); -+ bool satisfy_system = residual_norm < 1e-10; -+ CAPTURE(residual_norm); -+ CHECK(satisfy_system); -+ -+ bool satisfy_bc = true; -+ Vector tvec; -+ sol.GetTrueDofs(tvec); -+ for (auto dof : ess_tdof_list) -+ { -+ if (tvec[dof] != 0.0) -+ { -+ satisfy_bc = false; -+ break; -+ } -+ } -+ CHECK(satisfy_bc); -+ return ess_tdof_list.Size(); -+}; -+ -+#ifdef MFEM_USE_MPI -+ -+void CheckPoisson(ParMesh &pmesh, int order, -+ int disabled_boundary_attribute) -+{ -+ constexpr int dim = 3; -+ -+ H1_FECollection fec(order, dim); -+ ParFiniteElementSpace pfes(&pmesh, &fec); -+ -+ ParGridFunction sol(&pfes); -+ -+ ConstantCoefficient one(1.0); -+ ParBilinearForm a(&pfes); -+ a.AddDomainIntegrator(new DiffusionIntegrator(one)); -+ a.Assemble(); -+ ParLinearForm b(&pfes); -+ b.AddDomainIntegrator(new DomainLFIntegrator(one)); -+ b.Assemble(); -+ -+ // Add in essential boundary conditions -+ Array ess_tdof_list; -+ REQUIRE(pmesh.bdr_attributes.Max() > 0); -+ -+ Array bdr_attr_is_ess(pmesh.bdr_attributes.Max()); -+ bdr_attr_is_ess = 1; -+ if (disabled_boundary_attribute >= 0) -+ { -+ CAPTURE(disabled_boundary_attribute); -+ bdr_attr_is_ess[pmesh.bdr_attributes.Find(disabled_boundary_attribute)] = 0; -+ } -+ -+ pfes.GetEssentialTrueDofs(bdr_attr_is_ess, ess_tdof_list); -+ int num_ess_dof = ess_tdof_list.Size(); -+ MPI_Allreduce(MPI_IN_PLACE, &num_ess_dof, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ REQUIRE(num_ess_dof > 0); -+ -+ -+ ConstantCoefficient zero(0.0); -+ sol.ProjectCoefficient(zero); -+ Vector B, X; -+ OperatorPtr A; -+ const bool copy_interior = true; // interior(sol) --> interior(X) -+ a.FormLinearSystem(ess_tdof_list, sol, b, A, X, B, copy_interior); -+ -+ // Solve the system -+ CGSolver cg(MPI_COMM_WORLD); -+ // HypreBoomerAMG preconditioner; -+ cg.SetMaxIter(2000); -+ cg.SetRelTol(1e-12); -+ cg.SetPrintLevel(0); -+ cg.SetOperator(*A); -+ // cg.SetPreconditioner(preconditioner); -+ cg.Mult(B, X); -+ // Recover the solution -+ a.RecoverFEMSolution(X, b, sol); -+ -+ // Check that X solves the system A X = B. -+ A->AddMult(X, B, -1.0); -+ auto residual_norm = B.Norml2(); -+ bool satisfy_system = residual_norm < 1e-10; -+ CAPTURE(residual_norm); -+ CHECK(satisfy_system); -+ -+ // Initialize the bdr_dof to be checked -+ Vector tvec; -+ sol.GetTrueDofs(tvec); -+ bool satisfy_bc = true; -+ for (auto dof : ess_tdof_list) -+ { -+ if (tvec[dof] != 0.0) -+ { -+ satisfy_bc = false; -+ break; -+ } -+ } -+ CHECK(satisfy_bc); -+}; -+ -+std::unique_ptr CheckParMeshNBE(Mesh &smesh, -+ const std::unique_ptr &partition) -+{ -+ auto pmesh = std::unique_ptr(new ParMesh(MPI_COMM_WORLD, smesh, -+ partition.get())); -+ -+ int nbe = pmesh->GetNBE(); -+ MPI_Allreduce(MPI_IN_PLACE, &nbe, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ -+ CHECK(nbe == smesh.GetNBE()); -+ return pmesh; -+}; -+ -+bool CheckFaceInternal(ParMesh& pmesh, int f, -+ const std::map &local_to_shared) -+{ -+ int e1, e2; -+ pmesh.GetFaceElements(f, &e1, &e2); -+ int inf1, inf2, ncface; -+ pmesh.GetFaceInfos(f, &inf1, &inf2, &ncface); -+ -+ if (e2 < 0 && inf2 >=0) -+ { -+ // Shared face on processor boundary -> Need to discover the neighbor -+ // attributes -+ auto FET = pmesh.GetSharedFaceTransformations(local_to_shared.at(f)); -+ -+ if (FET->Elem1->Attribute != FET->Elem2->Attribute && f < pmesh.GetNumFaces()) -+ { -+ // shared face on domain attribute boundary, which this rank owns -+ return true; -+ } -+ } -+ -+ if (e2 >= 0 && pmesh.GetAttribute(e1) != pmesh.GetAttribute(e2)) -+ { -+ // local face on domain attribute boundary -+ return true; -+ } -+ return false; -+}; -+ -+#endif -+ -+} // namespace mfem -diff --git a/tests/unit/mesh/mesh_test_utils.hpp b/tests/unit/mesh/mesh_test_utils.hpp -new file mode 100644 -index 000000000..e4088a788 ---- /dev/null -+++ b/tests/unit/mesh/mesh_test_utils.hpp -@@ -0,0 +1,78 @@ -+// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced -+// at the Lawrence Livermore National Laboratory. All Rights reserved. See files -+// LICENSE and NOTICE for details. LLNL-CODE-806117. -+// -+// This file is part of the MFEM library. For more information and source code -+// availability visit https://mfem.org. -+// -+// MFEM is free software; you can redistribute it and/or modify it under the -+// terms of the BSD-3 license. We welcome feedback and contributions, see file -+// CONTRIBUTING.md for details. -+ -+#ifndef MFEM_MESH_TEST_UTILS -+#define MFEM_MESH_TEST_UTILS -+ -+ -+#include "mfem.hpp" -+#include "unit_tests.hpp" -+ -+namespace mfem -+{ -+ -+/** -+ * @brief Helper function for performing an H1 Poisson solve on a serial mesh, with -+ * homogeneous essential boundary conditions. Optionally can disable a boundary. -+ * -+ * @param mesh The SERIAL mesh to perform the Poisson solve on -+ * @param order The polynomial order of the basis -+ * @param disabled_boundary_attribute Optional boundary attribute to NOT apply -+ * homogeneous Dirichlet boundary condition on. Default of -1 means no boundary -+ * is disabled. -+ * @return int The number of DOF that are fixed by the essential boundary condition. -+ */ -+int CheckPoisson(Mesh &mesh, int order, int disabled_boundary_attribute = -1); -+ -+#ifdef MFEM_USE_MPI -+ -+/** -+ * @brief Helper function for performing an H1 Poisson solve on a parallel mesh, with -+ * homogeneous essential boundary conditions. Optionally can disable a boundary. -+ * -+ * @param mesh The PARALLEL mesh to perform the Poisson solve on -+ * @param order The polynomial order of the basis -+ * @param disabled_boundary_attribute Optional boundary attribute to NOT apply -+ * homogeneous Dirichlet boundary condition on. Default of -1 means no boundary -+ * is disabled. -+ * @return int The number of DOF that are fixed by the essential boundary condition. -+ */ -+void CheckPoisson(ParMesh &pmesh, int order, -+ int disabled_boundary_attribute = -1); -+ -+/** -+ * @brief Check that a Parmesh generates the same number of boundary elements as -+ * the serial mesh. -+ * -+ * @param smesh Serial mesh to be built from and compared against -+ * @param partition Optional partition -+ * @return std::unique_ptr Pointer to the mesh in question. -+ */ -+std::unique_ptr CheckParMeshNBE(Mesh &smesh, -+ const std::unique_ptr &partition = nullptr); -+ -+/** -+ * @brief Helper function to track if a face index is internal -+ * -+ * @param pmesh The mesh containing the face -+ * @param f The face index -+ * @param local_to_shared A map from local faces to shared faces -+ * @return true the face is between domain attributes (and owned by this rank) -+ * @return false the face is not between domain attributes or not owned by this rank -+ */ -+bool CheckFaceInternal(ParMesh& pmesh, int f, -+ const std::map &local_to_shared); -+ -+#endif -+ -+} // namespace mfem -+ -+#endif // MFEM_MESH_TEST_UTILS -\ No newline at end of file -diff --git a/tests/unit/mesh/test_ncmesh.cpp b/tests/unit/mesh/test_ncmesh.cpp -index c1bb54e13..c38ab18b2 100644 ---- a/tests/unit/mesh/test_ncmesh.cpp -+++ b/tests/unit/mesh/test_ncmesh.cpp -@@ -10,6 +10,7 @@ - // CONTRIBUTING.md for details. - - #include "mfem.hpp" -+#include "mesh_test_utils.hpp" - #include "unit_tests.hpp" - - #include -@@ -18,6 +19,34 @@ namespace mfem - - constexpr double EPS = 1e-10; - -+// Helper to count H1 essential dofs for a given order with a given attribute -+template -+int CountEssentialDof(Mesh &mesh, int order, int attribute) -+{ -+ constexpr int dim = 3; -+ FECollection fec(order, dim); -+ FiniteElementSpace fes(&mesh, &fec); -+ -+ Array bdr_attr_is_ess(mesh.bdr_attributes.Max()); -+ bdr_attr_is_ess = 0; -+ bdr_attr_is_ess[mesh.bdr_attributes.Find(attribute)] = 1; -+ -+ if (TDOF) -+ { -+ Array ess_tdof_list; -+ fes.GetEssentialTrueDofs(bdr_attr_is_ess, ess_tdof_list); -+ return ess_tdof_list.Size(); -+ } -+ else -+ { -+ // VDOF -+ Array ess_vdof_marker, vdof_list; -+ fes.GetEssentialVDofs(bdr_attr_is_ess, ess_vdof_marker); -+ fes.MarkerToList(ess_vdof_marker, vdof_list); -+ return vdof_list.Size(); -+ } -+}; -+ - // Test case: Verify that a conforming mesh yields the same norm for the - // assembled diagonal with PA when using the standard (conforming) - // Mesh vs. the corresponding (non-conforming) NCMesh. -@@ -114,7 +143,6 @@ TEST_CASE("NCMesh PA diagonal", "[NCMesh]") - - } // test case - -- - TEST_CASE("NCMesh 3D Refined Volume", "[NCMesh]") - { - auto mesh_fname = GENERATE("../../data/ref-tetrahedron.mesh", -@@ -146,7 +174,6 @@ TEST_CASE("NCMesh 3D Refined Volume", "[NCMesh]") - REQUIRE(summed_volume == MFEM_Approx(original_volume)); - } // test case - -- - TEST_CASE("NCMesh 3D Derefined Volume", "[NCMesh]") - { - auto mesh_fname = GENERATE("../../data/ref-tetrahedron.mesh", -@@ -176,6 +203,134 @@ TEST_CASE("NCMesh 3D Derefined Volume", "[NCMesh]") - REQUIRE(derefined_volume == MFEM_Approx(original_volume)); - } // test case - -+// Helper to create a mesh of a tet with four face neighbor tets and internal boundary between -+Mesh StarMesh() -+{ -+ const int nnode = 4 + 4; -+ const int nelem = 5; -+ -+ Mesh mesh(3, nnode, nelem); -+ -+ // central tet -+ mesh.AddVertex(0.0, 0.0, 0.0); -+ mesh.AddVertex(1.0, 0.0, 0.0); -+ mesh.AddVertex(0.0, 1.0, 0.0); -+ mesh.AddVertex(0.0, 0.0, 1.0); -+ -+ mesh.AddVertex( 1.0, 1.0, 1.0); // opposite 0 -+ mesh.AddVertex(-1.0, 0.0, 0.0); // opposite 1 -+ mesh.AddVertex( 0.0, -1.0, 0.0); // opposite 2 -+ mesh.AddVertex( 0.0, 0.0, -1.0); // opposite 3 -+ -+ mesh.AddTet(0, 1, 2, 3, 1); // central -+ mesh.AddTet(4, 1, 2, 3, 2); // opposite 0 -+ mesh.AddTet(0, 5, 2, 3, 3); // opposite 1 -+ mesh.AddTet(0, 1, 6, 3, 4); // opposite 2 -+ mesh.AddTet(0, 1, 2, 7, 5); // opposite 3 -+ -+ mesh.FinalizeTopology(); -+ mesh.Finalize(true, true); -+ -+ // Introduce internal boundary elements -+ const int new_attribute = mesh.bdr_attributes.Max() + 1; -+ Array original_boundary_vertices; -+ for (int f = 0; f < mesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ mesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && mesh.GetAttribute(e1) != mesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = mesh.GetFace(f)->Duplicate(&mesh); -+ new_elem->SetAttribute(new_attribute); -+ new_elem->GetVertices(original_boundary_vertices); -+ mesh.AddBdrElement(new_elem); -+ } -+ } -+ mesh.SetAttributes(); -+ mesh.FinalizeTopology(); -+ mesh.Finalize(true, true); -+ -+ return mesh; -+} -+ -+Mesh DividingPlaneMesh(bool tet_mesh = true, bool split = true) -+{ -+ auto mesh = Mesh("../../data/ref-cube.mesh"); -+ { -+ Array refs; -+ refs.Append(Refinement(0, Refinement::X)); -+ mesh.GeneralRefinement(refs); -+ } -+ delete mesh.ncmesh; -+ mesh.ncmesh = nullptr; -+ mesh.FinalizeTopology(); -+ mesh.Finalize(true, true); -+ -+ mesh.SetAttribute(0, 1); -+ mesh.SetAttribute(1, split ? 2 : 1); -+ -+ // Introduce internal boundary elements -+ const int new_attribute = mesh.bdr_attributes.Max() + 1; -+ for (int f = 0; f < mesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ mesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && mesh.GetAttribute(e1) != mesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = mesh.GetFace(f)->Duplicate(&mesh); -+ new_elem->SetAttribute(new_attribute); -+ mesh.AddBdrElement(new_elem); -+ } -+ } -+ if (tet_mesh) -+ { -+ mesh = Mesh::MakeSimplicial(mesh); -+ } -+ mesh.FinalizeTopology(); -+ mesh.Finalize(true, true); -+ return mesh; -+} -+ -+// Define a pair of tet with a shared triangle in the y-z plane. -+// Vary the vertex ordering to achieve the 3 possible odd orientations -+Mesh OrientedTriFaceMesh(int orientation, bool add_extbdr = false) -+{ -+ REQUIRE((orientation == 1 || orientation == 3 || orientation == 5)); -+ -+ Mesh mesh(3, 4, 2); -+ mesh.AddVertex(-1.0, 0.0, 0.0); -+ mesh.AddVertex(0.0, 0.0, 0.0); -+ mesh.AddVertex(0.0, 1.0, 0.0); -+ mesh.AddVertex(0.0, 0.0, 1.0); -+ -+ // opposing vertex -+ mesh.AddVertex(1.0, 0.0, 0.0); -+ -+ mesh.AddTet(0, 1, 2, 3, 1); -+ -+ switch (orientation) -+ { -+ case 1: -+ mesh.AddTet(4,2,1,3,2); break; -+ case 3: -+ mesh.AddTet(4,3,2,1,2); break; -+ case 5: -+ mesh.AddTet(4,1,3,2,2); break; -+ } -+ -+ mesh.FinalizeTopology(add_extbdr); -+ mesh.SetAttributes(); -+ -+ auto *bdr = new Triangle(1,2,3, -+ mesh.bdr_attributes.Size() == 0 ? 1 : mesh.bdr_attributes.Max() + 1); -+ mesh.AddBdrElement(bdr); -+ -+ mesh.FinalizeTopology(false); -+ mesh.Finalize(); -+ return mesh; -+}; - - #ifdef MFEM_USE_MPI - -@@ -297,7 +452,6 @@ TEST_CASE("pNCMesh PA diagonal", "[Parallel], [NCMesh]") - } - } // test case - -- - // Given a parallel and a serial mesh, perform an L2 projection and check the - // solutions match exactly. - std::array CheckL2Projection(ParMesh& pmesh, Mesh& smesh, int order, -@@ -618,7 +772,6 @@ TEST_CASE("EdgeFaceConstraint", "[Parallel], [NCMesh]") - } - } - ParMesh pmesh(MPI_COMM_WORLD, smesh, partition.get()); -- - { - constexpr int dim = 3; - constexpr int order = 1; -@@ -630,7 +783,6 @@ TEST_CASE("EdgeFaceConstraint", "[Parallel], [NCMesh]") - const auto parallel_ntdof = pfes.GlobalTrueVSize(); - CHECK(serial_ntdof == parallel_ntdof); - } -- - for (int order = 1; order <= 4; order++) - { - CAPTURE(order); -@@ -1026,6 +1178,2607 @@ TEST_CASE("GetVectorValueInFaceNeighborElement", "[Parallel], [NCMesh]") - } - } - --#endif // MFEM_USE_MPI -+TEST_CASE("TetCornerRefines", "[Parallel], [NCMesh]") -+{ -+ auto p = GENERATE(1,2,3); -+ CAPTURE(p); -+ -+ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); -+ -+ REQUIRE(smesh.GetNBE() == 4); -+ -+ Array refs; -+ refs.Append(Refinement(0, Refinement::X)); -+ smesh.GeneralRefinement(refs); -+ -+ // Now have a pair of elements, make the second element a different -+ // attribute. -+ smesh.SetAttribute(0, 1); -+ smesh.SetAttribute(1, 2); -+ -+ REQUIRE(smesh.GetNBE() == 2 * 3); -+ -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ // Introduce an internal boundary element -+ const int new_attribute = smesh.bdr_attributes.Max() + 1; -+ for (int f = 0; f < smesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); -+ new_elem->SetAttribute(new_attribute); -+ smesh.AddBdrElement(new_elem); -+ break; -+ } -+ } -+ smesh.FinalizeTopology(); // Finalize to build relevant tables -+ smesh.Finalize(); -+ -+ // Exactly one boundary element must be added -+ REQUIRE(smesh.GetNBE() == 2 * 3 + 1); -+ -+ smesh.EnsureNCMesh(true); -+ -+ auto pmesh = CheckParMeshNBE(smesh); -+ -+ pmesh->FinalizeTopology(); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ REQUIRE(pmesh->Nonconforming()); -+ -+ std::map local_to_shared; -+ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) -+ { -+ local_to_shared[pmesh->GetSharedFace(i)] = i; -+ } -+ -+ // Count the number of internal faces via the boundary elements -+ int num_internal = 0; -+ for (int n = 0; n < pmesh->GetNBE(); ++n) -+ { -+ int f, o; -+ pmesh->GetBdrElementFace(n, &f, &o); -+ if (CheckFaceInternal(*pmesh, f, local_to_shared)) -+ { -+ ++num_internal; -+ } -+ } -+ -+ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ CHECK(num_internal == 1); -+ -+ CheckPoisson(*pmesh, p, pmesh->bdr_attributes.Max()); -+ CheckPoisson(*pmesh, p); -+} -+ -+// Count the number of essential dofs on a ParMesh. -+template -+int CountEssentialDof(ParMesh &mesh, int order, int attribute) -+{ -+ constexpr int dim = 3; -+ FECollection fec(order, dim); -+ ParFiniteElementSpace pfes(&mesh, &fec); -+ -+ Array bdr_attr_is_ess(mesh.bdr_attributes.Max()); -+ bdr_attr_is_ess = 0; -+ bdr_attr_is_ess[mesh.bdr_attributes.Find(attribute)] = 1; -+ -+ Array ess_tdof_list; -+ pfes.GetEssentialTrueDofs(bdr_attr_is_ess, ess_tdof_list); -+ if (TDOF) -+ { -+ pfes.GetEssentialTrueDofs(bdr_attr_is_ess, ess_tdof_list); -+ return ess_tdof_list.Size(); -+ } -+ else -+ { -+ // VDOF -+ Array ess_vdof_marker, vdof_list; -+ pfes.GetEssentialVDofs(bdr_attr_is_ess, ess_vdof_marker); -+ pfes.MarkerToList(ess_vdof_marker, vdof_list); -+ return vdof_list.Size(); -+ } -+}; -+ -+template -+int ParCountEssentialDof(ParMesh &mesh, int order, int attribute) -+{ -+ auto num_essential_dof = CountEssentialDof(mesh, order, -+ attribute); -+ MPI_Allreduce(MPI_IN_PLACE, &num_essential_dof, 1, MPI_INT, MPI_SUM, -+ MPI_COMM_WORLD); -+ return num_essential_dof; -+}; -+ -+TEST_CASE("InteriorBoundaryReferenceTets", "[Parallel], [NCMesh]") -+{ -+ constexpr auto seed = 314159; -+ srand(seed); -+ auto p = 1;//GENERATE(1,2,3); -+ CAPTURE(p); -+ -+ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); -+ -+ REQUIRE(smesh.GetNBE() == 4); -+ -+ Array refs; -+ refs.Append(Refinement(0, Refinement::X)); -+ smesh.GeneralRefinement(refs); -+ -+ // Now have a pair of elements, make the second element a different -+ // attribute. -+ smesh.SetAttribute(0, 1); -+ smesh.SetAttribute(1, 2); -+ -+ REQUIRE(smesh.GetNBE() == 2 * 3); -+ -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ // Introduce an internal boundary element -+ const int new_attribute = smesh.bdr_attributes.Max() + 1; -+ for (int f = 0; f < smesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); -+ new_elem->SetAttribute(new_attribute); -+ smesh.AddBdrElement(new_elem); -+ break; -+ } -+ } -+ smesh.FinalizeTopology(); // Finalize to build relevant tables -+ smesh.Finalize(); -+ -+ // Exactly one boundary element must be added -+ REQUIRE(smesh.GetNBE() == 2 * 3 + 1); -+ -+ smesh.EnsureNCMesh(true); -+ -+ auto pmesh = CheckParMeshNBE(smesh); -+ -+ pmesh->FinalizeTopology(); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ REQUIRE(pmesh->Nonconforming()); -+ -+ std::map local_to_shared; -+ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) -+ { -+ local_to_shared[pmesh->GetSharedFace(i)] = i; -+ } -+ -+ // Count the number of internal faces via the boundary elements -+ int num_internal = 0; -+ for (int n = 0; n < pmesh->GetNBE(); ++n) -+ { -+ int f, o; -+ pmesh->GetBdrElementFace(n, &f, &o); -+ if (CheckFaceInternal(*pmesh, f, local_to_shared)) -+ { -+ ++num_internal; -+ } -+ } -+ -+ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ CHECK(num_internal == 1); -+ -+ CheckPoisson(*pmesh, p, pmesh->bdr_attributes.Max()); -+ CheckPoisson(*pmesh, p); -+ -+ int num_initial_ess_tdof = CountEssentialDof(*pmesh, p, -+ smesh.bdr_attributes.Max()); -+ if (Mpi::Root()) -+ { -+ REQUIRE(num_initial_ess_tdof > 0); -+ } -+ // Level of refinement difference across the processor boundary from root zero to the -+ // others -+ auto ref_level = 1;//GENERATE(1,2,3); -+ auto refined_attribute = 2;//GENERATE(1,2); -+ CAPTURE(ref_level); -+ CAPTURE(refined_attribute); -+ -+ Mesh modified_smesh(smesh); -+ for (int r = 0; r < ref_level; r++) -+ { -+ Array el_to_refine; -+ for (int n = 0; n < modified_smesh.GetNE(); n++) -+ { -+ if (modified_smesh.GetAttribute(n) == refined_attribute) -+ { -+ el_to_refine.Append(n); -+ } -+ } -+ modified_smesh.GeneralRefinement(el_to_refine); -+ } -+ -+ // There should now be some internal boundary elements, where there was one -+ // before. -+ CHECK(modified_smesh.GetNBE() == 3 /* external boundaries of unrefined */ -+ + std::pow(4, ref_level) /* internal boundaries */ -+ + (3 * std::pow(4, ref_level)) /* external boundaries of refined */); -+ -+ // Force the partition to have the edge case of a parent and child being -+ // divided across the processor boundary. This necessitates the -+ // GhostBoundaryElement treatment. -+ auto partition = std::unique_ptr(new int[modified_smesh.GetNE()]); -+ for (int i = 0; i < modified_smesh.GetNE(); i++) -+ { -+ // Randomly assign to any processor but zero. -+ partition[i] = Mpi::WorldSize() > 1 ? 1 + rand() % (Mpi::WorldSize() - 1) : 0; -+ } -+ if (Mpi::WorldSize() > 0) -+ { -+ // Make sure rank 0 has the non-refined attribute. This ensures it will have -+ // a parent face with only ghost children. -+ const int unrefined_attribute = refined_attribute == 1 ? 2 : 1; -+ Array root_element; -+ for (int n = 0; n < modified_smesh.GetNE(); n++) -+ { -+ if (modified_smesh.GetAttribute(n) == unrefined_attribute) -+ { -+ root_element.Append(n); -+ } -+ } -+ REQUIRE(root_element.Size() == 1); -+ partition[root_element[0]] = 0; -+ } -+ -+ pmesh = CheckParMeshNBE(modified_smesh, partition); -+ pmesh->Finalize(); -+ pmesh->FinalizeTopology(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ // return; -+ auto check_faces = [&]() -+ { -+ // repopulate the local to shared map. -+ local_to_shared.clear(); -+ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) -+ { -+ local_to_shared[pmesh->GetSharedFace(i)] = i; -+ } -+ -+ // Count the number of internal faces via the boundary elements -+ num_internal = 0; -+ for (int n = 0; n < pmesh->GetNBE(); ++n) -+ { -+ int f, o; -+ pmesh->GetBdrElementFace(n, &f, &o); -+ if (CheckFaceInternal(*pmesh, f, local_to_shared)) -+ { -+ ++num_internal; -+ } -+ } -+ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ CHECK(num_internal == std::pow(4, ref_level)); -+ CheckPoisson(*pmesh, p, smesh.bdr_attributes.Max()); -+ CheckPoisson(*pmesh, p); -+ }; -+ -+ -+ check_faces(); -+ pmesh->Rebalance(); -+ pmesh->ExchangeFaceNbrData(); -+ check_faces(); -+} -+ -+TEST_CASE("InteriorBoundaryInlineTetRefines", "[Parallel], [NCMesh]") -+{ -+ int p = GENERATE(1,2); -+ CAPTURE(p); -+ -+ auto smesh = Mesh("../../data/inline-tet.mesh"); -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ // Mark even and odd elements with different attributes -+ auto num_attributes = 3; -+ for (int i = 0; i < smesh.GetNE(); ++i) -+ { -+ smesh.SetAttribute(i, (i % num_attributes) + 1); -+ } -+ -+ smesh.SetAttributes(); -+ int initial_nbe = smesh.GetNBE(); -+ -+ // Introduce internal boundary elements -+ const int new_attribute = smesh.bdr_attributes.Max() + 1; -+ for (int f = 0; f < smesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); -+ new_elem->SetAttribute(new_attribute); -+ smesh.AddBdrElement(new_elem); -+ } -+ } -+ -+ smesh.FinalizeTopology(); // Finalize to build relevant tables -+ smesh.Finalize(); -+ -+ smesh.EnsureNCMesh(true); -+ -+ // Boundary elements must've been added to make the test valid -+ int num_internal_serial = smesh.GetNBE() - initial_nbe; -+ REQUIRE(num_internal_serial > 0); -+ -+ auto partition = std::unique_ptr(new int[smesh.GetNE()]); -+ -+ for (int i = 0; i < smesh.GetNE(); ++i) -+ { -+ partition[i] = i % Mpi::WorldSize(); // checkerboard partition -+ } -+ -+ auto pmesh = CheckParMeshNBE(smesh, partition); -+ -+ pmesh->FinalizeTopology(); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ std::map local_to_shared; -+ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) -+ { -+ local_to_shared[pmesh->GetSharedFace(i)] = i; -+ } -+ -+ // Count the number of internal faces via the boundary elements -+ int num_internal = 0; -+ for (int n = 0; n < pmesh->GetNBE(); ++n) -+ { -+ int f, o; -+ pmesh->GetBdrElementFace(n, &f, &o); -+ if (CheckFaceInternal(*pmesh, f, local_to_shared)) -+ { -+ ++num_internal; -+ } -+ } -+ -+ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ -+ CHECK(num_internal == num_internal_serial); -+ -+ CheckPoisson(*pmesh, p, new_attribute); -+ CheckPoisson(*pmesh, p); -+ -+ // Mark all elements of a given attribute for refinement to a given depth. -+ auto ref_level = GENERATE(1,2); -+ auto marked_attribute = GENERATE(1,2,3); -+ REQUIRE(marked_attribute <= num_attributes); -+ CAPTURE(ref_level); -+ CAPTURE(marked_attribute); -+ for (int r = 0; r < ref_level; r++) -+ { -+ Array elem_to_refine; -+ for (int i = 0; i < smesh.GetNE(); ++i) -+ { -+ if (smesh.GetAttribute(i) == marked_attribute) -+ { -+ elem_to_refine.Append(i); -+ } -+ } -+ smesh.GeneralRefinement(elem_to_refine); -+ } -+ -+ pmesh = CheckParMeshNBE(smesh); -+ pmesh->FinalizeTopology(); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ // Count the number of internal boundary elements -+ num_internal_serial = 0; -+ for (int n = 0; n < smesh.GetNBE(); ++n) -+ { -+ int f, o; -+ smesh.GetBdrElementFace(n, &f, &o); -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ ++num_internal_serial; -+ } -+ } -+ -+ auto check_faces = [&]() -+ { -+ // repopulate the local to shared map. -+ local_to_shared.clear(); -+ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) -+ { -+ local_to_shared[pmesh->GetSharedFace(i)] = i; -+ } -+ -+ num_internal = 0; -+ for (int n = 0; n < pmesh->GetNBE(); ++n) -+ { -+ int f, o; -+ pmesh->GetBdrElementFace(n, &f, &o); -+ if (CheckFaceInternal(*pmesh, f, local_to_shared)) -+ { -+ ++num_internal; -+ } -+ } -+ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ -+ CHECK(num_internal == num_internal_serial); -+ -+ CheckPoisson(*pmesh, p, pmesh->bdr_attributes.Max()); -+ CheckPoisson(*pmesh, p); -+ }; -+ -+ check_faces(); -+ pmesh->Rebalance(); -+ pmesh->ExchangeFaceNbrData(); -+ check_faces(); -+} -+ -+TEST_CASE("InteriorBoundaryReferenceCubes", "[Parallel], [NCMesh]") -+{ -+ auto p = GENERATE(1,2,3); -+ CAPTURE(p); -+ -+ auto smesh = Mesh("../../data/ref-cube.mesh"); -+ smesh.EnsureNCMesh(); -+ -+ REQUIRE(smesh.GetNBE() == 6); -+ -+ Array refs; -+ refs.Append(Refinement(0, Refinement::X)); -+ smesh.GeneralRefinement(refs); -+ -+ // Now have a pair of elements, make the second element a different -+ // attribute. -+ smesh.SetAttribute(0, 1); -+ smesh.SetAttribute(1, 2); -+ -+ REQUIRE(smesh.GetNBE() == 2 * 5); -+ -+ // Throw away the NCMesh, will restart NC later. -+ delete smesh.ncmesh; -+ smesh.ncmesh = nullptr; -+ -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ // Introduce an internal boundary element -+ const int new_attribute = smesh.bdr_attributes.Max() + 1; -+ for (int f = 0; f < smesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); -+ new_elem->SetAttribute(new_attribute); -+ smesh.AddBdrElement(new_elem); -+ break; -+ } -+ } -+ smesh.FinalizeTopology(); // Finalize to build relevant tables -+ smesh.Finalize(); -+ -+ // Exactly one boundary element must be added -+ REQUIRE(smesh.GetNBE() == 2 * 5 + 1); -+ -+ auto pmesh = CheckParMeshNBE(smesh); -+ -+ pmesh->FinalizeTopology(); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ REQUIRE(pmesh->Conforming()); -+ -+ std::map local_to_shared; -+ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) -+ { -+ local_to_shared[pmesh->GetSharedFace(i)] = i; -+ } -+ -+ // Count the number of internal faces via the boundary elements -+ int num_internal = 0; -+ for (int n = 0; n < pmesh->GetNBE(); ++n) -+ { -+ int f, o; -+ pmesh->GetBdrElementFace(n, &f, &o); -+ if (CheckFaceInternal(*pmesh, f, local_to_shared)) -+ { -+ ++num_internal; -+ } -+ } -+ -+ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ CHECK(num_internal == 1); -+ -+ CheckPoisson(*pmesh, p, pmesh->bdr_attributes.Max()); -+ CheckPoisson(*pmesh, p); -+ -+ for (int refined_elem : {0, 1}) -+ { -+ // Now NC refine one of the attached elements, this should result in 4 -+ // internal boundary elements. -+ Array el_to_refine; -+ el_to_refine.Append(refined_elem); -+ -+ Mesh modified_smesh(smesh); -+ modified_smesh.GeneralRefinement(el_to_refine); -+ -+ // There should now be four internal boundary elements, where there was one -+ // before. -+ CHECK(modified_smesh.GetNBE() == 5 /* external boundaries of unrefined */ -+ + 4 /* internal boundaries */ -+ + (5 * 4) /* external boundaries of refined */); -+ -+ // Force the partition to have the edge case of a parent and child being -+ // divided across the processor boundary. This necessitates the -+ // GhostBoundaryElement treatment. -+ auto partition = std::unique_ptr(new int[modified_smesh.GetNE()]); -+ srand(314159); -+ for (int i = 0; i < modified_smesh.GetNE(); ++i) -+ { -+ // Randomly assign to any processor but zero. -+ partition[i] = Mpi::WorldSize() > 1 ? 1 + rand() % (Mpi::WorldSize() - 1) : 0; -+ } -+ if (Mpi::WorldSize() > 0) -+ { -+ // Make sure on rank 1 there is a parent face with only ghost child -+ // faces. This can cause issues with higher order dofs being uncontrolled. -+ partition[refined_elem == 0 ? modified_smesh.GetNE() - 1 : 0] = 0; -+ } -+ -+ pmesh = CheckParMeshNBE(modified_smesh, partition); -+ pmesh->Finalize(); -+ pmesh->FinalizeTopology(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ auto check_faces = [&]() -+ { -+ // repopulate the local to shared map. -+ local_to_shared.clear(); -+ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) -+ { -+ local_to_shared[pmesh->GetSharedFace(i)] = i; -+ } -+ -+ // Count the number of internal faces via the boundary elements -+ num_internal = 0; -+ for (int n = 0; n < pmesh->GetNBE(); ++n) -+ { -+ int f, o; -+ pmesh->GetBdrElementFace(n, &f, &o); -+ if (CheckFaceInternal(*pmesh, f, local_to_shared)) -+ { -+ ++num_internal; -+ } -+ } -+ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ CHECK(num_internal == 4); -+ -+ CAPTURE(refined_elem); -+ CheckPoisson(*pmesh, p, smesh.bdr_attributes.Max()); -+ CheckPoisson(*pmesh, p); -+ }; -+ -+ check_faces(); -+ pmesh->Rebalance(); -+ pmesh->ExchangeFaceNbrData(); -+ check_faces(); -+ } -+} -+ -+TEST_CASE("InteriorBoundaryInlineHexRefines", "[Parallel], [NCMesh]") -+{ -+ int p = GENERATE(1,2,3); -+ CAPTURE(p); -+ -+ auto smesh = Mesh("../../data/inline-hex.mesh"); -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ // Mark even and odd elements with different attributes -+ for (int i = 0; i < smesh.GetNE(); ++i) -+ { -+ smesh.SetAttribute(i, (i % 2) + 1); -+ } -+ -+ smesh.SetAttributes(); -+ int initial_nbe = smesh.GetNBE(); -+ -+ // Introduce internal boundary elements -+ const int new_attribute = smesh.bdr_attributes.Max() + 1; -+ for (int f = 0; f < smesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); -+ new_elem->SetAttribute(new_attribute); -+ smesh.AddBdrElement(new_elem); -+ } -+ } -+ -+ smesh.FinalizeTopology(); // Finalize to build relevant tables -+ smesh.Finalize(); -+ -+ // Boundary elements must've been added to make the test valid -+ int num_internal_serial = smesh.GetNBE() - initial_nbe; -+ REQUIRE(num_internal_serial > 0); -+ -+ auto partition = std::unique_ptr(new int[smesh.GetNE()]); -+ -+ for (int i = 0; i < smesh.GetNE(); ++i) -+ { -+ partition[i] = i % Mpi::WorldSize(); // checkerboard partition -+ } -+ -+ auto pmesh = CheckParMeshNBE(smesh, partition); -+ -+ pmesh->FinalizeTopology(); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ std::map local_to_shared; -+ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) -+ { -+ local_to_shared[pmesh->GetSharedFace(i)] = i; -+ } -+ -+ // Count the number of internal faces via the boundary elements -+ int num_internal = 0; -+ for (int n = 0; n < pmesh->GetNBE(); ++n) -+ { -+ int f, o; -+ pmesh->GetBdrElementFace(n, &f, &o); -+ if (CheckFaceInternal(*pmesh, f, local_to_shared)) -+ { -+ ++num_internal; -+ } -+ } -+ -+ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ -+ CHECK(num_internal == num_internal_serial); -+ -+ CheckPoisson(*pmesh, p, new_attribute); -+ CheckPoisson(*pmesh, p); -+ -+ // Mark every third element for refinement -+ Array elem_to_refine; -+ const int factor = 3; -+ for (int i = 0; i < smesh.GetNE()/factor; ++i) -+ { -+ elem_to_refine.Append(factor * i); -+ } -+ smesh.GeneralRefinement(elem_to_refine); -+ -+ pmesh = CheckParMeshNBE(smesh); -+ pmesh->FinalizeTopology(); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ // repopulate the local to shared map. -+ local_to_shared.clear(); -+ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) -+ { -+ local_to_shared[pmesh->GetSharedFace(i)] = i; -+ } -+ -+ // Count the number of internal boundary elements -+ num_internal_serial = 0; -+ for (int n = 0; n < smesh.GetNBE(); ++n) -+ { -+ int f, o; -+ smesh.GetBdrElementFace(n, &f, &o); -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ ++num_internal_serial; -+ } -+ } -+ -+ num_internal = 0; -+ for (int n = 0; n < pmesh->GetNBE(); ++n) -+ { -+ int f, o; -+ pmesh->GetBdrElementFace(n, &f, &o); -+ if (CheckFaceInternal(*pmesh, f, local_to_shared)) -+ { -+ ++num_internal; -+ } -+ } -+ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ -+ CHECK(num_internal == num_internal_serial); -+ -+ CheckPoisson(*pmesh, p, pmesh->bdr_attributes.Max()); -+ CheckPoisson(*pmesh, p); -+} -+ -+TEST_CASE("ParMeshInternalBoundaryStarMesh", "[Parallel], [NCMesh]") -+{ -+ auto smesh = StarMesh(); -+ smesh.EnsureNCMesh(true); -+ -+ if (Mpi::WorldSize() < 5) { return;} -+ -+ auto partition = std::unique_ptr(new int[5]); -+ for (int i = 0; i < 5; i++) -+ { -+ partition[i] = i; -+ } -+ auto pmesh = CheckParMeshNBE(smesh, partition); -+ pmesh->FinalizeTopology(); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ REQUIRE(pmesh->Nonconforming()); -+ -+ std::map local_to_shared; -+ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) -+ { -+ local_to_shared[pmesh->GetSharedFace(i)] = i; -+ } -+ -+ // Count the number of internal faces via the boundary elements -+ int num_internal = 0; -+ for (int n = 0; n < pmesh->GetNBE(); ++n) -+ { -+ int f, o; -+ pmesh->GetBdrElementFace(n, &f, &o); -+ if (CheckFaceInternal(*pmesh, f, local_to_shared)) -+ { -+ ++num_internal; -+ } -+ } -+ -+ const int rank = Mpi::WorldRank(); -+ SECTION("Unrefined") -+ { -+ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); -+ CHECK(num_internal == 4); -+ -+ CHECK(CountEssentialDof(*pmesh, 1, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 4 : 0)); -+ CHECK(CountEssentialDof(*pmesh, 2, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 4 + 6 : 0)); -+ CHECK(CountEssentialDof(*pmesh, 3, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 4 + 6*2 + 4*1 : 0)); -+ CHECK(CountEssentialDof(*pmesh, 4, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 4 + 6*3 + 4*3 : 0)); -+ -+ CHECK(CountEssentialDof(*pmesh, 1, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 6 : 0)); -+ CHECK(CountEssentialDof(*pmesh, 2, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 20 : 0)); -+ CHECK(CountEssentialDof(*pmesh, 3, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 42 : 0)); -+ CHECK(CountEssentialDof(*pmesh, 4, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 72 : 0)); -+ CHECK(pmesh->GetNBE() == (rank == 0 ? 4 : (rank < 5 ? 3 : 0))); -+ } -+ -+ SECTION("Refinement") -+ { -+ // Refining an element attached to the core should not change the number of essential -+ // DOF, or the owner of them. -+ -+ const int refined_attribute = GENERATE(1,2,3,4,5); // equal to rank of owner + 1 -+ int ref_level = GENERATE(0, 1, 2, 3); -+ for (int r = 0; r < ref_level; r++) -+ { -+ Array el_to_refine; -+ for (int n = 0; n < pmesh->GetNE(); n++) -+ { -+ if (pmesh->GetAttribute(n) == refined_attribute) -+ { -+ el_to_refine.Append(n); -+ } -+ } -+ pmesh->GeneralRefinement(el_to_refine); -+ } -+ pmesh->ExchangeFaceNbrData(); -+ -+ CAPTURE(rank); -+ CAPTURE(refined_attribute); -+ CAPTURE(ref_level); -+ CHECK(pmesh->GetNE() == (rank == refined_attribute - 1 ? std::pow(8, -+ ref_level) : 1)); -+ CHECK(pmesh->GetNBE() == (rank == refined_attribute - 1 -+ ? std::pow(4, ref_level + 1) -+ : (ref_level == 0 && rank == 0 ? 4 : 3))); -+ -+ // Refining on only one side of the boundary face should not change the number of -+ // essential true dofs, which should match the number within the original face. -+ CHECK(ParCountEssentialDof(*pmesh, 1, -+ smesh.bdr_attributes.Max()) == 4); -+ CHECK(ParCountEssentialDof(*pmesh, 2, -+ smesh.bdr_attributes.Max()) == 4 + 6); -+ CHECK(ParCountEssentialDof(*pmesh, 3, -+ smesh.bdr_attributes.Max()) == 4 + 6*2 + 4*1); -+ CHECK(ParCountEssentialDof(*pmesh, 4, -+ smesh.bdr_attributes.Max()) == 4 + 6*3 + 4*3); -+ -+ CHECK(ParCountEssentialDof(*pmesh, 1, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 6 : 0)); -+ CHECK(ParCountEssentialDof(*pmesh, 2, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 20 : 0)); -+ CHECK(ParCountEssentialDof(*pmesh, 3, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 42 : 0)); -+ CHECK(ParCountEssentialDof(*pmesh, 4, -+ smesh.bdr_attributes.Max()) == (rank == 0 ? 72 : 0)); -+ } -+} -+ -+TEST_CASE("ParDividingPlaneMesh", "[Parallel], [NCMesh]") -+{ -+ auto RefineAttribute = [](Mesh& mesh, int attr, int ref_level) -+ { -+ for (int r = 0; r < ref_level; r++) -+ { -+ Array el_to_refine; -+ for (int n = 0; n < mesh.GetNE(); n++) -+ { -+ if (mesh.GetAttribute(n) == attr) -+ { -+ el_to_refine.Append(n); -+ } -+ } -+ mesh.GeneralRefinement(el_to_refine); -+ } -+ }; -+ -+ SECTION("Hex") -+ { -+ auto mesh = DividingPlaneMesh(false); -+ mesh.EnsureNCMesh(true); -+ -+ CHECK(mesh.GetNBE() == 2 * 5 + 1); -+ CHECK(mesh.GetNE() == 2); -+ -+ SECTION("H1Hex") -+ { -+ mesh.UniformRefinement(); -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 3*3); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 5*5); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 7*7); -+ CHECK(CountEssentialDof(mesh, 4, -+ mesh.bdr_attributes.Max()) == 9*9); -+ -+ auto attr = GENERATE(1,2); -+ auto ref_level = GENERATE(1,2); -+ RefineAttribute(mesh, attr, ref_level); -+ -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 3*3); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 5*5); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 7*7); -+ CHECK(CountEssentialDof(mesh, 4, -+ mesh.bdr_attributes.Max()) == 9*9); -+ } -+ } -+ -+ SECTION("Tet") -+ { -+ auto mesh = DividingPlaneMesh(true, true); -+ mesh.EnsureNCMesh(true); -+ auto pmesh = CheckParMeshNBE(mesh); -+ pmesh->FinalizeTopology(); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ CHECK(pmesh->bdr_attributes.Max() == mesh.bdr_attributes.Max()); -+ -+ auto attr = GENERATE(1,2); -+ auto ref_level = GENERATE(1,2); -+ CAPTURE(attr); -+ CAPTURE(ref_level); -+ -+ const int initial_num_vert = 4; -+ const int initial_num_edge = 5; -+ const int initial_num_face = 2; -+ SECTION("H1Tet") -+ { -+ CHECK(ParCountEssentialDof(*pmesh, 1, -+ mesh.bdr_attributes.Max()) == initial_num_vert); -+ CHECK(ParCountEssentialDof(*pmesh, 2, -+ mesh.bdr_attributes.Max()) == initial_num_vert + initial_num_edge); -+ CHECK(ParCountEssentialDof(*pmesh, 3, -+ mesh.bdr_attributes.Max()) == initial_num_vert + 2*initial_num_edge + -+ initial_num_face); -+ CHECK(ParCountEssentialDof(*pmesh, 4, -+ mesh.bdr_attributes.Max()) == initial_num_vert + 3*initial_num_edge + -+ 3*initial_num_face); -+ -+ RefineAttribute(*pmesh, attr, ref_level); -+ -+ CHECK(ParCountEssentialDof(*pmesh, 1, -+ mesh.bdr_attributes.Max()) == initial_num_vert); -+ CHECK(ParCountEssentialDof(*pmesh, 2, -+ mesh.bdr_attributes.Max()) == initial_num_vert + initial_num_edge); -+ CHECK(ParCountEssentialDof(*pmesh, 3, -+ mesh.bdr_attributes.Max()) == initial_num_vert + 2*initial_num_edge + -+ initial_num_face); -+ CHECK(ParCountEssentialDof(*pmesh, 4, -+ mesh.bdr_attributes.Max()) == initial_num_vert + 3*initial_num_edge + -+ 3*initial_num_face); -+ } -+ -+ SECTION("NDTet") -+ { -+ CHECK(ParCountEssentialDof(*pmesh, 1, -+ mesh.bdr_attributes.Max()) == 5); -+ CHECK(ParCountEssentialDof(*pmesh, 2, -+ mesh.bdr_attributes.Max()) == 14); -+ CHECK(ParCountEssentialDof(*pmesh, 3, -+ mesh.bdr_attributes.Max()) == 27); -+ CHECK(ParCountEssentialDof(*pmesh, 4, -+ mesh.bdr_attributes.Max()) == 44); -+ -+ RefineAttribute(*pmesh, attr, ref_level); -+ CHECK(ParCountEssentialDof(*pmesh, 1, -+ mesh.bdr_attributes.Max()) == 5); -+ CHECK(ParCountEssentialDof(*pmesh, 2, -+ mesh.bdr_attributes.Max()) == 14); -+ CHECK(ParCountEssentialDof(*pmesh, 3, -+ mesh.bdr_attributes.Max()) == 27); -+ CHECK(ParCountEssentialDof(*pmesh, 4, -+ mesh.bdr_attributes.Max()) == 44); -+ } -+ } -+} -+ -+TEST_CASE("ParTetFaceFlips", "[Parallel], [NCMesh]") -+{ -+ /* -+ 1. Define an ND space, and project a smooth non polynomial function onto the space. -+ 2. Compute y-z components in the face, and check that they are equal when evaluated -+ from either side of the face. Tangential continuity of the ND space should ensure -+ they are identical, if orientations are correctly accounted for. -+ 3. Mark the mesh as NC, build a new FESpace, and repeat. There should be no change as -+ the faces are "conformal" though they are within the NC structure. -+ 3. Partition the mesh, create a ParFESpace and repeat the above. There should be no -+ difference in conformal parallel. -+ 4. Construct the ParMesh from the NCMesh and repeat. As above, there should be no -+ change. -+ 5. Perform NC refinement on one side of the internal face, the number of conformal dof -+ in the face will not change, so there should also be no difference. This will be -+ complicated by ensuring the slave evaluations are at the same points. -+ */ -+ -+ auto orientation = GENERATE(1,3,5); -+ auto smesh = OrientedTriFaceMesh(orientation); -+ smesh.EnsureNodes(); -+ -+ CHECK(smesh.GetNBE() == 1); -+ -+ // A smooth function in each vector component -+ constexpr int order = 3, dim = 3, quadrature_order = 4; -+ constexpr double kappa = 2 * M_PI; -+ auto E_exact = [=](const Vector &x, Vector &E) -+ { -+ E(0) = cos(kappa * x(1)); -+ E(1) = cos(kappa * x(2)); -+ E(2) = cos(kappa * x(0)); -+ }; -+ VectorFunctionCoefficient E_coeff(dim, E_exact); -+ -+ // Helper for evaluating the ND grid function on either side of the first conformal shared face. -+ // Specific to the pair of tet mesh described above, but can be generalized. -+ auto CheckParallelNDConformal = [&](ParMesh &mesh) -+ { -+ ND_FECollection fe_collection(order, dim); -+ ParFiniteElementSpace fe_space(&mesh, &fe_collection); -+ ParGridFunction E(&fe_space); -+ -+ E.ProjectCoefficient(E_coeff); -+ E.ExchangeFaceNbrData(); -+ -+ auto *P = fe_space.GetProlongationMatrix(); -+ if (P != nullptr) -+ { -+ // Projection does not respect the non-conformal constraints. -+ // Extract the true (conformal) and prolongate to get the NC respecting projection. -+ auto E_true = E.GetTrueVector(); -+ P->Mult(E_true, E); -+ E.ExchangeFaceNbrData(); -+ } -+ ParGridFunction * const coords = dynamic_cast -+ (mesh.GetNodes()); -+ -+ const auto &ir = IntRules.Get(Geometry::Type::TRIANGLE, quadrature_order); -+ IntegrationRule left_eir(ir.GetNPoints()), -+ right_eir(ir.GetNPoints()); // element integration rules -+ -+ for (int n = 0; n < mesh.GetNBE(); n++) -+ { -+ auto f = mesh.GetBdrElementFaceIndex(n); -+ -+ auto finfo = mesh.GetFaceInformation(f); -+ auto &face_element_transform = finfo.IsShared() -+ ? *mesh.GetSharedFaceTransformationsByLocalIndex(f, true) -+ : *mesh.GetFaceElementTransformations(f); -+ -+ face_element_transform.Loc1.Transform(ir, left_eir); -+ face_element_transform.Loc2.Transform(ir, right_eir); -+ -+ constexpr double tol = 1e-14; -+ REQUIRE(left_eir.GetNPoints() == ir.GetNPoints()); -+ REQUIRE(right_eir.GetNPoints() == ir.GetNPoints()); -+ Vector left_val, right_val; -+ for (int i = 0; i < ir.GetNPoints(); i++) -+ { -+ face_element_transform.Elem1->SetIntPoint(&left_eir[i]); -+ coords->GetVectorValue(*face_element_transform.Elem1, left_eir[i], left_val); -+ coords->GetVectorValue(*face_element_transform.Elem1, left_eir[i], right_val); -+ REQUIRE(std::abs(left_val(0) - right_val(0)) < tol); -+ REQUIRE(std::abs(left_val(1) - right_val(1)) < tol); -+ REQUIRE(std::abs(left_val(2) - right_val(2)) < tol); -+ E.GetVectorValue(*face_element_transform.Elem1, left_eir[i], left_val); -+ -+ face_element_transform.Elem2->SetIntPoint(&right_eir[i]); -+ E.GetVectorValue(*face_element_transform.Elem2, right_eir[i], right_val); -+ -+ // Check that the second and third rows agree. -+ // The y and z should agree as the normal is in the x direction -+ CHECK(std::abs(left_val(1) - right_val(1)) < tol); -+ CHECK(std::abs(left_val(2) - right_val(2)) < tol); -+ } -+ } -+ -+ return fe_space.GlobalTrueVSize(); -+ }; -+ -+ SECTION("Conformal") -+ { -+ auto partition_flag = GENERATE(false, true); -+ CAPTURE(partition_flag); -+ auto partition = std::unique_ptr(new int[2]); -+ if (Mpi::WorldSize() > 1) -+ { -+ partition[0] = partition_flag ? 0 : 1; partition[1] = partition_flag ? 1 : 0; -+ } -+ else -+ { -+ partition[0] = 0; partition[1] = 0; -+ } -+ auto pmesh = CheckParMeshNBE(smesh); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ CheckParallelNDConformal(*pmesh); -+ } -+ -+ SECTION("ConformalSerialUniformRefined") -+ { -+ smesh.UniformRefinement(); -+ auto pmesh = CheckParMeshNBE(smesh); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ CheckParallelNDConformal(*pmesh); -+ } -+ -+ SECTION("ConformalParallelUniformRefined") -+ { -+ auto partition_flag = GENERATE(false, true); -+ CAPTURE(partition_flag); -+ auto partition = std::unique_ptr(new int[2]); -+ if (Mpi::WorldSize() > 1) -+ { -+ partition[0] = partition_flag ? 0 : 1; partition[1] = partition_flag ? 1 : 0; -+ } -+ else -+ { -+ partition[0] = 0; partition[1] = 0; -+ } -+ auto pmesh = CheckParMeshNBE(smesh); -+ pmesh->UniformRefinement(); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ CheckParallelNDConformal(*pmesh); -+ } -+ -+ SECTION("Nonconformal") -+ { -+ auto partition_flag = GENERATE(false, true); -+ CAPTURE(partition_flag); -+ auto partition = std::unique_ptr(new int[2]); -+ if (Mpi::WorldSize() > 1) -+ { -+ partition[0] = partition_flag ? 0 : 1; partition[1] = partition_flag ? 1 : 0; -+ } -+ else -+ { -+ partition[0] = 0; partition[1] = 0; -+ } -+ smesh.EnsureNCMesh(true); -+ auto pmesh = CheckParMeshNBE(smesh); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ CheckParallelNDConformal(*pmesh); -+ } -+ -+ SECTION("NonconformalSerialUniformRefined") -+ { -+ smesh.UniformRefinement(); -+ smesh.EnsureNCMesh(true); -+ auto pmesh = CheckParMeshNBE(smesh); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ CheckParallelNDConformal(*pmesh); -+ } -+ -+ SECTION("NonconformalSerialRefined") -+ { -+ smesh.EnsureNCMesh(true); -+ int ref_level = GENERATE(1, 2); -+ for (int r = 0; r < ref_level; r++) -+ { -+ Array el_to_refine; -+ for (int n = 0; n < smesh.GetNE(); n++) -+ { -+ if (smesh.GetAttribute(n) == 2) -+ { -+ el_to_refine.Append(n); -+ } -+ } -+ smesh.GeneralRefinement(el_to_refine); -+ } -+ auto pmesh = CheckParMeshNBE(smesh); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ CheckParallelNDConformal(*pmesh); -+ } -+ -+ SECTION("NonconformalParallelUniformRefined") -+ { -+ auto partition_flag = GENERATE(false, true); -+ CAPTURE(partition_flag); -+ auto partition = std::unique_ptr(new int[2]); -+ if (Mpi::WorldSize() > 1) -+ { -+ partition[0] = partition_flag ? 0 : 1; partition[1] = partition_flag ? 1 : 0; -+ } -+ else -+ { -+ partition[0] = 0; partition[1] = 0; -+ } -+ smesh.EnsureNCMesh(true); -+ auto pmesh = CheckParMeshNBE(smesh); -+ pmesh->UniformRefinement(); -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ CheckParallelNDConformal(*pmesh); -+ } -+ -+ SECTION("NonconformalParallelRefined") -+ { -+ auto partition_flag = GENERATE(false, true); -+ CAPTURE(partition_flag); -+ auto partition = std::unique_ptr(new int[2]); -+ if (Mpi::WorldSize() > 1) -+ { -+ partition[0] = partition_flag ? 0 : 1; partition[1] = partition_flag ? 1 : 0; -+ } -+ else -+ { -+ partition[0] = 0; partition[1] = 0; -+ } -+ smesh.EnsureNCMesh(true); -+ auto pmesh = CheckParMeshNBE(smesh); -+ int ref_level = GENERATE(1, 2); -+ for (int r = 0; r < ref_level; r++) -+ { -+ Array el_to_refine; -+ for (int n = 0; n < pmesh->GetNE(); n++) -+ { -+ if (pmesh->GetAttribute(n) == 2) -+ { -+ el_to_refine.Append(n); -+ } -+ } -+ pmesh->GeneralRefinement(el_to_refine); -+ } -+ pmesh->Finalize(); -+ pmesh->ExchangeFaceNbrData(); -+ -+ CheckParallelNDConformal(*pmesh); -+ } -+ -+ SECTION("NonconformalLevelTwoRefined") -+ { -+ smesh.EnsureNCMesh(true); -+ smesh.UniformRefinement(); -+ Array el_to_refine(1); -+ for (int n = 0; n < smesh.GetNE(); n++) -+ { -+ if (smesh.GetAttribute(n) == 2) -+ { -+ CAPTURE(n); -+ Mesh smesh2(smesh); -+ el_to_refine[0] = n; -+ smesh2.GeneralRefinement(el_to_refine); -+ for (int m = 0; m < smesh2.GetNE(); m++) -+ { -+ if (smesh2.GetAttribute(m) == 2) -+ { -+ CAPTURE(m); -+ Mesh smesh3(smesh2); -+ el_to_refine[0] = m; -+ smesh3.GeneralRefinement(el_to_refine); -+ CheckParallelNDConformal(*CheckParMeshNBE(smesh3)); -+ } -+ } -+ } -+ } -+ } -+ -+} -+ -+// Helper to check the identity PR = I on a ParFiniteElementSpace. -+void CheckRPIdentity(const ParFiniteElementSpace& pfespace) -+{ -+ const SparseMatrix *R = pfespace.GetRestrictionMatrix(); -+ HypreParMatrix *P = pfespace.Dof_TrueDof_Matrix(); -+ -+ REQUIRE(R != nullptr); -+ REQUIRE(P != nullptr); -+ -+ HypreParMatrix *hR = new HypreParMatrix( -+ pfespace.GetComm(), pfespace.GlobalTrueVSize(), -+ pfespace.GlobalVSize(), pfespace.GetTrueDofOffsets(), -+ pfespace.GetDofOffsets(), -+ const_cast(R)); // Non owning so cast is ok -+ -+ REQUIRE(hR->Height() == P->Width()); -+ REQUIRE(hR->Width() == P->Height()); -+ -+ REQUIRE(hR != nullptr); -+ HypreParMatrix *I = ParMult(hR, P); -+ -+ // Square matrix so the "diag" is the only bit we need. -+ SparseMatrix diag; -+ I->GetDiag(diag); -+ for (int i = 0; i < diag.Height(); i++) -+ for (int j = 0; j < diag.Width(); j++) -+ { -+ // cast to const to force a zero return rather than an abort. -+ CHECK(const_cast(diag)(i, j) == (i == j ? 1.0 : 0.0)); -+ } -+ -+ delete hR; -+ delete I; -+} -+ -+TEST_CASE("Parallel RP=I", "[Parallel], [NCMesh]") -+{ -+ const int order = GENERATE(1, 2, 3); -+ CAPTURE(order); -+ const int dim = 3; -+ -+ SECTION("Hex") -+ { -+ // Split the hex into a pair, then isotropically refine one of them. -+ Mesh smesh("../../data/ref-cube.mesh"); -+ Array refinements(1); -+ refinements[0].index = 0; -+ refinements[0].ref_type = Refinement::X; -+ smesh.GeneralRefinement(refinements); -+ refinements[0].ref_type = Refinement::XYZ; -+ smesh.GeneralRefinement(refinements); -+ ParMesh mesh(MPI_COMM_WORLD, smesh); -+ SECTION("ND") -+ { -+ ND_FECollection fec(order, dim); -+ ParFiniteElementSpace fespace(&mesh, &fec); -+ CheckRPIdentity(fespace); -+ } -+ SECTION("H1") -+ { -+ H1_FECollection fec(order, dim); -+ ParFiniteElementSpace fespace(&mesh, &fec); -+ CheckRPIdentity(fespace); -+ } -+ } -+ -+ SECTION("Tet") -+ { -+ // Split the hex into a pair, then isotropically refine one of them. -+ Mesh smesh("../../data/ref-tetrahedron.mesh"); -+ Array refinements(1); -+ refinements[0].index = 0; -+ refinements[0].ref_type = Refinement::X; -+ smesh.GeneralRefinement(refinements); -+ bool use_nc = GENERATE(false, true); -+ smesh.EnsureNCMesh(use_nc); -+ refinements[0].ref_type = Refinement::XYZ; -+ smesh.GeneralRefinement(refinements); -+ smesh.EnsureNCMesh(true); // Always checking NC -+ ParMesh mesh(MPI_COMM_WORLD, smesh); -+ SECTION("ND") -+ { -+ ND_FECollection fec(order, dim); -+ ParFiniteElementSpace fespace(&mesh, &fec); -+ CheckRPIdentity(fespace); -+ } -+ SECTION("H1") -+ { -+ H1_FECollection fec(order, dim); -+ ParFiniteElementSpace fespace(&mesh, &fec); -+ CheckRPIdentity(fespace); -+ } -+ } -+} -+ -+#endif // MFEM_USE_MPI -+ -+TEST_CASE("ReferenceCubeInternalBoundaries", "[NCMesh]") -+{ -+ auto p = GENERATE(1,2,3); -+ CAPTURE(p); -+ -+ auto smesh = Mesh("../../data/ref-cube.mesh"); -+ -+ CheckPoisson(smesh, p); -+ -+ smesh.EnsureNCMesh(); -+ Array refs; -+ refs.Append(Refinement(0, Refinement::X)); -+ smesh.GeneralRefinement(refs); -+ -+ // Now have a pair of elements, make the second element a different -+ // attribute. -+ smesh.SetAttribute(1, 2); -+ -+ REQUIRE(smesh.GetNBE() == 2 * 5); -+ -+ delete smesh.ncmesh; -+ smesh.ncmesh = nullptr; -+ -+ // Introduce an internal boundary element -+ for (int f = 0; f < smesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); -+ new_elem->SetAttribute(7); -+ smesh.AddBdrElement(new_elem); -+ } -+ } -+ -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ // Exactly one boundary element must be added -+ CHECK(smesh.GetNBE() == 2 * 5 + 1); -+ -+ smesh.EnsureNCMesh(); -+ CHECK(smesh.GetNBE() == 2 * 5 + 1); -+ -+ int without_internal, with_internal; -+ with_internal = CheckPoisson(smesh, p); // Include the internal boundary -+ without_internal = CheckPoisson(smesh, p, -+ smesh.bdr_attributes.Max()); // Exclude the internal boundary -+ -+ switch (p) -+ { -+ case 1: -+ CHECK(with_internal == without_internal); break; -+ case 2: -+ CHECK(with_internal == without_internal + 1); break; -+ case 3: -+ CHECK(with_internal == without_internal + 4); break; -+ } -+ -+ auto ref_type = char(GENERATE(//Refinement::Y, Refinement::Z, Refinement::YZ, -+ Refinement::XYZ)); -+ -+ for (auto ref : {0,1}) -+ { -+ refs[0].index = ref; -+ -+ auto ssmesh = Mesh(smesh); -+ -+ CAPTURE(ref_type); -+ -+ // Now NC refine one of the attached elements, this should result in 2 -+ // internal boundary elements. -+ refs[0].ref_type = ref_type; -+ -+ ssmesh.GeneralRefinement(refs); -+ -+ // There should now be four internal boundary elements, where there was one -+ // before. -+ if (ref_type == 2 /* Y */ || ref_type == 4 /* Z */) -+ { -+ CHECK(ssmesh.GetNBE() == 5 /* external boundaries of unrefined element */ -+ + 2 /* internal boundaries */ -+ + (2 * 4) /* external boundaries of refined elements */); -+ } -+ else if (ref_type == 6) -+ { -+ CHECK(ssmesh.GetNBE() == 5 /* external boundaries of unrefined element */ -+ + 4 /* internal boundaries */ -+ + (4 * 3) /* external boundaries of refined elements */); -+ } -+ else if (ref_type == 7) -+ { -+ CHECK(ssmesh.GetNBE() == 5 /* external boundaries of unrefined element */ -+ + 4 /* internal boundaries */ -+ + (4 * 3 + 4 * 2) /* external boundaries of refined elements */); -+ } -+ else -+ { -+ MFEM_ABORT("!"); -+ } -+ -+ // Count the number of internal boundary elements -+ int num_internal = 0; -+ for (int n = 0; n < ssmesh.GetNBE(); ++n) -+ { -+ int f, o; -+ ssmesh.GetBdrElementFace(n, &f, &o); -+ int e1, e2; -+ ssmesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && ssmesh.GetAttribute(e1) != ssmesh.GetAttribute(e2)) -+ { -+ ++num_internal; -+ } -+ } -+ CHECK(num_internal == (ref_type <= 4 ? 2 : 4)); -+ -+ ssmesh.FinalizeTopology(); -+ ssmesh.Finalize(); -+ -+ without_internal = CheckPoisson(ssmesh, p, -+ ssmesh.bdr_attributes.Max()); // Exclude the internal boundary -+ with_internal = CheckPoisson(ssmesh, p); // Include the internal boundary -+ -+ // All slaves dofs that are introduced on the face are constrained by -+ // the master dofs, thus the additional constraints on the internal -+ // boundary are purely on the master face, which matches the initial -+ // unrefined case. -+ switch (p) -+ { -+ case 1: -+ CHECK(with_internal == without_internal); break; -+ case 2: -+ CHECK(with_internal == without_internal + 1); break; -+ case 3: -+ CHECK(with_internal == without_internal + 4); break; -+ } -+ } -+} -+ -+TEST_CASE("RefinedCubesInternalBoundaries", "[NCMesh]") -+{ -+ auto p = GENERATE(1,2,3); -+ CAPTURE(p); -+ -+ auto smesh = Mesh("../../data/ref-cube.mesh"); -+ smesh.EnsureNCMesh(); -+ Array refs; -+ refs.Append(Refinement(0, Refinement::X)); -+ smesh.GeneralRefinement(refs); -+ -+ // Now have a pair of elements, make the second element a different -+ // attribute. -+ smesh.SetAttribute(1, 2); -+ -+ REQUIRE(smesh.GetNBE() == 2 * 5); -+ -+ delete smesh.ncmesh; -+ smesh.ncmesh = nullptr; -+ -+ smesh.UniformRefinement(); -+ -+ // Introduce four internal boundary elements -+ for (int f = 0; f < smesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); -+ new_elem->SetAttribute(7); -+ smesh.AddBdrElement(new_elem); -+ } -+ } -+ -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ // Exactly four boundary elements must be added -+ CHECK(smesh.GetNBE() == 2 * 5 * 4 + 4); -+ -+ smesh.EnsureNCMesh(); -+ CHECK(smesh.GetNBE() == 2 * 5 * 4 + 4); -+ -+ int without_internal = CheckPoisson(smesh, p, -+ 7); // Exclude the internal boundary -+ int with_internal = CheckPoisson(smesh, p); // Include the internal boundary -+ -+ switch (p) -+ { -+ case 1: -+ CHECK(with_internal == without_internal + 1); break; -+ case 2: -+ CHECK(with_internal == without_internal + 3 * 3); break; -+ case 3: -+ CHECK(with_internal == without_internal + 5 * 5); break; -+ } -+ -+ // Mark all elements on one side of the attribute boundary to refine -+ refs.DeleteAll(); -+ for (int n = 0; n < smesh.GetNE(); ++n) -+ { -+ if (smesh.GetAttribute(n) == 2) -+ { -+ refs.Append(Refinement{n, Refinement::XYZ}); -+ } -+ } -+ -+ smesh.GeneralRefinement(refs); -+ -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ // There should now be 16 internal boundary elements, where there were 4 before -+ -+ CHECK(smesh.GetNBE() == 5 * 4 /* external boundaries of unrefined domain */ -+ + 4 * 4 /* internal boundaries */ -+ + 5 * 16 /* external boundaries of refined elements */); -+ -+ -+ // Count the number of internal boundary elements -+ int num_internal = 0; -+ for (int n = 0; n < smesh.GetNBE(); ++n) -+ { -+ int f, o; -+ smesh.GetBdrElementFace(n, &f, &o); -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ ++num_internal; -+ } -+ } -+ CHECK(num_internal == 16); -+ -+ -+ without_internal = CheckPoisson(smesh, p, -+ smesh.bdr_attributes.Max()); // Exclude the internal boundary -+ with_internal = CheckPoisson(smesh, p); // Include the internal boundary -+ -+ switch (p) -+ { -+ case 1: -+ CHECK(with_internal == without_internal + 1); break; -+ case 2: -+ CHECK(with_internal == without_internal + 3 * 3); break; -+ case 3: -+ CHECK(with_internal == without_internal + 5 * 5); break; -+ } -+} -+ -+TEST_CASE("ReferenceTetInternalBoundaries", "[NCMesh]") -+{ -+ auto p = GENERATE(1,2,3); -+ CAPTURE(p); -+ -+ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); -+ Array refs; -+ refs.Append(Refinement(0, Refinement::X)); -+ smesh.GeneralRefinement(refs); -+ -+ // Now have a pair of elements, make the second element a different -+ // attribute. -+ smesh.SetAttribute(1, 2); -+ -+ REQUIRE(smesh.GetNE() == 2); -+ REQUIRE(smesh.GetNBE() == 2 * 3); -+ -+ // Introduce an internal boundary element -+ for (int f = 0; f < smesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); -+ new_elem->SetAttribute(5); -+ smesh.AddBdrElement(new_elem); -+ } -+ } -+ -+ // Exactly one boundary element must be added -+ CHECK(smesh.GetNBE() == 2 * 3 + 1); -+ -+ smesh.EnsureNCMesh(true); -+ -+ // Still exactly one boundary element must be added -+ CHECK(smesh.GetNBE() == 2 * 3 + 1); -+ -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ auto without_internal = CheckPoisson(smesh, p, -+ 5); // Exclude the internal boundary -+ auto with_internal = CheckPoisson(smesh, p); // Include the internal boundary -+ -+ switch (p) -+ { -+ case 1: -+ CHECK(with_internal == without_internal); break; -+ case 2: -+ CHECK(with_internal == without_internal); break; -+ case 3: -+ CHECK(with_internal == without_internal + 1); break; -+ } -+ -+ // Now NC refine one of the attached elements, this should result in 2 -+ // internal boundary elements. -+ for (int ref : {0, 1}) -+ { -+ refs[0].index = ref; -+ refs[0].ref_type = Refinement::XYZ; -+ auto ssmesh = Mesh(smesh); -+ ssmesh.GeneralRefinement(refs); -+ -+ // There should now be four internal boundary elements, where there was one -+ // before. -+ CHECK(ssmesh.GetNBE() == 3 /* external boundaries of unrefined element */ -+ + 4 /* internal boundaries */ -+ + (3 * 4) /* external boundaries of refined element */); -+ -+ // Count the number of internal boundary elements -+ int num_internal = 0; -+ for (int n = 0; n < ssmesh.GetNBE(); ++n) -+ { -+ int f, o; -+ ssmesh.GetBdrElementFace(n, &f, &o); -+ int e1, e2; -+ ssmesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && ssmesh.GetAttribute(e1) != ssmesh.GetAttribute(e2)) -+ { -+ ++num_internal; -+ } -+ } -+ CHECK(num_internal == 4); -+ -+ without_internal = CheckPoisson(ssmesh, p, 5); // Exclude the internal boundary -+ with_internal = CheckPoisson(ssmesh, p); // Include the internal boundary -+ -+ switch (p) -+ { -+ case 1: -+ CHECK(with_internal == without_internal); break; -+ case 2: -+ CHECK(with_internal == without_internal); break; -+ case 3: -+ CHECK(with_internal == without_internal + 1); break; -+ } -+ } -+} -+ -+TEST_CASE("RefinedTetsInternalBoundaries", "[NCMesh]") -+{ -+ auto p = GENERATE(1,2,3); -+ CAPTURE(p); -+ -+ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); -+ Array refs; -+ refs.Append(Refinement(0, Refinement::X)); -+ smesh.GeneralRefinement(refs); -+ -+ // Now have a pair of elements, make the second element a different -+ // attribute. -+ smesh.SetAttribute(1, 2); -+ -+ REQUIRE(smesh.GetNE() == 2); -+ REQUIRE(smesh.GetNBE() == 2 * 3); -+ -+ smesh.UniformRefinement(); -+ -+ CHECK(smesh.GetNBE() == 2 * 3 * 4); -+ -+ // Introduce internal boundary elements -+ for (int f = 0; f < smesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); -+ new_elem->SetAttribute(5); -+ smesh.AddBdrElement(new_elem); -+ } -+ } -+ -+ // Exactly four boundary elements must be added -+ CHECK(smesh.GetNBE() == 2 * 3 * 4 + 4); -+ -+ smesh.EnsureNCMesh(true); -+ -+ // Still exactly one boundary element must be added -+ CHECK(smesh.GetNBE() == 2 * 3 * 4 + 4); -+ -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ auto without_internal = CheckPoisson(smesh, p, -+ 5); // Exclude the internal boundary -+ auto with_internal = CheckPoisson(smesh, p); // Include the internal boundary -+ -+ switch (p) -+ { -+ case 1: -+ CHECK(with_internal == without_internal); break; -+ case 2: -+ CHECK(with_internal == without_internal + 3); break; -+ case 3: -+ CHECK(with_internal == without_internal + 10); break; -+ } -+ -+ // Now NC refine all elements with the 2 attribute. -+ -+ // Mark all elements on one side of the attribute boundary to refine -+ refs.DeleteAll(); -+ for (int n = 0; n < smesh.GetNE(); ++n) -+ { -+ if (smesh.GetAttribute(n) == 2) -+ { -+ refs.Append(Refinement{n, Refinement::XYZ}); -+ } -+ } -+ -+ smesh.GeneralRefinement(refs); -+ -+ // There should now be four internal boundary elements, where there was one -+ // before. -+ CHECK(smesh.GetNBE() == 3 * 4 /* external boundaries of unrefined elements */ -+ + 4 * 4 /* internal boundaries */ -+ + (3 * 4 * 4) /* external boundaries of refined elements */); -+ -+ // Count the number of internal boundary elements -+ int num_internal = 0; -+ for (int n = 0; n < smesh.GetNBE(); ++n) -+ { -+ int f, o; -+ smesh.GetBdrElementFace(n, &f, &o); -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ ++num_internal; -+ } -+ } -+ CHECK(num_internal == 4 * 4); -+ -+ without_internal = CheckPoisson(smesh, p, 5); // Exclude the internal boundary -+ with_internal = CheckPoisson(smesh, p); // Include the internal boundary -+ -+ switch (p) -+ { -+ case 1: -+ CHECK(with_internal == without_internal); break; -+ case 2: -+ CHECK(with_internal == without_internal + 3); break; -+ case 3: -+ CHECK(with_internal == without_internal + 10); break; -+ } -+} -+ -+TEST_CASE("PoissonOnReferenceCubeNC", "[NCMesh]") -+{ -+ auto smesh = Mesh("../../data/ref-cube.mesh"); -+ smesh.EnsureNCMesh(); -+ Array refs(1); -+ refs[0].index = 0; -+ refs[0].ref_type = Refinement::X; -+ smesh.GeneralRefinement(refs); -+ -+ // Now have two elements. -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ auto p = GENERATE(1, 2, 3); -+ CAPTURE(p); -+ -+ // Check that Poisson can be solved on the domain -+ CheckPoisson(smesh, p); -+ -+ auto ref_type = char(GENERATE(Refinement::X, Refinement::Y, Refinement::Z, -+ Refinement::XY, Refinement::XZ, Refinement::YZ, -+ Refinement::XYZ)); -+ CAPTURE(ref_type); -+ for (auto refined_elem : {0}) // The left or the right element -+ { -+ refs[0].index = refined_elem; -+ auto ssmesh = Mesh(smesh); -+ -+ // Now NC refine one of the attached elements -+ refs[0].ref_type = ref_type; -+ -+ ssmesh.GeneralRefinement(refs); -+ ssmesh.FinalizeTopology(); -+ ssmesh.Finalize(); -+ -+ CAPTURE(refined_elem); -+ CheckPoisson(ssmesh, p); -+ } -+} -+ -+TEST_CASE("PoissonOnReferenceTetNC", "[NCMesh]") -+{ -+ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); -+ -+ auto p = GENERATE(1, 2, 3); -+ CAPTURE(p); -+ -+ CheckPoisson(smesh, p); -+ -+ Array refs(1); -+ refs[0].index = 0; -+ refs[0].ref_type = Refinement::X; -+ -+ smesh.GeneralRefinement(refs); -+ -+ // Now have two elements. -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ -+ // Check that Poisson can be solved on the pair of tets -+ CheckPoisson(smesh, p); -+ -+ auto nc = GENERATE(false, true); -+ CAPTURE(nc); -+ -+ smesh.EnsureNCMesh(GENERATE(false, true)); -+ -+ for (auto refined_elem : {0, 1}) -+ { -+ auto ssmesh = Mesh(smesh); -+ -+ refs[0].index = refined_elem; -+ refs[0].ref_type = Refinement::XYZ; -+ -+ ssmesh.GeneralRefinement(refs); -+ ssmesh.FinalizeTopology(); -+ ssmesh.Finalize(); -+ -+ CAPTURE(refined_elem); -+ CheckPoisson(ssmesh, p); -+ } -+} -+ -+TEST_CASE("TetBoundaryRefinement", "[NCMesh]") -+{ -+ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); -+ -+ smesh.FinalizeTopology(); -+ smesh.Finalize(true); -+ smesh.UniformRefinement(); -+ -+ smesh.EnsureNCMesh(true); -+ -+ CHECK(smesh.GetNBE() == 4 * 4); -+ -+ // Loop over elements and mark for refinement if any vertices match the -+ // original -+ auto refine_corners = [&]() -+ { -+ Array vertices, elements; -+ // reference vertices of (0,0,0), (1,0,0), (0,1,0), (0,0,1) are [0,3] -+ auto original_vert = [](int i) { return i >= 0 && i <= 3; }; -+ for (int n = 0; n < smesh.GetNE(); ++n) -+ { -+ smesh.GetElementVertices(n, vertices); -+ if (std::any_of(vertices.begin(), vertices.end(), original_vert)) -+ { -+ elements.Append(n); -+ } -+ } -+ -+ smesh.GeneralRefinement(elements); -+ smesh.FinalizeTopology(); -+ smesh.Finalize(); -+ }; -+ -+ constexpr int max_ref_levels = 4; -+ for (int r = 0; r < max_ref_levels; r++) -+ { -+ refine_corners(); -+ CHECK(smesh.GetNBE() == 4 * (4 + 3 * 3 * (r + 1))); -+ } -+} -+ -+TEST_CASE("TetInternalBoundaryRefinement", "[NCMesh]") -+{ -+ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); -+ -+ REQUIRE(smesh.GetNBE() == 4); -+ -+ { -+ Array refs; -+ refs.Append(Refinement(0, Refinement::X)); -+ smesh.GeneralRefinement(refs); -+ } -+ -+ // Now have a pair of elements, make the second element a different -+ // attribute. -+ smesh.SetAttribute(0, 1); -+ smesh.SetAttribute(1, 2); -+ -+ // Introduce an internal boundary element -+ const int new_attribute = smesh.bdr_attributes.Max() + 1; -+ Array original_boundary_vertices; -+ for (int f = 0; f < smesh.GetNumFaces(); ++f) -+ { -+ int e1, e2; -+ smesh.GetFaceElements(f, &e1, &e2); -+ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) -+ { -+ // This is the internal face between attributes. -+ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); -+ new_elem->SetAttribute(new_attribute); -+ new_elem->GetVertices(original_boundary_vertices); -+ smesh.AddBdrElement(new_elem); -+ break; -+ } -+ } -+ -+ smesh.FinalizeTopology(); -+ smesh.Finalize(true); -+ smesh.UniformRefinement(); -+ smesh.EnsureNCMesh(true); -+ -+ CHECK(smesh.GetNBE() == (2*3 + 1) * 4); -+ -+ CHECK(CountEssentialDof(smesh, 1, -+ smesh.bdr_attributes.Max()) == 6); -+ CHECK(CountEssentialDof(smesh, 2, -+ smesh.bdr_attributes.Max()) == 6 + 3 * 3); -+ CHECK(CountEssentialDof(smesh, 3, -+ smesh.bdr_attributes.Max()) == 10 + 3 * 6); -+ -+ int refined_attribute = GENERATE(1,2); -+ int ref_level = GENERATE(1,2,3); -+ for (int r = 0; r < ref_level; r++) -+ { -+ Array el_to_refine; -+ for (int n = 0; n < smesh.GetNE(); n++) -+ { -+ if (smesh.GetAttribute(n) == refined_attribute) -+ { -+ el_to_refine.Append(n); -+ } -+ } -+ smesh.GeneralRefinement(el_to_refine); -+ } -+ -+ // Refining on only one side of the boundary face should not change the number of -+ // essential true dofs -+ CHECK(CountEssentialDof(smesh, 1, -+ smesh.bdr_attributes.Max()) == 6); -+ CHECK(CountEssentialDof(smesh, 2, -+ smesh.bdr_attributes.Max()) == 6 + 3 * 3); -+ CHECK(CountEssentialDof(smesh, 3, -+ smesh.bdr_attributes.Max()) == 10 + 3 * 6); -+ -+ // The number of boundary faces should have increased. -+ CHECK(smesh.GetNBE() == 3 * 4 + (3 + 1) * std::pow(4, 1+ref_level)); -+} -+ -+TEST_CASE("TetInternalBoundaryStarMesh", "[NCMesh]") -+{ -+ auto smesh = StarMesh(); -+ smesh.EnsureNCMesh(true); -+ -+ -+ SECTION("Unrefined") -+ { -+ CHECK(smesh.GetNBE() == 4 * 3 + 4); -+ CHECK(CountEssentialDof(smesh, 1, -+ smesh.bdr_attributes.Max()) == 4); -+ CHECK(CountEssentialDof(smesh, 2, -+ smesh.bdr_attributes.Max()) == 4 + 6); -+ CHECK(CountEssentialDof(smesh, 3, -+ smesh.bdr_attributes.Max()) == 4 + 6*2 + 4*1); -+ CHECK(CountEssentialDof(smesh, 4, -+ smesh.bdr_attributes.Max()) == 4 + 6*3 + 4*3); -+ -+ CHECK(CountEssentialDof(smesh, 1, -+ smesh.bdr_attributes.Max()) == 6); -+ CHECK(CountEssentialDof(smesh, 2, -+ smesh.bdr_attributes.Max()) == 20); -+ CHECK(CountEssentialDof(smesh, 3, -+ smesh.bdr_attributes.Max()) == 42); -+ CHECK(CountEssentialDof(smesh, 4, -+ smesh.bdr_attributes.Max()) == 72); -+ } -+ -+ SECTION("Refined") -+ { -+ int refined_attribute = GENERATE(1,2,3,4,5); -+ int ref_level = GENERATE(1,2,3); -+ for (int r = 0; r < ref_level; r++) -+ { -+ Array el_to_refine; -+ for (int n = 0; n < smesh.GetNE(); n++) -+ { -+ if (smesh.GetAttribute(n) == refined_attribute) -+ { -+ el_to_refine.Append(n); -+ } -+ } -+ smesh.GeneralRefinement(el_to_refine); -+ } -+ -+ // Refining on only one side of the boundary face should not change the number of -+ // essential true dofs -+ CHECK(CountEssentialDof(smesh, 1, -+ smesh.bdr_attributes.Max()) == 4); -+ CHECK(CountEssentialDof(smesh, 2, -+ smesh.bdr_attributes.Max()) == 4 + 6); -+ CHECK(CountEssentialDof(smesh, 3, -+ smesh.bdr_attributes.Max()) == 4 + 6*2 + 4*1); -+ CHECK(CountEssentialDof(smesh, 4, -+ smesh.bdr_attributes.Max()) == 4 + 6*3 + 4*3); -+ -+ CHECK(CountEssentialDof(smesh, 1, -+ smesh.bdr_attributes.Max()) == 6); -+ CHECK(CountEssentialDof(smesh, 2, -+ smesh.bdr_attributes.Max()) == 6 * 2 + 4 * 2); // 2 per edge, 2 per face -+ CHECK(CountEssentialDof(smesh, 3, -+ smesh.bdr_attributes.Max()) == 42); -+ CHECK(CountEssentialDof(smesh, 4, -+ smesh.bdr_attributes.Max()) == 72); -+ -+ // The number of boundary faces should have increased. -+ CHECK(smesh.GetNBE() == 3 * 4 + 4 * std::pow(4,ref_level)); -+ } -+} -+ -+TEST_CASE("DividingPlaneMesh", "[NCMesh]") -+{ -+ auto RefineAttribute = [](Mesh& mesh, int attr, int ref_level) -+ { -+ for (int r = 0; r < ref_level; r++) -+ { -+ Array el_to_refine; -+ for (int n = 0; n < mesh.GetNE(); n++) -+ { -+ if (mesh.GetAttribute(n) == attr) -+ { -+ el_to_refine.Append(n); -+ } -+ } -+ mesh.GeneralRefinement(el_to_refine); -+ } -+ }; -+ -+ SECTION("Hex") -+ { -+ auto mesh = DividingPlaneMesh(false); -+ mesh.EnsureNCMesh(true); -+ -+ CHECK(mesh.GetNBE() == 2 * 5 + 1); -+ CHECK(mesh.GetNE() == 2); -+ -+ auto attr = GENERATE(1,2); -+ auto ref_level = GENERATE(1,2); -+ -+ const int num_vert = ref_level == 1 ? 5*5 : 9*9; -+ const int num_edge = ref_level == 1 ? 2*4*5 : 2*8*9; -+ const int num_face = ref_level == 1 ? 4*4 : 8*8; -+ -+ SECTION("H1Hex") -+ { -+ mesh.UniformRefinement(); -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 3*3); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 5*5); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 7*7); -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 3*3); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 5*5); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 7*7); -+ -+ RefineAttribute(mesh, attr, ref_level); -+ -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 3*3); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 5*5); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 7*7); -+ -+ // Add the slave face dofs, then subtract off the vertex dofs which are double -+ // counted due to being shared. -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 3*3 + num_vert - 3*3); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 5*5 + num_vert + num_edge + num_face - 3*3); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 7*7 + num_vert + 2*num_edge + 4*num_face - 3*3); -+ -+ } -+ -+ SECTION("NDHex") -+ { -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 4); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 4*2 + 2*2); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 4*3 + 2*2*3); -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 4); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 4*2 + 2*2); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 4*3 + 2*2*3); -+ -+ mesh.UniformRefinement(); -+ const int initial_num_edge = 12; -+ const int initial_num_face = 4; -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == initial_num_edge); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == initial_num_edge*2 + initial_num_face*2*2); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == initial_num_edge*3 + initial_num_face*2*2*3); -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == initial_num_edge); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == initial_num_edge*2 + initial_num_face*2*2); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == initial_num_edge*3 + initial_num_face*2*2*3); -+ -+ RefineAttribute(mesh, attr, ref_level); -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == initial_num_edge); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == initial_num_edge*2 + initial_num_face*2*2); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == initial_num_edge*3 + initial_num_face*2*2*3); -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == (num_edge+initial_num_edge)); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == (num_edge+initial_num_edge)*2 + -+ (num_face+initial_num_face)*2*2); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == (num_edge+initial_num_edge)*3 + -+ (num_face+initial_num_face)*2*2*3); -+ } -+ } -+ -+ SECTION("Tet") -+ { -+ auto mesh = DividingPlaneMesh(true); -+ mesh.EnsureNCMesh(true); -+ -+ CHECK(mesh.GetNBE() == 2 * 5 * 2 + 2); -+ CHECK(mesh.GetNE() == 2 * 6); -+ -+ auto attr = GENERATE(1,2); -+ auto ref_level = GENERATE(1,2); -+ CAPTURE(attr); -+ CAPTURE(ref_level); -+ -+ const int initial_num_vert = 4; -+ const int initial_num_edge = 5; -+ const int initial_num_face = 2; -+ -+ const int num_vert = ref_level == 1 ? 9 : 25; -+ const int num_edge = ref_level == 1 ? 16 : 56; -+ const int num_face = ref_level == 1 ? 8 : 32; -+ -+ SECTION("H1Tet") -+ { -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == initial_num_vert); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == initial_num_vert + initial_num_edge); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == initial_num_vert + 2*initial_num_edge + -+ initial_num_face); -+ CHECK(CountEssentialDof(mesh, 4, -+ mesh.bdr_attributes.Max()) == initial_num_vert + 3*initial_num_edge + -+ 3*initial_num_face); -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == initial_num_vert); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == initial_num_vert + initial_num_edge); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == initial_num_vert + 2*initial_num_edge + -+ initial_num_face); -+ CHECK(CountEssentialDof(mesh, 4, -+ mesh.bdr_attributes.Max()) == initial_num_vert + 3*initial_num_edge + -+ 3*initial_num_face); -+ -+ RefineAttribute(mesh, attr, ref_level); -+ -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == initial_num_vert); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == initial_num_vert + initial_num_edge); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == initial_num_vert + 2*initial_num_edge + -+ initial_num_face); -+ CHECK(CountEssentialDof(mesh, 4, -+ mesh.bdr_attributes.Max()) == initial_num_vert + 3*initial_num_edge + -+ 3*initial_num_face); -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == num_vert); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == num_vert + num_edge + initial_num_edge); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == num_vert + 2*num_edge + num_face + -+ 2*initial_num_edge + initial_num_face); -+ CHECK(CountEssentialDof(mesh, 4, -+ mesh.bdr_attributes.Max()) == num_vert + 3*num_edge + 3*num_face + -+ 3*initial_num_edge + 3*initial_num_face); -+ } -+ -+ SECTION("NDTet") -+ { -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 5); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 14); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 27); -+ CHECK(CountEssentialDof(mesh, 4, -+ mesh.bdr_attributes.Max()) == 44); -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 5); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 14); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 27); -+ CHECK(CountEssentialDof(mesh, 4, -+ mesh.bdr_attributes.Max()) == 44); -+ -+ RefineAttribute(mesh, attr, ref_level); -+ -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 5); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 14); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 27); -+ CHECK(CountEssentialDof(mesh, 4, -+ mesh.bdr_attributes.Max()) == 44); -+ -+ CHECK(CountEssentialDof(mesh, 1, -+ mesh.bdr_attributes.Max()) == 5 + num_edge); -+ CHECK(CountEssentialDof(mesh, 2, -+ mesh.bdr_attributes.Max()) == 14 + 2 * num_edge + 2*num_face); -+ CHECK(CountEssentialDof(mesh, 3, -+ mesh.bdr_attributes.Max()) == 27 + 3 * num_edge + 2*3*num_face); -+ } -+ } -+} -+ -+ -+TEST_CASE("TetFaceFlips", "[NCMesh]") -+{ -+ auto orientation = GENERATE(1,3,5); -+ CAPTURE(orientation); -+ auto smesh = OrientedTriFaceMesh(orientation, true); -+ -+ // A smooth function in each vector component -+ constexpr int order = 3, dim = 3, quadrature_order = 4; -+ constexpr double kappa = 2 * M_PI; -+ auto E_exact = [=](const Vector &x, Vector &E) -+ { -+ E(0) = cos(kappa * x(1)); -+ E(1) = cos(kappa * x(2)); -+ E(2) = cos(kappa * x(0)); -+ }; -+ VectorFunctionCoefficient E_coeff(dim, E_exact); -+ -+ auto CheckSerialNDConformal = [&](Mesh &mesh, int num_essential_tdof, -+ int num_essential_vdof) -+ { -+ ND_FECollection fe_collection(order, dim); -+ FiniteElementSpace fe_space(&mesh, &fe_collection); -+ GridFunction E(&fe_space); -+ -+ E.ProjectCoefficient(E_coeff); -+ -+ auto *P = fe_space.GetProlongationMatrix(); -+ if (P != nullptr) -+ { -+ // Projection does not respect the non-conformal constraints. -+ // Extract the true (conformal) and prolongate to get the NC respecting projection. -+ auto E_true = E.GetTrueVector(); -+ P->Mult(E_true, E); -+ } -+ mesh.EnsureNodes(); -+ GridFunction * const coords = mesh.GetNodes(); -+ -+ const auto &ir = IntRules.Get(Geometry::Type::TRIANGLE, quadrature_order); -+ IntegrationRule left_eir(ir.GetNPoints()), -+ right_eir(ir.GetNPoints()); // element integration rules -+ -+ Array bdr_attr_is_ess = mesh.bdr_attributes, tdof_list; -+ bdr_attr_is_ess = 0; -+ bdr_attr_is_ess.Last() = 1; -+ fe_space.GetEssentialTrueDofs(bdr_attr_is_ess, tdof_list); -+ -+ Array ess_vdof_marker, vdof_list; -+ fe_space.GetEssentialVDofs(bdr_attr_is_ess, ess_vdof_marker); -+ fe_space.MarkerToList(ess_vdof_marker, vdof_list); -+ -+ CHECK(num_essential_tdof == tdof_list.Size()); -+ if (num_essential_vdof != -1) -+ { -+ CHECK(num_essential_vdof == vdof_list.Size()); -+ } -+ -+ for (int n = 0; n < mesh.GetNBE(); n++) -+ { -+ // NOTE: only works for internal boundaries -+ if (bdr_attr_is_ess[mesh.GetBdrAttribute(n) - 1]) -+ { -+ auto f = mesh.GetBdrElementFaceIndex(n); -+ auto &face_element_transform = *mesh.GetFaceElementTransformations(f); -+ -+ if (face_element_transform.Elem2 == nullptr) -+ { -+ // not internal, nothing to check. -+ continue; -+ } -+ -+ face_element_transform.Loc1.Transform(ir, left_eir); -+ face_element_transform.Loc2.Transform(ir, right_eir); -+ -+ constexpr double tol = 1e-14; -+ REQUIRE(left_eir.GetNPoints() == ir.GetNPoints()); -+ REQUIRE(right_eir.GetNPoints() == ir.GetNPoints()); -+ Vector left_val, right_val; -+ for (int i = 0; i < ir.GetNPoints(); i++) -+ { -+ face_element_transform.Elem1->SetIntPoint(&left_eir[i]); -+ coords->GetVectorValue(*face_element_transform.Elem1, left_eir[i], left_val); -+ coords->GetVectorValue(*face_element_transform.Elem1, left_eir[i], right_val); -+ REQUIRE(std::abs(left_val(0) - right_val(0)) < tol); -+ REQUIRE(std::abs(left_val(1) - right_val(1)) < tol); -+ REQUIRE(std::abs(left_val(2) - right_val(2)) < tol); -+ E.GetVectorValue(*face_element_transform.Elem1, left_eir[i], left_val); -+ -+ face_element_transform.Elem2->SetIntPoint(&right_eir[i]); -+ E.GetVectorValue(*face_element_transform.Elem2, right_eir[i], right_val); -+ -+ // Check that the second and third rows agree. -+ // The y and z should agree as the normal is in the x direction -+ CHECK(std::abs(left_val(1) - right_val(1)) < tol); -+ CHECK(std::abs(left_val(2) - right_val(2)) < tol); -+ } -+ } -+ } -+ }; -+ -+ SECTION("Conformal") -+ { -+ const int ntdof = 3*3 + 3*2; -+ const int nvdof = ntdof; -+ CheckSerialNDConformal(smesh, ntdof, nvdof); -+ } -+ -+ SECTION("Nonconformal") -+ { -+ smesh.EnsureNCMesh(true); -+ const int ntdof = 3*3 + 3*2; -+ const int nvdof = ntdof; -+ CheckSerialNDConformal(smesh, ntdof, nvdof); -+ } -+ -+ SECTION("ConformalUniformRefined") -+ { -+ smesh.UniformRefinement(); -+ const int ntdof = 9*3 + 4*3*2; -+ const int nvdof = ntdof; -+ CheckSerialNDConformal(smesh, ntdof, nvdof); -+ } -+ -+ SECTION("NonconformalUniformRefined") -+ { -+ smesh.EnsureNCMesh(true); -+ smesh.UniformRefinement(); -+ const int ntdof = 9*3 + 4*3*2; -+ const int nvdof = ntdof; -+ CheckSerialNDConformal(smesh, ntdof, nvdof); -+ } -+ -+ SECTION("NonconformalRefined") -+ { -+ smesh.EnsureNCMesh(true); -+ int ref_level = GENERATE(1, 2); -+ CAPTURE(ref_level); -+ for (int r = 0; r < ref_level; r++) -+ { -+ Array el_to_refine; -+ for (int n = 0; n < smesh.GetNE(); n++) -+ { -+ if (smesh.GetAttribute(n) == 2) -+ { -+ el_to_refine.Append(n); -+ } -+ } -+ smesh.GeneralRefinement(el_to_refine); -+ } -+ const int ntdof = 3*3 + 3*2; -+ const int nvdof = ntdof + (ref_level == 1 ? 9*3 + 4*3*2 : 30*3 + 16*3*2); -+ CheckSerialNDConformal(smesh, ntdof, nvdof); -+ } -+ -+ SECTION("NonconformalLevelTwoRefined") -+ { -+ smesh.EnsureNCMesh(true); -+ Array el_to_refine; -+ -+ smesh.UniformRefinement(); -+ -+ const int ntdof = 9*3 + 4*3*2; -+ el_to_refine.SetSize(1); -+ -+ auto n = GENERATE(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); -+ auto m = GENERATE(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22); -+ -+ if (n < smesh.GetNE() && smesh.GetAttribute(n) == 2) -+ { -+ el_to_refine[0] = n; -+ CAPTURE(n); -+ smesh.GeneralRefinement(el_to_refine); -+ CheckSerialNDConformal(smesh, ntdof, -1); -+ -+ if (smesh.GetAttribute(m) == 2) -+ { -+ el_to_refine[0] = m; -+ CAPTURE(m); -+ smesh.GeneralRefinement(el_to_refine); -+ CheckSerialNDConformal(smesh, ntdof, -1); -+ } -+ } -+ -+ } -+} -+ -+TEST_CASE("RP=I", "[NCMesh]") -+{ -+ auto CheckFESpace = [](const FiniteElementSpace& fespace) -+ { -+ auto * const R = fespace.GetConformingRestriction(); -+ auto * const P = fespace.GetConformingProlongation(); -+ -+ REQUIRE(R != nullptr); -+ REQUIRE(P != nullptr); -+ -+ // Vector notation -+ Vector e_i(R->Height()), e_j(P->Width()); -+ Vector Rrow(R->Width()), Pcol(P->Height()); -+ for (int i = 0; i < R->Height(); i++) -+ { -+ e_i = 0.0; -+ e_i(i) = 1.0; -+ R->MultTranspose(e_i, Rrow); -+ for (int j = 0; j < P->Width(); j++) -+ { -+ e_j = 0.0; -+ e_j(j) = 1.0; -+ P->Mult(e_j, Pcol); -+ -+ CHECK(Rrow * Pcol == (i == j ? 1.0 : 0.0)); -+ } -+ } -+ -+ // Index notation -+ CHECK(R->Height() == P->Width()); -+ CHECK(R->Width() == P->Height()); -+ for (int i = 0; i < R->Height(); i++) -+ for (int j = 0; j < P->Width(); j++) -+ { -+ double dot = 0.0; -+ for (int k = 0; k < R->Width(); k++) -+ { -+ dot += (*R)(i,k)*(*P)(k,j); -+ } -+ CHECK(dot == (i == j ? 1.0 : 0.0)); -+ } -+ }; -+ -+ SECTION("Hex") -+ { -+ const int dim = 3; -+ const int order = GENERATE(1, 2); -+ // Split the hex into a pair, then isotropically refine one of them. -+ Mesh mesh("../../data/ref-cube.mesh"); -+ Array refinements(1); -+ refinements[0].index = 0; -+ refinements[0].ref_type = Refinement::X; -+ mesh.GeneralRefinement(refinements); -+ refinements[0].ref_type = Refinement::XYZ; -+ mesh.GeneralRefinement(refinements); -+ SECTION("ND") -+ { -+ ND_FECollection fec(order, dim); -+ FiniteElementSpace fespace(&mesh, &fec); -+ CheckFESpace(fespace); -+ } -+ SECTION("H1") -+ { -+ H1_FECollection fec(order, dim); -+ FiniteElementSpace fespace(&mesh, &fec); -+ CheckFESpace(fespace); -+ } -+ } -+ -+ SECTION("Tet") -+ { -+ const int dim = 3; -+ const int order = GENERATE(1, 2); -+ // Split the hex into a pair, then isotropically refine one of them. -+ Mesh mesh("../../data/ref-tetrahedron.mesh"); -+ Array refinements(1); -+ refinements[0].index = 0; -+ refinements[0].ref_type = Refinement::X; -+ mesh.GeneralRefinement(refinements); -+ mesh.EnsureNCMesh(true); -+ refinements[0].ref_type = Refinement::XYZ; -+ mesh.GeneralRefinement(refinements); -+ SECTION("ND") -+ { -+ ND_FECollection fec(order, dim); -+ FiniteElementSpace fespace(&mesh, &fec); -+ CheckFESpace(fespace); -+ } -+ SECTION("H1") -+ { -+ H1_FECollection fec(order, dim); -+ FiniteElementSpace fespace(&mesh, &fec); -+ CheckFESpace(fespace); -+ } -+ } -+} - - } // namespace mfem +diff --git a/CHANGELOG b/CHANGELOG +index cd5f1333f..d2f362c06 100644 +--- a/CHANGELOG ++++ b/CHANGELOG +@@ -15,6 +15,8 @@ Discretization improvements + --------------------------- + - Introduced support for higher order non conformal Nedelec elements on + simplices in ParMesh. ++- Introduced support for internal boundary elements in nonconformal adapted ++ meshes. + + Miscellaneous + ------------- +diff --git a/fem/bilinearform.cpp b/fem/bilinearform.cpp +index 6eae233bc..caaafaac8 100644 +--- a/fem/bilinearform.cpp ++++ b/fem/bilinearform.cpp +@@ -1072,7 +1072,8 @@ void BilinearForm::EliminateEssentialBCFromDofs( + void BilinearForm::EliminateEssentialBCFromDofs (const Array &ess_dofs, + DiagonalPolicy dpolicy) + { +- MFEM_ASSERT(ess_dofs.Size() == height, "incorrect dof Array size"); ++ MFEM_ASSERT(ess_dofs.Size() == height, ++ "incorrect dof Array size: " << ess_dofs.Size() << ' ' << height); + + for (int i = 0; i < ess_dofs.Size(); i++) + if (ess_dofs[i] < 0) +@@ -1084,7 +1085,8 @@ void BilinearForm::EliminateEssentialBCFromDofs (const Array &ess_dofs, + void BilinearForm::EliminateEssentialBCFromDofsDiag (const Array &ess_dofs, + double value) + { +- MFEM_ASSERT(ess_dofs.Size() == height, "incorrect dof Array size"); ++ MFEM_ASSERT(ess_dofs.Size() == height, ++ "incorrect dof Array size: " << ess_dofs.Size() << ' ' << height); + + for (int i = 0; i < ess_dofs.Size(); i++) + if (ess_dofs[i] < 0) +diff --git a/fem/fespace.cpp b/fem/fespace.cpp +index 660fec17a..1462adc81 100644 +--- a/fem/fespace.cpp ++++ b/fem/fespace.cpp +@@ -503,13 +503,11 @@ void FiniteElementSpace::BuildDofToArrays() + } + } + +-static void mark_dofs(const Array &dofs, Array &mark_array) ++static void MarkDofs(const Array &dofs, Array &mark_array) + { +- for (int i = 0; i < dofs.Size(); i++) ++ for (auto d : dofs) + { +- int k = dofs[i]; +- if (k < 0) { k = -1 - k; } +- mark_array[k] = -1; ++ mark_array[d >= 0 ? d : -1 - d] = -1; + } + } + +@@ -517,11 +515,9 @@ void FiniteElementSpace::GetEssentialVDofs(const Array &bdr_attr_is_ess, + Array &ess_vdofs, + int component) const + { +- Array vdofs, dofs; +- ++ Array dofs; + ess_vdofs.SetSize(GetVSize()); + ess_vdofs = 0; +- + for (int i = 0; i < GetNBE(); i++) + { + if (bdr_attr_is_ess[GetBdrAttribute(i)-1]) +@@ -529,16 +525,14 @@ void FiniteElementSpace::GetEssentialVDofs(const Array &bdr_attr_is_ess, + if (component < 0) + { + // Mark all components. +- GetBdrElementVDofs(i, vdofs); +- mark_dofs(vdofs, ess_vdofs); ++ GetBdrElementVDofs(i, dofs); + } + else + { + GetBdrElementDofs(i, dofs); +- for (int d = 0; d < dofs.Size(); d++) +- { dofs[d] = DofToVDof(dofs[d], component); } +- mark_dofs(dofs, ess_vdofs); ++ for (auto &d : dofs) { d = DofToVDof(d, component); } + } ++ MarkDofs(dofs, ess_vdofs); + } + } + +@@ -546,38 +540,47 @@ void FiniteElementSpace::GetEssentialVDofs(const Array &bdr_attr_is_ess, + // local DOFs affected by boundary elements on other processors + if (Nonconforming()) + { +- Array bdr_verts, bdr_edges; +- mesh->ncmesh->GetBoundaryClosure(bdr_attr_is_ess, bdr_verts, bdr_edges); +- +- for (int i = 0; i < bdr_verts.Size(); i++) ++ Array bdr_verts, bdr_edges, bdr_faces; ++ mesh->ncmesh->GetBoundaryClosure(bdr_attr_is_ess, bdr_verts, bdr_edges, ++ bdr_faces); ++ for (auto v : bdr_verts) ++ { ++ if (component < 0) ++ { ++ GetVertexVDofs(v, dofs); ++ } ++ else ++ { ++ GetVertexDofs(v, dofs); ++ for (auto &d : dofs) { d = DofToVDof(d, component); } ++ } ++ MarkDofs(dofs, ess_vdofs); ++ } ++ for (auto e : bdr_edges) + { + if (component < 0) + { +- GetVertexVDofs(bdr_verts[i], vdofs); +- mark_dofs(vdofs, ess_vdofs); ++ GetEdgeVDofs(e, dofs); + } + else + { +- GetVertexDofs(bdr_verts[i], dofs); +- for (int d = 0; d < dofs.Size(); d++) +- { dofs[d] = DofToVDof(dofs[d], component); } +- mark_dofs(dofs, ess_vdofs); ++ GetEdgeDofs(e, dofs); ++ for (auto &d : dofs) { d = DofToVDof(d, component); } + } ++ MarkDofs(dofs, ess_vdofs); + } +- for (int i = 0; i < bdr_edges.Size(); i++) ++ for (auto f : bdr_faces) + { + if (component < 0) + { +- GetEdgeVDofs(bdr_edges[i], vdofs); +- mark_dofs(vdofs, ess_vdofs); ++ GetEntityVDofs(2, f, dofs); + } + else + { +- GetEdgeDofs(bdr_edges[i], dofs); +- for (int d = 0; d < dofs.Size(); d++) +- { dofs[d] = DofToVDof(dofs[d], component); } +- mark_dofs(dofs, ess_vdofs); ++ GetEntityDofs(2, f, dofs); ++ for (auto &d : dofs) { d = DofToVDof(d, component); } + } ++ MarkDofs(dofs, ess_vdofs); + } + } + } +@@ -596,6 +599,30 @@ void FiniteElementSpace::GetEssentialTrueDofs(const Array &bdr_attr_is_ess, + else + { + R->BooleanMult(ess_vdofs, ess_tdofs); ++#ifdef MFEM_DEBUG ++ // Verify that in boolean arithmetic: P^T ess_dofs = R ess_dofs ++ Array ess_tdofs2(ess_tdofs.Size()); ++ GetConformingProlongation()->BooleanMultTranspose(ess_vdofs, ess_tdofs2); ++ ++ int counter = 0; ++ std::string error_msg = "failed dof: "; ++ for (int i = 0; i < ess_tdofs2.Size(); ++i) ++ { ++ if (bool(ess_tdofs[i]) != bool(ess_tdofs2[i])) ++ { ++ error_msg += std::to_string(i) += "(R "; ++ error_msg += std::to_string(bool(ess_tdofs[i])) += " P^T "; ++ error_msg += std::to_string(bool(ess_tdofs2[i])) += ") "; ++ counter++; ++ } ++ } ++ ++ MFEM_ASSERT(R->Height() == GetConformingProlongation()->Width(), "!"); ++ MFEM_ASSERT(R->Width() == GetConformingProlongation()->Height(), "!"); ++ MFEM_ASSERT(R->Width() == ess_vdofs.Size(), "!"); ++ MFEM_VERIFY(counter == 0, "internal MFEM error: counter = " << counter ++ << ' ' << error_msg); ++#endif + } + MarkerToList(ess_tdofs, ess_tdof_list); + } +@@ -944,6 +971,15 @@ int FiniteElementSpace::GetEntityDofs(int entity, int index, Array &dofs, + } + } + ++int FiniteElementSpace::GetEntityVDofs(int entity, int index, Array &dofs, ++ Geometry::Type master_geom, ++ int variant) const ++{ ++ int n = GetEntityDofs(entity, index, dofs, master_geom, variant); ++ DofsToVDofs(dofs); ++ return n; ++} ++ + void FiniteElementSpace::BuildConformingInterpolation() const + { + #ifdef MFEM_USE_MPI +diff --git a/fem/fespace.hpp b/fem/fespace.hpp +index 0fd44b613..cd0f861a0 100644 +--- a/fem/fespace.hpp ++++ b/fem/fespace.hpp +@@ -383,6 +383,10 @@ protected: + int GetEntityDofs(int entity, int index, Array &dofs, + Geometry::Type master_geom = Geometry::INVALID, + int variant = 0) const; ++ /// Helper to get vertex, edge or face VDOFs (entity=0,1,2 resp.). ++ int GetEntityVDofs(int entity, int index, Array &dofs, ++ Geometry::Type master_geom = Geometry::INVALID, ++ int variant = 0) const; + + // Get degenerate face DOFs: see explanation in method implementation. + int GetDegenerateFaceDofs(int index, Array &dofs, +@@ -840,6 +844,7 @@ public: + /// @brief Returns the indices of the degrees of freedom for the specified + /// face, including the DOFs for the edges and the vertices of the face. + /// ++ /// + /// In variable order spaces, multiple variants of DOFs can be returned. + /// See GetEdgeDofs() for more details. + /// @return Order of the selected variant, or -1 if there are no more +diff --git a/fem/gridfunc.cpp b/fem/gridfunc.cpp +index 52620452f..310d8d704 100644 +--- a/fem/gridfunc.cpp ++++ b/fem/gridfunc.cpp +@@ -2125,8 +2125,8 @@ void GridFunction::AccumulateAndCountBdrValues( + Vector vals; + Mesh *mesh = fes->GetMesh(); + NCMesh *ncmesh = mesh->ncmesh; +- Array bdr_edges, bdr_vertices; +- ncmesh->GetBoundaryClosure(attr, bdr_vertices, bdr_edges); ++ Array bdr_edges, bdr_vertices, bdr_faces; ++ ncmesh->GetBoundaryClosure(attr, bdr_vertices, bdr_edges, bdr_faces); + + for (i = 0; i < bdr_edges.Size(); i++) + { +@@ -2232,8 +2232,8 @@ void GridFunction::AccumulateAndCountBdrTangentValues( + { + Mesh *mesh = fes->GetMesh(); + NCMesh *ncmesh = mesh->ncmesh; +- Array bdr_edges, bdr_vertices; +- ncmesh->GetBoundaryClosure(bdr_attr, bdr_vertices, bdr_edges); ++ Array bdr_edges, bdr_vertices, bdr_faces; ++ ncmesh->GetBoundaryClosure(bdr_attr, bdr_vertices, bdr_edges, bdr_faces); + + for (int i = 0; i < bdr_edges.Size(); i++) + { +diff --git a/fem/gridfunc.hpp b/fem/gridfunc.hpp +index 245d00078..50d7c1105 100644 +--- a/fem/gridfunc.hpp ++++ b/fem/gridfunc.hpp +@@ -586,6 +586,10 @@ public: + return ComputeLpError(infinity(), exsol, NULL, NULL, irs); + } + ++ virtual double ComputeL1Error(Coefficient *exsol[], ++ const IntegrationRule *irs[] = NULL) const ++ { return ComputeW11Error(*exsol, NULL, 1, NULL, irs); } ++ + virtual double ComputeL1Error(Coefficient &exsol, + const IntegrationRule *irs[] = NULL) const + { return ComputeLpError(1.0, exsol, NULL, irs); } +diff --git a/fem/pfespace.cpp b/fem/pfespace.cpp +index 8761e1489..3fa9fe5c7 100644 +--- a/fem/pfespace.cpp ++++ b/fem/pfespace.cpp +@@ -90,20 +90,19 @@ ParNURBSExtension *ParFiniteElementSpace::MakeLocalNURBSext( + void ParFiniteElementSpace::ParInit(ParMesh *pm) + { + pmesh = pm; +- pncmesh = NULL; ++ pncmesh = nullptr; + + MyComm = pmesh->GetComm(); + NRanks = pmesh->GetNRanks(); + MyRank = pmesh->GetMyRank(); + +- gcomm = NULL; ++ gcomm = nullptr; + +- P = NULL; +- Pconf = NULL; ++ P = nullptr; ++ Pconf = nullptr; + nonconf_P = false; +- Rconf = NULL; +- R = NULL; +- ++ Rconf = nullptr; ++ R = nullptr; + num_face_nbr_dofs = -1; + + if (NURBSext && !pNURBSext()) +@@ -519,7 +518,7 @@ void ParFiniteElementSpace::GetBdrElementDofs(int i, Array &dofs, + int ParFiniteElementSpace::GetFaceDofs(int i, Array &dofs, + int variant) const + { +- if (face_dof && variant == 0) ++ if (face_dof != nullptr && variant == 0) + { + face_dof->GetRow(i, dofs); + return fec->GetOrder(); +@@ -1039,18 +1038,28 @@ void ParFiniteElementSpace::GetEssentialTrueDofs(const Array + #ifdef MFEM_DEBUG + // Verify that in boolean arithmetic: P^T ess_dofs = R ess_dofs. + Array true_ess_dofs2(true_ess_dofs.Size()); +- HypreParMatrix *Pt = Dof_TrueDof_Matrix()->Transpose(); ++ auto Pt = std::unique_ptr(Dof_TrueDof_Matrix()->Transpose()); ++ + const int *ess_dofs_data = ess_dofs.HostRead(); + Pt->BooleanMult(1, ess_dofs_data, 0, true_ess_dofs2); +- delete Pt; + int counter = 0; + const int *ted = true_ess_dofs.HostRead(); ++ std::string error_msg = "failed dof: "; + for (int i = 0; i < true_ess_dofs.Size(); i++) + { +- if (bool(ted[i]) != bool(true_ess_dofs2[i])) { counter++; } ++ if (bool(ted[i]) != bool(true_ess_dofs2[i])) ++ { ++ error_msg += std::to_string(i) += "(R "; ++ error_msg += std::to_string(bool(ted[i])) += " P^T "; ++ error_msg += std::to_string(bool(true_ess_dofs2[i])) += ") "; ++ ++counter; ++ } + } ++ MFEM_ASSERT(R->Height() == P->Width(), "!"); ++ MFEM_ASSERT(R->Width() == P->Height(), "!"); ++ MFEM_ASSERT(R->Width() == ess_dofs.Size(), "!"); + MFEM_VERIFY(counter == 0, "internal MFEM error: counter = " << counter +- << ", rank = " << MyRank); ++ << ", rank = " << MyRank << ", " << error_msg); + #endif + + MarkerToList(true_ess_dofs, ess_tdof_list); +@@ -1945,8 +1954,7 @@ struct PMatrixRow + elems.reserve(elems.size() + other.elems.size()); + for (const PMatrixElement &oei : other.elems) + { +- elems.push_back( +- PMatrixElement(oei.column, oei.stride, coef * oei.value)); ++ elems.emplace_back(oei.column, oei.stride, coef * oei.value); + } + } + +@@ -2022,7 +2030,7 @@ public: + void AddRow(int entity, int index, int edof, GroupId group, + const PMatrixRow &row) + { +- rows.push_back(RowInfo(entity, index, edof, group, row)); ++ rows.emplace_back(entity, index, edof, group, row); + } + + const std::vector& GetRows() const { return rows; } +@@ -2038,8 +2046,8 @@ protected: + ParNCMesh *pncmesh; + const FiniteElementCollection* fec; + +- virtual void Encode(int rank); +- virtual void Decode(int); ++ void Encode(int rank) override; ++ void Decode(int) override; + }; + + void NeighborRowMessage::Encode(int rank) +@@ -2158,6 +2166,11 @@ void NeighborRowMessage::Decode(int rank) + int fo = pncmesh->GetFaceOrientation(id.index); + ind = fec->DofOrderForOrientation(geom, fo); + } ++ // P2 tri faces have dofs that must be processed in pairs, as the doftransformation ++ // is not diagonal. ++ const bool process_dof_pairs = (ent == 2 && ++ fec->GetContType() == FiniteElementCollection::TANGENTIAL ++ && !Geometry::IsTensorProduct(geom)); + + #ifdef MFEM_DEBUG_PMATRIX + mfem::out << "Rank " << pncmesh->MyRank << " receiving from " << rank +@@ -2177,7 +2190,7 @@ void NeighborRowMessage::Decode(int rank) + + // Create a row for this entity, recording the index of the mesh + // element +- rows.push_back(RowInfo(ent, id.index, edof, group_ids[gi++])); ++ rows.emplace_back(ent, id.index, edof, group_ids[gi++]); + rows.back().row.read(stream, s); + + #ifdef MFEM_DEBUG_PMATRIX +@@ -2187,8 +2200,7 @@ void NeighborRowMessage::Decode(int rank) + << std::endl; + #endif + +- if (ent == 2 && fec->GetContType() == FiniteElementCollection::TANGENTIAL +- && !Geometry::IsTensorProduct(geom)) ++ if (process_dof_pairs) + { + // ND face dofs need to be processed together, as the transformation + // is given by a 2x2 matrix, so we manually apply an extra increment +@@ -2209,8 +2221,10 @@ void NeighborRowMessage::Decode(int rank) + // there is no hidden copying that could result in a dangling + // reference. + auto &first_row = rows.back().row; ++ + // This is the first "fundamental unit" used in the transformation. + const auto initial_first_row = first_row; ++ + // Extract the next dof too, and apply any dof order transformation + // expected. + const MeshId &next_id = ids[++i]; +@@ -2226,15 +2240,34 @@ void NeighborRowMessage::Decode(int rank) + edof = -1 - edof; + s *= -1.0; + } +- rows.push_back(RowInfo(ent, next_id.index, edof, group_ids[gi++])); ++ ++ rows.emplace_back(ent, next_id.index, edof, group_ids[gi++]); + rows.back().row.read(stream, s); + auto &second_row = rows.back().row; + + // This is the second "fundamental unit" used in the transformation. + const auto initial_second_row = second_row; ++ ++ // Transform the received dofs by the primal transform. This is ++ // because within mfem as a face is visited its orientation is ++ // asigned to match the element that visited it first. Thus on ++ // processor boundaries, the transform will always be identity ++ // going into the element. However, the sending processor also ++ // thought the face orientation was zero, so it has sent the ++ // information in a different orientation. To map onto the local ++ // orientation definition, extract the orientation of the sending ++ // rank (the lower rank face defines the orientation fo), then ++ // apply the transform to the dependencies. The action of this ++ // transform on the dependencies is performed by adding scaled ++ // versions of the original two rows (which by the mfem assumption ++ // of face orientation, represent the identity transform). ++ MFEM_ASSERT(fo != 2 && fo != 4, ++ "This code branch is ambiguous for face orientations 2 and 4." ++ " Please report this mesh for further testing.\n"); + const double *T = + ND_DofTransformation::GetFaceTransform(fo).GetData(); + ++ // Remove the identity matrix from the transformation. + first_row.AddRow(initial_first_row, T[0] - 1.0); + first_row.AddRow(initial_second_row, T[2]); + second_row.AddRow(initial_first_row, T[1]); +@@ -2410,7 +2443,7 @@ int ParFiniteElementSpace + + if (master_dofs.Size() == 0) { continue; } + +- const FiniteElement* fe = fec->FiniteElementForGeometry(mf.Geom()); ++ const FiniteElement * const fe = fec->FiniteElementForGeometry(mf.Geom()); + if (fe == nullptr) { continue; } + + switch (mf.Geom()) +@@ -2439,7 +2472,6 @@ int ParFiniteElementSpace + } + } + } +- + deps.Finalize(); + } + +@@ -2568,15 +2600,15 @@ int ParFiniteElementSpace + + // big container for all messages we send (the list is for iterations) + std::list send_msg; +- send_msg.push_back(NeighborRowMessage::Map()); ++ send_msg.emplace_back(); + + // put identity in P and R for true DOFs, set ldof_ltdof + for (int dof = 0, tdof = 0; dof < ndofs; dof++) + { + if (finalized[dof]) + { +- pmatrix[dof].elems.push_back( +- PMatrixElement(my_tdof_offset + vdim_factor*tdof, tdof_stride, 1.)); ++ pmatrix[dof].elems.emplace_back(my_tdof_offset + vdim_factor*tdof, tdof_stride, ++ 1.); + + // prepare messages to neighbors with identity rows + if (dof_group[dof] != 0) +@@ -2620,7 +2652,7 @@ int ParFiniteElementSpace + // prepare a new round of send buffers + if (send_msg.back().size()) + { +- send_msg.push_back(NeighborRowMessage::Map()); ++ send_msg.emplace_back(); + } + + // check for incoming messages, receive PMatrixRows +diff --git a/fem/pfespace.hpp b/fem/pfespace.hpp +index 72029be56..e74f66622 100644 +--- a/fem/pfespace.hpp ++++ b/fem/pfespace.hpp +@@ -124,8 +124,8 @@ private: + void GetGhostVertexDofs(const MeshId &id, Array &dofs) const; + void GetGhostEdgeDofs(const MeshId &edge_id, Array &dofs) const; + void GetGhostFaceDofs(const MeshId &face_id, Array &dofs) const; +- + void GetGhostDofs(int entity, const MeshId &id, Array &dofs) const; ++ + /// Return the dofs associated with the interior of the given mesh entity. + void GetBareDofs(int entity, int index, Array &dofs) const; + +diff --git a/fem/pgridfunc.cpp b/fem/pgridfunc.cpp +index 38b57a3e1..773221029 100644 +--- a/fem/pgridfunc.cpp ++++ b/fem/pgridfunc.cpp +@@ -736,7 +736,8 @@ void ParGridFunction::ProjectBdrCoefficientTangent(VectorCoefficient &vcoeff, + { + MFEM_ASSERT(pfes->GetLocalTDofNumber(i) == -1 || + bool(values_counter[i]) == bool(ess_vdofs_marker[i]), +- "internal error"); ++ "internal error: " << pfes->GetLocalTDofNumber(i) << ' ' << bool( ++ values_counter[i])); + } + #endif + } +diff --git a/fem/pgridfunc.hpp b/fem/pgridfunc.hpp +index bc422a260..041dc1c98 100644 +--- a/fem/pgridfunc.hpp ++++ b/fem/pgridfunc.hpp +@@ -112,12 +112,12 @@ public: + + ParFiniteElementSpace *ParFESpace() const { return pfes; } + +- virtual void Update(); ++ void Update() override; + + /// Associate a new FiniteElementSpace with the ParGridFunction. + /** The ParGridFunction is resized using the SetSize() method. The new space + @a f is expected to be a ParFiniteElementSpace. */ +- virtual void SetSpace(FiniteElementSpace *f); ++ void SetSpace(FiniteElementSpace *f) override; + + /// Associate a new parallel space with the ParGridFunction. + void SetSpace(ParFiniteElementSpace *f); +@@ -130,7 +130,7 @@ public: + ParGridFunction and sets the pointer @a v as external data in the + ParGridFunction. The new space @a f is expected to be a + ParFiniteElementSpace. */ +- virtual void MakeRef(FiniteElementSpace *f, double *v); ++ void MakeRef(FiniteElementSpace *f, double *v) override; + + /** @brief Make the ParGridFunction reference external data on a new + ParFiniteElementSpace. */ +@@ -147,7 +147,7 @@ public: + expected to be a ParFiniteElementSpace. + @note This version of the method will also perform bounds checks when + the build option MFEM_DEBUG is enabled. */ +- virtual void MakeRef(FiniteElementSpace *f, Vector &v, int v_offset); ++ void MakeRef(FiniteElementSpace *f, Vector &v, int v_offset) override; + + /** @brief Make the ParGridFunction reference external data on a new + ParFiniteElementSpace. */ +@@ -166,7 +166,7 @@ public: + void AddDistribute(double a, const Vector &tv) { AddDistribute(a, &tv); } + + /// Set the GridFunction from the given true-dof vector. +- virtual void SetFromTrueDofs(const Vector &tv) { Distribute(tv); } ++ void SetFromTrueDofs(const Vector &tv) override { Distribute(tv); } + + /// Short semantic for Distribute() + ParGridFunction &operator=(const HypreParVector &tv) +@@ -209,26 +209,26 @@ public: + const Vector &FaceNbrData() const { return face_nbr_data; } + + // Redefine to handle the case when i is a face-neighbor element +- virtual double GetValue(int i, const IntegrationPoint &ip, +- int vdim = 1) const; ++ double GetValue(int i, const IntegrationPoint &ip, ++ int vdim = 1) const override; + double GetValue(ElementTransformation &T) + { return GetValue(T, T.GetIntPoint()); } + + // Redefine to handle the case when T describes a face-neighbor element +- virtual double GetValue(ElementTransformation &T, const IntegrationPoint &ip, +- int comp = 0, Vector *tr = NULL) const; ++ double GetValue(ElementTransformation &T, const IntegrationPoint &ip, ++ int comp = 0, Vector *tr = NULL) const override; + +- virtual void GetVectorValue(int i, const IntegrationPoint &ip, +- Vector &val) const; ++ void GetVectorValue(int i, const IntegrationPoint &ip, ++ Vector &val) const override; + + // Redefine to handle the case when T describes a face-neighbor element +- virtual void GetVectorValue(ElementTransformation &T, +- const IntegrationPoint &ip, +- Vector &val, Vector *tr = NULL) const; ++ void GetVectorValue(ElementTransformation &T, ++ const IntegrationPoint &ip, ++ Vector &val, Vector *tr = NULL) const override; + + /** @brief For each vdof, counts how many elements contain the vdof, + as containment is determined by FiniteElementSpace::GetElementVDofs(). */ +- virtual void CountElementsPerVDof(Array &elem_per_vdof) const; ++ void CountElementsPerVDof(Array &elem_per_vdof) const override; + + /// Parallel version of GridFunction::GetDerivative(); see its documentation. + void GetDerivative(int comp, int der_comp, ParGridFunction &der); +@@ -237,112 +237,111 @@ public: + freedom of element @a el. If @a el is greater than or equal to the number + of local elements, it will be interpreted as a shifted index of a face + neighbor element. */ +- virtual void GetElementDofValues(int el, Vector &dof_vals) const; ++ void GetElementDofValues(int el, Vector &dof_vals) const override; + + using GridFunction::ProjectCoefficient; +- virtual void ProjectCoefficient(Coefficient &coeff); ++ void ProjectCoefficient(Coefficient &coeff) override; + + using GridFunction::ProjectDiscCoefficient; + /** @brief Project a discontinuous vector coefficient as a grid function on + a continuous finite element space. The values in shared dofs are + determined from the element with maximal attribute. */ +- virtual void ProjectDiscCoefficient(VectorCoefficient &coeff); ++ void ProjectDiscCoefficient(VectorCoefficient &coeff) override; + +- virtual void ProjectDiscCoefficient(Coefficient &coeff, AvgType type); ++ void ProjectDiscCoefficient(Coefficient &coeff, AvgType type) override; + +- virtual void ProjectDiscCoefficient(VectorCoefficient &vcoeff, AvgType type); ++ void ProjectDiscCoefficient(VectorCoefficient &vcoeff, AvgType type) override; + + using GridFunction::ProjectBdrCoefficient; + + // Only the values in the master are guaranteed to be correct! +- virtual void ProjectBdrCoefficient(VectorCoefficient &vcoeff, +- Array &attr) ++ void ProjectBdrCoefficient(VectorCoefficient &vcoeff, ++ Array &attr) override + { ProjectBdrCoefficient(NULL, &vcoeff, attr); } + + // Only the values in the master are guaranteed to be correct! +- virtual void ProjectBdrCoefficient(Coefficient *coeff[], Array &attr) ++ void ProjectBdrCoefficient(Coefficient *coeff[], Array &attr) override + { ProjectBdrCoefficient(coeff, NULL, attr); } + + // Only the values in the master are guaranteed to be correct! +- virtual void ProjectBdrCoefficientTangent(VectorCoefficient &vcoeff, +- Array &bdr_attr); ++ void ProjectBdrCoefficientTangent(VectorCoefficient &vcoeff, ++ Array &bdr_attr) override; + +- virtual double ComputeL1Error(Coefficient *exsol[], +- const IntegrationRule *irs[] = NULL) const ++ double ComputeL1Error(Coefficient *exsol[], ++ const IntegrationRule *irs[] = NULL) const override + { + return GlobalLpNorm(1.0, GridFunction::ComputeW11Error( + *exsol, NULL, 1, NULL, irs), pfes->GetComm()); + } + +- virtual double ComputeL1Error(Coefficient &exsol, +- const IntegrationRule *irs[] = NULL) const ++ double ComputeL1Error(Coefficient &exsol, ++ const IntegrationRule *irs[] = NULL) const override + { return ComputeLpError(1.0, exsol, NULL, irs); } + +- virtual double ComputeL1Error(VectorCoefficient &exsol, +- const IntegrationRule *irs[] = NULL) const ++ double ComputeL1Error(VectorCoefficient &exsol, ++ const IntegrationRule *irs[] = NULL) const override + { return ComputeLpError(1.0, exsol, NULL, NULL, irs); } + +- virtual double ComputeL2Error(Coefficient *exsol[], +- const IntegrationRule *irs[] = NULL, +- const Array *elems = NULL) const ++ double ComputeL2Error(Coefficient *exsol[], ++ const IntegrationRule *irs[] = NULL, ++ const Array *elems = NULL) const override + { + return GlobalLpNorm(2.0, GridFunction::ComputeL2Error(exsol, irs, elems), + pfes->GetComm()); + } + +- virtual double ComputeL2Error(Coefficient &exsol, +- const IntegrationRule *irs[] = NULL, +- const Array *elems = NULL) const ++ double ComputeL2Error(Coefficient &exsol, ++ const IntegrationRule *irs[] = NULL, ++ const Array *elems = NULL) const override + { + return GlobalLpNorm(2.0, GridFunction::ComputeL2Error(exsol, irs, elems), + pfes->GetComm()); + } + + +- virtual double ComputeL2Error(VectorCoefficient &exsol, +- const IntegrationRule *irs[] = NULL, +- const Array *elems = NULL) const ++ double ComputeL2Error(VectorCoefficient &exsol, ++ const IntegrationRule *irs[] = NULL, ++ const Array *elems = NULL) const override + { + return GlobalLpNorm(2.0, GridFunction::ComputeL2Error(exsol, irs, elems), + pfes->GetComm()); + } + + /// Returns ||grad u_ex - grad u_h||_L2 for H1 or L2 elements +- virtual double ComputeGradError(VectorCoefficient *exgrad, +- const IntegrationRule *irs[] = NULL) const ++ double ComputeGradError(VectorCoefficient *exgrad, ++ const IntegrationRule *irs[] = NULL) const override + { + return GlobalLpNorm(2.0, GridFunction::ComputeGradError(exgrad,irs), + pfes->GetComm()); + } + + /// Returns ||curl u_ex - curl u_h||_L2 for ND elements +- virtual double ComputeCurlError(VectorCoefficient *excurl, +- const IntegrationRule *irs[] = NULL) const ++ double ComputeCurlError(VectorCoefficient *excurl, ++ const IntegrationRule *irs[] = NULL) const override + { + return GlobalLpNorm(2.0, GridFunction::ComputeCurlError(excurl,irs), + pfes->GetComm()); + } + + /// Returns ||div u_ex - div u_h||_L2 for RT elements +- virtual double ComputeDivError(Coefficient *exdiv, +- const IntegrationRule *irs[] = NULL) const ++ double ComputeDivError(Coefficient *exdiv, ++ const IntegrationRule *irs[] = NULL) const override + { + return GlobalLpNorm(2.0, GridFunction::ComputeDivError(exdiv,irs), + pfes->GetComm()); + } + + /// Returns the Face Jumps error for L2 elements +- virtual double ComputeDGFaceJumpError(Coefficient *exsol, +- Coefficient *ell_coeff, +- JumpScaling jump_scaling, +- const IntegrationRule *irs[]=NULL) +- const; ++ double ComputeDGFaceJumpError(Coefficient *exsol, ++ Coefficient *ell_coeff, ++ JumpScaling jump_scaling, ++ const IntegrationRule *irs[]=NULL) const override; + + /// Returns either the H1-seminorm or the DG Face Jumps error or both + /// depending on norm_type = 1, 2, 3 +- virtual double ComputeH1Error(Coefficient *exsol, VectorCoefficient *exgrad, +- Coefficient *ell_coef, double Nu, +- int norm_type) const ++ double ComputeH1Error(Coefficient *exsol, VectorCoefficient *exgrad, ++ Coefficient *ell_coef, double Nu, ++ int norm_type) const override + { + return GlobalLpNorm(2.0, + GridFunction::ComputeH1Error(exsol,exgrad,ell_coef, +@@ -352,56 +351,56 @@ public: + + /// Returns the error measured in H1-norm for H1 elements or in "broken" + /// H1-norm for L2 elements +- virtual double ComputeH1Error(Coefficient *exsol, VectorCoefficient *exgrad, +- const IntegrationRule *irs[] = NULL) const ++ double ComputeH1Error(Coefficient *exsol, VectorCoefficient *exgrad, ++ const IntegrationRule *irs[] = NULL) const override + { + return GlobalLpNorm(2.0, GridFunction::ComputeH1Error(exsol,exgrad,irs), + pfes->GetComm()); + } + + /// Returns the error measured H(div)-norm for RT elements +- virtual double ComputeHDivError(VectorCoefficient *exsol, +- Coefficient *exdiv, +- const IntegrationRule *irs[] = NULL) const ++ double ComputeHDivError(VectorCoefficient *exsol, ++ Coefficient *exdiv, ++ const IntegrationRule *irs[] = NULL) const override + { + return GlobalLpNorm(2.0, GridFunction::ComputeHDivError(exsol,exdiv,irs), + pfes->GetComm()); + } + + /// Returns the error measured H(curl)-norm for ND elements +- virtual double ComputeHCurlError(VectorCoefficient *exsol, +- VectorCoefficient *excurl, +- const IntegrationRule *irs[] = NULL) const ++ double ComputeHCurlError(VectorCoefficient *exsol, ++ VectorCoefficient *excurl, ++ const IntegrationRule *irs[] = NULL) const override + { + return GlobalLpNorm(2.0, + GridFunction::ComputeHCurlError(exsol,excurl,irs), + pfes->GetComm()); + } + +- virtual double ComputeMaxError(Coefficient *exsol[], +- const IntegrationRule *irs[] = NULL) const ++ double ComputeMaxError(Coefficient *exsol[], ++ const IntegrationRule *irs[] = NULL) const override + { + return GlobalLpNorm(infinity(), + GridFunction::ComputeMaxError(exsol, irs), + pfes->GetComm()); + } + +- virtual double ComputeMaxError(Coefficient &exsol, +- const IntegrationRule *irs[] = NULL) const ++ double ComputeMaxError(Coefficient &exsol, ++ const IntegrationRule *irs[] = NULL) const override + { + return ComputeLpError(infinity(), exsol, NULL, irs); + } + +- virtual double ComputeMaxError(VectorCoefficient &exsol, +- const IntegrationRule *irs[] = NULL) const ++ double ComputeMaxError(VectorCoefficient &exsol, ++ const IntegrationRule *irs[] = NULL) const override + { + return ComputeLpError(infinity(), exsol, NULL, NULL, irs); + } + +- virtual double ComputeLpError(const double p, Coefficient &exsol, +- Coefficient *weight = NULL, +- const IntegrationRule *irs[] = NULL, +- const Array *elems = NULL) const ++ double ComputeLpError(const double p, Coefficient &exsol, ++ Coefficient *weight = NULL, ++ const IntegrationRule *irs[] = NULL, ++ const Array *elems = NULL) const override + { + return GlobalLpNorm(p, GridFunction::ComputeLpError(p, exsol, weight, irs, + elems), pfes->GetComm()); +@@ -410,23 +409,23 @@ public: + /** When given a vector weight, compute the pointwise (scalar) error as the + dot product of the vector error with the vector weight. Otherwise, the + scalar error is the l_2 norm of the vector error. */ +- virtual double ComputeLpError(const double p, VectorCoefficient &exsol, +- Coefficient *weight = NULL, +- VectorCoefficient *v_weight = NULL, +- const IntegrationRule *irs[] = NULL) const ++ double ComputeLpError(const double p, VectorCoefficient &exsol, ++ Coefficient *weight = NULL, ++ VectorCoefficient *v_weight = NULL, ++ const IntegrationRule *irs[] = NULL) const override + { + return GlobalLpNorm(p, GridFunction::ComputeLpError( + p, exsol, weight, v_weight, irs), pfes->GetComm()); + } + +- virtual void ComputeFlux(BilinearFormIntegrator &blfi, +- GridFunction &flux, +- bool wcoef = true, int subdomain = -1); ++ void ComputeFlux(BilinearFormIntegrator &blfi, ++ GridFunction &flux, ++ bool wcoef = true, int subdomain = -1) override; + + /** Save the local portion of the ParGridFunction. This differs from the + serial GridFunction::Save in that it takes into account the signs of + the local dofs. */ +- virtual void Save(std::ostream &out) const; ++ void Save(std::ostream &out) const override; + + /// Save the ParGridFunction to a single file (written using MPI rank 0). The + /// given @a precision will be used for ASCII output. +@@ -435,7 +434,7 @@ public: + /// Save the ParGridFunction to files (one for each MPI rank). The files will + /// be given suffixes according to the MPI rank. The given @a precision will + /// be used for ASCII output. +- virtual void Save(const char *fname, int precision=16) const; ++ void Save(const char *fname, int precision=16) const override; + + /// Returns a GridFunction on MPI rank @a save_rank that does not have any + /// duplication of vertices/nodes at processor boundaries. +@@ -452,15 +451,16 @@ public: + /** Save the local portion of the ParGridFunction. This differs from the + serial GridFunction::Save in that it takes into account the signs of + the local dofs. */ +- virtual void Save( ++ void Save( + adios2stream &out, const std::string &variable_name, +- const adios2stream::data_type type = adios2stream::data_type::point_data) const; ++ const adios2stream::data_type type = adios2stream::data_type::point_data) const ++ override; + #endif + + /// Merge the local grid functions + void SaveAsOne(std::ostream &out = mfem::out) const; + +- virtual ~ParGridFunction() { } ++ virtual ~ParGridFunction() = default; + }; + + +diff --git a/general/communication.hpp b/general/communication.hpp +index 46d4f9f21..a8fb8a1f6 100644 +--- a/general/communication.hpp ++++ b/general/communication.hpp +@@ -561,8 +561,8 @@ struct VarMessage + } + + protected: +- virtual void Encode(int rank) {} +- virtual void Decode(int rank) {} ++ virtual void Encode(int rank) = 0; ++ virtual void Decode(int rank) = 0; + }; + + +diff --git a/general/hash.hpp b/general/hash.hpp +index 288d51288..b517172aa 100644 +--- a/general/hash.hpp ++++ b/general/hash.hpp +@@ -335,6 +335,8 @@ public: + + iterator begin() { return iterator(Base::begin()); } + iterator end() { return iterator(); } ++ const_iterator begin() const { return const_iterator(Base::cbegin()); } ++ const_iterator end() const { return const_iterator(); } + + const_iterator cbegin() const { return const_iterator(Base::cbegin()); } + const_iterator cend() const { return const_iterator(); } +diff --git a/mesh/element.hpp b/mesh/element.hpp +index f1b003cae..ccd72724a 100644 +--- a/mesh/element.hpp ++++ b/mesh/element.hpp +@@ -57,12 +57,15 @@ public: + /// Set element's attribute. + inline void SetAttribute(const int attr) { attribute = attr; } + +- /// Set the indices the element according to the input. +- virtual void SetVertices(const int *ind); +- +- /// Returns element's vertices. ++ /// Get the indices defining the vertices + virtual void GetVertices(Array &v) const = 0; + ++ /// Set the indices defining the vertices ++ virtual void SetVertices(const Array &v) = 0; ++ ++ /// Set the indices the element according to the input. ++ virtual void SetVertices(const int *ind) = 0; ++ + /// @note The returned array should NOT be deleted by the caller. + virtual int *GetVertices() = 0; + +diff --git a/mesh/hexahedron.cpp b/mesh/hexahedron.cpp +index beeab3b6a..e86e209c1 100644 +--- a/mesh/hexahedron.cpp ++++ b/mesh/hexahedron.cpp +@@ -43,10 +43,18 @@ Hexahedron::Hexahedron(int ind1, int ind2, int ind3, int ind4, + void Hexahedron::GetVertices(Array &v) const + { + v.SetSize(8); +- for (int i = 0; i < 8; i++) +- { +- v[i] = indices[i]; +- } ++ std::copy(indices, indices + 8, v.begin()); ++} ++ ++void Hexahedron::SetVertices(const Array &v) ++{ ++ MFEM_ASSERT(v.Size() == 8, "!"); ++ std::copy(v.begin(), v.end(), indices); ++} ++ ++void Hexahedron::SetVertices(const int *ind) ++{ ++ std::copy(ind, ind + 8, indices); + } + + TriLinear3DFiniteElement HexahedronFE; +diff --git a/mesh/hexahedron.hpp b/mesh/hexahedron.hpp +index a8186c0c8..450cac0ce 100644 +--- a/mesh/hexahedron.hpp ++++ b/mesh/hexahedron.hpp +@@ -37,35 +37,42 @@ public: + int ind5, int ind6, int ind7, int ind8, int attr = 1); + + /// Return element's type +- Type GetType() const { return Element::HEXAHEDRON; } ++ Type GetType() const override { return Element::HEXAHEDRON; } + +- /// Returns the indices of the element's vertices. +- virtual void GetVertices(Array &v) const; ++ /// Get the indices defining the vertices ++ void GetVertices(Array &v) const override; + +- virtual int *GetVertices() { return indices; } ++ /// Set the indices defining the vertices ++ void SetVertices(const Array &v) override; + +- virtual int GetNVertices() const { return 8; } ++ /// @note The returned array should NOT be deleted by the caller. ++ int * GetVertices () override { return indices; } + +- virtual int GetNEdges() const { return 12; } ++ /// Set the vertices according to the given input. ++ void SetVertices(const int *ind) override; + +- virtual const int *GetEdgeVertices(int ei) const ++ int GetNVertices() const override { return 8; } ++ ++ int GetNEdges() const override { return 12; } ++ ++ const int *GetEdgeVertices(int ei) const override + { return geom_t::Edges[ei]; } + + /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. +- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const ++ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override + { nFaceVertices = 4; return 6; } + +- virtual int GetNFaces() const { return 6; } ++ int GetNFaces() const override { return 6; } + +- virtual int GetNFaceVertices(int) const { return 4; } ++ int GetNFaceVertices(int) const override { return 4; } + +- virtual const int *GetFaceVertices(int fi) const ++ const int *GetFaceVertices(int fi) const override + { return geom_t::FaceVert[fi]; } + +- virtual Element *Duplicate(Mesh *m) const ++ Element *Duplicate(Mesh *m) const override + { return new Hexahedron(indices, attribute); } + +- virtual ~Hexahedron() { } ++ virtual ~Hexahedron() = default; + }; + + extern MFEM_EXPORT class TriLinear3DFiniteElement HexahedronFE; +diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp +index 5f82de812..bf2fce576 100644 +--- a/mesh/mesh.cpp ++++ b/mesh/mesh.cpp +@@ -1415,6 +1415,7 @@ Geometry::Type Mesh::GetFaceGeometry(int Face) const + } + // ghost face + const int nc_face_id = faces_info[Face].NCFace; ++ + MFEM_ASSERT(nc_face_id >= 0, "parent ghost faces are not supported"); + return faces[nc_faces_info[nc_face_id].MasterFace]->GetGeometryType(); + } +@@ -1889,9 +1890,9 @@ int Mesh::AddBdrPoint(int v, int attr) + + void Mesh::GenerateBoundaryElements() + { +- for (int i = 0; i < boundary.Size(); i++) ++ for (auto &b : boundary) + { +- FreeElement(boundary[i]); ++ FreeElement(b); + } + + if (Dim == 3) +@@ -1902,9 +1903,9 @@ void Mesh::GenerateBoundaryElements() + + // count the 'NumOfBdrElements' + NumOfBdrElements = 0; +- for (int i = 0; i < faces_info.Size(); i++) ++ for (const auto &fi : faces_info) + { +- if (faces_info[i].Elem2No < 0) { NumOfBdrElements++; } ++ if (fi.Elem2No < 0) { ++NumOfBdrElements; } + } + + // Add the boundary elements +@@ -4403,7 +4404,7 @@ Mesh::Mesh(Mesh *orig_mesh, int ref_factor, int ref_type) + MakeRefined_(*orig_mesh, ref_factors, ref_type); + } + +-void Mesh::MakeRefined_(Mesh &orig_mesh, const Array ref_factors, ++void Mesh::MakeRefined_(Mesh &orig_mesh, const Array &ref_factors, + int ref_type) + { + SetEmpty(); +@@ -6189,22 +6190,22 @@ int Mesh::CheckBdrElementOrientation(bool fix_it) + { + // swap vertices 0 and 1 so that we don't change the marked edge: + // (0,1,2) -> (1,0,2) +- mfem::Swap(bv[0], bv[1]); ++ mfem::Swap(bv[0], bv[1]); + if (bel_to_edge) + { + int *be = bel_to_edge->GetRow(i); +- mfem::Swap(be[1], be[2]); ++ mfem::Swap(be[1], be[2]); + } + break; + } + case Element::QUADRILATERAL: + { +- mfem::Swap(bv[0], bv[2]); ++ mfem::Swap(bv[0], bv[2]); + if (bel_to_edge) + { + int *be = bel_to_edge->GetRow(i); +- mfem::Swap(be[0], be[1]); +- mfem::Swap(be[2], be[3]); ++ mfem::Swap(be[0], be[1]); ++ mfem::Swap(be[2], be[3]); + } + break; + } +@@ -6997,26 +6998,27 @@ void Mesh::AddQuadFaceElement(int lf, int gf, int el, + + void Mesh::GenerateFaces() + { +- int i, nfaces = GetNumFaces(); ++ int nfaces = GetNumFaces(); + +- for (i = 0; i < faces.Size(); i++) ++ for (auto &f : faces) + { +- FreeElement(faces[i]); ++ FreeElement(f); + } + + // (re)generate the interior faces and the info for them + faces.SetSize(nfaces); + faces_info.SetSize(nfaces); +- for (i = 0; i < nfaces; i++) ++ for (int i = 0; i < nfaces; ++i) + { + faces[i] = NULL; + faces_info[i].Elem1No = -1; + faces_info[i].NCFace = -1; + } +- for (i = 0; i < NumOfElements; i++) ++ ++ Array v; ++ for (int i = 0; i < NumOfElements; ++i) + { +- const int *v = elements[i]->GetVertices(); +- const int *ef; ++ elements[i]->GetVertices(v); + if (Dim == 1) + { + AddPointFaceElement(0, v[0], i); +@@ -7024,7 +7026,7 @@ void Mesh::GenerateFaces() + } + else if (Dim == 2) + { +- ef = el_to_edge->GetRow(i); ++ const int * const ef = el_to_edge->GetRow(i); + const int ne = elements[i]->GetNEdges(); + for (int j = 0; j < ne; j++) + { +@@ -7034,7 +7036,7 @@ void Mesh::GenerateFaces() + } + else + { +- ef = el_to_face->GetRow(i); ++ const int * const ef = el_to_face->GetRow(i); + switch (GetElementType(i)) + { + case Element::TETRAHEDRON: +@@ -7100,9 +7102,9 @@ void Mesh::GenerateNCFaceInfo() + { + MFEM_VERIFY(ncmesh, "missing NCMesh."); + +- for (int i = 0; i < faces_info.Size(); i++) ++ for (auto &x : faces_info) + { +- faces_info[i].NCFace = -1; ++ x.NCFace = -1; + } + + const NCMesh::NCList &list = +@@ -7114,9 +7116,8 @@ void Mesh::GenerateNCFaceInfo() + int nfaces = GetNumFaces(); + + // add records for master faces +- for (int i = 0; i < list.masters.Size(); i++) ++ for (const NCMesh::Master &master : list.masters) + { +- const NCMesh::Master &master = list.masters[i]; + if (master.index >= nfaces) { continue; } + + FaceInfo &master_fi = faces_info[master.index]; +@@ -7128,10 +7129,8 @@ void Mesh::GenerateNCFaceInfo() + } + + // add records for slave faces +- for (int i = 0; i < list.slaves.Size(); i++) ++ for (const NCMesh::Slave &slave : list.slaves) + { +- const NCMesh::Slave &slave = list.slaves[i]; +- + if (slave.index < 0 || // degenerate slave face + slave.index >= nfaces || // ghost slave + slave.master >= nfaces) // has ghost master +@@ -7222,7 +7221,7 @@ STable3D *Mesh::GetFacesTable() + + STable3D *Mesh::GetElementToFaceTable(int ret_ftbl) + { +- int i, *v; ++ Array v; + STable3D *faces_tbl; + + if (el_to_face != NULL) +@@ -7231,9 +7230,9 @@ STable3D *Mesh::GetElementToFaceTable(int ret_ftbl) + } + el_to_face = new Table(NumOfElements, 6); // must be 6 for hexahedra + faces_tbl = new STable3D(NumOfVertices); +- for (i = 0; i < NumOfElements; i++) ++ for (int i = 0; i < NumOfElements; i++) + { +- v = elements[i]->GetVertices(); ++ elements[i]->GetVertices(v); + switch (GetElementType(i)) + { + case Element::TETRAHEDRON: +@@ -7297,9 +7296,10 @@ STable3D *Mesh::GetElementToFaceTable(int ret_ftbl) + el_to_face->Finalize(); + NumOfFaces = faces_tbl->NumberOfElements(); + be_to_face.SetSize(NumOfBdrElements); +- for (i = 0; i < NumOfBdrElements; i++) ++ ++ for (int i = 0; i < NumOfBdrElements; i++) + { +- v = boundary[i]->GetVertices(); ++ boundary[i]->GetVertices(v); + switch (GetBdrElementType(i)) + { + case Element::TRIANGLE: +diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp +index a6957720a..81324399b 100644 +--- a/mesh/mesh.hpp ++++ b/mesh/mesh.hpp +@@ -592,7 +592,7 @@ protected: + void Make1D(int n, double sx = 1.0); + + /// Internal function used in Mesh::MakeRefined +- void MakeRefined_(Mesh &orig_mesh, const Array ref_factors, ++ void MakeRefined_(Mesh &orig_mesh, const Array &ref_factors, + int ref_type); + + /// Initialize vertices/elements/boundary/tables from a nonconforming mesh. +@@ -1698,7 +1698,7 @@ public: + }; + + /** @brief This structure is used as a human readable output format that +- decipheres the information contained in Mesh::FaceInfo when using the ++ deciphers the information contained in Mesh::FaceInfo when using the + Mesh::GetFaceInformation() method. + + The element indices in this structure don't need further processing, +diff --git a/mesh/ncmesh.cpp b/mesh/ncmesh.cpp +index ecb5fb90b..5c57adc4a 100644 +--- a/mesh/ncmesh.cpp ++++ b/mesh/ncmesh.cpp +@@ -19,6 +19,36 @@ + + #include "ncmesh_tables.hpp" + ++ ++namespace ++{ ++/** ++ * @brief Base case of convenience variadic max function. ++ * ++ * @tparam T Base type ++ * @param arg Recursion base value ++ * @return T value to max over ++ */ ++template ++T max(T&& arg) ++{ ++ return arg; ++} ++/** ++ * @brief Convenience variadic max function. ++ * ++ * @tparam T Base Type ++ * @tparam Ts Parameter pack of other types ++ * @param arg Singular argument ++ * @param args Pack of arguments ++ * @return T maximum value ++ */ ++template ++T max(T arg, Ts... args) ++{ ++ return std::max(std::forward(arg), max(args...)); ++} ++} // namespace + namespace mfem + { + +@@ -2449,18 +2479,21 @@ void NCMesh::GetMeshComponents(Mesh &mesh) const + // left uninitialized here; they will be initialized later by the Mesh from + // Nodes -- here we just make sure mesh.vertices has the correct size. + +- for (int i = 0; i < mesh.NumOfElements; i++) ++ for (auto &elem : mesh.elements) + { +- mesh.FreeElement(mesh.elements[i]); ++ mesh.FreeElement(elem); + } + mesh.elements.SetSize(0); + +- for (int i = 0; i < mesh.NumOfBdrElements; i++) ++ for (auto &elem : mesh.boundary) + { +- mesh.FreeElement(mesh.boundary[i]); ++ mesh.FreeElement(elem); + } + mesh.boundary.SetSize(0); + ++ // Save off boundary face vertices to make boundary elements later. ++ std::map> unique_boundary_faces; ++ + // create an mfem::Element for each leaf Element + for (int i = 0; i < NElements; i++) + { +@@ -2478,65 +2511,83 @@ void NCMesh::GetMeshComponents(Mesh &mesh) const + elem->GetVertices()[j] = nodes[node[j]].vert_index; + } + +- // create boundary elements +- // TODO: use boundary_faces? +- for (int k = 0; k < gi.nf; k++) ++ // Loop over faces and collect those marked as boundaries ++ for (int k = 0; k < gi.nf; ++k) + { +- const int* fv = gi.faces[k]; + const int nfv = gi.nfv[k]; +- const Face* face = faces.Find(node[fv[0]], node[fv[1]], +- node[fv[2]], node[fv[3]]); +- if (face->Boundary()) ++ const int * const fv = gi.faces[k]; ++ const auto id = faces.FindId(node[fv[0]], node[fv[1]], node[fv[2]], ++ node[fv[3]]); ++ if (id >= 0 && faces[id].Boundary()) + { +- if ((nc_elem.geom == Geometry::CUBE) || +- ((nc_elem.geom == Geometry::PRISM || +- nc_elem.geom == Geometry::PYRAMID) && nfv == 4)) +- { +- auto* quad = (Quadrilateral*) mesh.NewElement(Geometry::SQUARE); +- quad->SetAttribute(face->attribute); +- for (int j = 0; j < 4; j++) +- { +- quad->GetVertices()[j] = nodes[node[fv[j]]].vert_index; +- } +- mesh.boundary.Append(quad); +- } +- else if (nc_elem.geom == Geometry::PRISM || +- nc_elem.geom == Geometry::PYRAMID || +- nc_elem.geom == Geometry::TETRAHEDRON) ++ const auto &face = faces[id]; ++ if (face.elem[0] >= 0 && face.elem[1] >= 0 && ++ nc_elem.rank != std::min(elements[face.elem[0]].rank, ++ elements[face.elem[1]].rank)) + { +- MFEM_ASSERT(nfv == 3, ""); +- auto* tri = (Triangle*) mesh.NewElement(Geometry::TRIANGLE); +- tri->SetAttribute(face->attribute); +- for (int j = 0; j < 3; j++) +- { +- tri->GetVertices()[j] = nodes[node[fv[j]]].vert_index; +- } +- mesh.boundary.Append(tri); ++ // This is a conformal internal face, but this element is not the lowest ++ // ranking attached processor, thus not the owner of the face. ++ // Consequently, we do not add this face to avoid double ++ // counting. ++ continue; + } +- else if (nc_elem.geom == Geometry::SQUARE || +- nc_elem.geom == Geometry::TRIANGLE) ++ ++ // Add in all boundary faces that are actual boundaries or not masters of another face. ++ // The fv[2] in the edge split is on purpose. ++ if ((nfv == 4 && ++ QuadFaceNotMaster(node[fv[0]], node[fv[1]], node[fv[2]], node[fv[3]])) ++ || (nfv == 3 && TriFaceNotMaster(node[fv[0]], node[fv[1]], node[fv[2]])) ++ || (nfv == 2 && ++ EdgeSplitLevel(node[fv[0]], node[fv[2]] /* [2] not an error */) == 0)) + { +- auto* segment = (Segment*) mesh.NewElement(Geometry::SEGMENT); +- segment->SetAttribute(face->attribute); +- for (int j = 0; j < 2; j++) ++ // This face has no split faces below, it is conformal or a ++ // slave. ++ unique_boundary_faces[id].SetSize(nfv); ++ for (int v = 0; v < nfv; ++v) + { +- segment->GetVertices()[j] = nodes[node[fv[2*j]]].vert_index; ++ // Using a map overwrites if a face is visited twice. ++ // The nfv==2 is necessary because faces of 2D are storing the ++ // second index in the 2 slot, not the 1 slot. ++ unique_boundary_faces[id][v] = nodes[node[fv[(nfv==2) ? 2*v : v]]].vert_index; + } +- mesh.boundary.Append(segment); +- } +- else +- { +- MFEM_ASSERT(nc_elem.geom == Geometry::SEGMENT, ""); +- auto* point = (mfem::Point*) mesh.NewElement(Geometry::POINT); +- point->SetAttribute(face->attribute); +- point->GetVertices()[0] = nodes[node[fv[0]]].vert_index; +- mesh.boundary.Append(point); + } + } + } + } ++ ++ auto geom_from_nfv = [](int nfv) ++ { ++ switch (nfv) ++ { ++ case 1: return Geometry::POINT; ++ case 2: return Geometry::SEGMENT; ++ case 3: return Geometry::TRIANGLE; ++ case 4: return Geometry::SQUARE; ++ } ++ return Geometry::INVALID; ++ }; ++ ++ for (const auto &fv : unique_boundary_faces) ++ { ++ const auto f = fv.first; ++ const auto &v = fv.second; ++ const auto &face = faces.At(f); ++ ++ auto geom = geom_from_nfv(v.Size()); ++ ++ MFEM_ASSERT(geom != Geometry::INVALID, ++ "nfv: " << v.Size() << ++ " does not match a valid face geometry: Quad, Tri, Segment, Point"); ++ ++ // Add a new boundary element, with matching attribute and vertices ++ mesh.boundary.Append(mesh.NewElement(geom)); ++ auto * const be = mesh.boundary.Last(); ++ be->SetAttribute(face.attribute); ++ be->SetVertices(v); ++ } + } + ++ + void NCMesh::OnMeshUpdated(Mesh *mesh) + { + //// PART 1: pull indices of regular edges/faces from the Mesh +@@ -2651,13 +2702,14 @@ void NCMesh::OnMeshUpdated(Mesh *mesh) + for (int j = 0; j < gi.nf; j++) + { + const int *fv = gi.faces[j]; +- Face* face = faces.Find(el.node[fv[0]], el.node[fv[1]], +- el.node[fv[2]], el.node[fv[3]]); +- MFEM_ASSERT(face, "face not found!"); ++ int fid = faces.FindId(el.node[fv[0]], el.node[fv[1]], ++ el.node[fv[2]], el.node[fv[3]]); ++ MFEM_ASSERT(fid >= 0, "face not found!"); ++ auto &face = faces[fid]; + +- if (face->index < 0) ++ if (face.index < 0) + { +- face->index = NFaces + (nghosts++); ++ face.index = NFaces + (nghosts++); + + // store the face geometry + static const Geometry::Type types[5] = +@@ -2665,7 +2717,7 @@ void NCMesh::OnMeshUpdated(Mesh *mesh) + Geometry::INVALID, Geometry::INVALID, + Geometry::SEGMENT, Geometry::TRIANGLE, Geometry::SQUARE + }; +- face_geom[face->index] = types[gi.nfv[j]]; ++ face_geom[face.index] = types[gi.nfv[j]]; + } + } + } +@@ -2741,7 +2793,7 @@ bool NCMesh::TriFaceSplit(int v1, int v2, int v3, int mid[3]) const + + if (mid) { mid[0] = e1, mid[1] = e2, mid[2] = e3; } + +- // NOTE: face (v1, v2, v3) still needs to be checked ++ // This is necessary but not sufficient to determine if a face has been split. + return true; + } + +@@ -3157,6 +3209,7 @@ void NCMesh::BuildFaceList() + int fgeom = (node[3] >= 0) ? Geometry::SQUARE : Geometry::TRIANGLE; + + Face &fa = faces[face]; ++ bool is_master = false; + if (fa.elem[0] >= 0 && fa.elem[1] >= 0) + { + // this is a conforming face, add it to the list +@@ -3183,6 +3236,7 @@ void NCMesh::BuildFaceList() + if (sb < se) + { + // found slaves, so this is a master face; add it to the list ++ is_master = true; + face_list.masters.Append( + Master(fa.index, elem, j, fgeom, sb, se)); + +@@ -3194,7 +3248,8 @@ void NCMesh::BuildFaceList() + } + } + +- if (fa.Boundary()) { boundary_faces.Append(face); } ++ // To support internal boundaries can only insert non-master faces. ++ if (fa.Boundary() && !is_master) { boundary_faces.Append(face); } + } + } + +@@ -3270,20 +3325,22 @@ void NCMesh::BuildEdgeList() + // tell ParNCMesh about the edge + ElementSharesEdge(elem, j, enode); + +- // (2D only, store boundary faces) +- if (Dim <= 2) +- { +- int face = faces.FindId(node[0], node[0], node[1], node[1]); +- MFEM_ASSERT(face >= 0, "face not found!"); +- if (faces[face].Boundary()) { boundary_faces.Append(face); } +- } +- + // store element/local for later + edge_element[nd.edge_index] = elem; + edge_local[nd.edge_index] = j; + + // skip slave edges here, they will be reached from their masters +- if (GetEdgeMaster(enode) >= 0) { continue; } ++ if (GetEdgeMaster(enode) >= 0) ++ { ++ // (2D only, store internal boundary faces) ++ if (Dim <= 2) ++ { ++ int face = faces.FindId(node[0], node[0], node[1], node[1]); ++ MFEM_ASSERT(face >= 0, "face not found!"); ++ if (faces[face].Boundary()) { boundary_faces.Append(face); } ++ } ++ continue; ++ } + + // have we already processed this edge? skip if yes + if (processed_edges[enode]) { continue; } +@@ -3316,6 +3373,13 @@ void NCMesh::BuildEdgeList() + { + // no slaves, this is a conforming edge + edge_list.conforming.Append(MeshId(nd.edge_index, elem, j)); ++ // (2D only, store boundary faces) ++ if (Dim <= 2) ++ { ++ int face = faces.FindId(node[0], node[0], node[1], node[1]); ++ MFEM_ASSERT(face >= 0, "face not found!"); ++ if (faces[face].Boundary()) { boundary_faces.Append(face); } ++ } + } + } + } +@@ -3477,7 +3541,6 @@ NCMesh::NCList::BuildIndex() const + inv_index.emplace(slaves[i].index, std::make_pair(MeshIdType::SLAVE, i)); + } + } +- + MFEM_ASSERT(inv_index.size() > 0, + "Empty inverse index, member lists must be populated before BuildIndex is called!"); + } +@@ -5195,22 +5258,23 @@ void NCMesh::FindFaceNodes(int face, int node[4]) + } + + void NCMesh::GetBoundaryClosure(const Array &bdr_attr_is_ess, +- Array &bdr_vertices, Array &bdr_edges) ++ Array &bdr_vertices, Array &bdr_edges, ++ Array &bdr_faces) + { + bdr_vertices.SetSize(0); + bdr_edges.SetSize(0); ++ bdr_faces.SetSize(0); + + if (Dim == 3) + { + GetFaceList(); // make sure 'boundary_faces' is up to date + +- for (int i = 0; i < boundary_faces.Size(); i++) ++ for (int f : boundary_faces) + { +- int face = boundary_faces[i]; +- if (bdr_attr_is_ess[faces[face].attribute - 1]) ++ if (bdr_attr_is_ess[faces[f].attribute - 1]) + { + int node[4]; +- FindFaceNodes(face, node); ++ FindFaceNodes(f, node); + int nfv = (node[3] < 0) ? 3 : 4; + + for (int j = 0; j < nfv; j++) +@@ -5228,6 +5292,17 @@ void NCMesh::GetBoundaryClosure(const Array &bdr_attr_is_ess, + bdr_edges.Append(nodes[enode].edge_index); + } + } ++ ++ // If the face is a slave face, collect any non-ghost master face ++ const Face &face = faces[f]; ++ ++ const auto id_and_type = GetFaceList().GetMeshIdAndType(face.index); ++ if (id_and_type.type == NCList::MeshIdType::SLAVE) ++ { ++ // A slave face must mark any masters ++ const auto &slave_face_id = static_cast(*id_and_type.id); ++ bdr_faces.Append(slave_face_id.master); ++ } + } + } + } +@@ -5235,36 +5310,38 @@ void NCMesh::GetBoundaryClosure(const Array &bdr_attr_is_ess, + { + GetEdgeList(); // make sure 'boundary_faces' is up to date + +- for (int i = 0; i < boundary_faces.Size(); i++) ++ for (int f : boundary_faces) + { +- int face = boundary_faces[i]; +- Face &fc = faces[face]; +- if (bdr_attr_is_ess[fc.attribute - 1]) ++ Face &face = faces[f]; ++ if (bdr_attr_is_ess[face.attribute - 1]) ++ { ++ bdr_vertices.Append(nodes[face.p1].vert_index); ++ bdr_vertices.Append(nodes[face.p3].vert_index); ++ } ++ ++ const auto id_and_type = GetEdgeList().GetMeshIdAndType(face.index); ++ if (id_and_type.type == NCList::MeshIdType::SLAVE) + { +- bdr_vertices.Append(nodes[fc.p1].vert_index); +- bdr_vertices.Append(nodes[fc.p3].vert_index); ++ // A slave face must mark any masters ++ const auto &slave_edge_id = static_cast(*id_and_type.id); ++ bdr_edges.Append(slave_edge_id.master); + } + } + } + +- bdr_vertices.Sort(); +- bdr_vertices.Unique(); +- +- bdr_edges.Sort(); +- bdr_edges.Unique(); +-} ++ // Filter, sort and unique an array, so it contains only local unique values. ++ auto FilterSortUnique = [](Array &v, int N) ++ { ++ // Perform the O(N) filter before the O(NlogN) sort. ++ // begin -> it is only entries < N. ++ auto it = std::remove_if(v.begin(), v.end(), [N](int i) { return i >= N; }); ++ std::sort(v.begin(), it); ++ v.SetSize(std::distance(v.begin(), std::unique(v.begin(), it))); ++ }; + +-static int max4(int a, int b, int c, int d) +-{ +- return std::max(std::max(a, b), std::max(c, d)); +-} +-static int max6(int a, int b, int c, int d, int e, int f) +-{ +- return std::max(max4(a, b, c, d), std::max(e, f)); +-} +-static int max8(int a, int b, int c, int d, int e, int f, int g, int h) +-{ +- return std::max(max4(a, b, c, d), max4(e, f, g, h)); ++ FilterSortUnique(bdr_vertices, NVertices); ++ FilterSortUnique(bdr_edges, NEdges); ++ FilterSortUnique(bdr_faces, NFaces); + } + + int NCMesh::EdgeSplitLevel(int vn1, int vn2) const +@@ -5280,15 +5357,13 @@ int NCMesh::TriFaceSplitLevel(int vn1, int vn2, int vn3) const + if (TriFaceSplit(vn1, vn2, vn3, mid) && + faces.FindId(vn1, vn2, vn3) < 0) + { +- return 1 + max4(TriFaceSplitLevel(vn1, mid[0], mid[2]), +- TriFaceSplitLevel(mid[0], vn2, mid[1]), +- TriFaceSplitLevel(mid[2], mid[1], vn3), +- TriFaceSplitLevel(mid[0], mid[1], mid[2])); +- } +- else // not split +- { +- return 0; ++ return 1 + max(TriFaceSplitLevel(vn1, mid[0], mid[2]), ++ TriFaceSplitLevel(mid[0], vn2, mid[1]), ++ TriFaceSplitLevel(mid[2], mid[1], vn3), ++ TriFaceSplitLevel(mid[0], mid[1], mid[2])); + } ++ ++ return 0; // not split + } + + void NCMesh::QuadFaceSplitLevel(int vn1, int vn2, int vn3, int vn4, +@@ -5318,6 +5393,13 @@ void NCMesh::QuadFaceSplitLevel(int vn1, int vn2, int vn3, int vn4, + } + } + ++int NCMesh::QuadFaceSplitLevel(int vn1, int vn2, int vn3, int vn4) const ++{ ++ int h_level, v_level; ++ QuadFaceSplitLevel(vn1, vn2, vn3, vn4, h_level, v_level); ++ return h_level + v_level; ++} ++ + void NCMesh::CountSplits(int elem, int splits[3]) const + { + const Element &el = elements[elem]; +@@ -5354,57 +5436,52 @@ void NCMesh::CountSplits(int elem, int splits[3]) const + + if (el.Geom() == Geometry::CUBE) + { +- splits[0] = max8(flevel[0][0], flevel[1][0], flevel[3][0], flevel[5][0], +- elevel[0], elevel[2], elevel[4], elevel[6]); ++ splits[0] = max(flevel[0][0], flevel[1][0], flevel[3][0], flevel[5][0], ++ elevel[0], elevel[2], elevel[4], elevel[6]); + +- splits[1] = max8(flevel[0][1], flevel[2][0], flevel[4][0], flevel[5][1], +- elevel[1], elevel[3], elevel[5], elevel[7]); ++ splits[1] = max(flevel[0][1], flevel[2][0], flevel[4][0], flevel[5][1], ++ elevel[1], elevel[3], elevel[5], elevel[7]); + +- splits[2] = max8(flevel[1][1], flevel[2][1], flevel[3][1], flevel[4][1], +- elevel[8], elevel[9], elevel[10], elevel[11]); ++ splits[2] = max(flevel[1][1], flevel[2][1], flevel[3][1], flevel[4][1], ++ elevel[8], elevel[9], elevel[10], elevel[11]); + } + else if (el.Geom() == Geometry::PRISM) + { +- splits[0] = splits[1] = +- std::max( +- max6(flevel[0][0], flevel[1][0], 0, +- flevel[2][0], flevel[3][0], flevel[4][0]), +- max6(elevel[0], elevel[1], elevel[2], +- elevel[3], elevel[4], elevel[5])); ++ splits[0] = splits[1] = max(flevel[0][0], flevel[1][0], 0, ++ flevel[2][0], flevel[3][0], flevel[4][0], ++ elevel[0], elevel[1], elevel[2], ++ elevel[3], elevel[4], elevel[5]); + +- splits[2] = max6(flevel[2][1], flevel[3][1], flevel[4][1], +- elevel[6], elevel[7], elevel[8]); ++ splits[2] = max(flevel[2][1], flevel[3][1], flevel[4][1], ++ elevel[6], elevel[7], elevel[8]); + } + else if (el.Geom() == Geometry::PYRAMID) + { +- splits[0] = std::max( +- max6(flevel[0][0], flevel[1][0], 0, +- flevel[2][0], flevel[3][0], flevel[4][0]), +- max8(elevel[0], elevel[1], elevel[2], +- elevel[3], elevel[4], elevel[5], +- elevel[6], elevel[7])); ++ splits[0] = max(flevel[0][0], flevel[1][0], 0, ++ flevel[2][0], flevel[3][0], flevel[4][0], ++ elevel[0], elevel[1], elevel[2], ++ elevel[3], elevel[4], elevel[5], ++ elevel[6], elevel[7]); + + splits[1] = splits[0]; + splits[2] = splits[0]; + } + else if (el.Geom() == Geometry::TETRAHEDRON) + { +- splits[0] = std::max( +- max4(flevel[0][0], flevel[1][0], flevel[2][0], flevel[3][0]), +- max6(elevel[0], elevel[1], elevel[2], +- elevel[3], elevel[4], elevel[5])); ++ splits[0] = max(flevel[0][0], flevel[1][0], flevel[2][0], flevel[3][0], ++ elevel[0], elevel[1], elevel[2], elevel[3], elevel[4], elevel[5]); + + splits[1] = splits[0]; + splits[2] = splits[0]; + } + else if (el.Geom() == Geometry::SQUARE) + { +- splits[0] = std::max(elevel[0], elevel[2]); +- splits[1] = std::max(elevel[1], elevel[3]); ++ splits[0] = max(elevel[0], elevel[2]); ++ splits[1] = max(elevel[1], elevel[3]); + } + else if (el.Geom() == Geometry::TRIANGLE) + { +- splits[0] = std::max(elevel[0], std::max(elevel[1], elevel[2])); ++ splits[0] = max(elevel[0], elevel[1], elevel[2]); + splits[1] = splits[0]; + } + else +@@ -6377,17 +6454,17 @@ void NCMesh::DebugDump(std::ostream &os) const + + // dump faces + os << faces.Size() << "\n"; +- for (auto face = faces.cbegin(); face != faces.cend(); ++face) ++ for (const auto &face : faces) + { +- int elem = face->elem[0]; +- if (elem < 0) { elem = face->elem[1]; } ++ int elem = face.elem[0]; ++ if (elem < 0) { elem = face.elem[1]; } + MFEM_ASSERT(elem >= 0, ""); + const Element &el = elements[elem]; + + int lf = find_local_face(el.Geom(), +- find_node(el, face->p1), +- find_node(el, face->p2), +- find_node(el, face->p3)); ++ find_node(el, face.p1), ++ find_node(el, face.p2), ++ find_node(el, face.p3)); + + const int* fv = GI[el.Geom()].faces[lf]; + const int nfv = GI[el.Geom()].nfv[lf]; +@@ -6397,7 +6474,7 @@ void NCMesh::DebugDump(std::ostream &os) const + { + os << " " << el.node[fv[i]]; + } +- //os << " # face " << face.index() << ", index " << face->index << "\n"; ++ //os << " # face " << face.index() << ", index " << face.index << "\n"; + os << "\n"; + } + } +diff --git a/mesh/ncmesh.hpp b/mesh/ncmesh.hpp +index 8ac50d342..b004cf43e 100644 +--- a/mesh/ncmesh.hpp ++++ b/mesh/ncmesh.hpp +@@ -293,6 +293,7 @@ public: + mutable std::unordered_map> inv_index; + }; + ++ + /// Return the current list of conforming and nonconforming faces. + const NCList& GetFaceList() + { +@@ -392,11 +393,13 @@ public: + /** Get a list of vertices (2D/3D) and edges (3D) that coincide with boundary + elements with the specified attributes (marked in 'bdr_attr_is_ess'). + In 3D this function also reveals "hidden" boundary edges. In parallel it +- helps identifying boundary vertices/edges affected by non-local boundary +- elements. */ ++ helps identifying boundary vertices/edges/faces affected by non-local boundary ++ elements. Hidden faces can occur for an internal boundary coincident to a processor ++ boundary. ++ */ + virtual void GetBoundaryClosure(const Array &bdr_attr_is_ess, + Array &bdr_vertices, +- Array &bdr_edges); ++ Array &bdr_edges, Array &bdr_faces); + + /// Return element geometry type. @a index is the Mesh element number. + Geometry::Type GetElementGeometry(int index) const +@@ -456,7 +459,6 @@ protected: // non-public interface for the Mesh class + by calling Mesh::SetCurvature or otherwise setting the Nodes. */ + void MakeTopologyOnly() { coordinates.DeleteAll(); } + +- + protected: // implementation + + int Dim, spaceDim; ///< dimensions of the elements and the vertex coordinates +@@ -594,7 +596,6 @@ protected: // implementation + + Table element_vertex; ///< leaf-element to vertex table, see FindSetNeighbors + +- + /// Update the leaf elements indices in leaf_elements + void UpdateLeafElements(); + +@@ -712,10 +713,79 @@ protected: // implementation + + mfem::Element* NewMeshElement(int geom) const; + +- int QuadFaceSplitType(int v1, int v2, int v3, int v4, int mid[5] ++ /** ++ * @brief Given a quad face defined by four vertices, establish which edges ++ * of this face have been split, and if so optionally return the mid points ++ * of those edges. ++ * ++ * @param n1 The first node defining the face ++ * @param n2 The second node defining the face ++ * @param n3 The third node defining the face ++ * @param n4 The fourth node defining the face ++ * @param mid optional return of the edge mid points. ++ * @return int 0 -- no split, 1 -- "vertical" split, 2 -- "horizontal" split ++ */ ++ int QuadFaceSplitType(int n1, int n2, int n3, int n4, int mid[5] + = NULL /*optional output of mid-edge nodes*/) const; + +- bool TriFaceSplit(int v1, int v2, int v3, int mid[3] = NULL) const; ++ /** ++ * @brief Given a tri face defined by three vertices, establish whether the ++ * edges that make up this face have been split, and if so optionally return ++ * the midpoints. ++ * @details This is a necessary condition for this face to have been split, ++ * but is not sufficient. Consider a triangle attached to three refined ++ * triangles, in this scenario all edges can be split but this face not be ++ * split. In this case, it is necessary to check if there is a face made up ++ * of the returned midpoint nodes. ++ * ++ * @param n1 The first node defining the face ++ * @param n2 The second node defining the face ++ * @param n3 The third node defining the face ++ * @param mid optional return of the edge mid points. ++ * @return true Splits for all edges have been found ++ * @return false ++ */ ++ bool TriFaceSplit(int n1, int n2, int n3, int mid[3] = NULL) const; ++ ++ /** ++ * @brief Determine if a Triangle face is not a master ++ * @details This check requires looking for the edges making up the triangle ++ * being split, if nodes exist at their midpoints, and there are vertices at ++ * them, this implies the face COULD be split. To determine if it is, we then ++ * check whether these midpoints have all been connected, this is required to ++ * discriminate between an internal master face surrounded by nonconformal ++ * refinements and a conformal boundary face surrounded by refinements. ++ * ++ * @param n1 The first node defining the face ++ * @param n2 The second node defining the face ++ * @param n3 The third node defining the face ++ * @return true The face is not a master ++ * @return false The face is a master ++ */ ++ inline bool TriFaceNotMaster(int n1, int n2, int n3) const ++ { ++ int mid[3]; ++ return !TriFaceSplit(n1, n2, n3, mid) // The edges aren't split ++ // OR none of the midpoints are connected. ++ || (nodes.FindId(mid[0], mid[1]) < 0 && ++ nodes.FindId(mid[0], mid[2]) < 0 && ++ nodes.FindId(mid[1], mid[2]) < 0); ++ } ++ ++ /** ++ * @brief Determine if a Quad face is not a master ++ * ++ * @param n1 The first node defining the face ++ * @param n2 The second node defining the face ++ * @param n3 The third node defining the face ++ * @param n4 The fourth node defining the face ++ * @return true The quad face is not a master ++ * @return false The quad face is a master ++ */ ++ inline bool QuadFaceNotMaster(int n1, int n2, int n3, int n4) const ++ { ++ return QuadFaceSplitType(n1, n2, n3, n4) == 0; ++ } + + void ForceRefinement(int vn1, int vn2, int vn3, int vn4); + +@@ -792,7 +862,6 @@ protected: // implementation + virtual void ElementSharesEdge(int elem, int local, int enode) {} // ParNCMesh + virtual void ElementSharesVertex(int elem, int local, int vnode) {} // ParNCMesh + +- + // neighbors / element_vertex table + + /** Return all vertex-, edge- and face-neighbors of a set of elements. +@@ -981,9 +1050,7 @@ protected: // implementation + void InitDerefTransforms(); + void SetDerefMatrixCodes(int parent, Array &fine_coarse); + +- + // vertex temporary data, used by GetMeshComponents +- + struct TmpVertex + { + bool valid, visited; +@@ -1002,10 +1069,56 @@ protected: // implementation + + void FindFaceNodes(int face, int node[4]); + ++ /** ++ * @brief Return the number of splits of this edge that have occurred in the ++ * NCMesh. If zero, this means the segment is not the master of any other segments. ++ * ++ * @param vn1 The first vertex making up the segment ++ * @param vn2 The second vertex making up the segment ++ * @return int The depth of splits of this segment that are present in the mesh. ++ */ + int EdgeSplitLevel(int vn1, int vn2) const; ++ /** ++ * @brief Return the number of splits of this triangle that have occurred in ++ * the NCMesh. If zero, this means the triangle is neither split, nor the ++ * master of a split face. ++ * ++ * @param vn1 The first vertex making up the triangle ++ * @param vn2 The second vertex making up the triangle ++ * @param vn3 The third vertex making up the triangle ++ * @return int The depth of splits of this triangle that are present in the mesh. ++ */ + int TriFaceSplitLevel(int vn1, int vn2, int vn3) const; ++ /** ++ * @brief Computes the number of horizontal and vertical splits of this quad ++ * that have occurred in the NCMesh. If zero, this means the quad is not ++ * the master of any other quad. ++ * ++ * @param vn1 The first vertex making up the quad ++ * @param vn2 The second vertex making up the quad ++ * @param vn3 The third vertex making up the quad ++ * @param vn4 The fourth vertex making up the quad ++ * @param h_level The number of "horizontal" splits of the quad ++ * @param v_level The number of "vertical" splits of the quad ++ */ + void QuadFaceSplitLevel(int vn1, int vn2, int vn3, int vn4, + int& h_level, int& v_level) const; ++ /** ++ * @brief Returns the total number of splits of this quad that have occurred ++ * in the NCMesh. If zero, this means the quad is not ++ * the master of any other quad. ++ * @details This is a convenience wrapper that sums the horizontal and ++ * vertical levels from the full method. ++ * ++ * @param vn1 The first vertex making up the quad ++ * @param vn2 The second vertex making up the quad ++ * @param vn3 The third vertex making up the quad ++ * @param vn4 The fourth vertex making up the quad ++ * @return int The depth of splits of this triangle that are present in the ++ * mesh. NB: An isotropic refinement has a level of 2, one horizontal split, ++ * followed by a vertical split. ++ */ ++ int QuadFaceSplitLevel(int vn1, int vn2, int vn3, int vn4) const; + + void CountSplits(int elem, int splits[3]) const; + void GetLimitRefinements(Array &refinements, int max_level); +@@ -1042,7 +1155,6 @@ protected: // implementation + /// Load the deprecated MFEM mesh v1.1 format for backward compatibility. + void LoadLegacyFormat(std::istream &input, int &curved, int &is_nc); + +- + // geometry + + /// This holds in one place the constants about the geometries we support +diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp +index 47a091c04..967b448fc 100644 +--- a/mesh/pmesh.cpp ++++ b/mesh/pmesh.cpp +@@ -364,11 +364,8 @@ int ParMesh::BuildLocalVertices(const mfem::Mesh &mesh, + int ParMesh::BuildLocalElements(const Mesh& mesh, const int* partitioning, + const Array& vert_global_local) + { +- int nelems = 0; +- for (int i = 0; i < mesh.GetNE(); i++) +- { +- if (partitioning[i] == MyRank) { nelems++; } +- } ++ const int nelems = std::count_if(partitioning, ++ partitioning + mesh.GetNE(), [this](int i) { return i == MyRank;}); + + elements.SetSize(nelems); + +@@ -387,7 +384,7 @@ int ParMesh::BuildLocalElements(const Mesh& mesh, const int* partitioning, + { + v[j] = vert_global_local[v[j]]; + } +- element_counter++; ++ ++element_counter; + } + } + +@@ -400,7 +397,6 @@ int ParMesh::BuildLocalBoundary(const Mesh& mesh, const int* partitioning, + Table*& edge_element) + { + int nbdry = 0; +- + if (mesh.NURBSext) + { + activeBdrElem.SetSize(mesh.GetNBE()); +@@ -2102,7 +2098,7 @@ void ParMesh::ExchangeFaceNbrData() + + if (Nonconforming()) + { +- // with ParNCMesh we can set up face neighbors mostly without communication ++ // With ParNCMesh we can set up face neighbors mostly without communication. + pncmesh->GetFaceNeighbors(*this); + have_face_nbr_data = true; + +diff --git a/mesh/pncmesh.cpp b/mesh/pncmesh.cpp +index cd6625e9c..169c5c6bb 100644 +--- a/mesh/pncmesh.cpp ++++ b/mesh/pncmesh.cpp +@@ -577,41 +577,81 @@ void ParNCMesh::CalcFaceOrientations() + face_orient.SetSize(NFaces); + face_orient = 0; + +- for (auto face = faces.begin(); face != faces.end(); ++face) ++ for (auto face : faces) + { +- if (face->elem[0] >= 0 && face->elem[1] >= 0 && face->index < NFaces) ++ if (face.elem[0] >= 0 && face.elem[1] >= 0 && face.index < NFaces) + { +- Element *e1 = &elements[face->elem[0]]; +- Element *e2 = &elements[face->elem[1]]; ++ Element *e1 = &elements[face.elem[0]]; ++ Element *e2 = &elements[face.elem[1]]; + + if (e1->rank == e2->rank) { continue; } + if (e1->rank > e2->rank) { std::swap(e1, e2); } + +- face_orient[face->index] = get_face_orientation(*face, *e1, *e2); ++ face_orient[face.index] = get_face_orientation(face, *e1, *e2); + } + } + } + + void ParNCMesh::GetBoundaryClosure(const Array &bdr_attr_is_ess, + Array &bdr_vertices, +- Array &bdr_edges) ++ Array &bdr_edges, Array &bdr_faces) + { +- NCMesh::GetBoundaryClosure(bdr_attr_is_ess, bdr_vertices, bdr_edges); ++ NCMesh::GetBoundaryClosure(bdr_attr_is_ess, bdr_vertices, bdr_edges, bdr_faces); + +- int i, j; +- // filter out ghost vertices +- for (i = j = 0; i < bdr_vertices.Size(); i++) ++ if (Dim == 3) + { +- if (bdr_vertices[i] < NVertices) { bdr_vertices[j++] = bdr_vertices[i]; } ++ // Mark masters of shared slave boundary faces as essential boundary faces. Some ++ // master faces may only have slave children. ++ for (const auto &mf : shared_faces.masters) ++ { ++ if (elements[mf.element].rank != MyRank) { continue; } ++ for (int j = mf.slaves_begin; j < mf.slaves_end; j++) ++ { ++ const auto &sf = GetFaceList().slaves[j]; ++ if (sf.index < 0) ++ { ++ // Edge-face constraint. Skip this edge. ++ continue; ++ } ++ Face *face = GetFace(elements[sf.element], sf.local); ++ if (face && face->Boundary() && bdr_attr_is_ess[face->attribute - 1]) ++ { ++ bdr_faces.Append(mf.index); ++ } ++ } ++ } + } +- bdr_vertices.SetSize(j); +- +- // filter out ghost edges +- for (i = j = 0; i < bdr_edges.Size(); i++) ++ else if (Dim == 2) + { +- if (bdr_edges[i] < NEdges) { bdr_edges[j++] = bdr_edges[i]; } ++ // Mark masters of shared slave boundary edges as essential boundary edges. Some ++ // master edges may only have slave children. ++ for (const auto &me : shared_edges.masters) ++ { ++ if (elements[me.element].rank != MyRank) { continue; } ++ for (int j = me.slaves_begin; j < me.slaves_end; j++) ++ { ++ const auto &se = GetEdgeList().slaves[j]; ++ Face *face = GetFace(elements[se.element], se.local); ++ if (face && face->Boundary() && bdr_attr_is_ess[face->attribute - 1]) ++ { ++ bdr_edges.Append(me.index); ++ } ++ } ++ } + } +- bdr_edges.SetSize(j); ++ ++ // Filter, sort and unique an array, so it contains only local unique values. ++ auto FilterSortUnique = [](Array &v, int N) ++ { ++ // Perform the O(N) filter before the O(NlogN) sort. ++ auto local = std::remove_if(v.begin(), v.end(), [N](int i) { return i >= N; }); ++ std::sort(v.begin(), local); ++ v.SetSize(std::distance(v.begin(), std::unique(v.begin(), local))); ++ }; ++ ++ FilterSortUnique(bdr_vertices, NVertices); ++ FilterSortUnique(bdr_edges, NEdges); ++ FilterSortUnique(bdr_faces, NFaces); + } + + +@@ -698,9 +738,9 @@ static void set_to_array(const std::set &set, Array &array) + { + array.Reserve(set.size()); + array.SetSize(0); +- for (std::set::iterator it = set.begin(); it != set.end(); ++it) ++ for (auto x : set) + { +- array.Append(*it); ++ array.Append(x); + } + } + +@@ -789,8 +829,10 @@ void ParNCMesh::GetConformingSharedStructures(ParMesh &pmesh) + for (int ent = 0; ent < Dim; ent++) + { + GetSharedList(ent); +- MFEM_VERIFY(entity_conf_group[ent].Size(), "internal error"); +- MFEM_VERIFY(entity_elem_local[ent].Size(), "internal error"); ++ MFEM_VERIFY(entity_conf_group[ent].Size() || ++ pmesh.GetNE() == 0, "Non empty partitions must be connected"); ++ MFEM_VERIFY(entity_elem_local[ent].Size() || ++ pmesh.GetNE() == 0, "Non empty partitions must be connected"); + } + } + +@@ -1119,7 +1161,7 @@ void ParNCMesh::GetFaceNeighbors(ParMesh &pmesh) + bool sloc = (sfe.rank == MyRank); + bool mloc = (mfe.rank == MyRank); + if (sloc == mloc // both or neither face is owned by this processor +- || sf.index < 0) // the face is degenerate (i.e. a face-edge constraint) ++ || sf.index < 0) // the face is degenerate (i.e. a edge-face constraint) + { + continue; + } +@@ -1307,8 +1349,6 @@ void ParNCMesh::GetFaceNeighbors(ParMesh &pmesh) + MPI_Waitall(int(send_requests.size()), send_requests.data(), status.data()); + } + } +- +- + // NOTE: this function skips ParMesh::send_face_nbr_vertices and + // ParMesh::face_nbr_vertices_offset, these are not used outside of ParMesh + } +@@ -1322,7 +1362,6 @@ void ParNCMesh::ClearAuxPM() + aux_pm_store.DeleteAll(); + } + +- + //// Prune, Refine, Derefine /////////////////////////////////////////////////// + + bool ParNCMesh::PruneTree(int elem) +@@ -1953,10 +1992,9 @@ void ParNCMesh::RedistributeElements(Array &new_ranks, int target_elements, + NeighborElementRankMessage::RecvAll(recv_ghost_ranks, MyComm); + + // read new ranks for the ghost layer from messages received +- NeighborElementRankMessage::Map::iterator it; +- for (it = recv_ghost_ranks.begin(); it != recv_ghost_ranks.end(); ++it) ++ for (auto &kv : recv_ghost_ranks) + { +- NeighborElementRankMessage &msg = it->second; ++ NeighborElementRankMessage &msg = kv.second; + for (int i = 0; i < msg.Size(); i++) + { + int ghost_index = elements[msg.elements[i]].index; +@@ -2483,9 +2521,8 @@ void ParNCMesh::AdjustMeshIds(Array ids[], int rank) + + // find vertices/edges of master faces shared with 'rank', and modify their + // MeshIds so their element/local matches the element of the master face +- for (int i = 0; i < shared_faces.masters.Size(); i++) ++ for (const MeshId &face_id : shared_faces.masters) + { +- const MeshId &face_id = shared_faces.masters[i]; + if (contains_rank[entity_pmat_group[2][face_id.index]]) + { + int v[4], e[4], eo[4], pos, k; +diff --git a/mesh/pncmesh.hpp b/mesh/pncmesh.hpp +index df5fb929e..d33f5cbbe 100644 +--- a/mesh/pncmesh.hpp ++++ b/mesh/pncmesh.hpp +@@ -229,10 +229,11 @@ public: + const Table &deref_table); + + /** Extension of NCMesh::GetBoundaryClosure. Filters out ghost vertices and +- ghost edges from 'bdr_vertices' and 'bdr_edges'. */ ++ ghost edges from 'bdr_vertices' and 'bdr_edges', and uncovers hidden internal ++ boundary faces. */ + void GetBoundaryClosure(const Array &bdr_attr_is_ess, + Array &bdr_vertices, +- Array &bdr_edges) override; ++ Array &bdr_edges, Array &bdr_faces) override; + + /// Save memory by releasing all non-essential and cached data. + void Trim() override; +@@ -258,8 +259,6 @@ protected: // interface for ParMesh + /** Populate face neighbor members of ParMesh from the ghost layer, without + communication. */ + void GetFaceNeighbors(class ParMesh &pmesh); +- +- + protected: // implementation + + MPI_Comm MyComm; +diff --git a/mesh/point.cpp b/mesh/point.cpp +index ecf6a4dd0..473655b11 100644 +--- a/mesh/point.cpp ++++ b/mesh/point.cpp +@@ -21,12 +21,24 @@ Point::Point( const int *ind, int attr ) : Element(Geometry::POINT) + indices[0] = ind[0]; + } + +-void Point::GetVertices( Array &v ) const ++void Point::GetVertices(Array &v) const + { +- v.SetSize( 1 ); ++ v.SetSize(1); + v[0] = indices[0]; + } + ++void Point::SetVertices(const Array &v) ++{ ++ MFEM_ASSERT(v.Size() == 1, "!"); ++ indices[0] = v[0]; ++} ++ ++ ++void Point::SetVertices(const int *ind) ++{ ++ indices[0] = ind[0]; ++} ++ + PointFiniteElement PointFE; + + } +diff --git a/mesh/point.hpp b/mesh/point.hpp +index f154e205e..be00c9c84 100644 +--- a/mesh/point.hpp ++++ b/mesh/point.hpp +@@ -33,33 +33,40 @@ public: + Point( const int *ind, int attr = -1 ); + + /// Return element's type. +- virtual Type GetType() const { return Element::POINT; } ++ Type GetType() const override { return Element::POINT; } + +- /// Returns the indices of the element's vertices. +- virtual void GetVertices( Array &v ) const; ++ /// Get the indices defining the vertices ++ void GetVertices(Array &v) const override; + +- virtual int * GetVertices () { return indices; } ++ /// Set the indices defining the vertices ++ void SetVertices(const Array &v) override; + +- virtual int GetNVertices() const { return 1; } ++ /// @note The returned array should NOT be deleted by the caller. ++ int * GetVertices () override { return indices; } + +- virtual int GetNEdges() const { return (0); } ++ /// Set the vertices according to the given input. ++ void SetVertices(const int *ind) override; + +- virtual const int *GetEdgeVertices(int ei) const { return NULL; } ++ int GetNVertices() const override { return 1; } ++ ++ int GetNEdges() const override { return (0); } ++ ++ const int *GetEdgeVertices(int ei) const override { return NULL; } + + /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. +- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const ++ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override + { nFaceVertices = 0; return 0; } + +- virtual int GetNFaces() const { return 0; } ++ int GetNFaces() const override { return 0; } + +- virtual int GetNFaceVertices(int) const { return 0; } ++ int GetNFaceVertices(int) const override { return 0; } + +- virtual const int *GetFaceVertices(int fi) const { return NULL; } ++ const int *GetFaceVertices(int fi) const override { return NULL; } + +- virtual Element *Duplicate(Mesh *m) const ++ Element *Duplicate(Mesh *m) const override + { return new Point (indices, attribute); } + +- virtual ~Point() { } ++ virtual ~Point() = default; + }; + + class PointFiniteElement; +diff --git a/mesh/pyramid.cpp b/mesh/pyramid.cpp +index d67841564..f64f2afe9 100644 +--- a/mesh/pyramid.cpp ++++ b/mesh/pyramid.cpp +@@ -48,10 +48,13 @@ void Pyramid::SetVertices(const int *ind) + void Pyramid::GetVertices(Array &v) const + { + v.SetSize(5); +- for (int i = 0; i < 5; i++) +- { +- v[i] = indices[i]; +- } ++ std::copy(indices, indices + 5, v.begin()); ++} ++ ++void Pyramid::SetVertices(const Array &v) ++{ ++ MFEM_ASSERT(v.Size() == 5, "!"); ++ std::copy(v.begin(), v.end(), indices); + } + + int Pyramid::GetNFaces(int &nFaceVertices) const +diff --git a/mesh/pyramid.hpp b/mesh/pyramid.hpp +index 8e171a31d..adcc540ed 100644 +--- a/mesh/pyramid.hpp ++++ b/mesh/pyramid.hpp +@@ -37,38 +37,42 @@ public: + int attr = 1); + + /// Return element's type. +- virtual Type GetType() const { return Element::PYRAMID; } ++ Type GetType() const override { return Element::PYRAMID; } + +- /// Set the vertices according to the given input. +- virtual void SetVertices(const int *ind); ++ /// Get the indices defining the vertices ++ void GetVertices(Array &v) const override; ++ ++ /// Set the indices defining the vertices ++ void SetVertices(const Array &v) override; + +- /// Returns the indices of the element's vertices. +- virtual void GetVertices(Array &v) const; ++ /// @note The returned array should NOT be deleted by the caller. ++ int * GetVertices () override { return indices; } + +- virtual int *GetVertices() { return indices; } ++ /// Set the vertices according to the given input. ++ void SetVertices(const int *ind) override; + +- virtual int GetNVertices() const { return 5; } ++ int GetNVertices() const override { return 5; } + +- virtual int GetNEdges() const { return 8; } ++ int GetNEdges() const override { return 8; } + +- virtual const int *GetEdgeVertices(int ei) const ++ const int *GetEdgeVertices(int ei) const override + { return geom_t::Edges[ei]; } + + /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. +- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const; ++ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override; + +- virtual int GetNFaces() const { return 5; } ++ int GetNFaces() const override { return 5; } + +- virtual int GetNFaceVertices(int fi) const ++ int GetNFaceVertices(int fi) const override + { return ( ( fi < 1 ) ? 4 : 3); } + +- virtual const int *GetFaceVertices(int fi) const ++ const int *GetFaceVertices(int fi) const override + { return geom_t::FaceVert[fi]; } + +- virtual Element *Duplicate(Mesh *m) const ++ Element *Duplicate(Mesh *m) const override + { return new Pyramid(indices, attribute); } + +- virtual ~Pyramid() { } ++ virtual ~Pyramid() = default; + }; + + extern class LinearPyramidFiniteElement PyramidFE; +diff --git a/mesh/quadrilateral.cpp b/mesh/quadrilateral.cpp +index 1a69cf179..29fa3bbe1 100644 +--- a/mesh/quadrilateral.cpp ++++ b/mesh/quadrilateral.cpp +@@ -37,19 +37,20 @@ Quadrilateral::Quadrilateral( int ind1, int ind2, int ind3, int ind4, + + void Quadrilateral::SetVertices(const int *ind) + { +- for (int i=0; i<4; i++) +- { +- indices[i] = ind[i]; +- } ++ std::copy(ind, ind + 4, indices); + } + +-void Quadrilateral::GetVertices( Array &v ) const ++void Quadrilateral::GetVertices(Array &v) const + { +- v.SetSize( 4 ); +- for (int i=0; i<4; i++) +- { +- v[i] = indices[i]; +- } ++ v.SetSize(4); ++ std::copy(indices, indices + 4, v.begin()); ++} ++ ++ ++void Quadrilateral::SetVertices(const Array &v) ++{ ++ MFEM_ASSERT(v.Size() == 4, "!"); ++ std::copy(v.begin(), v.end(), indices); + } + + BiLinear2DFiniteElement QuadrilateralFE; +diff --git a/mesh/quadrilateral.hpp b/mesh/quadrilateral.hpp +index 9f6b9a442..70fcbfdcc 100644 +--- a/mesh/quadrilateral.hpp ++++ b/mesh/quadrilateral.hpp +@@ -36,37 +36,41 @@ public: + Quadrilateral(int ind1, int ind2, int ind3, int ind4, int attr = 1); + + /// Return element's type +- Type GetType() const { return Element::QUADRILATERAL; } ++ Type GetType() const override { return Element::QUADRILATERAL; } + +- /// Set the vertices according to the given input. +- virtual void SetVertices(const int *ind); ++ /// Get the indices defining the vertices ++ void GetVertices(Array &v) const override; ++ ++ /// Set the indices defining the vertices ++ void SetVertices(const Array &v) override; + +- /// Returns the indices of the element's vertices. +- virtual void GetVertices(Array &v) const; ++ /// @note The returned array should NOT be deleted by the caller. ++ int * GetVertices () override { return indices; } + +- virtual int *GetVertices() { return indices; } ++ /// Set the vertices according to the given input. ++ void SetVertices(const int *ind) override; + +- virtual int GetNVertices() const { return 4; } ++ int GetNVertices() const override { return 4; } + +- virtual int GetNEdges() const { return (4); } ++ int GetNEdges() const override { return (4); } + +- virtual const int *GetEdgeVertices(int ei) const ++ const int *GetEdgeVertices(int ei) const override + { return geom_t::Edges[ei]; } + + /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. +- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const ++ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override + { nFaceVertices = 0; return 0; } + +- virtual int GetNFaces() const { return 0; } ++ int GetNFaces() const override { return 0; } + +- virtual int GetNFaceVertices(int) const { return 0; } ++ int GetNFaceVertices(int) const override { return 0; } + +- virtual const int *GetFaceVertices(int fi) const { return NULL; } ++ const int *GetFaceVertices(int fi) const override { return NULL; } + +- virtual Element *Duplicate(Mesh *m) const ++ Element *Duplicate(Mesh *m) const override + { return new Quadrilateral(indices, attribute); } + +- virtual ~Quadrilateral() { } ++ virtual ~Quadrilateral() = default; + }; + + extern MFEM_EXPORT class BiLinear2DFiniteElement QuadrilateralFE; +diff --git a/mesh/segment.cpp b/mesh/segment.cpp +index 717245907..910614770 100644 +--- a/mesh/segment.cpp ++++ b/mesh/segment.cpp +@@ -37,13 +37,16 @@ void Segment::SetVertices(const int *ind) + indices[1] = ind[1]; + } + +-void Segment::GetVertices( Array &v ) const ++void Segment::GetVertices(Array &v) const + { +- v.SetSize( 2 ); +- for (int i=0; i<2; i++) +- { +- v[i] = indices[i]; +- } ++ v.SetSize(2); ++ std::copy(indices, indices + 2, v.begin()); ++} ++ ++void Segment::SetVertices(const Array &v) ++{ ++ MFEM_ASSERT(v.Size() == 2, "!"); ++ std::copy(v.begin(), v.end(), indices); + } + + Linear1DFiniteElement SegmentFE; +diff --git a/mesh/segment.hpp b/mesh/segment.hpp +index 6ca918758..aafc4909f 100644 +--- a/mesh/segment.hpp ++++ b/mesh/segment.hpp +@@ -35,37 +35,41 @@ public: + /// Constructs triangle by specifying the indices and the attribute. + Segment(int ind1, int ind2, int attr = 1); + +- /// Set the indices the element according to the input. +- virtual void SetVertices(const int *ind); +- + /// Return element's type. +- virtual Type GetType() const { return Element::SEGMENT; } ++ Type GetType() const override { return Element::SEGMENT; } ++ ++ /// Get the indices defining the vertices ++ void GetVertices(Array &v) const override; ++ ++ /// Set the indices defining the vertices ++ void SetVertices(const Array &v) override; + +- /// Returns the indices of the element's vertices. +- virtual void GetVertices(Array &v) const; ++ /// @note The returned array should NOT be deleted by the caller. ++ int * GetVertices () override { return indices; } + +- virtual int *GetVertices() { return indices; } ++ /// Set the vertices according to the given input. ++ void SetVertices(const int *ind) override; + +- virtual int GetNVertices() const { return 2; } ++ int GetNVertices() const override { return 2; } + +- virtual int GetNEdges() const { return (0); } ++ int GetNEdges() const override { return 0; } + +- virtual const int *GetEdgeVertices(int ei) const { return NULL; } ++ const int *GetEdgeVertices(int ei) const override { return NULL; } + + /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. +- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const ++ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override + { nFaceVertices = 0; return 0; } + +- virtual int GetNFaces() const { return 0; } ++ int GetNFaces() const override { return 0; } + +- virtual int GetNFaceVertices(int) const { return 0; } ++ int GetNFaceVertices(int) const override { return 0; } + +- virtual const int *GetFaceVertices(int fi) const { return NULL; } ++ const int *GetFaceVertices(int fi) const override { return NULL; } + +- virtual Element *Duplicate(Mesh *m) const ++ Element *Duplicate(Mesh *m) const override + { return new Segment(indices, attribute); } + +- virtual ~Segment() { } ++ virtual ~Segment() = default; + }; + + class Linear1DFiniteElement; +diff --git a/mesh/tetrahedron.cpp b/mesh/tetrahedron.cpp +index c1b0ae6d8..133a69b41 100644 +--- a/mesh/tetrahedron.cpp ++++ b/mesh/tetrahedron.cpp +@@ -341,10 +341,13 @@ void Tetrahedron::GetPointMatrix(unsigned transform, DenseMatrix &pm) + void Tetrahedron::GetVertices(Array &v) const + { + v.SetSize(4); +- for (int i = 0; i < 4; i++) +- { +- v[i] = indices[i]; +- } ++ std::copy(indices, indices + 4, v.begin()); ++} ++ ++void Tetrahedron::SetVertices(const Array &v) ++{ ++ MFEM_ASSERT(v.Size() == 4, "!"); ++ std::copy(v.begin(), v.end(), indices); + } + + Element *Tetrahedron::Duplicate(Mesh *m) const +diff --git a/mesh/tetrahedron.hpp b/mesh/tetrahedron.hpp +index ef8f36eb8..157c75895 100644 +--- a/mesh/tetrahedron.hpp ++++ b/mesh/tetrahedron.hpp +@@ -56,7 +56,7 @@ public: + int ref_flag = 0); + + /// Return element's type. +- virtual Type GetType() const { return Element::TETRAHEDRON; } ++ Type GetType() const override { return Element::TETRAHEDRON; } + + void ParseRefinementFlag(int refinement_edges[2], int &type, + int &flag) const; +@@ -69,10 +69,7 @@ public: + void SetRefinementFlag(int rf) { refinement_flag = rf; } + + /// Return 1 if the element needs refinement in order to get conforming mesh. +- virtual int NeedRefinement(HashTable &v_to_v) const; +- +- /// Set the vertices according to the given input. +- virtual void SetVertices(const int *ind); ++ int NeedRefinement(HashTable &v_to_v) const override; + + /** Reorder the vertices so that the longest edge is from vertex 0 + to vertex 1. If called it should be once from the mesh constructor, +@@ -81,42 +78,49 @@ public: + void MarkEdge(const DSTable &v_to_v, const Array &length, + const Array &length2); + +- virtual void ResetTransform(int tr) { transform = tr; } +- virtual unsigned GetTransform() const { return transform; } ++ void ResetTransform(int tr) override { transform = tr; } ++ unsigned GetTransform() const override { return transform; } + + /// Add 'tr' to the current chain of coarse-fine transformations. +- virtual void PushTransform(int tr) ++ void PushTransform(int tr) override + { transform = (transform << 3) | (tr + 1); } + + /// Calculate point matrix corresponding to a chain of transformations. + static void GetPointMatrix(unsigned transform, DenseMatrix &pm); + +- /// Returns the indices of the element's vertices. +- virtual void GetVertices(Array &v) const; ++ /// Get the indices defining the vertices ++ void GetVertices(Array &v) const override; ++ ++ /// Set the indices defining the vertices ++ void SetVertices(const Array &v) override; + +- virtual int *GetVertices() { return indices; } ++ /// @note The returned array should NOT be deleted by the caller. ++ int * GetVertices () override { return indices; } ++ ++ /// Set the vertices according to the given input. ++ void SetVertices(const int *ind) override; + +- virtual int GetNVertices() const { return 4; } ++ int GetNVertices() const override { return 4; } + +- virtual int GetNEdges() const { return (6); } ++ int GetNEdges() const override { return (6); } + +- virtual const int *GetEdgeVertices(int ei) const ++ const int *GetEdgeVertices(int ei) const override + { return geom_t::Edges[ei]; } + + /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. +- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const ++ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override + { nFaceVertices = 3; return 4; } + +- virtual int GetNFaces() const { return 4; } ++ int GetNFaces() const override { return 4; } + +- virtual int GetNFaceVertices(int) const { return 3; } ++ int GetNFaceVertices(int) const override { return 3; } + +- virtual const int *GetFaceVertices(int fi) const ++ const int *GetFaceVertices(int fi) const override + { return geom_t::FaceVert[fi]; } + +- virtual Element *Duplicate(Mesh *m) const; ++ Element *Duplicate(Mesh *m) const override; + +- virtual ~Tetrahedron() { } ++ virtual ~Tetrahedron() = default; + }; + + // Defined in fe.cpp to ensure construction before 'mfem::Geometries'. +diff --git a/mesh/triangle.cpp b/mesh/triangle.cpp +index abd2b4379..80d11f4b6 100644 +--- a/mesh/triangle.cpp ++++ b/mesh/triangle.cpp +@@ -155,10 +155,13 @@ void Triangle::GetPointMatrix(unsigned transform, DenseMatrix &pm) + void Triangle::GetVertices(Array &v) const + { + v.SetSize(3); +- for (int i = 0; i < 3; i++) +- { +- v[i] = indices[i]; +- } ++ std::copy(indices, indices + 3, v.begin()); ++} ++ ++void Triangle::SetVertices(const Array &v) ++{ ++ MFEM_ASSERT(v.Size() == 3, "!"); ++ std::copy(v.begin(), v.end(), indices); + } + + // @cond DOXYGEN_SKIP +diff --git a/mesh/triangle.hpp b/mesh/triangle.hpp +index 49fb4fe99..480c04118 100644 +--- a/mesh/triangle.hpp ++++ b/mesh/triangle.hpp +@@ -39,13 +39,10 @@ public: + Triangle(int ind1, int ind2, int ind3, int attr = 1); + + /// Return element's type. +- virtual Type GetType() const { return Element::TRIANGLE; } ++ Type GetType() const override { return Element::TRIANGLE; } + + /// Return 1 if the element needs refinement in order to get conforming mesh. +- virtual int NeedRefinement(HashTable &v_to_v) const; +- +- /// Set the vertices according to the given input. +- virtual void SetVertices(const int *ind); ++ int NeedRefinement(HashTable &v_to_v) const override; + + /** Reorder the vertices so that the longest edge is from vertex 0 + to vertex 1. If called it should be once from the mesh constructor, +@@ -59,43 +56,51 @@ public: + static void MarkEdge(int *indices, const DSTable &v_to_v, + const Array &length, const Array &length2); + +- virtual void ResetTransform(int tr) { transform = tr; } +- virtual unsigned GetTransform() const { return transform; } ++ void ResetTransform(int tr) override { transform = tr; } ++ unsigned GetTransform() const override { return transform; } + + /// Add 'tr' to the current chain of coarse-fine transformations. +- virtual void PushTransform(int tr) ++ void PushTransform(int tr) override + { transform = (transform << 3) | (tr + 1); } + + /// Calculate point matrix corresponding to a chain of transformations. + static void GetPointMatrix(unsigned transform, DenseMatrix &pm); + +- /// Returns the indices of the element's vertices. +- virtual void GetVertices(Array &v) const; ++ /// Get the indices defining the vertices ++ void GetVertices(Array &v) const override; ++ ++ /// Set the indices defining the vertices ++ void SetVertices(const Array &v) override; ++ ++ /// @note The returned array should NOT be deleted by the caller. ++ int * GetVertices () override { return indices; } ++ ++ /// Set the vertices according to the given input. ++ void SetVertices(const int *ind) override; + +- virtual int *GetVertices() { return indices; } + +- virtual int GetNVertices() const { return 3; } ++ int GetNVertices() const override { return 3; } + +- virtual int GetNEdges() const { return (3); } ++ int GetNEdges() const override { return (3); } + +- virtual const int *GetEdgeVertices(int ei) const ++ const int *GetEdgeVertices(int ei) const override + { return geom_t::Edges[ei]; } + + /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. +- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const ++ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override + { nFaceVertices = 0; return 0; } + +- virtual int GetNFaces() const { return 0; } ++ int GetNFaces() const override { return 0; } + +- virtual int GetNFaceVertices(int) const { return 0; } ++ int GetNFaceVertices(int) const override { return 0; } + +- virtual const int *GetFaceVertices(int fi) const ++ const int *GetFaceVertices(int fi) const override + { MFEM_ABORT("not implemented"); return NULL; } + +- virtual Element *Duplicate(Mesh *m) const ++ Element *Duplicate(Mesh *m) const override + { return new Triangle(indices, attribute); } + +- virtual ~Triangle() { } ++ virtual ~Triangle() = default; + }; + + // Defined in fe.cpp to ensure construction before 'mfem::Geometries'. +diff --git a/mesh/wedge.cpp b/mesh/wedge.cpp +index 898da7653..b1aea933d 100644 +--- a/mesh/wedge.cpp ++++ b/mesh/wedge.cpp +@@ -50,10 +50,13 @@ void Wedge::SetVertices(const int *ind) + void Wedge::GetVertices(Array &v) const + { + v.SetSize(6); +- for (int i = 0; i < 6; i++) +- { +- v[i] = indices[i]; +- } ++ std::copy(indices, indices + 6, v.begin()); ++} ++ ++void Wedge::SetVertices(const Array &v) ++{ ++ MFEM_ASSERT(v.Size() == 6, "!"); ++ std::copy(v.begin(), v.end(), indices); + } + + int Wedge::GetNFaces(int &nFaceVertices) const +diff --git a/mesh/wedge.hpp b/mesh/wedge.hpp +index fb8583f8e..2eae6d104 100644 +--- a/mesh/wedge.hpp ++++ b/mesh/wedge.hpp +@@ -37,38 +37,42 @@ public: + int attr = 1); + + /// Return element's type. +- virtual Type GetType() const { return Element::WEDGE; } ++ Type GetType() const override { return Element::WEDGE; } + +- /// Set the vertices according to the given input. +- virtual void SetVertices(const int *ind); ++ /// Get the indices defining the vertices ++ void GetVertices(Array &v) const override; ++ ++ /// Set the indices defining the vertices ++ void SetVertices(const Array &v) override; + +- /// Returns the indices of the element's vertices. +- virtual void GetVertices(Array &v) const; ++ /// @note The returned array should NOT be deleted by the caller. ++ int * GetVertices () override { return indices; } + +- virtual int *GetVertices() { return indices; } ++ /// Set the vertices according to the given input. ++ void SetVertices(const int *ind) override; + +- virtual int GetNVertices() const { return 6; } ++ int GetNVertices() const override { return 6; } + +- virtual int GetNEdges() const { return 9; } ++ int GetNEdges() const override { return 9; } + +- virtual const int *GetEdgeVertices(int ei) const ++ const int *GetEdgeVertices(int ei) const override + { return geom_t::Edges[ei]; } + + /// @deprecated Use GetNFaces(void) and GetNFaceVertices(int) instead. +- MFEM_DEPRECATED virtual int GetNFaces(int &nFaceVertices) const; ++ MFEM_DEPRECATED int GetNFaces(int &nFaceVertices) const override; + +- virtual int GetNFaces() const { return 5; } ++ int GetNFaces() const override { return 5; } + +- virtual int GetNFaceVertices(int fi) const ++ int GetNFaceVertices(int fi) const override + { return (fi < 2) ? 3 : 4; } + +- virtual const int *GetFaceVertices(int fi) const ++ const int *GetFaceVertices(int fi) const override + { return geom_t::FaceVert[fi]; } + +- virtual Element *Duplicate(Mesh *m) const ++ Element *Duplicate(Mesh *m) const override + { return new Wedge(indices, attribute); } + +- virtual ~Wedge() { } ++ virtual ~Wedge() = default; + }; + + extern MFEM_EXPORT class LinearWedgeFiniteElement WedgeFE; +diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt +index dc0e9fea8..678abb706 100644 +--- a/tests/unit/CMakeLists.txt ++++ b/tests/unit/CMakeLists.txt +@@ -43,6 +43,7 @@ set(UNIT_TESTS_SRCS + linalg/test_operator.cpp + linalg/test_vector.cpp + mesh/test_face_orientations.cpp ++ mesh/mesh_test_utils.cpp + mesh/test_fms.cpp + mesh/test_mesh.cpp + mesh/test_ncmesh.cpp +diff --git a/tests/unit/mesh/mesh_test_utils.cpp b/tests/unit/mesh/mesh_test_utils.cpp +new file mode 100644 +index 000000000..65fb2e01d +--- /dev/null ++++ b/tests/unit/mesh/mesh_test_utils.cpp +@@ -0,0 +1,207 @@ ++// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced ++// at the Lawrence Livermore National Laboratory. All Rights reserved. See files ++// LICENSE and NOTICE for details. LLNL-CODE-806117. ++// ++// This file is part of the MFEM library. For more information and source code ++// availability visit https://mfem.org. ++// ++// MFEM is free software; you can redistribute it and/or modify it under the ++// terms of the BSD-3 license. We welcome feedback and contributions, see file ++// CONTRIBUTING.md for details. ++ ++#include "mesh_test_utils.hpp" ++ ++namespace mfem ++{ ++ ++int CheckPoisson(Mesh &mesh, int order, int disabled_boundary_attribute) ++{ ++ constexpr int dim = 3; ++ ++ H1_FECollection fec(order, dim); ++ FiniteElementSpace fes(&mesh, &fec); ++ ++ GridFunction sol(&fes); ++ ++ ConstantCoefficient one(1.0); ++ BilinearForm a(&fes); ++ a.AddDomainIntegrator(new DiffusionIntegrator(one)); ++ a.Assemble(); ++ ++ LinearForm b(&fes); ++ b.AddDomainIntegrator(new DomainLFIntegrator(one)); ++ b.Assemble(); ++ ++ // Add in essential boundary conditions ++ Array ess_tdof_list; ++ REQUIRE(mesh.bdr_attributes.Max() > 0); ++ ++ // Mark all boundaries essential ++ Array bdr_attr_is_ess(mesh.bdr_attributes.Max()); ++ bdr_attr_is_ess = 1; ++ if (disabled_boundary_attribute >= 0) ++ { ++ bdr_attr_is_ess[mesh.bdr_attributes.Find(disabled_boundary_attribute)] = 0; ++ } ++ ++ fes.GetEssentialTrueDofs(bdr_attr_is_ess, ess_tdof_list); ++ REQUIRE(ess_tdof_list.Size() > 0); ++ ++ ConstantCoefficient zero(0.0); ++ sol.ProjectCoefficient(zero); ++ Vector B, X; ++ OperatorPtr A; ++ a.FormLinearSystem(ess_tdof_list, sol, b, A, X, B); ++ ++ // Solve the system ++ CG(*A, B, X, 2, 1000, 1e-20, 0.0); ++ ++ // Recover the solution ++ a.RecoverFEMSolution(X, b, sol); ++ ++ // Check that X solves the system A X = B. ++ A->AddMult(X, B, -1.0); ++ auto residual_norm = B.Norml2(); ++ bool satisfy_system = residual_norm < 1e-10; ++ CAPTURE(residual_norm); ++ CHECK(satisfy_system); ++ ++ bool satisfy_bc = true; ++ Vector tvec; ++ sol.GetTrueDofs(tvec); ++ for (auto dof : ess_tdof_list) ++ { ++ if (tvec[dof] != 0.0) ++ { ++ satisfy_bc = false; ++ break; ++ } ++ } ++ CHECK(satisfy_bc); ++ return ess_tdof_list.Size(); ++}; ++ ++#ifdef MFEM_USE_MPI ++ ++void CheckPoisson(ParMesh &pmesh, int order, ++ int disabled_boundary_attribute) ++{ ++ constexpr int dim = 3; ++ ++ H1_FECollection fec(order, dim); ++ ParFiniteElementSpace pfes(&pmesh, &fec); ++ ++ ParGridFunction sol(&pfes); ++ ++ ConstantCoefficient one(1.0); ++ ParBilinearForm a(&pfes); ++ a.AddDomainIntegrator(new DiffusionIntegrator(one)); ++ a.Assemble(); ++ ParLinearForm b(&pfes); ++ b.AddDomainIntegrator(new DomainLFIntegrator(one)); ++ b.Assemble(); ++ ++ // Add in essential boundary conditions ++ Array ess_tdof_list; ++ REQUIRE(pmesh.bdr_attributes.Max() > 0); ++ ++ Array bdr_attr_is_ess(pmesh.bdr_attributes.Max()); ++ bdr_attr_is_ess = 1; ++ if (disabled_boundary_attribute >= 0) ++ { ++ CAPTURE(disabled_boundary_attribute); ++ bdr_attr_is_ess[pmesh.bdr_attributes.Find(disabled_boundary_attribute)] = 0; ++ } ++ ++ pfes.GetEssentialTrueDofs(bdr_attr_is_ess, ess_tdof_list); ++ int num_ess_dof = ess_tdof_list.Size(); ++ MPI_Allreduce(MPI_IN_PLACE, &num_ess_dof, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ REQUIRE(num_ess_dof > 0); ++ ++ ++ ConstantCoefficient zero(0.0); ++ sol.ProjectCoefficient(zero); ++ Vector B, X; ++ OperatorPtr A; ++ const bool copy_interior = true; // interior(sol) --> interior(X) ++ a.FormLinearSystem(ess_tdof_list, sol, b, A, X, B, copy_interior); ++ ++ // Solve the system ++ CGSolver cg(MPI_COMM_WORLD); ++ // HypreBoomerAMG preconditioner; ++ cg.SetMaxIter(2000); ++ cg.SetRelTol(1e-12); ++ cg.SetPrintLevel(0); ++ cg.SetOperator(*A); ++ // cg.SetPreconditioner(preconditioner); ++ cg.Mult(B, X); ++ // Recover the solution ++ a.RecoverFEMSolution(X, b, sol); ++ ++ // Check that X solves the system A X = B. ++ A->AddMult(X, B, -1.0); ++ auto residual_norm = B.Norml2(); ++ bool satisfy_system = residual_norm < 1e-10; ++ CAPTURE(residual_norm); ++ CHECK(satisfy_system); ++ ++ // Initialize the bdr_dof to be checked ++ Vector tvec; ++ sol.GetTrueDofs(tvec); ++ bool satisfy_bc = true; ++ for (auto dof : ess_tdof_list) ++ { ++ if (tvec[dof] != 0.0) ++ { ++ satisfy_bc = false; ++ break; ++ } ++ } ++ CHECK(satisfy_bc); ++}; ++ ++std::unique_ptr CheckParMeshNBE(Mesh &smesh, ++ const std::unique_ptr &partition) ++{ ++ auto pmesh = std::unique_ptr(new ParMesh(MPI_COMM_WORLD, smesh, ++ partition.get())); ++ ++ int nbe = pmesh->GetNBE(); ++ MPI_Allreduce(MPI_IN_PLACE, &nbe, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ ++ CHECK(nbe == smesh.GetNBE()); ++ return pmesh; ++}; ++ ++bool CheckFaceInternal(ParMesh& pmesh, int f, ++ const std::map &local_to_shared) ++{ ++ int e1, e2; ++ pmesh.GetFaceElements(f, &e1, &e2); ++ int inf1, inf2, ncface; ++ pmesh.GetFaceInfos(f, &inf1, &inf2, &ncface); ++ ++ if (e2 < 0 && inf2 >=0) ++ { ++ // Shared face on processor boundary -> Need to discover the neighbor ++ // attributes ++ auto FET = pmesh.GetSharedFaceTransformations(local_to_shared.at(f)); ++ ++ if (FET->Elem1->Attribute != FET->Elem2->Attribute && f < pmesh.GetNumFaces()) ++ { ++ // shared face on domain attribute boundary, which this rank owns ++ return true; ++ } ++ } ++ ++ if (e2 >= 0 && pmesh.GetAttribute(e1) != pmesh.GetAttribute(e2)) ++ { ++ // local face on domain attribute boundary ++ return true; ++ } ++ return false; ++}; ++ ++#endif ++ ++} // namespace mfem +diff --git a/tests/unit/mesh/mesh_test_utils.hpp b/tests/unit/mesh/mesh_test_utils.hpp +new file mode 100644 +index 000000000..e4088a788 +--- /dev/null ++++ b/tests/unit/mesh/mesh_test_utils.hpp +@@ -0,0 +1,78 @@ ++// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced ++// at the Lawrence Livermore National Laboratory. All Rights reserved. See files ++// LICENSE and NOTICE for details. LLNL-CODE-806117. ++// ++// This file is part of the MFEM library. For more information and source code ++// availability visit https://mfem.org. ++// ++// MFEM is free software; you can redistribute it and/or modify it under the ++// terms of the BSD-3 license. We welcome feedback and contributions, see file ++// CONTRIBUTING.md for details. ++ ++#ifndef MFEM_MESH_TEST_UTILS ++#define MFEM_MESH_TEST_UTILS ++ ++ ++#include "mfem.hpp" ++#include "unit_tests.hpp" ++ ++namespace mfem ++{ ++ ++/** ++ * @brief Helper function for performing an H1 Poisson solve on a serial mesh, with ++ * homogeneous essential boundary conditions. Optionally can disable a boundary. ++ * ++ * @param mesh The SERIAL mesh to perform the Poisson solve on ++ * @param order The polynomial order of the basis ++ * @param disabled_boundary_attribute Optional boundary attribute to NOT apply ++ * homogeneous Dirichlet boundary condition on. Default of -1 means no boundary ++ * is disabled. ++ * @return int The number of DOF that are fixed by the essential boundary condition. ++ */ ++int CheckPoisson(Mesh &mesh, int order, int disabled_boundary_attribute = -1); ++ ++#ifdef MFEM_USE_MPI ++ ++/** ++ * @brief Helper function for performing an H1 Poisson solve on a parallel mesh, with ++ * homogeneous essential boundary conditions. Optionally can disable a boundary. ++ * ++ * @param mesh The PARALLEL mesh to perform the Poisson solve on ++ * @param order The polynomial order of the basis ++ * @param disabled_boundary_attribute Optional boundary attribute to NOT apply ++ * homogeneous Dirichlet boundary condition on. Default of -1 means no boundary ++ * is disabled. ++ * @return int The number of DOF that are fixed by the essential boundary condition. ++ */ ++void CheckPoisson(ParMesh &pmesh, int order, ++ int disabled_boundary_attribute = -1); ++ ++/** ++ * @brief Check that a Parmesh generates the same number of boundary elements as ++ * the serial mesh. ++ * ++ * @param smesh Serial mesh to be built from and compared against ++ * @param partition Optional partition ++ * @return std::unique_ptr Pointer to the mesh in question. ++ */ ++std::unique_ptr CheckParMeshNBE(Mesh &smesh, ++ const std::unique_ptr &partition = nullptr); ++ ++/** ++ * @brief Helper function to track if a face index is internal ++ * ++ * @param pmesh The mesh containing the face ++ * @param f The face index ++ * @param local_to_shared A map from local faces to shared faces ++ * @return true the face is between domain attributes (and owned by this rank) ++ * @return false the face is not between domain attributes or not owned by this rank ++ */ ++bool CheckFaceInternal(ParMesh& pmesh, int f, ++ const std::map &local_to_shared); ++ ++#endif ++ ++} // namespace mfem ++ ++#endif // MFEM_MESH_TEST_UTILS +\ No newline at end of file +diff --git a/tests/unit/mesh/test_ncmesh.cpp b/tests/unit/mesh/test_ncmesh.cpp +index c1bb54e13..c38ab18b2 100644 +--- a/tests/unit/mesh/test_ncmesh.cpp ++++ b/tests/unit/mesh/test_ncmesh.cpp +@@ -10,6 +10,7 @@ + // CONTRIBUTING.md for details. + + #include "mfem.hpp" ++#include "mesh_test_utils.hpp" + #include "unit_tests.hpp" + + #include +@@ -18,6 +19,34 @@ namespace mfem + + constexpr double EPS = 1e-10; + ++// Helper to count H1 essential dofs for a given order with a given attribute ++template ++int CountEssentialDof(Mesh &mesh, int order, int attribute) ++{ ++ constexpr int dim = 3; ++ FECollection fec(order, dim); ++ FiniteElementSpace fes(&mesh, &fec); ++ ++ Array bdr_attr_is_ess(mesh.bdr_attributes.Max()); ++ bdr_attr_is_ess = 0; ++ bdr_attr_is_ess[mesh.bdr_attributes.Find(attribute)] = 1; ++ ++ if (TDOF) ++ { ++ Array ess_tdof_list; ++ fes.GetEssentialTrueDofs(bdr_attr_is_ess, ess_tdof_list); ++ return ess_tdof_list.Size(); ++ } ++ else ++ { ++ // VDOF ++ Array ess_vdof_marker, vdof_list; ++ fes.GetEssentialVDofs(bdr_attr_is_ess, ess_vdof_marker); ++ fes.MarkerToList(ess_vdof_marker, vdof_list); ++ return vdof_list.Size(); ++ } ++}; ++ + // Test case: Verify that a conforming mesh yields the same norm for the + // assembled diagonal with PA when using the standard (conforming) + // Mesh vs. the corresponding (non-conforming) NCMesh. +@@ -114,7 +143,6 @@ TEST_CASE("NCMesh PA diagonal", "[NCMesh]") + + } // test case + +- + TEST_CASE("NCMesh 3D Refined Volume", "[NCMesh]") + { + auto mesh_fname = GENERATE("../../data/ref-tetrahedron.mesh", +@@ -146,7 +174,6 @@ TEST_CASE("NCMesh 3D Refined Volume", "[NCMesh]") + REQUIRE(summed_volume == MFEM_Approx(original_volume)); + } // test case + +- + TEST_CASE("NCMesh 3D Derefined Volume", "[NCMesh]") + { + auto mesh_fname = GENERATE("../../data/ref-tetrahedron.mesh", +@@ -176,6 +203,134 @@ TEST_CASE("NCMesh 3D Derefined Volume", "[NCMesh]") + REQUIRE(derefined_volume == MFEM_Approx(original_volume)); + } // test case + ++// Helper to create a mesh of a tet with four face neighbor tets and internal boundary between ++Mesh StarMesh() ++{ ++ const int nnode = 4 + 4; ++ const int nelem = 5; ++ ++ Mesh mesh(3, nnode, nelem); ++ ++ // central tet ++ mesh.AddVertex(0.0, 0.0, 0.0); ++ mesh.AddVertex(1.0, 0.0, 0.0); ++ mesh.AddVertex(0.0, 1.0, 0.0); ++ mesh.AddVertex(0.0, 0.0, 1.0); ++ ++ mesh.AddVertex( 1.0, 1.0, 1.0); // opposite 0 ++ mesh.AddVertex(-1.0, 0.0, 0.0); // opposite 1 ++ mesh.AddVertex( 0.0, -1.0, 0.0); // opposite 2 ++ mesh.AddVertex( 0.0, 0.0, -1.0); // opposite 3 ++ ++ mesh.AddTet(0, 1, 2, 3, 1); // central ++ mesh.AddTet(4, 1, 2, 3, 2); // opposite 0 ++ mesh.AddTet(0, 5, 2, 3, 3); // opposite 1 ++ mesh.AddTet(0, 1, 6, 3, 4); // opposite 2 ++ mesh.AddTet(0, 1, 2, 7, 5); // opposite 3 ++ ++ mesh.FinalizeTopology(); ++ mesh.Finalize(true, true); ++ ++ // Introduce internal boundary elements ++ const int new_attribute = mesh.bdr_attributes.Max() + 1; ++ Array original_boundary_vertices; ++ for (int f = 0; f < mesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ mesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && mesh.GetAttribute(e1) != mesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = mesh.GetFace(f)->Duplicate(&mesh); ++ new_elem->SetAttribute(new_attribute); ++ new_elem->GetVertices(original_boundary_vertices); ++ mesh.AddBdrElement(new_elem); ++ } ++ } ++ mesh.SetAttributes(); ++ mesh.FinalizeTopology(); ++ mesh.Finalize(true, true); ++ ++ return mesh; ++} ++ ++Mesh DividingPlaneMesh(bool tet_mesh = true, bool split = true) ++{ ++ auto mesh = Mesh("../../data/ref-cube.mesh"); ++ { ++ Array refs; ++ refs.Append(Refinement(0, Refinement::X)); ++ mesh.GeneralRefinement(refs); ++ } ++ delete mesh.ncmesh; ++ mesh.ncmesh = nullptr; ++ mesh.FinalizeTopology(); ++ mesh.Finalize(true, true); ++ ++ mesh.SetAttribute(0, 1); ++ mesh.SetAttribute(1, split ? 2 : 1); ++ ++ // Introduce internal boundary elements ++ const int new_attribute = mesh.bdr_attributes.Max() + 1; ++ for (int f = 0; f < mesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ mesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && mesh.GetAttribute(e1) != mesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = mesh.GetFace(f)->Duplicate(&mesh); ++ new_elem->SetAttribute(new_attribute); ++ mesh.AddBdrElement(new_elem); ++ } ++ } ++ if (tet_mesh) ++ { ++ mesh = Mesh::MakeSimplicial(mesh); ++ } ++ mesh.FinalizeTopology(); ++ mesh.Finalize(true, true); ++ return mesh; ++} ++ ++// Define a pair of tet with a shared triangle in the y-z plane. ++// Vary the vertex ordering to achieve the 3 possible odd orientations ++Mesh OrientedTriFaceMesh(int orientation, bool add_extbdr = false) ++{ ++ REQUIRE((orientation == 1 || orientation == 3 || orientation == 5)); ++ ++ Mesh mesh(3, 4, 2); ++ mesh.AddVertex(-1.0, 0.0, 0.0); ++ mesh.AddVertex(0.0, 0.0, 0.0); ++ mesh.AddVertex(0.0, 1.0, 0.0); ++ mesh.AddVertex(0.0, 0.0, 1.0); ++ ++ // opposing vertex ++ mesh.AddVertex(1.0, 0.0, 0.0); ++ ++ mesh.AddTet(0, 1, 2, 3, 1); ++ ++ switch (orientation) ++ { ++ case 1: ++ mesh.AddTet(4,2,1,3,2); break; ++ case 3: ++ mesh.AddTet(4,3,2,1,2); break; ++ case 5: ++ mesh.AddTet(4,1,3,2,2); break; ++ } ++ ++ mesh.FinalizeTopology(add_extbdr); ++ mesh.SetAttributes(); ++ ++ auto *bdr = new Triangle(1,2,3, ++ mesh.bdr_attributes.Size() == 0 ? 1 : mesh.bdr_attributes.Max() + 1); ++ mesh.AddBdrElement(bdr); ++ ++ mesh.FinalizeTopology(false); ++ mesh.Finalize(); ++ return mesh; ++}; + + #ifdef MFEM_USE_MPI + +@@ -297,7 +452,6 @@ TEST_CASE("pNCMesh PA diagonal", "[Parallel], [NCMesh]") + } + } // test case + +- + // Given a parallel and a serial mesh, perform an L2 projection and check the + // solutions match exactly. + std::array CheckL2Projection(ParMesh& pmesh, Mesh& smesh, int order, +@@ -618,7 +772,6 @@ TEST_CASE("EdgeFaceConstraint", "[Parallel], [NCMesh]") + } + } + ParMesh pmesh(MPI_COMM_WORLD, smesh, partition.get()); +- + { + constexpr int dim = 3; + constexpr int order = 1; +@@ -630,7 +783,6 @@ TEST_CASE("EdgeFaceConstraint", "[Parallel], [NCMesh]") + const auto parallel_ntdof = pfes.GlobalTrueVSize(); + CHECK(serial_ntdof == parallel_ntdof); + } +- + for (int order = 1; order <= 4; order++) + { + CAPTURE(order); +@@ -1026,6 +1178,2607 @@ TEST_CASE("GetVectorValueInFaceNeighborElement", "[Parallel], [NCMesh]") + } + } + +-#endif // MFEM_USE_MPI ++TEST_CASE("TetCornerRefines", "[Parallel], [NCMesh]") ++{ ++ auto p = GENERATE(1,2,3); ++ CAPTURE(p); ++ ++ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); ++ ++ REQUIRE(smesh.GetNBE() == 4); ++ ++ Array refs; ++ refs.Append(Refinement(0, Refinement::X)); ++ smesh.GeneralRefinement(refs); ++ ++ // Now have a pair of elements, make the second element a different ++ // attribute. ++ smesh.SetAttribute(0, 1); ++ smesh.SetAttribute(1, 2); ++ ++ REQUIRE(smesh.GetNBE() == 2 * 3); ++ ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ // Introduce an internal boundary element ++ const int new_attribute = smesh.bdr_attributes.Max() + 1; ++ for (int f = 0; f < smesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); ++ new_elem->SetAttribute(new_attribute); ++ smesh.AddBdrElement(new_elem); ++ break; ++ } ++ } ++ smesh.FinalizeTopology(); // Finalize to build relevant tables ++ smesh.Finalize(); ++ ++ // Exactly one boundary element must be added ++ REQUIRE(smesh.GetNBE() == 2 * 3 + 1); ++ ++ smesh.EnsureNCMesh(true); ++ ++ auto pmesh = CheckParMeshNBE(smesh); ++ ++ pmesh->FinalizeTopology(); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ REQUIRE(pmesh->Nonconforming()); ++ ++ std::map local_to_shared; ++ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) ++ { ++ local_to_shared[pmesh->GetSharedFace(i)] = i; ++ } ++ ++ // Count the number of internal faces via the boundary elements ++ int num_internal = 0; ++ for (int n = 0; n < pmesh->GetNBE(); ++n) ++ { ++ int f, o; ++ pmesh->GetBdrElementFace(n, &f, &o); ++ if (CheckFaceInternal(*pmesh, f, local_to_shared)) ++ { ++ ++num_internal; ++ } ++ } ++ ++ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ CHECK(num_internal == 1); ++ ++ CheckPoisson(*pmesh, p, pmesh->bdr_attributes.Max()); ++ CheckPoisson(*pmesh, p); ++} ++ ++// Count the number of essential dofs on a ParMesh. ++template ++int CountEssentialDof(ParMesh &mesh, int order, int attribute) ++{ ++ constexpr int dim = 3; ++ FECollection fec(order, dim); ++ ParFiniteElementSpace pfes(&mesh, &fec); ++ ++ Array bdr_attr_is_ess(mesh.bdr_attributes.Max()); ++ bdr_attr_is_ess = 0; ++ bdr_attr_is_ess[mesh.bdr_attributes.Find(attribute)] = 1; ++ ++ Array ess_tdof_list; ++ pfes.GetEssentialTrueDofs(bdr_attr_is_ess, ess_tdof_list); ++ if (TDOF) ++ { ++ pfes.GetEssentialTrueDofs(bdr_attr_is_ess, ess_tdof_list); ++ return ess_tdof_list.Size(); ++ } ++ else ++ { ++ // VDOF ++ Array ess_vdof_marker, vdof_list; ++ pfes.GetEssentialVDofs(bdr_attr_is_ess, ess_vdof_marker); ++ pfes.MarkerToList(ess_vdof_marker, vdof_list); ++ return vdof_list.Size(); ++ } ++}; ++ ++template ++int ParCountEssentialDof(ParMesh &mesh, int order, int attribute) ++{ ++ auto num_essential_dof = CountEssentialDof(mesh, order, ++ attribute); ++ MPI_Allreduce(MPI_IN_PLACE, &num_essential_dof, 1, MPI_INT, MPI_SUM, ++ MPI_COMM_WORLD); ++ return num_essential_dof; ++}; ++ ++TEST_CASE("InteriorBoundaryReferenceTets", "[Parallel], [NCMesh]") ++{ ++ constexpr auto seed = 314159; ++ srand(seed); ++ auto p = 1;//GENERATE(1,2,3); ++ CAPTURE(p); ++ ++ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); ++ ++ REQUIRE(smesh.GetNBE() == 4); ++ ++ Array refs; ++ refs.Append(Refinement(0, Refinement::X)); ++ smesh.GeneralRefinement(refs); ++ ++ // Now have a pair of elements, make the second element a different ++ // attribute. ++ smesh.SetAttribute(0, 1); ++ smesh.SetAttribute(1, 2); ++ ++ REQUIRE(smesh.GetNBE() == 2 * 3); ++ ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ // Introduce an internal boundary element ++ const int new_attribute = smesh.bdr_attributes.Max() + 1; ++ for (int f = 0; f < smesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); ++ new_elem->SetAttribute(new_attribute); ++ smesh.AddBdrElement(new_elem); ++ break; ++ } ++ } ++ smesh.FinalizeTopology(); // Finalize to build relevant tables ++ smesh.Finalize(); ++ ++ // Exactly one boundary element must be added ++ REQUIRE(smesh.GetNBE() == 2 * 3 + 1); ++ ++ smesh.EnsureNCMesh(true); ++ ++ auto pmesh = CheckParMeshNBE(smesh); ++ ++ pmesh->FinalizeTopology(); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ REQUIRE(pmesh->Nonconforming()); ++ ++ std::map local_to_shared; ++ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) ++ { ++ local_to_shared[pmesh->GetSharedFace(i)] = i; ++ } ++ ++ // Count the number of internal faces via the boundary elements ++ int num_internal = 0; ++ for (int n = 0; n < pmesh->GetNBE(); ++n) ++ { ++ int f, o; ++ pmesh->GetBdrElementFace(n, &f, &o); ++ if (CheckFaceInternal(*pmesh, f, local_to_shared)) ++ { ++ ++num_internal; ++ } ++ } ++ ++ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ CHECK(num_internal == 1); ++ ++ CheckPoisson(*pmesh, p, pmesh->bdr_attributes.Max()); ++ CheckPoisson(*pmesh, p); ++ ++ int num_initial_ess_tdof = CountEssentialDof(*pmesh, p, ++ smesh.bdr_attributes.Max()); ++ if (Mpi::Root()) ++ { ++ REQUIRE(num_initial_ess_tdof > 0); ++ } ++ // Level of refinement difference across the processor boundary from root zero to the ++ // others ++ auto ref_level = 1;//GENERATE(1,2,3); ++ auto refined_attribute = 2;//GENERATE(1,2); ++ CAPTURE(ref_level); ++ CAPTURE(refined_attribute); ++ ++ Mesh modified_smesh(smesh); ++ for (int r = 0; r < ref_level; r++) ++ { ++ Array el_to_refine; ++ for (int n = 0; n < modified_smesh.GetNE(); n++) ++ { ++ if (modified_smesh.GetAttribute(n) == refined_attribute) ++ { ++ el_to_refine.Append(n); ++ } ++ } ++ modified_smesh.GeneralRefinement(el_to_refine); ++ } ++ ++ // There should now be some internal boundary elements, where there was one ++ // before. ++ CHECK(modified_smesh.GetNBE() == 3 /* external boundaries of unrefined */ ++ + std::pow(4, ref_level) /* internal boundaries */ ++ + (3 * std::pow(4, ref_level)) /* external boundaries of refined */); ++ ++ // Force the partition to have the edge case of a parent and child being ++ // divided across the processor boundary. This necessitates the ++ // GhostBoundaryElement treatment. ++ auto partition = std::unique_ptr(new int[modified_smesh.GetNE()]); ++ for (int i = 0; i < modified_smesh.GetNE(); i++) ++ { ++ // Randomly assign to any processor but zero. ++ partition[i] = Mpi::WorldSize() > 1 ? 1 + rand() % (Mpi::WorldSize() - 1) : 0; ++ } ++ if (Mpi::WorldSize() > 0) ++ { ++ // Make sure rank 0 has the non-refined attribute. This ensures it will have ++ // a parent face with only ghost children. ++ const int unrefined_attribute = refined_attribute == 1 ? 2 : 1; ++ Array root_element; ++ for (int n = 0; n < modified_smesh.GetNE(); n++) ++ { ++ if (modified_smesh.GetAttribute(n) == unrefined_attribute) ++ { ++ root_element.Append(n); ++ } ++ } ++ REQUIRE(root_element.Size() == 1); ++ partition[root_element[0]] = 0; ++ } ++ ++ pmesh = CheckParMeshNBE(modified_smesh, partition); ++ pmesh->Finalize(); ++ pmesh->FinalizeTopology(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ // return; ++ auto check_faces = [&]() ++ { ++ // repopulate the local to shared map. ++ local_to_shared.clear(); ++ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) ++ { ++ local_to_shared[pmesh->GetSharedFace(i)] = i; ++ } ++ ++ // Count the number of internal faces via the boundary elements ++ num_internal = 0; ++ for (int n = 0; n < pmesh->GetNBE(); ++n) ++ { ++ int f, o; ++ pmesh->GetBdrElementFace(n, &f, &o); ++ if (CheckFaceInternal(*pmesh, f, local_to_shared)) ++ { ++ ++num_internal; ++ } ++ } ++ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ CHECK(num_internal == std::pow(4, ref_level)); ++ CheckPoisson(*pmesh, p, smesh.bdr_attributes.Max()); ++ CheckPoisson(*pmesh, p); ++ }; ++ ++ ++ check_faces(); ++ pmesh->Rebalance(); ++ pmesh->ExchangeFaceNbrData(); ++ check_faces(); ++} ++ ++TEST_CASE("InteriorBoundaryInlineTetRefines", "[Parallel], [NCMesh]") ++{ ++ int p = GENERATE(1,2); ++ CAPTURE(p); ++ ++ auto smesh = Mesh("../../data/inline-tet.mesh"); ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ // Mark even and odd elements with different attributes ++ auto num_attributes = 3; ++ for (int i = 0; i < smesh.GetNE(); ++i) ++ { ++ smesh.SetAttribute(i, (i % num_attributes) + 1); ++ } ++ ++ smesh.SetAttributes(); ++ int initial_nbe = smesh.GetNBE(); ++ ++ // Introduce internal boundary elements ++ const int new_attribute = smesh.bdr_attributes.Max() + 1; ++ for (int f = 0; f < smesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); ++ new_elem->SetAttribute(new_attribute); ++ smesh.AddBdrElement(new_elem); ++ } ++ } ++ ++ smesh.FinalizeTopology(); // Finalize to build relevant tables ++ smesh.Finalize(); ++ ++ smesh.EnsureNCMesh(true); ++ ++ // Boundary elements must've been added to make the test valid ++ int num_internal_serial = smesh.GetNBE() - initial_nbe; ++ REQUIRE(num_internal_serial > 0); ++ ++ auto partition = std::unique_ptr(new int[smesh.GetNE()]); ++ ++ for (int i = 0; i < smesh.GetNE(); ++i) ++ { ++ partition[i] = i % Mpi::WorldSize(); // checkerboard partition ++ } ++ ++ auto pmesh = CheckParMeshNBE(smesh, partition); ++ ++ pmesh->FinalizeTopology(); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ std::map local_to_shared; ++ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) ++ { ++ local_to_shared[pmesh->GetSharedFace(i)] = i; ++ } ++ ++ // Count the number of internal faces via the boundary elements ++ int num_internal = 0; ++ for (int n = 0; n < pmesh->GetNBE(); ++n) ++ { ++ int f, o; ++ pmesh->GetBdrElementFace(n, &f, &o); ++ if (CheckFaceInternal(*pmesh, f, local_to_shared)) ++ { ++ ++num_internal; ++ } ++ } ++ ++ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ ++ CHECK(num_internal == num_internal_serial); ++ ++ CheckPoisson(*pmesh, p, new_attribute); ++ CheckPoisson(*pmesh, p); ++ ++ // Mark all elements of a given attribute for refinement to a given depth. ++ auto ref_level = GENERATE(1,2); ++ auto marked_attribute = GENERATE(1,2,3); ++ REQUIRE(marked_attribute <= num_attributes); ++ CAPTURE(ref_level); ++ CAPTURE(marked_attribute); ++ for (int r = 0; r < ref_level; r++) ++ { ++ Array elem_to_refine; ++ for (int i = 0; i < smesh.GetNE(); ++i) ++ { ++ if (smesh.GetAttribute(i) == marked_attribute) ++ { ++ elem_to_refine.Append(i); ++ } ++ } ++ smesh.GeneralRefinement(elem_to_refine); ++ } ++ ++ pmesh = CheckParMeshNBE(smesh); ++ pmesh->FinalizeTopology(); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ // Count the number of internal boundary elements ++ num_internal_serial = 0; ++ for (int n = 0; n < smesh.GetNBE(); ++n) ++ { ++ int f, o; ++ smesh.GetBdrElementFace(n, &f, &o); ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ ++num_internal_serial; ++ } ++ } ++ ++ auto check_faces = [&]() ++ { ++ // repopulate the local to shared map. ++ local_to_shared.clear(); ++ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) ++ { ++ local_to_shared[pmesh->GetSharedFace(i)] = i; ++ } ++ ++ num_internal = 0; ++ for (int n = 0; n < pmesh->GetNBE(); ++n) ++ { ++ int f, o; ++ pmesh->GetBdrElementFace(n, &f, &o); ++ if (CheckFaceInternal(*pmesh, f, local_to_shared)) ++ { ++ ++num_internal; ++ } ++ } ++ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ ++ CHECK(num_internal == num_internal_serial); ++ ++ CheckPoisson(*pmesh, p, pmesh->bdr_attributes.Max()); ++ CheckPoisson(*pmesh, p); ++ }; ++ ++ check_faces(); ++ pmesh->Rebalance(); ++ pmesh->ExchangeFaceNbrData(); ++ check_faces(); ++} ++ ++TEST_CASE("InteriorBoundaryReferenceCubes", "[Parallel], [NCMesh]") ++{ ++ auto p = GENERATE(1,2,3); ++ CAPTURE(p); ++ ++ auto smesh = Mesh("../../data/ref-cube.mesh"); ++ smesh.EnsureNCMesh(); ++ ++ REQUIRE(smesh.GetNBE() == 6); ++ ++ Array refs; ++ refs.Append(Refinement(0, Refinement::X)); ++ smesh.GeneralRefinement(refs); ++ ++ // Now have a pair of elements, make the second element a different ++ // attribute. ++ smesh.SetAttribute(0, 1); ++ smesh.SetAttribute(1, 2); ++ ++ REQUIRE(smesh.GetNBE() == 2 * 5); ++ ++ // Throw away the NCMesh, will restart NC later. ++ delete smesh.ncmesh; ++ smesh.ncmesh = nullptr; ++ ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ // Introduce an internal boundary element ++ const int new_attribute = smesh.bdr_attributes.Max() + 1; ++ for (int f = 0; f < smesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); ++ new_elem->SetAttribute(new_attribute); ++ smesh.AddBdrElement(new_elem); ++ break; ++ } ++ } ++ smesh.FinalizeTopology(); // Finalize to build relevant tables ++ smesh.Finalize(); ++ ++ // Exactly one boundary element must be added ++ REQUIRE(smesh.GetNBE() == 2 * 5 + 1); ++ ++ auto pmesh = CheckParMeshNBE(smesh); ++ ++ pmesh->FinalizeTopology(); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ REQUIRE(pmesh->Conforming()); ++ ++ std::map local_to_shared; ++ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) ++ { ++ local_to_shared[pmesh->GetSharedFace(i)] = i; ++ } ++ ++ // Count the number of internal faces via the boundary elements ++ int num_internal = 0; ++ for (int n = 0; n < pmesh->GetNBE(); ++n) ++ { ++ int f, o; ++ pmesh->GetBdrElementFace(n, &f, &o); ++ if (CheckFaceInternal(*pmesh, f, local_to_shared)) ++ { ++ ++num_internal; ++ } ++ } ++ ++ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ CHECK(num_internal == 1); ++ ++ CheckPoisson(*pmesh, p, pmesh->bdr_attributes.Max()); ++ CheckPoisson(*pmesh, p); ++ ++ for (int refined_elem : {0, 1}) ++ { ++ // Now NC refine one of the attached elements, this should result in 4 ++ // internal boundary elements. ++ Array el_to_refine; ++ el_to_refine.Append(refined_elem); ++ ++ Mesh modified_smesh(smesh); ++ modified_smesh.GeneralRefinement(el_to_refine); ++ ++ // There should now be four internal boundary elements, where there was one ++ // before. ++ CHECK(modified_smesh.GetNBE() == 5 /* external boundaries of unrefined */ ++ + 4 /* internal boundaries */ ++ + (5 * 4) /* external boundaries of refined */); ++ ++ // Force the partition to have the edge case of a parent and child being ++ // divided across the processor boundary. This necessitates the ++ // GhostBoundaryElement treatment. ++ auto partition = std::unique_ptr(new int[modified_smesh.GetNE()]); ++ srand(314159); ++ for (int i = 0; i < modified_smesh.GetNE(); ++i) ++ { ++ // Randomly assign to any processor but zero. ++ partition[i] = Mpi::WorldSize() > 1 ? 1 + rand() % (Mpi::WorldSize() - 1) : 0; ++ } ++ if (Mpi::WorldSize() > 0) ++ { ++ // Make sure on rank 1 there is a parent face with only ghost child ++ // faces. This can cause issues with higher order dofs being uncontrolled. ++ partition[refined_elem == 0 ? modified_smesh.GetNE() - 1 : 0] = 0; ++ } ++ ++ pmesh = CheckParMeshNBE(modified_smesh, partition); ++ pmesh->Finalize(); ++ pmesh->FinalizeTopology(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ auto check_faces = [&]() ++ { ++ // repopulate the local to shared map. ++ local_to_shared.clear(); ++ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) ++ { ++ local_to_shared[pmesh->GetSharedFace(i)] = i; ++ } ++ ++ // Count the number of internal faces via the boundary elements ++ num_internal = 0; ++ for (int n = 0; n < pmesh->GetNBE(); ++n) ++ { ++ int f, o; ++ pmesh->GetBdrElementFace(n, &f, &o); ++ if (CheckFaceInternal(*pmesh, f, local_to_shared)) ++ { ++ ++num_internal; ++ } ++ } ++ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ CHECK(num_internal == 4); ++ ++ CAPTURE(refined_elem); ++ CheckPoisson(*pmesh, p, smesh.bdr_attributes.Max()); ++ CheckPoisson(*pmesh, p); ++ }; ++ ++ check_faces(); ++ pmesh->Rebalance(); ++ pmesh->ExchangeFaceNbrData(); ++ check_faces(); ++ } ++} ++ ++TEST_CASE("InteriorBoundaryInlineHexRefines", "[Parallel], [NCMesh]") ++{ ++ int p = GENERATE(1,2,3); ++ CAPTURE(p); ++ ++ auto smesh = Mesh("../../data/inline-hex.mesh"); ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ // Mark even and odd elements with different attributes ++ for (int i = 0; i < smesh.GetNE(); ++i) ++ { ++ smesh.SetAttribute(i, (i % 2) + 1); ++ } ++ ++ smesh.SetAttributes(); ++ int initial_nbe = smesh.GetNBE(); ++ ++ // Introduce internal boundary elements ++ const int new_attribute = smesh.bdr_attributes.Max() + 1; ++ for (int f = 0; f < smesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); ++ new_elem->SetAttribute(new_attribute); ++ smesh.AddBdrElement(new_elem); ++ } ++ } ++ ++ smesh.FinalizeTopology(); // Finalize to build relevant tables ++ smesh.Finalize(); ++ ++ // Boundary elements must've been added to make the test valid ++ int num_internal_serial = smesh.GetNBE() - initial_nbe; ++ REQUIRE(num_internal_serial > 0); ++ ++ auto partition = std::unique_ptr(new int[smesh.GetNE()]); ++ ++ for (int i = 0; i < smesh.GetNE(); ++i) ++ { ++ partition[i] = i % Mpi::WorldSize(); // checkerboard partition ++ } ++ ++ auto pmesh = CheckParMeshNBE(smesh, partition); ++ ++ pmesh->FinalizeTopology(); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ std::map local_to_shared; ++ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) ++ { ++ local_to_shared[pmesh->GetSharedFace(i)] = i; ++ } ++ ++ // Count the number of internal faces via the boundary elements ++ int num_internal = 0; ++ for (int n = 0; n < pmesh->GetNBE(); ++n) ++ { ++ int f, o; ++ pmesh->GetBdrElementFace(n, &f, &o); ++ if (CheckFaceInternal(*pmesh, f, local_to_shared)) ++ { ++ ++num_internal; ++ } ++ } ++ ++ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ ++ CHECK(num_internal == num_internal_serial); ++ ++ CheckPoisson(*pmesh, p, new_attribute); ++ CheckPoisson(*pmesh, p); ++ ++ // Mark every third element for refinement ++ Array elem_to_refine; ++ const int factor = 3; ++ for (int i = 0; i < smesh.GetNE()/factor; ++i) ++ { ++ elem_to_refine.Append(factor * i); ++ } ++ smesh.GeneralRefinement(elem_to_refine); ++ ++ pmesh = CheckParMeshNBE(smesh); ++ pmesh->FinalizeTopology(); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ // repopulate the local to shared map. ++ local_to_shared.clear(); ++ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) ++ { ++ local_to_shared[pmesh->GetSharedFace(i)] = i; ++ } ++ ++ // Count the number of internal boundary elements ++ num_internal_serial = 0; ++ for (int n = 0; n < smesh.GetNBE(); ++n) ++ { ++ int f, o; ++ smesh.GetBdrElementFace(n, &f, &o); ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ ++num_internal_serial; ++ } ++ } ++ ++ num_internal = 0; ++ for (int n = 0; n < pmesh->GetNBE(); ++n) ++ { ++ int f, o; ++ pmesh->GetBdrElementFace(n, &f, &o); ++ if (CheckFaceInternal(*pmesh, f, local_to_shared)) ++ { ++ ++num_internal; ++ } ++ } ++ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ ++ CHECK(num_internal == num_internal_serial); ++ ++ CheckPoisson(*pmesh, p, pmesh->bdr_attributes.Max()); ++ CheckPoisson(*pmesh, p); ++} ++ ++TEST_CASE("ParMeshInternalBoundaryStarMesh", "[Parallel], [NCMesh]") ++{ ++ auto smesh = StarMesh(); ++ smesh.EnsureNCMesh(true); ++ ++ if (Mpi::WorldSize() < 5) { return;} ++ ++ auto partition = std::unique_ptr(new int[5]); ++ for (int i = 0; i < 5; i++) ++ { ++ partition[i] = i; ++ } ++ auto pmesh = CheckParMeshNBE(smesh, partition); ++ pmesh->FinalizeTopology(); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ REQUIRE(pmesh->Nonconforming()); ++ ++ std::map local_to_shared; ++ for (int i = 0; i < pmesh->GetNSharedFaces(); ++i) ++ { ++ local_to_shared[pmesh->GetSharedFace(i)] = i; ++ } ++ ++ // Count the number of internal faces via the boundary elements ++ int num_internal = 0; ++ for (int n = 0; n < pmesh->GetNBE(); ++n) ++ { ++ int f, o; ++ pmesh->GetBdrElementFace(n, &f, &o); ++ if (CheckFaceInternal(*pmesh, f, local_to_shared)) ++ { ++ ++num_internal; ++ } ++ } ++ ++ const int rank = Mpi::WorldRank(); ++ SECTION("Unrefined") ++ { ++ MPI_Allreduce(MPI_IN_PLACE, &num_internal, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); ++ CHECK(num_internal == 4); ++ ++ CHECK(CountEssentialDof(*pmesh, 1, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 4 : 0)); ++ CHECK(CountEssentialDof(*pmesh, 2, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 4 + 6 : 0)); ++ CHECK(CountEssentialDof(*pmesh, 3, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 4 + 6*2 + 4*1 : 0)); ++ CHECK(CountEssentialDof(*pmesh, 4, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 4 + 6*3 + 4*3 : 0)); ++ ++ CHECK(CountEssentialDof(*pmesh, 1, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 6 : 0)); ++ CHECK(CountEssentialDof(*pmesh, 2, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 20 : 0)); ++ CHECK(CountEssentialDof(*pmesh, 3, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 42 : 0)); ++ CHECK(CountEssentialDof(*pmesh, 4, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 72 : 0)); ++ CHECK(pmesh->GetNBE() == (rank == 0 ? 4 : (rank < 5 ? 3 : 0))); ++ } ++ ++ SECTION("Refinement") ++ { ++ // Refining an element attached to the core should not change the number of essential ++ // DOF, or the owner of them. ++ ++ const int refined_attribute = GENERATE(1,2,3,4,5); // equal to rank of owner + 1 ++ int ref_level = GENERATE(0, 1, 2, 3); ++ for (int r = 0; r < ref_level; r++) ++ { ++ Array el_to_refine; ++ for (int n = 0; n < pmesh->GetNE(); n++) ++ { ++ if (pmesh->GetAttribute(n) == refined_attribute) ++ { ++ el_to_refine.Append(n); ++ } ++ } ++ pmesh->GeneralRefinement(el_to_refine); ++ } ++ pmesh->ExchangeFaceNbrData(); ++ ++ CAPTURE(rank); ++ CAPTURE(refined_attribute); ++ CAPTURE(ref_level); ++ CHECK(pmesh->GetNE() == (rank == refined_attribute - 1 ? std::pow(8, ++ ref_level) : 1)); ++ CHECK(pmesh->GetNBE() == (rank == refined_attribute - 1 ++ ? std::pow(4, ref_level + 1) ++ : (ref_level == 0 && rank == 0 ? 4 : 3))); ++ ++ // Refining on only one side of the boundary face should not change the number of ++ // essential true dofs, which should match the number within the original face. ++ CHECK(ParCountEssentialDof(*pmesh, 1, ++ smesh.bdr_attributes.Max()) == 4); ++ CHECK(ParCountEssentialDof(*pmesh, 2, ++ smesh.bdr_attributes.Max()) == 4 + 6); ++ CHECK(ParCountEssentialDof(*pmesh, 3, ++ smesh.bdr_attributes.Max()) == 4 + 6*2 + 4*1); ++ CHECK(ParCountEssentialDof(*pmesh, 4, ++ smesh.bdr_attributes.Max()) == 4 + 6*3 + 4*3); ++ ++ CHECK(ParCountEssentialDof(*pmesh, 1, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 6 : 0)); ++ CHECK(ParCountEssentialDof(*pmesh, 2, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 20 : 0)); ++ CHECK(ParCountEssentialDof(*pmesh, 3, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 42 : 0)); ++ CHECK(ParCountEssentialDof(*pmesh, 4, ++ smesh.bdr_attributes.Max()) == (rank == 0 ? 72 : 0)); ++ } ++} ++ ++TEST_CASE("ParDividingPlaneMesh", "[Parallel], [NCMesh]") ++{ ++ auto RefineAttribute = [](Mesh& mesh, int attr, int ref_level) ++ { ++ for (int r = 0; r < ref_level; r++) ++ { ++ Array el_to_refine; ++ for (int n = 0; n < mesh.GetNE(); n++) ++ { ++ if (mesh.GetAttribute(n) == attr) ++ { ++ el_to_refine.Append(n); ++ } ++ } ++ mesh.GeneralRefinement(el_to_refine); ++ } ++ }; ++ ++ SECTION("Hex") ++ { ++ auto mesh = DividingPlaneMesh(false); ++ mesh.EnsureNCMesh(true); ++ ++ CHECK(mesh.GetNBE() == 2 * 5 + 1); ++ CHECK(mesh.GetNE() == 2); ++ ++ SECTION("H1Hex") ++ { ++ mesh.UniformRefinement(); ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 3*3); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 5*5); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 7*7); ++ CHECK(CountEssentialDof(mesh, 4, ++ mesh.bdr_attributes.Max()) == 9*9); ++ ++ auto attr = GENERATE(1,2); ++ auto ref_level = GENERATE(1,2); ++ RefineAttribute(mesh, attr, ref_level); ++ ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 3*3); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 5*5); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 7*7); ++ CHECK(CountEssentialDof(mesh, 4, ++ mesh.bdr_attributes.Max()) == 9*9); ++ } ++ } ++ ++ SECTION("Tet") ++ { ++ auto mesh = DividingPlaneMesh(true, true); ++ mesh.EnsureNCMesh(true); ++ auto pmesh = CheckParMeshNBE(mesh); ++ pmesh->FinalizeTopology(); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ CHECK(pmesh->bdr_attributes.Max() == mesh.bdr_attributes.Max()); ++ ++ auto attr = GENERATE(1,2); ++ auto ref_level = GENERATE(1,2); ++ CAPTURE(attr); ++ CAPTURE(ref_level); ++ ++ const int initial_num_vert = 4; ++ const int initial_num_edge = 5; ++ const int initial_num_face = 2; ++ SECTION("H1Tet") ++ { ++ CHECK(ParCountEssentialDof(*pmesh, 1, ++ mesh.bdr_attributes.Max()) == initial_num_vert); ++ CHECK(ParCountEssentialDof(*pmesh, 2, ++ mesh.bdr_attributes.Max()) == initial_num_vert + initial_num_edge); ++ CHECK(ParCountEssentialDof(*pmesh, 3, ++ mesh.bdr_attributes.Max()) == initial_num_vert + 2*initial_num_edge + ++ initial_num_face); ++ CHECK(ParCountEssentialDof(*pmesh, 4, ++ mesh.bdr_attributes.Max()) == initial_num_vert + 3*initial_num_edge + ++ 3*initial_num_face); ++ ++ RefineAttribute(*pmesh, attr, ref_level); ++ ++ CHECK(ParCountEssentialDof(*pmesh, 1, ++ mesh.bdr_attributes.Max()) == initial_num_vert); ++ CHECK(ParCountEssentialDof(*pmesh, 2, ++ mesh.bdr_attributes.Max()) == initial_num_vert + initial_num_edge); ++ CHECK(ParCountEssentialDof(*pmesh, 3, ++ mesh.bdr_attributes.Max()) == initial_num_vert + 2*initial_num_edge + ++ initial_num_face); ++ CHECK(ParCountEssentialDof(*pmesh, 4, ++ mesh.bdr_attributes.Max()) == initial_num_vert + 3*initial_num_edge + ++ 3*initial_num_face); ++ } ++ ++ SECTION("NDTet") ++ { ++ CHECK(ParCountEssentialDof(*pmesh, 1, ++ mesh.bdr_attributes.Max()) == 5); ++ CHECK(ParCountEssentialDof(*pmesh, 2, ++ mesh.bdr_attributes.Max()) == 14); ++ CHECK(ParCountEssentialDof(*pmesh, 3, ++ mesh.bdr_attributes.Max()) == 27); ++ CHECK(ParCountEssentialDof(*pmesh, 4, ++ mesh.bdr_attributes.Max()) == 44); ++ ++ RefineAttribute(*pmesh, attr, ref_level); ++ CHECK(ParCountEssentialDof(*pmesh, 1, ++ mesh.bdr_attributes.Max()) == 5); ++ CHECK(ParCountEssentialDof(*pmesh, 2, ++ mesh.bdr_attributes.Max()) == 14); ++ CHECK(ParCountEssentialDof(*pmesh, 3, ++ mesh.bdr_attributes.Max()) == 27); ++ CHECK(ParCountEssentialDof(*pmesh, 4, ++ mesh.bdr_attributes.Max()) == 44); ++ } ++ } ++} ++ ++TEST_CASE("ParTetFaceFlips", "[Parallel], [NCMesh]") ++{ ++ /* ++ 1. Define an ND space, and project a smooth non polynomial function onto the space. ++ 2. Compute y-z components in the face, and check that they are equal when evaluated ++ from either side of the face. Tangential continuity of the ND space should ensure ++ they are identical, if orientations are correctly accounted for. ++ 3. Mark the mesh as NC, build a new FESpace, and repeat. There should be no change as ++ the faces are "conformal" though they are within the NC structure. ++ 3. Partition the mesh, create a ParFESpace and repeat the above. There should be no ++ difference in conformal parallel. ++ 4. Construct the ParMesh from the NCMesh and repeat. As above, there should be no ++ change. ++ 5. Perform NC refinement on one side of the internal face, the number of conformal dof ++ in the face will not change, so there should also be no difference. This will be ++ complicated by ensuring the slave evaluations are at the same points. ++ */ ++ ++ auto orientation = GENERATE(1,3,5); ++ auto smesh = OrientedTriFaceMesh(orientation); ++ smesh.EnsureNodes(); ++ ++ CHECK(smesh.GetNBE() == 1); ++ ++ // A smooth function in each vector component ++ constexpr int order = 3, dim = 3, quadrature_order = 4; ++ constexpr double kappa = 2 * M_PI; ++ auto E_exact = [=](const Vector &x, Vector &E) ++ { ++ E(0) = cos(kappa * x(1)); ++ E(1) = cos(kappa * x(2)); ++ E(2) = cos(kappa * x(0)); ++ }; ++ VectorFunctionCoefficient E_coeff(dim, E_exact); ++ ++ // Helper for evaluating the ND grid function on either side of the first conformal shared face. ++ // Specific to the pair of tet mesh described above, but can be generalized. ++ auto CheckParallelNDConformal = [&](ParMesh &mesh) ++ { ++ ND_FECollection fe_collection(order, dim); ++ ParFiniteElementSpace fe_space(&mesh, &fe_collection); ++ ParGridFunction E(&fe_space); ++ ++ E.ProjectCoefficient(E_coeff); ++ E.ExchangeFaceNbrData(); ++ ++ auto *P = fe_space.GetProlongationMatrix(); ++ if (P != nullptr) ++ { ++ // Projection does not respect the non-conformal constraints. ++ // Extract the true (conformal) and prolongate to get the NC respecting projection. ++ auto E_true = E.GetTrueVector(); ++ P->Mult(E_true, E); ++ E.ExchangeFaceNbrData(); ++ } ++ ParGridFunction * const coords = dynamic_cast ++ (mesh.GetNodes()); ++ ++ const auto &ir = IntRules.Get(Geometry::Type::TRIANGLE, quadrature_order); ++ IntegrationRule left_eir(ir.GetNPoints()), ++ right_eir(ir.GetNPoints()); // element integration rules ++ ++ for (int n = 0; n < mesh.GetNBE(); n++) ++ { ++ auto f = mesh.GetBdrElementFaceIndex(n); ++ ++ auto finfo = mesh.GetFaceInformation(f); ++ auto &face_element_transform = finfo.IsShared() ++ ? *mesh.GetSharedFaceTransformationsByLocalIndex(f, true) ++ : *mesh.GetFaceElementTransformations(f); ++ ++ face_element_transform.Loc1.Transform(ir, left_eir); ++ face_element_transform.Loc2.Transform(ir, right_eir); ++ ++ constexpr double tol = 1e-14; ++ REQUIRE(left_eir.GetNPoints() == ir.GetNPoints()); ++ REQUIRE(right_eir.GetNPoints() == ir.GetNPoints()); ++ Vector left_val, right_val; ++ for (int i = 0; i < ir.GetNPoints(); i++) ++ { ++ face_element_transform.Elem1->SetIntPoint(&left_eir[i]); ++ coords->GetVectorValue(*face_element_transform.Elem1, left_eir[i], left_val); ++ coords->GetVectorValue(*face_element_transform.Elem1, left_eir[i], right_val); ++ REQUIRE(std::abs(left_val(0) - right_val(0)) < tol); ++ REQUIRE(std::abs(left_val(1) - right_val(1)) < tol); ++ REQUIRE(std::abs(left_val(2) - right_val(2)) < tol); ++ E.GetVectorValue(*face_element_transform.Elem1, left_eir[i], left_val); ++ ++ face_element_transform.Elem2->SetIntPoint(&right_eir[i]); ++ E.GetVectorValue(*face_element_transform.Elem2, right_eir[i], right_val); ++ ++ // Check that the second and third rows agree. ++ // The y and z should agree as the normal is in the x direction ++ CHECK(std::abs(left_val(1) - right_val(1)) < tol); ++ CHECK(std::abs(left_val(2) - right_val(2)) < tol); ++ } ++ } ++ ++ return fe_space.GlobalTrueVSize(); ++ }; ++ ++ SECTION("Conformal") ++ { ++ auto partition_flag = GENERATE(false, true); ++ CAPTURE(partition_flag); ++ auto partition = std::unique_ptr(new int[2]); ++ if (Mpi::WorldSize() > 1) ++ { ++ partition[0] = partition_flag ? 0 : 1; partition[1] = partition_flag ? 1 : 0; ++ } ++ else ++ { ++ partition[0] = 0; partition[1] = 0; ++ } ++ auto pmesh = CheckParMeshNBE(smesh); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ CheckParallelNDConformal(*pmesh); ++ } ++ ++ SECTION("ConformalSerialUniformRefined") ++ { ++ smesh.UniformRefinement(); ++ auto pmesh = CheckParMeshNBE(smesh); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ CheckParallelNDConformal(*pmesh); ++ } ++ ++ SECTION("ConformalParallelUniformRefined") ++ { ++ auto partition_flag = GENERATE(false, true); ++ CAPTURE(partition_flag); ++ auto partition = std::unique_ptr(new int[2]); ++ if (Mpi::WorldSize() > 1) ++ { ++ partition[0] = partition_flag ? 0 : 1; partition[1] = partition_flag ? 1 : 0; ++ } ++ else ++ { ++ partition[0] = 0; partition[1] = 0; ++ } ++ auto pmesh = CheckParMeshNBE(smesh); ++ pmesh->UniformRefinement(); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ CheckParallelNDConformal(*pmesh); ++ } ++ ++ SECTION("Nonconformal") ++ { ++ auto partition_flag = GENERATE(false, true); ++ CAPTURE(partition_flag); ++ auto partition = std::unique_ptr(new int[2]); ++ if (Mpi::WorldSize() > 1) ++ { ++ partition[0] = partition_flag ? 0 : 1; partition[1] = partition_flag ? 1 : 0; ++ } ++ else ++ { ++ partition[0] = 0; partition[1] = 0; ++ } ++ smesh.EnsureNCMesh(true); ++ auto pmesh = CheckParMeshNBE(smesh); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ CheckParallelNDConformal(*pmesh); ++ } ++ ++ SECTION("NonconformalSerialUniformRefined") ++ { ++ smesh.UniformRefinement(); ++ smesh.EnsureNCMesh(true); ++ auto pmesh = CheckParMeshNBE(smesh); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ CheckParallelNDConformal(*pmesh); ++ } ++ ++ SECTION("NonconformalSerialRefined") ++ { ++ smesh.EnsureNCMesh(true); ++ int ref_level = GENERATE(1, 2); ++ for (int r = 0; r < ref_level; r++) ++ { ++ Array el_to_refine; ++ for (int n = 0; n < smesh.GetNE(); n++) ++ { ++ if (smesh.GetAttribute(n) == 2) ++ { ++ el_to_refine.Append(n); ++ } ++ } ++ smesh.GeneralRefinement(el_to_refine); ++ } ++ auto pmesh = CheckParMeshNBE(smesh); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ CheckParallelNDConformal(*pmesh); ++ } ++ ++ SECTION("NonconformalParallelUniformRefined") ++ { ++ auto partition_flag = GENERATE(false, true); ++ CAPTURE(partition_flag); ++ auto partition = std::unique_ptr(new int[2]); ++ if (Mpi::WorldSize() > 1) ++ { ++ partition[0] = partition_flag ? 0 : 1; partition[1] = partition_flag ? 1 : 0; ++ } ++ else ++ { ++ partition[0] = 0; partition[1] = 0; ++ } ++ smesh.EnsureNCMesh(true); ++ auto pmesh = CheckParMeshNBE(smesh); ++ pmesh->UniformRefinement(); ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ CheckParallelNDConformal(*pmesh); ++ } ++ ++ SECTION("NonconformalParallelRefined") ++ { ++ auto partition_flag = GENERATE(false, true); ++ CAPTURE(partition_flag); ++ auto partition = std::unique_ptr(new int[2]); ++ if (Mpi::WorldSize() > 1) ++ { ++ partition[0] = partition_flag ? 0 : 1; partition[1] = partition_flag ? 1 : 0; ++ } ++ else ++ { ++ partition[0] = 0; partition[1] = 0; ++ } ++ smesh.EnsureNCMesh(true); ++ auto pmesh = CheckParMeshNBE(smesh); ++ int ref_level = GENERATE(1, 2); ++ for (int r = 0; r < ref_level; r++) ++ { ++ Array el_to_refine; ++ for (int n = 0; n < pmesh->GetNE(); n++) ++ { ++ if (pmesh->GetAttribute(n) == 2) ++ { ++ el_to_refine.Append(n); ++ } ++ } ++ pmesh->GeneralRefinement(el_to_refine); ++ } ++ pmesh->Finalize(); ++ pmesh->ExchangeFaceNbrData(); ++ ++ CheckParallelNDConformal(*pmesh); ++ } ++ ++ SECTION("NonconformalLevelTwoRefined") ++ { ++ smesh.EnsureNCMesh(true); ++ smesh.UniformRefinement(); ++ Array el_to_refine(1); ++ for (int n = 0; n < smesh.GetNE(); n++) ++ { ++ if (smesh.GetAttribute(n) == 2) ++ { ++ CAPTURE(n); ++ Mesh smesh2(smesh); ++ el_to_refine[0] = n; ++ smesh2.GeneralRefinement(el_to_refine); ++ for (int m = 0; m < smesh2.GetNE(); m++) ++ { ++ if (smesh2.GetAttribute(m) == 2) ++ { ++ CAPTURE(m); ++ Mesh smesh3(smesh2); ++ el_to_refine[0] = m; ++ smesh3.GeneralRefinement(el_to_refine); ++ CheckParallelNDConformal(*CheckParMeshNBE(smesh3)); ++ } ++ } ++ } ++ } ++ } ++ ++} ++ ++// Helper to check the identity PR = I on a ParFiniteElementSpace. ++void CheckRPIdentity(const ParFiniteElementSpace& pfespace) ++{ ++ const SparseMatrix *R = pfespace.GetRestrictionMatrix(); ++ HypreParMatrix *P = pfespace.Dof_TrueDof_Matrix(); ++ ++ REQUIRE(R != nullptr); ++ REQUIRE(P != nullptr); ++ ++ HypreParMatrix *hR = new HypreParMatrix( ++ pfespace.GetComm(), pfespace.GlobalTrueVSize(), ++ pfespace.GlobalVSize(), pfespace.GetTrueDofOffsets(), ++ pfespace.GetDofOffsets(), ++ const_cast(R)); // Non owning so cast is ok ++ ++ REQUIRE(hR->Height() == P->Width()); ++ REQUIRE(hR->Width() == P->Height()); ++ ++ REQUIRE(hR != nullptr); ++ HypreParMatrix *I = ParMult(hR, P); ++ ++ // Square matrix so the "diag" is the only bit we need. ++ SparseMatrix diag; ++ I->GetDiag(diag); ++ for (int i = 0; i < diag.Height(); i++) ++ for (int j = 0; j < diag.Width(); j++) ++ { ++ // cast to const to force a zero return rather than an abort. ++ CHECK(const_cast(diag)(i, j) == (i == j ? 1.0 : 0.0)); ++ } ++ ++ delete hR; ++ delete I; ++} ++ ++TEST_CASE("Parallel RP=I", "[Parallel], [NCMesh]") ++{ ++ const int order = GENERATE(1, 2, 3); ++ CAPTURE(order); ++ const int dim = 3; ++ ++ SECTION("Hex") ++ { ++ // Split the hex into a pair, then isotropically refine one of them. ++ Mesh smesh("../../data/ref-cube.mesh"); ++ Array refinements(1); ++ refinements[0].index = 0; ++ refinements[0].ref_type = Refinement::X; ++ smesh.GeneralRefinement(refinements); ++ refinements[0].ref_type = Refinement::XYZ; ++ smesh.GeneralRefinement(refinements); ++ ParMesh mesh(MPI_COMM_WORLD, smesh); ++ SECTION("ND") ++ { ++ ND_FECollection fec(order, dim); ++ ParFiniteElementSpace fespace(&mesh, &fec); ++ CheckRPIdentity(fespace); ++ } ++ SECTION("H1") ++ { ++ H1_FECollection fec(order, dim); ++ ParFiniteElementSpace fespace(&mesh, &fec); ++ CheckRPIdentity(fespace); ++ } ++ } ++ ++ SECTION("Tet") ++ { ++ // Split the hex into a pair, then isotropically refine one of them. ++ Mesh smesh("../../data/ref-tetrahedron.mesh"); ++ Array refinements(1); ++ refinements[0].index = 0; ++ refinements[0].ref_type = Refinement::X; ++ smesh.GeneralRefinement(refinements); ++ bool use_nc = GENERATE(false, true); ++ smesh.EnsureNCMesh(use_nc); ++ refinements[0].ref_type = Refinement::XYZ; ++ smesh.GeneralRefinement(refinements); ++ smesh.EnsureNCMesh(true); // Always checking NC ++ ParMesh mesh(MPI_COMM_WORLD, smesh); ++ SECTION("ND") ++ { ++ ND_FECollection fec(order, dim); ++ ParFiniteElementSpace fespace(&mesh, &fec); ++ CheckRPIdentity(fespace); ++ } ++ SECTION("H1") ++ { ++ H1_FECollection fec(order, dim); ++ ParFiniteElementSpace fespace(&mesh, &fec); ++ CheckRPIdentity(fespace); ++ } ++ } ++} ++ ++#endif // MFEM_USE_MPI ++ ++TEST_CASE("ReferenceCubeInternalBoundaries", "[NCMesh]") ++{ ++ auto p = GENERATE(1,2,3); ++ CAPTURE(p); ++ ++ auto smesh = Mesh("../../data/ref-cube.mesh"); ++ ++ CheckPoisson(smesh, p); ++ ++ smesh.EnsureNCMesh(); ++ Array refs; ++ refs.Append(Refinement(0, Refinement::X)); ++ smesh.GeneralRefinement(refs); ++ ++ // Now have a pair of elements, make the second element a different ++ // attribute. ++ smesh.SetAttribute(1, 2); ++ ++ REQUIRE(smesh.GetNBE() == 2 * 5); ++ ++ delete smesh.ncmesh; ++ smesh.ncmesh = nullptr; ++ ++ // Introduce an internal boundary element ++ for (int f = 0; f < smesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); ++ new_elem->SetAttribute(7); ++ smesh.AddBdrElement(new_elem); ++ } ++ } ++ ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ // Exactly one boundary element must be added ++ CHECK(smesh.GetNBE() == 2 * 5 + 1); ++ ++ smesh.EnsureNCMesh(); ++ CHECK(smesh.GetNBE() == 2 * 5 + 1); ++ ++ int without_internal, with_internal; ++ with_internal = CheckPoisson(smesh, p); // Include the internal boundary ++ without_internal = CheckPoisson(smesh, p, ++ smesh.bdr_attributes.Max()); // Exclude the internal boundary ++ ++ switch (p) ++ { ++ case 1: ++ CHECK(with_internal == without_internal); break; ++ case 2: ++ CHECK(with_internal == without_internal + 1); break; ++ case 3: ++ CHECK(with_internal == without_internal + 4); break; ++ } ++ ++ auto ref_type = char(GENERATE(//Refinement::Y, Refinement::Z, Refinement::YZ, ++ Refinement::XYZ)); ++ ++ for (auto ref : {0,1}) ++ { ++ refs[0].index = ref; ++ ++ auto ssmesh = Mesh(smesh); ++ ++ CAPTURE(ref_type); ++ ++ // Now NC refine one of the attached elements, this should result in 2 ++ // internal boundary elements. ++ refs[0].ref_type = ref_type; ++ ++ ssmesh.GeneralRefinement(refs); ++ ++ // There should now be four internal boundary elements, where there was one ++ // before. ++ if (ref_type == 2 /* Y */ || ref_type == 4 /* Z */) ++ { ++ CHECK(ssmesh.GetNBE() == 5 /* external boundaries of unrefined element */ ++ + 2 /* internal boundaries */ ++ + (2 * 4) /* external boundaries of refined elements */); ++ } ++ else if (ref_type == 6) ++ { ++ CHECK(ssmesh.GetNBE() == 5 /* external boundaries of unrefined element */ ++ + 4 /* internal boundaries */ ++ + (4 * 3) /* external boundaries of refined elements */); ++ } ++ else if (ref_type == 7) ++ { ++ CHECK(ssmesh.GetNBE() == 5 /* external boundaries of unrefined element */ ++ + 4 /* internal boundaries */ ++ + (4 * 3 + 4 * 2) /* external boundaries of refined elements */); ++ } ++ else ++ { ++ MFEM_ABORT("!"); ++ } ++ ++ // Count the number of internal boundary elements ++ int num_internal = 0; ++ for (int n = 0; n < ssmesh.GetNBE(); ++n) ++ { ++ int f, o; ++ ssmesh.GetBdrElementFace(n, &f, &o); ++ int e1, e2; ++ ssmesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && ssmesh.GetAttribute(e1) != ssmesh.GetAttribute(e2)) ++ { ++ ++num_internal; ++ } ++ } ++ CHECK(num_internal == (ref_type <= 4 ? 2 : 4)); ++ ++ ssmesh.FinalizeTopology(); ++ ssmesh.Finalize(); ++ ++ without_internal = CheckPoisson(ssmesh, p, ++ ssmesh.bdr_attributes.Max()); // Exclude the internal boundary ++ with_internal = CheckPoisson(ssmesh, p); // Include the internal boundary ++ ++ // All slaves dofs that are introduced on the face are constrained by ++ // the master dofs, thus the additional constraints on the internal ++ // boundary are purely on the master face, which matches the initial ++ // unrefined case. ++ switch (p) ++ { ++ case 1: ++ CHECK(with_internal == without_internal); break; ++ case 2: ++ CHECK(with_internal == without_internal + 1); break; ++ case 3: ++ CHECK(with_internal == without_internal + 4); break; ++ } ++ } ++} ++ ++TEST_CASE("RefinedCubesInternalBoundaries", "[NCMesh]") ++{ ++ auto p = GENERATE(1,2,3); ++ CAPTURE(p); ++ ++ auto smesh = Mesh("../../data/ref-cube.mesh"); ++ smesh.EnsureNCMesh(); ++ Array refs; ++ refs.Append(Refinement(0, Refinement::X)); ++ smesh.GeneralRefinement(refs); ++ ++ // Now have a pair of elements, make the second element a different ++ // attribute. ++ smesh.SetAttribute(1, 2); ++ ++ REQUIRE(smesh.GetNBE() == 2 * 5); ++ ++ delete smesh.ncmesh; ++ smesh.ncmesh = nullptr; ++ ++ smesh.UniformRefinement(); ++ ++ // Introduce four internal boundary elements ++ for (int f = 0; f < smesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); ++ new_elem->SetAttribute(7); ++ smesh.AddBdrElement(new_elem); ++ } ++ } ++ ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ // Exactly four boundary elements must be added ++ CHECK(smesh.GetNBE() == 2 * 5 * 4 + 4); ++ ++ smesh.EnsureNCMesh(); ++ CHECK(smesh.GetNBE() == 2 * 5 * 4 + 4); ++ ++ int without_internal = CheckPoisson(smesh, p, ++ 7); // Exclude the internal boundary ++ int with_internal = CheckPoisson(smesh, p); // Include the internal boundary ++ ++ switch (p) ++ { ++ case 1: ++ CHECK(with_internal == without_internal + 1); break; ++ case 2: ++ CHECK(with_internal == without_internal + 3 * 3); break; ++ case 3: ++ CHECK(with_internal == without_internal + 5 * 5); break; ++ } ++ ++ // Mark all elements on one side of the attribute boundary to refine ++ refs.DeleteAll(); ++ for (int n = 0; n < smesh.GetNE(); ++n) ++ { ++ if (smesh.GetAttribute(n) == 2) ++ { ++ refs.Append(Refinement{n, Refinement::XYZ}); ++ } ++ } ++ ++ smesh.GeneralRefinement(refs); ++ ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ // There should now be 16 internal boundary elements, where there were 4 before ++ ++ CHECK(smesh.GetNBE() == 5 * 4 /* external boundaries of unrefined domain */ ++ + 4 * 4 /* internal boundaries */ ++ + 5 * 16 /* external boundaries of refined elements */); ++ ++ ++ // Count the number of internal boundary elements ++ int num_internal = 0; ++ for (int n = 0; n < smesh.GetNBE(); ++n) ++ { ++ int f, o; ++ smesh.GetBdrElementFace(n, &f, &o); ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ ++num_internal; ++ } ++ } ++ CHECK(num_internal == 16); ++ ++ ++ without_internal = CheckPoisson(smesh, p, ++ smesh.bdr_attributes.Max()); // Exclude the internal boundary ++ with_internal = CheckPoisson(smesh, p); // Include the internal boundary ++ ++ switch (p) ++ { ++ case 1: ++ CHECK(with_internal == without_internal + 1); break; ++ case 2: ++ CHECK(with_internal == without_internal + 3 * 3); break; ++ case 3: ++ CHECK(with_internal == without_internal + 5 * 5); break; ++ } ++} ++ ++TEST_CASE("ReferenceTetInternalBoundaries", "[NCMesh]") ++{ ++ auto p = GENERATE(1,2,3); ++ CAPTURE(p); ++ ++ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); ++ Array refs; ++ refs.Append(Refinement(0, Refinement::X)); ++ smesh.GeneralRefinement(refs); ++ ++ // Now have a pair of elements, make the second element a different ++ // attribute. ++ smesh.SetAttribute(1, 2); ++ ++ REQUIRE(smesh.GetNE() == 2); ++ REQUIRE(smesh.GetNBE() == 2 * 3); ++ ++ // Introduce an internal boundary element ++ for (int f = 0; f < smesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); ++ new_elem->SetAttribute(5); ++ smesh.AddBdrElement(new_elem); ++ } ++ } ++ ++ // Exactly one boundary element must be added ++ CHECK(smesh.GetNBE() == 2 * 3 + 1); ++ ++ smesh.EnsureNCMesh(true); ++ ++ // Still exactly one boundary element must be added ++ CHECK(smesh.GetNBE() == 2 * 3 + 1); ++ ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ auto without_internal = CheckPoisson(smesh, p, ++ 5); // Exclude the internal boundary ++ auto with_internal = CheckPoisson(smesh, p); // Include the internal boundary ++ ++ switch (p) ++ { ++ case 1: ++ CHECK(with_internal == without_internal); break; ++ case 2: ++ CHECK(with_internal == without_internal); break; ++ case 3: ++ CHECK(with_internal == without_internal + 1); break; ++ } ++ ++ // Now NC refine one of the attached elements, this should result in 2 ++ // internal boundary elements. ++ for (int ref : {0, 1}) ++ { ++ refs[0].index = ref; ++ refs[0].ref_type = Refinement::XYZ; ++ auto ssmesh = Mesh(smesh); ++ ssmesh.GeneralRefinement(refs); ++ ++ // There should now be four internal boundary elements, where there was one ++ // before. ++ CHECK(ssmesh.GetNBE() == 3 /* external boundaries of unrefined element */ ++ + 4 /* internal boundaries */ ++ + (3 * 4) /* external boundaries of refined element */); ++ ++ // Count the number of internal boundary elements ++ int num_internal = 0; ++ for (int n = 0; n < ssmesh.GetNBE(); ++n) ++ { ++ int f, o; ++ ssmesh.GetBdrElementFace(n, &f, &o); ++ int e1, e2; ++ ssmesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && ssmesh.GetAttribute(e1) != ssmesh.GetAttribute(e2)) ++ { ++ ++num_internal; ++ } ++ } ++ CHECK(num_internal == 4); ++ ++ without_internal = CheckPoisson(ssmesh, p, 5); // Exclude the internal boundary ++ with_internal = CheckPoisson(ssmesh, p); // Include the internal boundary ++ ++ switch (p) ++ { ++ case 1: ++ CHECK(with_internal == without_internal); break; ++ case 2: ++ CHECK(with_internal == without_internal); break; ++ case 3: ++ CHECK(with_internal == without_internal + 1); break; ++ } ++ } ++} ++ ++TEST_CASE("RefinedTetsInternalBoundaries", "[NCMesh]") ++{ ++ auto p = GENERATE(1,2,3); ++ CAPTURE(p); ++ ++ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); ++ Array refs; ++ refs.Append(Refinement(0, Refinement::X)); ++ smesh.GeneralRefinement(refs); ++ ++ // Now have a pair of elements, make the second element a different ++ // attribute. ++ smesh.SetAttribute(1, 2); ++ ++ REQUIRE(smesh.GetNE() == 2); ++ REQUIRE(smesh.GetNBE() == 2 * 3); ++ ++ smesh.UniformRefinement(); ++ ++ CHECK(smesh.GetNBE() == 2 * 3 * 4); ++ ++ // Introduce internal boundary elements ++ for (int f = 0; f < smesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); ++ new_elem->SetAttribute(5); ++ smesh.AddBdrElement(new_elem); ++ } ++ } ++ ++ // Exactly four boundary elements must be added ++ CHECK(smesh.GetNBE() == 2 * 3 * 4 + 4); ++ ++ smesh.EnsureNCMesh(true); ++ ++ // Still exactly one boundary element must be added ++ CHECK(smesh.GetNBE() == 2 * 3 * 4 + 4); ++ ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ auto without_internal = CheckPoisson(smesh, p, ++ 5); // Exclude the internal boundary ++ auto with_internal = CheckPoisson(smesh, p); // Include the internal boundary ++ ++ switch (p) ++ { ++ case 1: ++ CHECK(with_internal == without_internal); break; ++ case 2: ++ CHECK(with_internal == without_internal + 3); break; ++ case 3: ++ CHECK(with_internal == without_internal + 10); break; ++ } ++ ++ // Now NC refine all elements with the 2 attribute. ++ ++ // Mark all elements on one side of the attribute boundary to refine ++ refs.DeleteAll(); ++ for (int n = 0; n < smesh.GetNE(); ++n) ++ { ++ if (smesh.GetAttribute(n) == 2) ++ { ++ refs.Append(Refinement{n, Refinement::XYZ}); ++ } ++ } ++ ++ smesh.GeneralRefinement(refs); ++ ++ // There should now be four internal boundary elements, where there was one ++ // before. ++ CHECK(smesh.GetNBE() == 3 * 4 /* external boundaries of unrefined elements */ ++ + 4 * 4 /* internal boundaries */ ++ + (3 * 4 * 4) /* external boundaries of refined elements */); ++ ++ // Count the number of internal boundary elements ++ int num_internal = 0; ++ for (int n = 0; n < smesh.GetNBE(); ++n) ++ { ++ int f, o; ++ smesh.GetBdrElementFace(n, &f, &o); ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ ++num_internal; ++ } ++ } ++ CHECK(num_internal == 4 * 4); ++ ++ without_internal = CheckPoisson(smesh, p, 5); // Exclude the internal boundary ++ with_internal = CheckPoisson(smesh, p); // Include the internal boundary ++ ++ switch (p) ++ { ++ case 1: ++ CHECK(with_internal == without_internal); break; ++ case 2: ++ CHECK(with_internal == without_internal + 3); break; ++ case 3: ++ CHECK(with_internal == without_internal + 10); break; ++ } ++} ++ ++TEST_CASE("PoissonOnReferenceCubeNC", "[NCMesh]") ++{ ++ auto smesh = Mesh("../../data/ref-cube.mesh"); ++ smesh.EnsureNCMesh(); ++ Array refs(1); ++ refs[0].index = 0; ++ refs[0].ref_type = Refinement::X; ++ smesh.GeneralRefinement(refs); ++ ++ // Now have two elements. ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ auto p = GENERATE(1, 2, 3); ++ CAPTURE(p); ++ ++ // Check that Poisson can be solved on the domain ++ CheckPoisson(smesh, p); ++ ++ auto ref_type = char(GENERATE(Refinement::X, Refinement::Y, Refinement::Z, ++ Refinement::XY, Refinement::XZ, Refinement::YZ, ++ Refinement::XYZ)); ++ CAPTURE(ref_type); ++ for (auto refined_elem : {0}) // The left or the right element ++ { ++ refs[0].index = refined_elem; ++ auto ssmesh = Mesh(smesh); ++ ++ // Now NC refine one of the attached elements ++ refs[0].ref_type = ref_type; ++ ++ ssmesh.GeneralRefinement(refs); ++ ssmesh.FinalizeTopology(); ++ ssmesh.Finalize(); ++ ++ CAPTURE(refined_elem); ++ CheckPoisson(ssmesh, p); ++ } ++} ++ ++TEST_CASE("PoissonOnReferenceTetNC", "[NCMesh]") ++{ ++ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); ++ ++ auto p = GENERATE(1, 2, 3); ++ CAPTURE(p); ++ ++ CheckPoisson(smesh, p); ++ ++ Array refs(1); ++ refs[0].index = 0; ++ refs[0].ref_type = Refinement::X; ++ ++ smesh.GeneralRefinement(refs); ++ ++ // Now have two elements. ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ ++ // Check that Poisson can be solved on the pair of tets ++ CheckPoisson(smesh, p); ++ ++ auto nc = GENERATE(false, true); ++ CAPTURE(nc); ++ ++ smesh.EnsureNCMesh(GENERATE(false, true)); ++ ++ for (auto refined_elem : {0, 1}) ++ { ++ auto ssmesh = Mesh(smesh); ++ ++ refs[0].index = refined_elem; ++ refs[0].ref_type = Refinement::XYZ; ++ ++ ssmesh.GeneralRefinement(refs); ++ ssmesh.FinalizeTopology(); ++ ssmesh.Finalize(); ++ ++ CAPTURE(refined_elem); ++ CheckPoisson(ssmesh, p); ++ } ++} ++ ++TEST_CASE("TetBoundaryRefinement", "[NCMesh]") ++{ ++ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); ++ ++ smesh.FinalizeTopology(); ++ smesh.Finalize(true); ++ smesh.UniformRefinement(); ++ ++ smesh.EnsureNCMesh(true); ++ ++ CHECK(smesh.GetNBE() == 4 * 4); ++ ++ // Loop over elements and mark for refinement if any vertices match the ++ // original ++ auto refine_corners = [&]() ++ { ++ Array vertices, elements; ++ // reference vertices of (0,0,0), (1,0,0), (0,1,0), (0,0,1) are [0,3] ++ auto original_vert = [](int i) { return i >= 0 && i <= 3; }; ++ for (int n = 0; n < smesh.GetNE(); ++n) ++ { ++ smesh.GetElementVertices(n, vertices); ++ if (std::any_of(vertices.begin(), vertices.end(), original_vert)) ++ { ++ elements.Append(n); ++ } ++ } ++ ++ smesh.GeneralRefinement(elements); ++ smesh.FinalizeTopology(); ++ smesh.Finalize(); ++ }; ++ ++ constexpr int max_ref_levels = 4; ++ for (int r = 0; r < max_ref_levels; r++) ++ { ++ refine_corners(); ++ CHECK(smesh.GetNBE() == 4 * (4 + 3 * 3 * (r + 1))); ++ } ++} ++ ++TEST_CASE("TetInternalBoundaryRefinement", "[NCMesh]") ++{ ++ auto smesh = Mesh("../../data/ref-tetrahedron.mesh"); ++ ++ REQUIRE(smesh.GetNBE() == 4); ++ ++ { ++ Array refs; ++ refs.Append(Refinement(0, Refinement::X)); ++ smesh.GeneralRefinement(refs); ++ } ++ ++ // Now have a pair of elements, make the second element a different ++ // attribute. ++ smesh.SetAttribute(0, 1); ++ smesh.SetAttribute(1, 2); ++ ++ // Introduce an internal boundary element ++ const int new_attribute = smesh.bdr_attributes.Max() + 1; ++ Array original_boundary_vertices; ++ for (int f = 0; f < smesh.GetNumFaces(); ++f) ++ { ++ int e1, e2; ++ smesh.GetFaceElements(f, &e1, &e2); ++ if (e1 >= 0 && e2 >= 0 && smesh.GetAttribute(e1) != smesh.GetAttribute(e2)) ++ { ++ // This is the internal face between attributes. ++ auto *new_elem = smesh.GetFace(f)->Duplicate(&smesh); ++ new_elem->SetAttribute(new_attribute); ++ new_elem->GetVertices(original_boundary_vertices); ++ smesh.AddBdrElement(new_elem); ++ break; ++ } ++ } ++ ++ smesh.FinalizeTopology(); ++ smesh.Finalize(true); ++ smesh.UniformRefinement(); ++ smesh.EnsureNCMesh(true); ++ ++ CHECK(smesh.GetNBE() == (2*3 + 1) * 4); ++ ++ CHECK(CountEssentialDof(smesh, 1, ++ smesh.bdr_attributes.Max()) == 6); ++ CHECK(CountEssentialDof(smesh, 2, ++ smesh.bdr_attributes.Max()) == 6 + 3 * 3); ++ CHECK(CountEssentialDof(smesh, 3, ++ smesh.bdr_attributes.Max()) == 10 + 3 * 6); ++ ++ int refined_attribute = GENERATE(1,2); ++ int ref_level = GENERATE(1,2,3); ++ for (int r = 0; r < ref_level; r++) ++ { ++ Array el_to_refine; ++ for (int n = 0; n < smesh.GetNE(); n++) ++ { ++ if (smesh.GetAttribute(n) == refined_attribute) ++ { ++ el_to_refine.Append(n); ++ } ++ } ++ smesh.GeneralRefinement(el_to_refine); ++ } ++ ++ // Refining on only one side of the boundary face should not change the number of ++ // essential true dofs ++ CHECK(CountEssentialDof(smesh, 1, ++ smesh.bdr_attributes.Max()) == 6); ++ CHECK(CountEssentialDof(smesh, 2, ++ smesh.bdr_attributes.Max()) == 6 + 3 * 3); ++ CHECK(CountEssentialDof(smesh, 3, ++ smesh.bdr_attributes.Max()) == 10 + 3 * 6); ++ ++ // The number of boundary faces should have increased. ++ CHECK(smesh.GetNBE() == 3 * 4 + (3 + 1) * std::pow(4, 1+ref_level)); ++} ++ ++TEST_CASE("TetInternalBoundaryStarMesh", "[NCMesh]") ++{ ++ auto smesh = StarMesh(); ++ smesh.EnsureNCMesh(true); ++ ++ ++ SECTION("Unrefined") ++ { ++ CHECK(smesh.GetNBE() == 4 * 3 + 4); ++ CHECK(CountEssentialDof(smesh, 1, ++ smesh.bdr_attributes.Max()) == 4); ++ CHECK(CountEssentialDof(smesh, 2, ++ smesh.bdr_attributes.Max()) == 4 + 6); ++ CHECK(CountEssentialDof(smesh, 3, ++ smesh.bdr_attributes.Max()) == 4 + 6*2 + 4*1); ++ CHECK(CountEssentialDof(smesh, 4, ++ smesh.bdr_attributes.Max()) == 4 + 6*3 + 4*3); ++ ++ CHECK(CountEssentialDof(smesh, 1, ++ smesh.bdr_attributes.Max()) == 6); ++ CHECK(CountEssentialDof(smesh, 2, ++ smesh.bdr_attributes.Max()) == 20); ++ CHECK(CountEssentialDof(smesh, 3, ++ smesh.bdr_attributes.Max()) == 42); ++ CHECK(CountEssentialDof(smesh, 4, ++ smesh.bdr_attributes.Max()) == 72); ++ } ++ ++ SECTION("Refined") ++ { ++ int refined_attribute = GENERATE(1,2,3,4,5); ++ int ref_level = GENERATE(1,2,3); ++ for (int r = 0; r < ref_level; r++) ++ { ++ Array el_to_refine; ++ for (int n = 0; n < smesh.GetNE(); n++) ++ { ++ if (smesh.GetAttribute(n) == refined_attribute) ++ { ++ el_to_refine.Append(n); ++ } ++ } ++ smesh.GeneralRefinement(el_to_refine); ++ } ++ ++ // Refining on only one side of the boundary face should not change the number of ++ // essential true dofs ++ CHECK(CountEssentialDof(smesh, 1, ++ smesh.bdr_attributes.Max()) == 4); ++ CHECK(CountEssentialDof(smesh, 2, ++ smesh.bdr_attributes.Max()) == 4 + 6); ++ CHECK(CountEssentialDof(smesh, 3, ++ smesh.bdr_attributes.Max()) == 4 + 6*2 + 4*1); ++ CHECK(CountEssentialDof(smesh, 4, ++ smesh.bdr_attributes.Max()) == 4 + 6*3 + 4*3); ++ ++ CHECK(CountEssentialDof(smesh, 1, ++ smesh.bdr_attributes.Max()) == 6); ++ CHECK(CountEssentialDof(smesh, 2, ++ smesh.bdr_attributes.Max()) == 6 * 2 + 4 * 2); // 2 per edge, 2 per face ++ CHECK(CountEssentialDof(smesh, 3, ++ smesh.bdr_attributes.Max()) == 42); ++ CHECK(CountEssentialDof(smesh, 4, ++ smesh.bdr_attributes.Max()) == 72); ++ ++ // The number of boundary faces should have increased. ++ CHECK(smesh.GetNBE() == 3 * 4 + 4 * std::pow(4,ref_level)); ++ } ++} ++ ++TEST_CASE("DividingPlaneMesh", "[NCMesh]") ++{ ++ auto RefineAttribute = [](Mesh& mesh, int attr, int ref_level) ++ { ++ for (int r = 0; r < ref_level; r++) ++ { ++ Array el_to_refine; ++ for (int n = 0; n < mesh.GetNE(); n++) ++ { ++ if (mesh.GetAttribute(n) == attr) ++ { ++ el_to_refine.Append(n); ++ } ++ } ++ mesh.GeneralRefinement(el_to_refine); ++ } ++ }; ++ ++ SECTION("Hex") ++ { ++ auto mesh = DividingPlaneMesh(false); ++ mesh.EnsureNCMesh(true); ++ ++ CHECK(mesh.GetNBE() == 2 * 5 + 1); ++ CHECK(mesh.GetNE() == 2); ++ ++ auto attr = GENERATE(1,2); ++ auto ref_level = GENERATE(1,2); ++ ++ const int num_vert = ref_level == 1 ? 5*5 : 9*9; ++ const int num_edge = ref_level == 1 ? 2*4*5 : 2*8*9; ++ const int num_face = ref_level == 1 ? 4*4 : 8*8; ++ ++ SECTION("H1Hex") ++ { ++ mesh.UniformRefinement(); ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 3*3); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 5*5); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 7*7); ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 3*3); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 5*5); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 7*7); ++ ++ RefineAttribute(mesh, attr, ref_level); ++ ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 3*3); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 5*5); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 7*7); ++ ++ // Add the slave face dofs, then subtract off the vertex dofs which are double ++ // counted due to being shared. ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 3*3 + num_vert - 3*3); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 5*5 + num_vert + num_edge + num_face - 3*3); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 7*7 + num_vert + 2*num_edge + 4*num_face - 3*3); ++ ++ } ++ ++ SECTION("NDHex") ++ { ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 4); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 4*2 + 2*2); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 4*3 + 2*2*3); ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 4); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 4*2 + 2*2); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 4*3 + 2*2*3); ++ ++ mesh.UniformRefinement(); ++ const int initial_num_edge = 12; ++ const int initial_num_face = 4; ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == initial_num_edge); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == initial_num_edge*2 + initial_num_face*2*2); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == initial_num_edge*3 + initial_num_face*2*2*3); ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == initial_num_edge); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == initial_num_edge*2 + initial_num_face*2*2); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == initial_num_edge*3 + initial_num_face*2*2*3); ++ ++ RefineAttribute(mesh, attr, ref_level); ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == initial_num_edge); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == initial_num_edge*2 + initial_num_face*2*2); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == initial_num_edge*3 + initial_num_face*2*2*3); ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == (num_edge+initial_num_edge)); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == (num_edge+initial_num_edge)*2 + ++ (num_face+initial_num_face)*2*2); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == (num_edge+initial_num_edge)*3 + ++ (num_face+initial_num_face)*2*2*3); ++ } ++ } ++ ++ SECTION("Tet") ++ { ++ auto mesh = DividingPlaneMesh(true); ++ mesh.EnsureNCMesh(true); ++ ++ CHECK(mesh.GetNBE() == 2 * 5 * 2 + 2); ++ CHECK(mesh.GetNE() == 2 * 6); ++ ++ auto attr = GENERATE(1,2); ++ auto ref_level = GENERATE(1,2); ++ CAPTURE(attr); ++ CAPTURE(ref_level); ++ ++ const int initial_num_vert = 4; ++ const int initial_num_edge = 5; ++ const int initial_num_face = 2; ++ ++ const int num_vert = ref_level == 1 ? 9 : 25; ++ const int num_edge = ref_level == 1 ? 16 : 56; ++ const int num_face = ref_level == 1 ? 8 : 32; ++ ++ SECTION("H1Tet") ++ { ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == initial_num_vert); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == initial_num_vert + initial_num_edge); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == initial_num_vert + 2*initial_num_edge + ++ initial_num_face); ++ CHECK(CountEssentialDof(mesh, 4, ++ mesh.bdr_attributes.Max()) == initial_num_vert + 3*initial_num_edge + ++ 3*initial_num_face); ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == initial_num_vert); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == initial_num_vert + initial_num_edge); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == initial_num_vert + 2*initial_num_edge + ++ initial_num_face); ++ CHECK(CountEssentialDof(mesh, 4, ++ mesh.bdr_attributes.Max()) == initial_num_vert + 3*initial_num_edge + ++ 3*initial_num_face); ++ ++ RefineAttribute(mesh, attr, ref_level); ++ ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == initial_num_vert); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == initial_num_vert + initial_num_edge); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == initial_num_vert + 2*initial_num_edge + ++ initial_num_face); ++ CHECK(CountEssentialDof(mesh, 4, ++ mesh.bdr_attributes.Max()) == initial_num_vert + 3*initial_num_edge + ++ 3*initial_num_face); ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == num_vert); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == num_vert + num_edge + initial_num_edge); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == num_vert + 2*num_edge + num_face + ++ 2*initial_num_edge + initial_num_face); ++ CHECK(CountEssentialDof(mesh, 4, ++ mesh.bdr_attributes.Max()) == num_vert + 3*num_edge + 3*num_face + ++ 3*initial_num_edge + 3*initial_num_face); ++ } ++ ++ SECTION("NDTet") ++ { ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 5); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 14); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 27); ++ CHECK(CountEssentialDof(mesh, 4, ++ mesh.bdr_attributes.Max()) == 44); ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 5); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 14); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 27); ++ CHECK(CountEssentialDof(mesh, 4, ++ mesh.bdr_attributes.Max()) == 44); ++ ++ RefineAttribute(mesh, attr, ref_level); ++ ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 5); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 14); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 27); ++ CHECK(CountEssentialDof(mesh, 4, ++ mesh.bdr_attributes.Max()) == 44); ++ ++ CHECK(CountEssentialDof(mesh, 1, ++ mesh.bdr_attributes.Max()) == 5 + num_edge); ++ CHECK(CountEssentialDof(mesh, 2, ++ mesh.bdr_attributes.Max()) == 14 + 2 * num_edge + 2*num_face); ++ CHECK(CountEssentialDof(mesh, 3, ++ mesh.bdr_attributes.Max()) == 27 + 3 * num_edge + 2*3*num_face); ++ } ++ } ++} ++ ++ ++TEST_CASE("TetFaceFlips", "[NCMesh]") ++{ ++ auto orientation = GENERATE(1,3,5); ++ CAPTURE(orientation); ++ auto smesh = OrientedTriFaceMesh(orientation, true); ++ ++ // A smooth function in each vector component ++ constexpr int order = 3, dim = 3, quadrature_order = 4; ++ constexpr double kappa = 2 * M_PI; ++ auto E_exact = [=](const Vector &x, Vector &E) ++ { ++ E(0) = cos(kappa * x(1)); ++ E(1) = cos(kappa * x(2)); ++ E(2) = cos(kappa * x(0)); ++ }; ++ VectorFunctionCoefficient E_coeff(dim, E_exact); ++ ++ auto CheckSerialNDConformal = [&](Mesh &mesh, int num_essential_tdof, ++ int num_essential_vdof) ++ { ++ ND_FECollection fe_collection(order, dim); ++ FiniteElementSpace fe_space(&mesh, &fe_collection); ++ GridFunction E(&fe_space); ++ ++ E.ProjectCoefficient(E_coeff); ++ ++ auto *P = fe_space.GetProlongationMatrix(); ++ if (P != nullptr) ++ { ++ // Projection does not respect the non-conformal constraints. ++ // Extract the true (conformal) and prolongate to get the NC respecting projection. ++ auto E_true = E.GetTrueVector(); ++ P->Mult(E_true, E); ++ } ++ mesh.EnsureNodes(); ++ GridFunction * const coords = mesh.GetNodes(); ++ ++ const auto &ir = IntRules.Get(Geometry::Type::TRIANGLE, quadrature_order); ++ IntegrationRule left_eir(ir.GetNPoints()), ++ right_eir(ir.GetNPoints()); // element integration rules ++ ++ Array bdr_attr_is_ess = mesh.bdr_attributes, tdof_list; ++ bdr_attr_is_ess = 0; ++ bdr_attr_is_ess.Last() = 1; ++ fe_space.GetEssentialTrueDofs(bdr_attr_is_ess, tdof_list); ++ ++ Array ess_vdof_marker, vdof_list; ++ fe_space.GetEssentialVDofs(bdr_attr_is_ess, ess_vdof_marker); ++ fe_space.MarkerToList(ess_vdof_marker, vdof_list); ++ ++ CHECK(num_essential_tdof == tdof_list.Size()); ++ if (num_essential_vdof != -1) ++ { ++ CHECK(num_essential_vdof == vdof_list.Size()); ++ } ++ ++ for (int n = 0; n < mesh.GetNBE(); n++) ++ { ++ // NOTE: only works for internal boundaries ++ if (bdr_attr_is_ess[mesh.GetBdrAttribute(n) - 1]) ++ { ++ auto f = mesh.GetBdrElementFaceIndex(n); ++ auto &face_element_transform = *mesh.GetFaceElementTransformations(f); ++ ++ if (face_element_transform.Elem2 == nullptr) ++ { ++ // not internal, nothing to check. ++ continue; ++ } ++ ++ face_element_transform.Loc1.Transform(ir, left_eir); ++ face_element_transform.Loc2.Transform(ir, right_eir); ++ ++ constexpr double tol = 1e-14; ++ REQUIRE(left_eir.GetNPoints() == ir.GetNPoints()); ++ REQUIRE(right_eir.GetNPoints() == ir.GetNPoints()); ++ Vector left_val, right_val; ++ for (int i = 0; i < ir.GetNPoints(); i++) ++ { ++ face_element_transform.Elem1->SetIntPoint(&left_eir[i]); ++ coords->GetVectorValue(*face_element_transform.Elem1, left_eir[i], left_val); ++ coords->GetVectorValue(*face_element_transform.Elem1, left_eir[i], right_val); ++ REQUIRE(std::abs(left_val(0) - right_val(0)) < tol); ++ REQUIRE(std::abs(left_val(1) - right_val(1)) < tol); ++ REQUIRE(std::abs(left_val(2) - right_val(2)) < tol); ++ E.GetVectorValue(*face_element_transform.Elem1, left_eir[i], left_val); ++ ++ face_element_transform.Elem2->SetIntPoint(&right_eir[i]); ++ E.GetVectorValue(*face_element_transform.Elem2, right_eir[i], right_val); ++ ++ // Check that the second and third rows agree. ++ // The y and z should agree as the normal is in the x direction ++ CHECK(std::abs(left_val(1) - right_val(1)) < tol); ++ CHECK(std::abs(left_val(2) - right_val(2)) < tol); ++ } ++ } ++ } ++ }; ++ ++ SECTION("Conformal") ++ { ++ const int ntdof = 3*3 + 3*2; ++ const int nvdof = ntdof; ++ CheckSerialNDConformal(smesh, ntdof, nvdof); ++ } ++ ++ SECTION("Nonconformal") ++ { ++ smesh.EnsureNCMesh(true); ++ const int ntdof = 3*3 + 3*2; ++ const int nvdof = ntdof; ++ CheckSerialNDConformal(smesh, ntdof, nvdof); ++ } ++ ++ SECTION("ConformalUniformRefined") ++ { ++ smesh.UniformRefinement(); ++ const int ntdof = 9*3 + 4*3*2; ++ const int nvdof = ntdof; ++ CheckSerialNDConformal(smesh, ntdof, nvdof); ++ } ++ ++ SECTION("NonconformalUniformRefined") ++ { ++ smesh.EnsureNCMesh(true); ++ smesh.UniformRefinement(); ++ const int ntdof = 9*3 + 4*3*2; ++ const int nvdof = ntdof; ++ CheckSerialNDConformal(smesh, ntdof, nvdof); ++ } ++ ++ SECTION("NonconformalRefined") ++ { ++ smesh.EnsureNCMesh(true); ++ int ref_level = GENERATE(1, 2); ++ CAPTURE(ref_level); ++ for (int r = 0; r < ref_level; r++) ++ { ++ Array el_to_refine; ++ for (int n = 0; n < smesh.GetNE(); n++) ++ { ++ if (smesh.GetAttribute(n) == 2) ++ { ++ el_to_refine.Append(n); ++ } ++ } ++ smesh.GeneralRefinement(el_to_refine); ++ } ++ const int ntdof = 3*3 + 3*2; ++ const int nvdof = ntdof + (ref_level == 1 ? 9*3 + 4*3*2 : 30*3 + 16*3*2); ++ CheckSerialNDConformal(smesh, ntdof, nvdof); ++ } ++ ++ SECTION("NonconformalLevelTwoRefined") ++ { ++ smesh.EnsureNCMesh(true); ++ Array el_to_refine; ++ ++ smesh.UniformRefinement(); ++ ++ const int ntdof = 9*3 + 4*3*2; ++ el_to_refine.SetSize(1); ++ ++ auto n = GENERATE(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); ++ auto m = GENERATE(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22); ++ ++ if (n < smesh.GetNE() && smesh.GetAttribute(n) == 2) ++ { ++ el_to_refine[0] = n; ++ CAPTURE(n); ++ smesh.GeneralRefinement(el_to_refine); ++ CheckSerialNDConformal(smesh, ntdof, -1); ++ ++ if (smesh.GetAttribute(m) == 2) ++ { ++ el_to_refine[0] = m; ++ CAPTURE(m); ++ smesh.GeneralRefinement(el_to_refine); ++ CheckSerialNDConformal(smesh, ntdof, -1); ++ } ++ } ++ ++ } ++} ++ ++TEST_CASE("RP=I", "[NCMesh]") ++{ ++ auto CheckFESpace = [](const FiniteElementSpace& fespace) ++ { ++ auto * const R = fespace.GetConformingRestriction(); ++ auto * const P = fespace.GetConformingProlongation(); ++ ++ REQUIRE(R != nullptr); ++ REQUIRE(P != nullptr); ++ ++ // Vector notation ++ Vector e_i(R->Height()), e_j(P->Width()); ++ Vector Rrow(R->Width()), Pcol(P->Height()); ++ for (int i = 0; i < R->Height(); i++) ++ { ++ e_i = 0.0; ++ e_i(i) = 1.0; ++ R->MultTranspose(e_i, Rrow); ++ for (int j = 0; j < P->Width(); j++) ++ { ++ e_j = 0.0; ++ e_j(j) = 1.0; ++ P->Mult(e_j, Pcol); ++ ++ CHECK(Rrow * Pcol == (i == j ? 1.0 : 0.0)); ++ } ++ } ++ ++ // Index notation ++ CHECK(R->Height() == P->Width()); ++ CHECK(R->Width() == P->Height()); ++ for (int i = 0; i < R->Height(); i++) ++ for (int j = 0; j < P->Width(); j++) ++ { ++ double dot = 0.0; ++ for (int k = 0; k < R->Width(); k++) ++ { ++ dot += (*R)(i,k)*(*P)(k,j); ++ } ++ CHECK(dot == (i == j ? 1.0 : 0.0)); ++ } ++ }; ++ ++ SECTION("Hex") ++ { ++ const int dim = 3; ++ const int order = GENERATE(1, 2); ++ // Split the hex into a pair, then isotropically refine one of them. ++ Mesh mesh("../../data/ref-cube.mesh"); ++ Array refinements(1); ++ refinements[0].index = 0; ++ refinements[0].ref_type = Refinement::X; ++ mesh.GeneralRefinement(refinements); ++ refinements[0].ref_type = Refinement::XYZ; ++ mesh.GeneralRefinement(refinements); ++ SECTION("ND") ++ { ++ ND_FECollection fec(order, dim); ++ FiniteElementSpace fespace(&mesh, &fec); ++ CheckFESpace(fespace); ++ } ++ SECTION("H1") ++ { ++ H1_FECollection fec(order, dim); ++ FiniteElementSpace fespace(&mesh, &fec); ++ CheckFESpace(fespace); ++ } ++ } ++ ++ SECTION("Tet") ++ { ++ const int dim = 3; ++ const int order = GENERATE(1, 2); ++ // Split the hex into a pair, then isotropically refine one of them. ++ Mesh mesh("../../data/ref-tetrahedron.mesh"); ++ Array refinements(1); ++ refinements[0].index = 0; ++ refinements[0].ref_type = Refinement::X; ++ mesh.GeneralRefinement(refinements); ++ mesh.EnsureNCMesh(true); ++ refinements[0].ref_type = Refinement::XYZ; ++ mesh.GeneralRefinement(refinements); ++ SECTION("ND") ++ { ++ ND_FECollection fec(order, dim); ++ FiniteElementSpace fespace(&mesh, &fec); ++ CheckFESpace(fespace); ++ } ++ SECTION("H1") ++ { ++ H1_FECollection fec(order, dim); ++ FiniteElementSpace fespace(&mesh, &fec); ++ CheckFESpace(fespace); ++ } ++ } ++} + + } // namespace mfem diff --git a/extern/patch/mfem/patch_par_tet_mesh_fix.diff b/extern/patch/mfem/patch_par_tet_mesh_fix.diff index 3610a9c051..958537ea0b 100644 --- a/extern/patch/mfem/patch_par_tet_mesh_fix.diff +++ b/extern/patch/mfem/patch_par_tet_mesh_fix.diff @@ -1,829 +1,829 @@ -diff --git a/mesh/element.hpp b/mesh/element.hpp -index c37205eb1..f1b003cae 100644 ---- a/mesh/element.hpp -+++ b/mesh/element.hpp -@@ -84,9 +84,6 @@ public: - - virtual const int *GetFaceVertices(int fi) const = 0; - -- /// Mark the longest edge by assuming/changing the order of the vertices. -- virtual void MarkEdge(const DSTable &v_to_v, const int *length) {} -- - /// Return 1 if the element needs refinement in order to get conforming mesh. - virtual int NeedRefinement(HashTable &v_to_v) const { return 0; } - -diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp -index f76104da3..5e96c3f39 100644 ---- a/mesh/mesh.cpp -+++ b/mesh/mesh.cpp -@@ -489,6 +489,7 @@ void Mesh::GetBdrElementTransformation(int i, IsoparametricTransformation* ElTr) - Nodes->FESpace()->GetTraceElement(elem_id, face_geom); - MFEM_VERIFY(dynamic_cast(face_el), - "Mesh requires nodal Finite Element."); -+ - IntegrationRule eir(face_el->GetDof()); - FaceElemTr.Loc1.Transf.ElementNo = elem_id; - FaceElemTr.Loc1.Transf.mesh = this; -@@ -1941,7 +1942,7 @@ void Mesh::FinalizeTriMesh(int generate_edges, int refine, bool fix_orientation) - - if (refine) - { -- MarkTriMeshForRefinement(); -+ MarkForRefinement(); - } - - if (generate_edges) -@@ -2395,82 +2396,110 @@ void Mesh::ReorderElements(const Array &ordering, bool reorder_vertices) - } - - -+void Mesh::GetEdgeLengths(const DSTable &v_to_v, Array &lengths) const -+{ -+ auto GetLength = [this](int i, int j) -+ { -+ double length = 0.; -+ if (Nodes == NULL) -+ { -+ const double *vi = vertices[i](); -+ const double *vj = vertices[j](); -+ for (int k = 0; k < spaceDim; k++) -+ { -+ length += (vi[k]-vj[k])*(vi[k]-vj[k]); -+ } -+ } -+ else -+ { -+ Array ivdofs, jvdofs; -+ Nodes->FESpace()->GetVertexVDofs(i, ivdofs); -+ Nodes->FESpace()->GetVertexVDofs(j, jvdofs); -+ for (int k = 0; k < ivdofs.Size(); k++) -+ { -+ length += ((*Nodes)(ivdofs[k])-(*Nodes)(jvdofs[k]))* -+ ((*Nodes)(ivdofs[k])-(*Nodes)(jvdofs[k])); -+ } -+ } -+ return length; -+ }; -+ -+ lengths.SetSize(NumOfEdges); -+ for (int i = 0; i < NumOfVertices; i++) -+ { -+ for (DSTable::RowIterator it(v_to_v, i); !it; ++it) -+ { -+ int j = it.Index(); -+ lengths[j] = GetLength(i, it.Column()); -+ } -+ } -+}; -+ - void Mesh::MarkForRefinement() - { - if (meshgen & 1) - { -+ DSTable v_to_v(NumOfVertices); -+ GetVertexToVertexTable(v_to_v); -+ NumOfEdges = v_to_v.NumberOfEntries(); - if (Dim == 2) - { -- MarkTriMeshForRefinement(); -+ MarkTriMeshForRefinement(v_to_v); - } - else if (Dim == 3) - { -- DSTable v_to_v(NumOfVertices); -- GetVertexToVertexTable(v_to_v); - MarkTetMeshForRefinement(v_to_v); - } - } - } - --void Mesh::MarkTriMeshForRefinement() -+void Mesh::MarkTriMeshForRefinement(const DSTable &v_to_v) - { - // Mark the longest triangle edge by rotating the indices so that - // vertex 0 - vertex 1 is the longest edge in the triangle. -- DenseMatrix pmat; -- for (int i = 0; i < NumOfElements; i++) -- { -- if (elements[i]->GetType() == Element::TRIANGLE) -- { -- GetPointMatrix(i, pmat); -- static_cast(elements[i])->MarkEdge(pmat); -- } -- } --} -+ Array lengths; -+ GetEdgeLengths(v_to_v, lengths); - --void Mesh::GetEdgeOrdering(const DSTable &v_to_v, Array &order) --{ -- NumOfEdges = v_to_v.NumberOfEntries(); -- order.SetSize(NumOfEdges); -- Array > length_idx(NumOfEdges); -+ Array idx(NumOfEdges); -+ for (int i = 0; i < NumOfEdges; i++) { idx[i] = i; } - -- for (int i = 0; i < NumOfVertices; i++) -+ for (int i = 0; i < NumOfElements; i++) - { -- for (DSTable::RowIterator it(v_to_v, i); !it; ++it) -+ if (elements[i]->GetType() == Element::TRIANGLE) - { -- int j = it.Index(); -- length_idx[j].one = GetLength(i, it.Column()); -- length_idx[j].two = j; -+ MFEM_ASSERT(dynamic_cast(elements[i]), -+ "Unexpected non-Triangle element type"); -+ static_cast(elements[i])->MarkEdge(v_to_v, lengths, idx); - } - } -- -- // Sort by increasing edge-length. -- length_idx.Sort(); -- -- for (int i = 0; i < NumOfEdges; i++) -- { -- order[length_idx[i].two] = i; -- } - } - - void Mesh::MarkTetMeshForRefinement(const DSTable &v_to_v) - { - // Mark the longest tetrahedral edge by rotating the indices so that - // vertex 0 - vertex 1 is the longest edge in the element. -- Array order; -- GetEdgeOrdering(v_to_v, order); -+ Array lengths; -+ GetEdgeLengths(v_to_v, lengths); -+ -+ Array idx(NumOfEdges); -+ for (int i = 0; i < NumOfEdges; i++) { idx[i] = i; } - - for (int i = 0; i < NumOfElements; i++) - { - if (elements[i]->GetType() == Element::TETRAHEDRON) - { -- elements[i]->MarkEdge(v_to_v, order); -+ MFEM_ASSERT(dynamic_cast(elements[i]), -+ "Unexpected non-Tetrahedron element type"); -+ static_cast(elements[i])->MarkEdge(v_to_v, lengths, idx); - } - } - for (int i = 0; i < NumOfBdrElements; i++) - { - if (boundary[i]->GetType() == Element::TRIANGLE) - { -- boundary[i]->MarkEdge(v_to_v, order); -+ MFEM_ASSERT(dynamic_cast(boundary[i]), -+ "Unexpected non-Triangle element type"); -+ static_cast(boundary[i])->MarkEdge(v_to_v, lengths, idx); - } - } - } -@@ -2820,9 +2849,7 @@ void Mesh::FinalizeTetMesh(int generate_edges, int refine, bool fix_orientation) - - if (refine) - { -- DSTable v_to_v(NumOfVertices); -- GetVertexToVertexTable(v_to_v); -- MarkTetMeshForRefinement(v_to_v); -+ MarkForRefinement(); - } - - GetElementToFaceTable(); -@@ -3089,8 +3116,7 @@ void Mesh::Finalize(bool refine, bool fix_orientation) - // only perform it when Dim == spaceDim. - if (Dim >= 2 && Dim == spaceDim) - { -- const int num_faces = GetNumFaces(); -- for (int i = 0; i < num_faces; i++) -+ for (int i = 0; i < GetNumFaces(); i++) - { - MFEM_VERIFY(faces_info[i].Elem2No < 0 || - faces_info[i].Elem2Inf%2 != 0, "Invalid mesh topology." -@@ -3527,8 +3553,6 @@ void Mesh::Make2D(int nx, int ny, Element::Type type, - boundary[2*nx+j] = new Segment((j+1)*m, j*m, 4); - boundary[2*nx+ny+j] = new Segment(j*m+nx, (j+1)*m+nx, 2); - } -- -- // MarkTriMeshForRefinement(); // done in Finalize(...) - } - else - { -@@ -5699,37 +5723,21 @@ static const char *fixed_or_not[] = { "fixed", "NOT FIXED" }; - - int Mesh::CheckElementOrientation(bool fix_it) - { -- int i, j, k, wo = 0, fo = 0; -- double *v[4]; -+ int wo = 0, fo = 0; - - if (Dim == 2 && spaceDim == 2) - { - DenseMatrix J(2, 2); - -- for (i = 0; i < NumOfElements; i++) -+ for (int i = 0; i < NumOfElements; i++) - { -- int *vi = elements[i]->GetVertices(); -- if (Nodes == NULL) -- { -- for (j = 0; j < 3; j++) -- { -- v[j] = vertices[vi[j]](); -- } -- for (j = 0; j < 2; j++) -- for (k = 0; k < 2; k++) -- { -- J(j, k) = v[j+1][k] - v[0][k]; -- } -- } -- else -- { -- // only check the Jacobian at the center of the element -- GetElementJacobian(i, J); -- } -+ // only check the Jacobian at the center of the element -+ GetElementJacobian(i, J); - if (J.Det() < 0.0) - { - if (fix_it) - { -+ int *vi = elements[i]->GetVertices(); - switch (GetElementType(i)) - { - case Element::TRIANGLE: -@@ -5749,88 +5757,41 @@ int Mesh::CheckElementOrientation(bool fix_it) - } - } - } -- -- if (Dim == 3) -+ else if (Dim == 3) - { - DenseMatrix J(3, 3); - -- for (i = 0; i < NumOfElements; i++) -+ for (int i = 0; i < NumOfElements; i++) - { -- int *vi = elements[i]->GetVertices(); -- switch (GetElementType(i)) -+ // only check the Jacobian at the center of the element -+ GetElementJacobian(i, J); -+ if (J.Det() < 0.0) - { -- case Element::TETRAHEDRON: -- if (Nodes == NULL) -- { -- for (j = 0; j < 4; j++) -- { -- v[j] = vertices[vi[j]](); -- } -- for (j = 0; j < 3; j++) -- for (k = 0; k < 3; k++) -- { -- J(j, k) = v[j+1][k] - v[0][k]; -- } -- } -- else -- { -- // only check the Jacobian at the center of the element -- GetElementJacobian(i, J); -- } -- if (J.Det() < 0.0) -+ if (fix_it) -+ { -+ int *vi = elements[i]->GetVertices(); -+ switch (GetElementType(i)) - { -- wo++; -- if (fix_it) -- { -+ case Element::TETRAHEDRON: - mfem::Swap(vi[0], vi[1]); - fo++; -- } -- } -- break; -- -- case Element::WEDGE: -- // only check the Jacobian at the center of the element -- GetElementJacobian(i, J); -- if (J.Det() < 0.0) -- { -- wo++; -- if (fix_it) -- { -+ break; -+ case Element::WEDGE: - // how? -- } -- } -- break; -- -- case Element::PYRAMID: -- // only check the Jacobian at the center of the element -- GetElementJacobian(i, J); -- if (J.Det() < 0.0) -- { -- wo++; -- if (fix_it) -- { -+ break; -+ case Element::PYRAMID: - // how? -- } -- } -- break; -- -- case Element::HEXAHEDRON: -- // only check the Jacobian at the center of the element -- GetElementJacobian(i, J); -- if (J.Det() < 0.0) -- { -- wo++; -- if (fix_it) -- { -+ break; -+ case Element::HEXAHEDRON: - // how? -- } -+ break; -+ default: -+ MFEM_ABORT("Invalid 3D element type \"" -+ << GetElementType(i) << "\""); -+ break; - } -- break; -- -- default: -- MFEM_ABORT("Invalid 3D element type \"" -- << GetElementType(i) << "\""); -- break; -+ } -+ wo++; - } - } - } -@@ -6756,24 +6717,12 @@ void Mesh::GetBdrPointMatrix(int i,DenseMatrix &pointmat) const - - pointmat.SetSize(spaceDim, nv); - for (k = 0; k < spaceDim; k++) -+ { - for (j = 0; j < nv; j++) - { - pointmat(k, j) = vertices[v[j]](k); - } --} -- --double Mesh::GetLength(int i, int j) const --{ -- const double *vi = vertices[i](); -- const double *vj = vertices[j](); -- double length = 0.; -- -- for (int k = 0; k < spaceDim; k++) -- { -- length += (vi[k]-vj[k])*(vi[k]-vj[k]); - } -- -- return sqrt(length); - } - - // static method -diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp -index 8be58b232..b9c5538c3 100644 ---- a/mesh/mesh.hpp -+++ b/mesh/mesh.hpp -@@ -358,12 +358,9 @@ protected: - /** Also, initializes #mesh_geoms. */ - void SetMeshGen(); - -- /// Return the length of the segment from node i to node j. -- double GetLength(int i, int j) const; -- -+ void GetEdgeLengths(const DSTable &v_to_v, Array &lengths) const; - void MarkForRefinement(); -- void MarkTriMeshForRefinement(); -- void GetEdgeOrdering(const DSTable &v_to_v, Array &order); -+ void MarkTriMeshForRefinement(const DSTable &v_to_v); - virtual void MarkTetMeshForRefinement(const DSTable &v_to_v); - - // Methods used to prepare and apply permutation of the mesh nodes assuming -diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp -index 28b8a1bf8..26e2f4655 100644 ---- a/mesh/pmesh.cpp -+++ b/mesh/pmesh.cpp -@@ -20,6 +20,7 @@ - #include "../general/text.hpp" - #include "../general/globals.hpp" - -+#include - #include - #include - -@@ -762,8 +763,10 @@ void ParMesh::BuildSharedFaceElems(int ntri_faces, int nquad_faces, - sface_lface[stria_counter] = lface; - if (meshgen == 1) // Tet-only mesh - { -- Tetrahedron *tet = dynamic_cast -- (elements[faces_info[lface].Elem1No]); -+ Element *elem = elements[faces_info[lface].Elem1No]; -+ MFEM_ASSERT(dynamic_cast(elem), -+ "Unexpected non-Tetrahedron element") -+ auto *tet = static_cast(elem); - // mark the shared face for refinement by reorienting - // it according to the refinement flag in the tetrahedron - // to which this shared face belongs to. -@@ -1739,97 +1742,59 @@ void ParMesh::GetSharedTriCommunicator(int ordering, - - void ParMesh::MarkTetMeshForRefinement(const DSTable &v_to_v) - { -- Array order; -- GetEdgeOrdering(v_to_v, order); // local edge ordering -+ Array lengths; -+ GetEdgeLengths(v_to_v, lengths); - -- // create a GroupCommunicator on the shared edges -+ // create a GroupCommunicator over shared edges - GroupCommunicator sedge_comm(gtopo); - GetSharedEdgeCommunicator(0, sedge_comm); - -- Array sedge_ord(shared_edges.Size()); -- Array > sedge_ord_map(shared_edges.Size()); -- for (int k = 0; k < shared_edges.Size(); k++) -+ // communicate the local index of each shared edge from the group master to -+ // other ranks in the group -+ Array sedge_master_rank(shared_edges.Size()); -+ Array sedge_master_index(shared_edges.Size()); -+ for (int i = 0; i < group_sedge.Size(); i++) -+ { -+ int rank = gtopo.GetGroupMasterRank(i+1); -+ for (int j = 0; j < group_sedge.RowSize(i); j++) -+ { -+ sedge_master_rank[group_sedge.GetRow(i)[j]] = rank; -+ } -+ } -+ for (int i = 0; i < shared_edges.Size(); i++) - { -- // sedge_ledge may be undefined -- use shared_edges and v_to_v instead -- const int sedge = group_sedge.GetJ()[k]; -+ // sedge_ledge may be undefined so use shared_edges and v_to_v instead -+ const int sedge = group_sedge.GetJ()[i]; - const int *v = shared_edges[sedge]->GetVertices(); -- sedge_ord[k] = order[v_to_v(v[0], v[1])]; -+ sedge_master_index[i] = v_to_v(v[0], v[1]); - } -+ sedge_comm.Bcast(sedge_master_index); - -- sedge_comm.Bcast(sedge_ord, 1); -- -- for (int k = 0, gr = 1; gr < GetNGroups(); gr++) -+ // the pairs (master rank, master local index) define a globally consistent -+ // edge ordering -+ Array glob_edge_order(NumOfEdges); -+ for (int i = 0; i < NumOfEdges; i++) - { -- const int n = group_sedge.RowSize(gr-1); -- if (n == 0) { continue; } -- sedge_ord_map.SetSize(n); -- for (int j = 0; j < n; j++) -- { -- sedge_ord_map[j].one = sedge_ord[k+j]; -- sedge_ord_map[j].two = j; -- } -- SortPairs(sedge_ord_map, n); -- for (int j = 0; j < n; j++) -- { -- const int sedge_from = group_sedge.GetJ()[k+j]; -- const int *v = shared_edges[sedge_from]->GetVertices(); -- sedge_ord[k+j] = order[v_to_v(v[0], v[1])]; -- } -- std::sort(&sedge_ord[k], &sedge_ord[k] + n); -- for (int j = 0; j < n; j++) -- { -- const int sedge_to = group_sedge.GetJ()[k+sedge_ord_map[j].two]; -- const int *v = shared_edges[sedge_to]->GetVertices(); -- order[v_to_v(v[0], v[1])] = sedge_ord[k+j]; -- } -- k += n; -+ glob_edge_order[i] = (std::int64_t(MyRank) << 32) + i; - } -- --#ifdef MFEM_DEBUG -+ for (int i = 0; i < shared_edges.Size(); i++) - { -- Array > ilen_len(order.Size()); -- -- for (int i = 0; i < NumOfVertices; i++) -- { -- for (DSTable::RowIterator it(v_to_v, i); !it; ++it) -- { -- int j = it.Index(); -- ilen_len[j].one = order[j]; -- ilen_len[j].two = GetLength(i, it.Column()); -- } -- } -- -- SortPairs(ilen_len, order.Size()); -- -- double d_max = 0.; -- for (int i = 1; i < order.Size(); i++) -- { -- d_max = std::max(d_max, ilen_len[i-1].two-ilen_len[i].two); -- } -- --#if 0 -- // Debug message from every MPI rank. -- mfem::out << "proc. " << MyRank << '/' << NRanks << ": d_max = " << d_max -- << endl; --#else -- // Debug message just from rank 0. -- double glob_d_max; -- MPI_Reduce(&d_max, &glob_d_max, 1, MPI_DOUBLE, MPI_MAX, 0, MyComm); -- if (MyRank == 0) -- { -- mfem::out << "glob_d_max = " << glob_d_max << endl; -- } --#endif -+ const int sedge = group_sedge.GetJ()[i]; -+ const int *v = shared_edges[sedge]->GetVertices(); -+ glob_edge_order[v_to_v(v[0], v[1])] = -+ (std::int64_t(sedge_master_rank[i]) << 32) + sedge_master_index[i]; - } --#endif - -- // use 'order' to mark the tets, the boundary triangles, and the shared -+ // use the lengths to mark the tets, the boundary triangles, and the shared - // triangle faces - for (int i = 0; i < NumOfElements; i++) - { - if (elements[i]->GetType() == Element::TETRAHEDRON) - { -- elements[i]->MarkEdge(v_to_v, order); -+ MFEM_ASSERT(dynamic_cast(elements[i]), -+ "Unexpected non-Tetrahedron element type"); -+ static_cast(elements[i])->MarkEdge(v_to_v, lengths, -+ glob_edge_order); - } - } - -@@ -1837,13 +1802,16 @@ void ParMesh::MarkTetMeshForRefinement(const DSTable &v_to_v) - { - if (boundary[i]->GetType() == Element::TRIANGLE) - { -- boundary[i]->MarkEdge(v_to_v, order); -+ MFEM_ASSERT(dynamic_cast(boundary[i]), -+ "Unexpected non-Triangle element type"); -+ static_cast(boundary[i])->MarkEdge(v_to_v, lengths, -+ glob_edge_order); - } - } - - for (int i = 0; i < shared_trias.Size(); i++) - { -- Triangle::MarkEdge(shared_trias[i].v, v_to_v, order); -+ Triangle::MarkEdge(shared_trias[i].v, v_to_v, lengths, glob_edge_order); - } - } - -diff --git a/mesh/tetrahedron.cpp b/mesh/tetrahedron.cpp -index c7ebc064b..0815fc0a7 100644 ---- a/mesh/tetrahedron.cpp -+++ b/mesh/tetrahedron.cpp -@@ -13,6 +13,8 @@ - - #include "mesh_headers.hpp" - -+#include -+ - namespace mfem - { - -@@ -184,19 +186,30 @@ void Tetrahedron::SetVertices(const int *ind) - } - } - --void Tetrahedron::MarkEdge(const DSTable &v_to_v, const int *length) -+template -+void Tetrahedron::MarkEdge(const DSTable &v_to_v, const Array &length, -+ const Array &length2) - { -- int ind[4], i, j, l, L, type; -- -- // determine the longest edge -- L = length[v_to_v(indices[0], indices[1])]; j = 0; -- if ((l = length[v_to_v(indices[1], indices[2])]) > L) { L = l; j = 1; } -- if ((l = length[v_to_v(indices[2], indices[0])]) > L) { L = l; j = 2; } -- if ((l = length[v_to_v(indices[0], indices[3])]) > L) { L = l; j = 3; } -- if ((l = length[v_to_v(indices[1], indices[3])]) > L) { L = l; j = 4; } -- if ((l = length[v_to_v(indices[2], indices[3])]) > L) { j = 5; } -+ int e, j, ind[4], type; -+ T1 l, L; -+ T2 l2, L2; -+ auto Compare = [&length, &length2, &l, &l2, &L, &L2](int e) -+ { -+ constexpr T1 rtol = 1.0e-6; -+ l = length[e]; -+ l2 = length2[e]; -+ MFEM_ASSERT(l2 != L2, "Tie-breaking lengths should be unique for MarkEdge"); -+ return (l > L * (1.0 + rtol) || (l > L * (1.0 - rtol) && l2 > L2)); -+ }; -+ -+ e = v_to_v(indices[0], indices[1]); L = length[e]; L2 = length2[e]; j = 0; -+ if (Compare(v_to_v(indices[1], indices[2]))) { L = l; L2 = l2; j = 1; } -+ if (Compare(v_to_v(indices[2], indices[0]))) { L = l; L2 = l2; j = 2; } -+ if (Compare(v_to_v(indices[0], indices[3]))) { L = l; L2 = l2; j = 3; } -+ if (Compare(v_to_v(indices[1], indices[3]))) { L = l; L2 = l2; j = 4; } -+ if (Compare(v_to_v(indices[2], indices[3]))) { j = 5; } - -- for (i = 0; i < 4; i++) -+ for (int i = 0; i < 4; i++) - { - ind[i] = indices[i]; - } -@@ -228,13 +241,14 @@ void Tetrahedron::MarkEdge(const DSTable &v_to_v, const int *length) - // Determine the two longest edges for the other two faces and - // store them in ind[0] and ind[1] - ind[0] = 2; ind[1] = 1; -- L = length[v_to_v(indices[0], indices[2])]; -- if ((l = length[v_to_v(indices[0], indices[3])]) > L) { L = l; ind[0] = 3; } -- if ((l = length[v_to_v(indices[2], indices[3])]) > L) { ind[0] = 5; } - -- L = length[v_to_v(indices[1], indices[2])]; -- if ((l = length[v_to_v(indices[1], indices[3])]) > L) { L = l; ind[1] = 4; } -- if ((l = length[v_to_v(indices[2], indices[3])]) > L) { ind[1] = 5; } -+ e = v_to_v(indices[0], indices[2]); L = length[e]; L2 = length2[e]; -+ if (Compare(v_to_v(indices[0], indices[3]))) { L = l; L2 = l2; ind[0] = 3; } -+ if (Compare(v_to_v(indices[2], indices[3]))) { L = l; L2 = l2; ind[0] = 5; } -+ -+ e = v_to_v(indices[1], indices[2]); L = length[e]; L2 = length2[e]; -+ if (Compare(v_to_v(indices[1], indices[3]))) { L = l; L2 = l2; ind[1] = 4; } -+ if (Compare(v_to_v(indices[2], indices[3]))) { L = l; L2 = l2; ind[1] = 5; } - - j = 0; - switch (ind[0]) -@@ -345,4 +359,13 @@ Element *Tetrahedron::Duplicate(Mesh *m) const - return tet; - } - -+// @cond DOXYGEN_SKIP -+ -+template void Tetrahedron::MarkEdge(const DSTable &, const Array &, -+ const Array &); -+template void Tetrahedron::MarkEdge(const DSTable &, const Array &, -+ const Array &); -+ -+// @endcond -+ - } -diff --git a/mesh/tetrahedron.hpp b/mesh/tetrahedron.hpp -index c434ae903..ad018a037 100644 ---- a/mesh/tetrahedron.hpp -+++ b/mesh/tetrahedron.hpp -@@ -76,7 +76,9 @@ public: - /** Reorder the vertices so that the longest edge is from vertex 0 - to vertex 1. If called it should be once from the mesh constructor, - because the order may be used later for setting the edges. **/ -- virtual void MarkEdge(const DSTable &v_to_v, const int *length); -+ template -+ void MarkEdge(const DSTable &v_to_v, const Array &length, -+ const Array &length2); - - virtual void ResetTransform(int tr) { transform = tr; } - virtual unsigned GetTransform() const { return transform; } -diff --git a/mesh/triangle.cpp b/mesh/triangle.cpp -index 5ce32cb31..abd2b4379 100644 ---- a/mesh/triangle.cpp -+++ b/mesh/triangle.cpp -@@ -11,6 +11,8 @@ - - #include "mesh_headers.hpp" - -+#include -+ - namespace mfem - { - -@@ -50,63 +52,28 @@ void Triangle::SetVertices(const int *ind) - } - } - --void Triangle::MarkEdge(DenseMatrix &pmat) -+// static method -+template -+void Triangle::MarkEdge(int indices[3], const DSTable &v_to_v, -+ const Array &length, const Array &length2) - { -- double d[3]; -- int shift, v; -- -- d[0] = ( (pmat(0,1)-pmat(0,0))*(pmat(0,1)-pmat(0,0)) + -- (pmat(1,1)-pmat(1,0))*(pmat(1,1)-pmat(1,0)) ); -- d[1] = ( (pmat(0,2)-pmat(0,1))*(pmat(0,2)-pmat(0,1)) + -- (pmat(1,2)-pmat(1,1))*(pmat(1,2)-pmat(1,1)) ); -- d[2] = ( (pmat(0,2)-pmat(0,0))*(pmat(0,2)-pmat(0,0)) + -- (pmat(1,2)-pmat(1,0))*(pmat(1,2)-pmat(1,0)) ); -- -- // if pmat has 3 rows, then use extra term in each sum -- if (pmat.Height()==3) -+ int e, j, ind[3]; -+ T1 l, L; -+ T2 l2, L2; -+ auto Compare = [&length, &length2, &l, &l2, &L, &L2](int e) - { -- d[0] += (pmat(2,1)-pmat(2,0))*(pmat(2,1)-pmat(2,0)); -- d[1] += (pmat(2,2)-pmat(2,1))*(pmat(2,2)-pmat(2,1)); -- d[2] += (pmat(2,2)-pmat(2,0))*(pmat(2,2)-pmat(2,0)); -- } -+ constexpr T1 rtol = 1.0e-6; -+ l = length[e]; -+ l2 = length2[e]; -+ MFEM_ASSERT(l2 != L2, "Tie-breaking lengths should be unique for MarkEdge"); -+ return (l > L * (1.0 + rtol) || (l > L * (1.0 - rtol) && l2 > L2)); -+ }; - -- if (d[0] >= d[1]) -- { -- if (d[0] >= d[2]) { shift = 0; } -- else { shift = 2; } -- } -- else if (d[1] >= d[2]) { shift = 1; } -- else { shift = 2; } -+ e = v_to_v(indices[0], indices[1]); L = length[e]; L2 = length2[e]; j = 0; -+ if (Compare(v_to_v(indices[1], indices[2]))) { L = l; L2 = l2; j = 1; } -+ if (Compare(v_to_v(indices[2], indices[0]))) { j = 2; } - -- switch (shift) -- { -- case 0: -- break; -- case 1: -- v = indices[0]; -- indices[0] = indices[1]; -- indices[1] = indices[2]; -- indices[2] = v; -- break; -- case 2: -- v = indices[0]; -- indices[0] = indices[2]; -- indices[2] = indices[1]; -- indices[1] = v; -- break; -- } --} -- --// Static method --void Triangle::MarkEdge(int *indices, const DSTable &v_to_v, const int *length) --{ -- int l, L, j, ind[3], i; -- -- L = length[ v_to_v(indices[0], indices[1]) ]; j = 0; -- if ( (l = length[ v_to_v(indices[1], indices[2]) ]) > L ) { L = l; j = 1; } -- if ( (l = length[ v_to_v(indices[2], indices[0]) ]) > L ) { j = 2; } -- -- for (i = 0; i < 3; i++) -+ for (int i = 0; i < 3; i++) - { - ind[i] = indices[i]; - } -@@ -194,4 +161,13 @@ void Triangle::GetVertices(Array &v) const - } - } - -+// @cond DOXYGEN_SKIP -+ -+template void Triangle::MarkEdge(int *, const DSTable &, const Array &, -+ const Array &); -+template void Triangle::MarkEdge(int *, const DSTable &, const Array &, -+ const Array &); -+ -+// @endcond -+ - } // namespace mfem -diff --git a/mesh/triangle.hpp b/mesh/triangle.hpp -index 363bd4503..49fb4fe99 100644 ---- a/mesh/triangle.hpp -+++ b/mesh/triangle.hpp -@@ -50,13 +50,14 @@ public: - /** Reorder the vertices so that the longest edge is from vertex 0 - to vertex 1. If called it should be once from the mesh constructor, - because the order may be used later for setting the edges. **/ -- void MarkEdge(DenseMatrix & pmat); -- -- static void MarkEdge(int *indices, const DSTable &v_to_v, const int *length); -- -- /// Mark the longest edge by assuming/changing the order of the vertices. -- virtual void MarkEdge(const DSTable &v_to_v, const int *length) -- { MarkEdge(indices, v_to_v, length); } -+ template -+ void MarkEdge(const DSTable &v_to_v, const Array &length, -+ const Array &length2) -+ { MarkEdge(indices, v_to_v, length, length2); } -+ -+ template -+ static void MarkEdge(int *indices, const DSTable &v_to_v, -+ const Array &length, const Array &length2); - - virtual void ResetTransform(int tr) { transform = tr; } - virtual unsigned GetTransform() const { return transform; } +diff --git a/mesh/element.hpp b/mesh/element.hpp +index c37205eb1..f1b003cae 100644 +--- a/mesh/element.hpp ++++ b/mesh/element.hpp +@@ -84,9 +84,6 @@ public: + + virtual const int *GetFaceVertices(int fi) const = 0; + +- /// Mark the longest edge by assuming/changing the order of the vertices. +- virtual void MarkEdge(const DSTable &v_to_v, const int *length) {} +- + /// Return 1 if the element needs refinement in order to get conforming mesh. + virtual int NeedRefinement(HashTable &v_to_v) const { return 0; } + +diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp +index f76104da3..5e96c3f39 100644 +--- a/mesh/mesh.cpp ++++ b/mesh/mesh.cpp +@@ -489,6 +489,7 @@ void Mesh::GetBdrElementTransformation(int i, IsoparametricTransformation* ElTr) + Nodes->FESpace()->GetTraceElement(elem_id, face_geom); + MFEM_VERIFY(dynamic_cast(face_el), + "Mesh requires nodal Finite Element."); ++ + IntegrationRule eir(face_el->GetDof()); + FaceElemTr.Loc1.Transf.ElementNo = elem_id; + FaceElemTr.Loc1.Transf.mesh = this; +@@ -1941,7 +1942,7 @@ void Mesh::FinalizeTriMesh(int generate_edges, int refine, bool fix_orientation) + + if (refine) + { +- MarkTriMeshForRefinement(); ++ MarkForRefinement(); + } + + if (generate_edges) +@@ -2395,82 +2396,110 @@ void Mesh::ReorderElements(const Array &ordering, bool reorder_vertices) + } + + ++void Mesh::GetEdgeLengths(const DSTable &v_to_v, Array &lengths) const ++{ ++ auto GetLength = [this](int i, int j) ++ { ++ double length = 0.; ++ if (Nodes == NULL) ++ { ++ const double *vi = vertices[i](); ++ const double *vj = vertices[j](); ++ for (int k = 0; k < spaceDim; k++) ++ { ++ length += (vi[k]-vj[k])*(vi[k]-vj[k]); ++ } ++ } ++ else ++ { ++ Array ivdofs, jvdofs; ++ Nodes->FESpace()->GetVertexVDofs(i, ivdofs); ++ Nodes->FESpace()->GetVertexVDofs(j, jvdofs); ++ for (int k = 0; k < ivdofs.Size(); k++) ++ { ++ length += ((*Nodes)(ivdofs[k])-(*Nodes)(jvdofs[k]))* ++ ((*Nodes)(ivdofs[k])-(*Nodes)(jvdofs[k])); ++ } ++ } ++ return length; ++ }; ++ ++ lengths.SetSize(NumOfEdges); ++ for (int i = 0; i < NumOfVertices; i++) ++ { ++ for (DSTable::RowIterator it(v_to_v, i); !it; ++it) ++ { ++ int j = it.Index(); ++ lengths[j] = GetLength(i, it.Column()); ++ } ++ } ++}; ++ + void Mesh::MarkForRefinement() + { + if (meshgen & 1) + { ++ DSTable v_to_v(NumOfVertices); ++ GetVertexToVertexTable(v_to_v); ++ NumOfEdges = v_to_v.NumberOfEntries(); + if (Dim == 2) + { +- MarkTriMeshForRefinement(); ++ MarkTriMeshForRefinement(v_to_v); + } + else if (Dim == 3) + { +- DSTable v_to_v(NumOfVertices); +- GetVertexToVertexTable(v_to_v); + MarkTetMeshForRefinement(v_to_v); + } + } + } + +-void Mesh::MarkTriMeshForRefinement() ++void Mesh::MarkTriMeshForRefinement(const DSTable &v_to_v) + { + // Mark the longest triangle edge by rotating the indices so that + // vertex 0 - vertex 1 is the longest edge in the triangle. +- DenseMatrix pmat; +- for (int i = 0; i < NumOfElements; i++) +- { +- if (elements[i]->GetType() == Element::TRIANGLE) +- { +- GetPointMatrix(i, pmat); +- static_cast(elements[i])->MarkEdge(pmat); +- } +- } +-} ++ Array lengths; ++ GetEdgeLengths(v_to_v, lengths); + +-void Mesh::GetEdgeOrdering(const DSTable &v_to_v, Array &order) +-{ +- NumOfEdges = v_to_v.NumberOfEntries(); +- order.SetSize(NumOfEdges); +- Array > length_idx(NumOfEdges); ++ Array idx(NumOfEdges); ++ for (int i = 0; i < NumOfEdges; i++) { idx[i] = i; } + +- for (int i = 0; i < NumOfVertices; i++) ++ for (int i = 0; i < NumOfElements; i++) + { +- for (DSTable::RowIterator it(v_to_v, i); !it; ++it) ++ if (elements[i]->GetType() == Element::TRIANGLE) + { +- int j = it.Index(); +- length_idx[j].one = GetLength(i, it.Column()); +- length_idx[j].two = j; ++ MFEM_ASSERT(dynamic_cast(elements[i]), ++ "Unexpected non-Triangle element type"); ++ static_cast(elements[i])->MarkEdge(v_to_v, lengths, idx); + } + } +- +- // Sort by increasing edge-length. +- length_idx.Sort(); +- +- for (int i = 0; i < NumOfEdges; i++) +- { +- order[length_idx[i].two] = i; +- } + } + + void Mesh::MarkTetMeshForRefinement(const DSTable &v_to_v) + { + // Mark the longest tetrahedral edge by rotating the indices so that + // vertex 0 - vertex 1 is the longest edge in the element. +- Array order; +- GetEdgeOrdering(v_to_v, order); ++ Array lengths; ++ GetEdgeLengths(v_to_v, lengths); ++ ++ Array idx(NumOfEdges); ++ for (int i = 0; i < NumOfEdges; i++) { idx[i] = i; } + + for (int i = 0; i < NumOfElements; i++) + { + if (elements[i]->GetType() == Element::TETRAHEDRON) + { +- elements[i]->MarkEdge(v_to_v, order); ++ MFEM_ASSERT(dynamic_cast(elements[i]), ++ "Unexpected non-Tetrahedron element type"); ++ static_cast(elements[i])->MarkEdge(v_to_v, lengths, idx); + } + } + for (int i = 0; i < NumOfBdrElements; i++) + { + if (boundary[i]->GetType() == Element::TRIANGLE) + { +- boundary[i]->MarkEdge(v_to_v, order); ++ MFEM_ASSERT(dynamic_cast(boundary[i]), ++ "Unexpected non-Triangle element type"); ++ static_cast(boundary[i])->MarkEdge(v_to_v, lengths, idx); + } + } + } +@@ -2820,9 +2849,7 @@ void Mesh::FinalizeTetMesh(int generate_edges, int refine, bool fix_orientation) + + if (refine) + { +- DSTable v_to_v(NumOfVertices); +- GetVertexToVertexTable(v_to_v); +- MarkTetMeshForRefinement(v_to_v); ++ MarkForRefinement(); + } + + GetElementToFaceTable(); +@@ -3089,8 +3116,7 @@ void Mesh::Finalize(bool refine, bool fix_orientation) + // only perform it when Dim == spaceDim. + if (Dim >= 2 && Dim == spaceDim) + { +- const int num_faces = GetNumFaces(); +- for (int i = 0; i < num_faces; i++) ++ for (int i = 0; i < GetNumFaces(); i++) + { + MFEM_VERIFY(faces_info[i].Elem2No < 0 || + faces_info[i].Elem2Inf%2 != 0, "Invalid mesh topology." +@@ -3527,8 +3553,6 @@ void Mesh::Make2D(int nx, int ny, Element::Type type, + boundary[2*nx+j] = new Segment((j+1)*m, j*m, 4); + boundary[2*nx+ny+j] = new Segment(j*m+nx, (j+1)*m+nx, 2); + } +- +- // MarkTriMeshForRefinement(); // done in Finalize(...) + } + else + { +@@ -5699,37 +5723,21 @@ static const char *fixed_or_not[] = { "fixed", "NOT FIXED" }; + + int Mesh::CheckElementOrientation(bool fix_it) + { +- int i, j, k, wo = 0, fo = 0; +- double *v[4]; ++ int wo = 0, fo = 0; + + if (Dim == 2 && spaceDim == 2) + { + DenseMatrix J(2, 2); + +- for (i = 0; i < NumOfElements; i++) ++ for (int i = 0; i < NumOfElements; i++) + { +- int *vi = elements[i]->GetVertices(); +- if (Nodes == NULL) +- { +- for (j = 0; j < 3; j++) +- { +- v[j] = vertices[vi[j]](); +- } +- for (j = 0; j < 2; j++) +- for (k = 0; k < 2; k++) +- { +- J(j, k) = v[j+1][k] - v[0][k]; +- } +- } +- else +- { +- // only check the Jacobian at the center of the element +- GetElementJacobian(i, J); +- } ++ // only check the Jacobian at the center of the element ++ GetElementJacobian(i, J); + if (J.Det() < 0.0) + { + if (fix_it) + { ++ int *vi = elements[i]->GetVertices(); + switch (GetElementType(i)) + { + case Element::TRIANGLE: +@@ -5749,88 +5757,41 @@ int Mesh::CheckElementOrientation(bool fix_it) + } + } + } +- +- if (Dim == 3) ++ else if (Dim == 3) + { + DenseMatrix J(3, 3); + +- for (i = 0; i < NumOfElements; i++) ++ for (int i = 0; i < NumOfElements; i++) + { +- int *vi = elements[i]->GetVertices(); +- switch (GetElementType(i)) ++ // only check the Jacobian at the center of the element ++ GetElementJacobian(i, J); ++ if (J.Det() < 0.0) + { +- case Element::TETRAHEDRON: +- if (Nodes == NULL) +- { +- for (j = 0; j < 4; j++) +- { +- v[j] = vertices[vi[j]](); +- } +- for (j = 0; j < 3; j++) +- for (k = 0; k < 3; k++) +- { +- J(j, k) = v[j+1][k] - v[0][k]; +- } +- } +- else +- { +- // only check the Jacobian at the center of the element +- GetElementJacobian(i, J); +- } +- if (J.Det() < 0.0) ++ if (fix_it) ++ { ++ int *vi = elements[i]->GetVertices(); ++ switch (GetElementType(i)) + { +- wo++; +- if (fix_it) +- { ++ case Element::TETRAHEDRON: + mfem::Swap(vi[0], vi[1]); + fo++; +- } +- } +- break; +- +- case Element::WEDGE: +- // only check the Jacobian at the center of the element +- GetElementJacobian(i, J); +- if (J.Det() < 0.0) +- { +- wo++; +- if (fix_it) +- { ++ break; ++ case Element::WEDGE: + // how? +- } +- } +- break; +- +- case Element::PYRAMID: +- // only check the Jacobian at the center of the element +- GetElementJacobian(i, J); +- if (J.Det() < 0.0) +- { +- wo++; +- if (fix_it) +- { ++ break; ++ case Element::PYRAMID: + // how? +- } +- } +- break; +- +- case Element::HEXAHEDRON: +- // only check the Jacobian at the center of the element +- GetElementJacobian(i, J); +- if (J.Det() < 0.0) +- { +- wo++; +- if (fix_it) +- { ++ break; ++ case Element::HEXAHEDRON: + // how? +- } ++ break; ++ default: ++ MFEM_ABORT("Invalid 3D element type \"" ++ << GetElementType(i) << "\""); ++ break; + } +- break; +- +- default: +- MFEM_ABORT("Invalid 3D element type \"" +- << GetElementType(i) << "\""); +- break; ++ } ++ wo++; + } + } + } +@@ -6756,24 +6717,12 @@ void Mesh::GetBdrPointMatrix(int i,DenseMatrix &pointmat) const + + pointmat.SetSize(spaceDim, nv); + for (k = 0; k < spaceDim; k++) ++ { + for (j = 0; j < nv; j++) + { + pointmat(k, j) = vertices[v[j]](k); + } +-} +- +-double Mesh::GetLength(int i, int j) const +-{ +- const double *vi = vertices[i](); +- const double *vj = vertices[j](); +- double length = 0.; +- +- for (int k = 0; k < spaceDim; k++) +- { +- length += (vi[k]-vj[k])*(vi[k]-vj[k]); + } +- +- return sqrt(length); + } + + // static method +diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp +index 8be58b232..b9c5538c3 100644 +--- a/mesh/mesh.hpp ++++ b/mesh/mesh.hpp +@@ -358,12 +358,9 @@ protected: + /** Also, initializes #mesh_geoms. */ + void SetMeshGen(); + +- /// Return the length of the segment from node i to node j. +- double GetLength(int i, int j) const; +- ++ void GetEdgeLengths(const DSTable &v_to_v, Array &lengths) const; + void MarkForRefinement(); +- void MarkTriMeshForRefinement(); +- void GetEdgeOrdering(const DSTable &v_to_v, Array &order); ++ void MarkTriMeshForRefinement(const DSTable &v_to_v); + virtual void MarkTetMeshForRefinement(const DSTable &v_to_v); + + // Methods used to prepare and apply permutation of the mesh nodes assuming +diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp +index 28b8a1bf8..26e2f4655 100644 +--- a/mesh/pmesh.cpp ++++ b/mesh/pmesh.cpp +@@ -20,6 +20,7 @@ + #include "../general/text.hpp" + #include "../general/globals.hpp" + ++#include + #include + #include + +@@ -762,8 +763,10 @@ void ParMesh::BuildSharedFaceElems(int ntri_faces, int nquad_faces, + sface_lface[stria_counter] = lface; + if (meshgen == 1) // Tet-only mesh + { +- Tetrahedron *tet = dynamic_cast +- (elements[faces_info[lface].Elem1No]); ++ Element *elem = elements[faces_info[lface].Elem1No]; ++ MFEM_ASSERT(dynamic_cast(elem), ++ "Unexpected non-Tetrahedron element") ++ auto *tet = static_cast(elem); + // mark the shared face for refinement by reorienting + // it according to the refinement flag in the tetrahedron + // to which this shared face belongs to. +@@ -1739,97 +1742,59 @@ void ParMesh::GetSharedTriCommunicator(int ordering, + + void ParMesh::MarkTetMeshForRefinement(const DSTable &v_to_v) + { +- Array order; +- GetEdgeOrdering(v_to_v, order); // local edge ordering ++ Array lengths; ++ GetEdgeLengths(v_to_v, lengths); + +- // create a GroupCommunicator on the shared edges ++ // create a GroupCommunicator over shared edges + GroupCommunicator sedge_comm(gtopo); + GetSharedEdgeCommunicator(0, sedge_comm); + +- Array sedge_ord(shared_edges.Size()); +- Array > sedge_ord_map(shared_edges.Size()); +- for (int k = 0; k < shared_edges.Size(); k++) ++ // communicate the local index of each shared edge from the group master to ++ // other ranks in the group ++ Array sedge_master_rank(shared_edges.Size()); ++ Array sedge_master_index(shared_edges.Size()); ++ for (int i = 0; i < group_sedge.Size(); i++) ++ { ++ int rank = gtopo.GetGroupMasterRank(i+1); ++ for (int j = 0; j < group_sedge.RowSize(i); j++) ++ { ++ sedge_master_rank[group_sedge.GetRow(i)[j]] = rank; ++ } ++ } ++ for (int i = 0; i < shared_edges.Size(); i++) + { +- // sedge_ledge may be undefined -- use shared_edges and v_to_v instead +- const int sedge = group_sedge.GetJ()[k]; ++ // sedge_ledge may be undefined so use shared_edges and v_to_v instead ++ const int sedge = group_sedge.GetJ()[i]; + const int *v = shared_edges[sedge]->GetVertices(); +- sedge_ord[k] = order[v_to_v(v[0], v[1])]; ++ sedge_master_index[i] = v_to_v(v[0], v[1]); + } ++ sedge_comm.Bcast(sedge_master_index); + +- sedge_comm.Bcast(sedge_ord, 1); +- +- for (int k = 0, gr = 1; gr < GetNGroups(); gr++) ++ // the pairs (master rank, master local index) define a globally consistent ++ // edge ordering ++ Array glob_edge_order(NumOfEdges); ++ for (int i = 0; i < NumOfEdges; i++) + { +- const int n = group_sedge.RowSize(gr-1); +- if (n == 0) { continue; } +- sedge_ord_map.SetSize(n); +- for (int j = 0; j < n; j++) +- { +- sedge_ord_map[j].one = sedge_ord[k+j]; +- sedge_ord_map[j].two = j; +- } +- SortPairs(sedge_ord_map, n); +- for (int j = 0; j < n; j++) +- { +- const int sedge_from = group_sedge.GetJ()[k+j]; +- const int *v = shared_edges[sedge_from]->GetVertices(); +- sedge_ord[k+j] = order[v_to_v(v[0], v[1])]; +- } +- std::sort(&sedge_ord[k], &sedge_ord[k] + n); +- for (int j = 0; j < n; j++) +- { +- const int sedge_to = group_sedge.GetJ()[k+sedge_ord_map[j].two]; +- const int *v = shared_edges[sedge_to]->GetVertices(); +- order[v_to_v(v[0], v[1])] = sedge_ord[k+j]; +- } +- k += n; ++ glob_edge_order[i] = (std::int64_t(MyRank) << 32) + i; + } +- +-#ifdef MFEM_DEBUG ++ for (int i = 0; i < shared_edges.Size(); i++) + { +- Array > ilen_len(order.Size()); +- +- for (int i = 0; i < NumOfVertices; i++) +- { +- for (DSTable::RowIterator it(v_to_v, i); !it; ++it) +- { +- int j = it.Index(); +- ilen_len[j].one = order[j]; +- ilen_len[j].two = GetLength(i, it.Column()); +- } +- } +- +- SortPairs(ilen_len, order.Size()); +- +- double d_max = 0.; +- for (int i = 1; i < order.Size(); i++) +- { +- d_max = std::max(d_max, ilen_len[i-1].two-ilen_len[i].two); +- } +- +-#if 0 +- // Debug message from every MPI rank. +- mfem::out << "proc. " << MyRank << '/' << NRanks << ": d_max = " << d_max +- << endl; +-#else +- // Debug message just from rank 0. +- double glob_d_max; +- MPI_Reduce(&d_max, &glob_d_max, 1, MPI_DOUBLE, MPI_MAX, 0, MyComm); +- if (MyRank == 0) +- { +- mfem::out << "glob_d_max = " << glob_d_max << endl; +- } +-#endif ++ const int sedge = group_sedge.GetJ()[i]; ++ const int *v = shared_edges[sedge]->GetVertices(); ++ glob_edge_order[v_to_v(v[0], v[1])] = ++ (std::int64_t(sedge_master_rank[i]) << 32) + sedge_master_index[i]; + } +-#endif + +- // use 'order' to mark the tets, the boundary triangles, and the shared ++ // use the lengths to mark the tets, the boundary triangles, and the shared + // triangle faces + for (int i = 0; i < NumOfElements; i++) + { + if (elements[i]->GetType() == Element::TETRAHEDRON) + { +- elements[i]->MarkEdge(v_to_v, order); ++ MFEM_ASSERT(dynamic_cast(elements[i]), ++ "Unexpected non-Tetrahedron element type"); ++ static_cast(elements[i])->MarkEdge(v_to_v, lengths, ++ glob_edge_order); + } + } + +@@ -1837,13 +1802,16 @@ void ParMesh::MarkTetMeshForRefinement(const DSTable &v_to_v) + { + if (boundary[i]->GetType() == Element::TRIANGLE) + { +- boundary[i]->MarkEdge(v_to_v, order); ++ MFEM_ASSERT(dynamic_cast(boundary[i]), ++ "Unexpected non-Triangle element type"); ++ static_cast(boundary[i])->MarkEdge(v_to_v, lengths, ++ glob_edge_order); + } + } + + for (int i = 0; i < shared_trias.Size(); i++) + { +- Triangle::MarkEdge(shared_trias[i].v, v_to_v, order); ++ Triangle::MarkEdge(shared_trias[i].v, v_to_v, lengths, glob_edge_order); + } + } + +diff --git a/mesh/tetrahedron.cpp b/mesh/tetrahedron.cpp +index c7ebc064b..0815fc0a7 100644 +--- a/mesh/tetrahedron.cpp ++++ b/mesh/tetrahedron.cpp +@@ -13,6 +13,8 @@ + + #include "mesh_headers.hpp" + ++#include ++ + namespace mfem + { + +@@ -184,19 +186,30 @@ void Tetrahedron::SetVertices(const int *ind) + } + } + +-void Tetrahedron::MarkEdge(const DSTable &v_to_v, const int *length) ++template ++void Tetrahedron::MarkEdge(const DSTable &v_to_v, const Array &length, ++ const Array &length2) + { +- int ind[4], i, j, l, L, type; +- +- // determine the longest edge +- L = length[v_to_v(indices[0], indices[1])]; j = 0; +- if ((l = length[v_to_v(indices[1], indices[2])]) > L) { L = l; j = 1; } +- if ((l = length[v_to_v(indices[2], indices[0])]) > L) { L = l; j = 2; } +- if ((l = length[v_to_v(indices[0], indices[3])]) > L) { L = l; j = 3; } +- if ((l = length[v_to_v(indices[1], indices[3])]) > L) { L = l; j = 4; } +- if ((l = length[v_to_v(indices[2], indices[3])]) > L) { j = 5; } ++ int e, j, ind[4], type; ++ T1 l, L; ++ T2 l2, L2; ++ auto Compare = [&length, &length2, &l, &l2, &L, &L2](int e) ++ { ++ constexpr T1 rtol = 1.0e-6; ++ l = length[e]; ++ l2 = length2[e]; ++ MFEM_ASSERT(l2 != L2, "Tie-breaking lengths should be unique for MarkEdge"); ++ return (l > L * (1.0 + rtol) || (l > L * (1.0 - rtol) && l2 > L2)); ++ }; ++ ++ e = v_to_v(indices[0], indices[1]); L = length[e]; L2 = length2[e]; j = 0; ++ if (Compare(v_to_v(indices[1], indices[2]))) { L = l; L2 = l2; j = 1; } ++ if (Compare(v_to_v(indices[2], indices[0]))) { L = l; L2 = l2; j = 2; } ++ if (Compare(v_to_v(indices[0], indices[3]))) { L = l; L2 = l2; j = 3; } ++ if (Compare(v_to_v(indices[1], indices[3]))) { L = l; L2 = l2; j = 4; } ++ if (Compare(v_to_v(indices[2], indices[3]))) { j = 5; } + +- for (i = 0; i < 4; i++) ++ for (int i = 0; i < 4; i++) + { + ind[i] = indices[i]; + } +@@ -228,13 +241,14 @@ void Tetrahedron::MarkEdge(const DSTable &v_to_v, const int *length) + // Determine the two longest edges for the other two faces and + // store them in ind[0] and ind[1] + ind[0] = 2; ind[1] = 1; +- L = length[v_to_v(indices[0], indices[2])]; +- if ((l = length[v_to_v(indices[0], indices[3])]) > L) { L = l; ind[0] = 3; } +- if ((l = length[v_to_v(indices[2], indices[3])]) > L) { ind[0] = 5; } + +- L = length[v_to_v(indices[1], indices[2])]; +- if ((l = length[v_to_v(indices[1], indices[3])]) > L) { L = l; ind[1] = 4; } +- if ((l = length[v_to_v(indices[2], indices[3])]) > L) { ind[1] = 5; } ++ e = v_to_v(indices[0], indices[2]); L = length[e]; L2 = length2[e]; ++ if (Compare(v_to_v(indices[0], indices[3]))) { L = l; L2 = l2; ind[0] = 3; } ++ if (Compare(v_to_v(indices[2], indices[3]))) { L = l; L2 = l2; ind[0] = 5; } ++ ++ e = v_to_v(indices[1], indices[2]); L = length[e]; L2 = length2[e]; ++ if (Compare(v_to_v(indices[1], indices[3]))) { L = l; L2 = l2; ind[1] = 4; } ++ if (Compare(v_to_v(indices[2], indices[3]))) { L = l; L2 = l2; ind[1] = 5; } + + j = 0; + switch (ind[0]) +@@ -345,4 +359,13 @@ Element *Tetrahedron::Duplicate(Mesh *m) const + return tet; + } + ++// @cond DOXYGEN_SKIP ++ ++template void Tetrahedron::MarkEdge(const DSTable &, const Array &, ++ const Array &); ++template void Tetrahedron::MarkEdge(const DSTable &, const Array &, ++ const Array &); ++ ++// @endcond ++ + } +diff --git a/mesh/tetrahedron.hpp b/mesh/tetrahedron.hpp +index c434ae903..ad018a037 100644 +--- a/mesh/tetrahedron.hpp ++++ b/mesh/tetrahedron.hpp +@@ -76,7 +76,9 @@ public: + /** Reorder the vertices so that the longest edge is from vertex 0 + to vertex 1. If called it should be once from the mesh constructor, + because the order may be used later for setting the edges. **/ +- virtual void MarkEdge(const DSTable &v_to_v, const int *length); ++ template ++ void MarkEdge(const DSTable &v_to_v, const Array &length, ++ const Array &length2); + + virtual void ResetTransform(int tr) { transform = tr; } + virtual unsigned GetTransform() const { return transform; } +diff --git a/mesh/triangle.cpp b/mesh/triangle.cpp +index 5ce32cb31..abd2b4379 100644 +--- a/mesh/triangle.cpp ++++ b/mesh/triangle.cpp +@@ -11,6 +11,8 @@ + + #include "mesh_headers.hpp" + ++#include ++ + namespace mfem + { + +@@ -50,63 +52,28 @@ void Triangle::SetVertices(const int *ind) + } + } + +-void Triangle::MarkEdge(DenseMatrix &pmat) ++// static method ++template ++void Triangle::MarkEdge(int indices[3], const DSTable &v_to_v, ++ const Array &length, const Array &length2) + { +- double d[3]; +- int shift, v; +- +- d[0] = ( (pmat(0,1)-pmat(0,0))*(pmat(0,1)-pmat(0,0)) + +- (pmat(1,1)-pmat(1,0))*(pmat(1,1)-pmat(1,0)) ); +- d[1] = ( (pmat(0,2)-pmat(0,1))*(pmat(0,2)-pmat(0,1)) + +- (pmat(1,2)-pmat(1,1))*(pmat(1,2)-pmat(1,1)) ); +- d[2] = ( (pmat(0,2)-pmat(0,0))*(pmat(0,2)-pmat(0,0)) + +- (pmat(1,2)-pmat(1,0))*(pmat(1,2)-pmat(1,0)) ); +- +- // if pmat has 3 rows, then use extra term in each sum +- if (pmat.Height()==3) ++ int e, j, ind[3]; ++ T1 l, L; ++ T2 l2, L2; ++ auto Compare = [&length, &length2, &l, &l2, &L, &L2](int e) + { +- d[0] += (pmat(2,1)-pmat(2,0))*(pmat(2,1)-pmat(2,0)); +- d[1] += (pmat(2,2)-pmat(2,1))*(pmat(2,2)-pmat(2,1)); +- d[2] += (pmat(2,2)-pmat(2,0))*(pmat(2,2)-pmat(2,0)); +- } ++ constexpr T1 rtol = 1.0e-6; ++ l = length[e]; ++ l2 = length2[e]; ++ MFEM_ASSERT(l2 != L2, "Tie-breaking lengths should be unique for MarkEdge"); ++ return (l > L * (1.0 + rtol) || (l > L * (1.0 - rtol) && l2 > L2)); ++ }; + +- if (d[0] >= d[1]) +- { +- if (d[0] >= d[2]) { shift = 0; } +- else { shift = 2; } +- } +- else if (d[1] >= d[2]) { shift = 1; } +- else { shift = 2; } ++ e = v_to_v(indices[0], indices[1]); L = length[e]; L2 = length2[e]; j = 0; ++ if (Compare(v_to_v(indices[1], indices[2]))) { L = l; L2 = l2; j = 1; } ++ if (Compare(v_to_v(indices[2], indices[0]))) { j = 2; } + +- switch (shift) +- { +- case 0: +- break; +- case 1: +- v = indices[0]; +- indices[0] = indices[1]; +- indices[1] = indices[2]; +- indices[2] = v; +- break; +- case 2: +- v = indices[0]; +- indices[0] = indices[2]; +- indices[2] = indices[1]; +- indices[1] = v; +- break; +- } +-} +- +-// Static method +-void Triangle::MarkEdge(int *indices, const DSTable &v_to_v, const int *length) +-{ +- int l, L, j, ind[3], i; +- +- L = length[ v_to_v(indices[0], indices[1]) ]; j = 0; +- if ( (l = length[ v_to_v(indices[1], indices[2]) ]) > L ) { L = l; j = 1; } +- if ( (l = length[ v_to_v(indices[2], indices[0]) ]) > L ) { j = 2; } +- +- for (i = 0; i < 3; i++) ++ for (int i = 0; i < 3; i++) + { + ind[i] = indices[i]; + } +@@ -194,4 +161,13 @@ void Triangle::GetVertices(Array &v) const + } + } + ++// @cond DOXYGEN_SKIP ++ ++template void Triangle::MarkEdge(int *, const DSTable &, const Array &, ++ const Array &); ++template void Triangle::MarkEdge(int *, const DSTable &, const Array &, ++ const Array &); ++ ++// @endcond ++ + } // namespace mfem +diff --git a/mesh/triangle.hpp b/mesh/triangle.hpp +index 363bd4503..49fb4fe99 100644 +--- a/mesh/triangle.hpp ++++ b/mesh/triangle.hpp +@@ -50,13 +50,14 @@ public: + /** Reorder the vertices so that the longest edge is from vertex 0 + to vertex 1. If called it should be once from the mesh constructor, + because the order may be used later for setting the edges. **/ +- void MarkEdge(DenseMatrix & pmat); +- +- static void MarkEdge(int *indices, const DSTable &v_to_v, const int *length); +- +- /// Mark the longest edge by assuming/changing the order of the vertices. +- virtual void MarkEdge(const DSTable &v_to_v, const int *length) +- { MarkEdge(indices, v_to_v, length); } ++ template ++ void MarkEdge(const DSTable &v_to_v, const Array &length, ++ const Array &length2) ++ { MarkEdge(indices, v_to_v, length, length2); } ++ ++ template ++ static void MarkEdge(int *indices, const DSTable &v_to_v, ++ const Array &length, const Array &length2); + + virtual void ResetTransform(int tr) { transform = tr; } + virtual unsigned GetTransform() const { return transform; } diff --git a/extern/patch/mfem/patch_pfespace_constructor_fix.diff b/extern/patch/mfem/patch_pfespace_constructor_fix.diff index 5d1d0de008..c0faa5858e 100644 --- a/extern/patch/mfem/patch_pfespace_constructor_fix.diff +++ b/extern/patch/mfem/patch_pfespace_constructor_fix.diff @@ -1,297 +1,297 @@ -diff --git a/fem/pfespace.cpp b/fem/pfespace.cpp -index b0fdfe2c0..81b5bd5b2 100644 ---- a/fem/pfespace.cpp -+++ b/fem/pfespace.cpp -@@ -43,7 +43,7 @@ ParFiniteElementSpace::ParFiniteElementSpace( - } - - ParFiniteElementSpace::ParFiniteElementSpace( -- ParMesh *pm, const FiniteElementSpace *global_fes, const int *partitioning, -+ ParMesh *pm, const FiniteElementSpace *global_fes, - const FiniteElementCollection *f) - : FiniteElementSpace(pm, MakeLocalNURBSext(global_fes->GetNURBSext(), - pm->NURBSext), -diff --git a/fem/pfespace.hpp b/fem/pfespace.hpp -index 44d512f73..7c7b49b7e 100644 ---- a/fem/pfespace.hpp -+++ b/fem/pfespace.hpp -@@ -243,14 +243,11 @@ public: - /** @brief Construct the *local* ParFiniteElementSpace corresponding to the - global FE space, @a global_fes. */ - /** The parameter @a pm is the *local* ParMesh obtained by decomposing the -- global Mesh used by @a global_fes. The array @a partitioning represents -- the parallel decomposition - it maps global element ids to MPI ranks. -- If the FiniteElementCollection, @a f, is NULL (default), the FE -- collection used by @a global_fes will be reused. If @a f is not NULL, it -- must be the same as, or a copy of, the FE collection used by -- @a global_fes. */ -+ global Mesh used by @a global_fes. If the FiniteElementCollection, @a f, -+ is NULL (default), the FE collection used by @a global_fes will be -+ reused. If @a f is not NULL, it must be the same as, or a copy of, the FE -+ collection used by @a global_fes. */ - ParFiniteElementSpace(ParMesh *pm, const FiniteElementSpace *global_fes, -- const int *partitioning, - const FiniteElementCollection *f = NULL); - - ParFiniteElementSpace(ParMesh *pm, const FiniteElementCollection *f, -diff --git a/fem/pgridfunc.cpp b/fem/pgridfunc.cpp -index 3631579ea..38b57a3e1 100644 ---- a/fem/pgridfunc.cpp -+++ b/fem/pgridfunc.cpp -@@ -41,7 +41,7 @@ ParGridFunction::ParGridFunction(ParMesh *pmesh, const GridFunction *gf, - // duplicate the FiniteElementCollection from 'gf' - fec = FiniteElementCollection::New(glob_fes->FEColl()->Name()); - // create a local ParFiniteElementSpace from the global one: -- fes = pfes = new ParFiniteElementSpace(pmesh, glob_fes, partitioning, fec); -+ fes = pfes = new ParFiniteElementSpace(pmesh, glob_fes, fec); - SetSize(pfes->GetVSize()); - - if (partitioning) -diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp -index a1403e5d5..f76104da3 100644 ---- a/mesh/mesh.cpp -+++ b/mesh/mesh.cpp -@@ -2427,7 +2427,7 @@ void Mesh::MarkTriMeshForRefinement() - } - } - --void Mesh::GetEdgeOrdering(DSTable &v_to_v, Array &order) -+void Mesh::GetEdgeOrdering(const DSTable &v_to_v, Array &order) - { - NumOfEdges = v_to_v.NumberOfEntries(); - order.SetSize(NumOfEdges); -@@ -2452,7 +2452,7 @@ void Mesh::GetEdgeOrdering(DSTable &v_to_v, Array &order) - } - } - --void Mesh::MarkTetMeshForRefinement(DSTable &v_to_v) -+void Mesh::MarkTetMeshForRefinement(const DSTable &v_to_v) - { - // Mark the longest tetrahedral edge by rotating the indices so that - // vertex 0 - vertex 1 is the longest edge in the element. -diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp -index 208501345..8be58b232 100644 ---- a/mesh/mesh.hpp -+++ b/mesh/mesh.hpp -@@ -31,8 +31,6 @@ - namespace mfem - { - --// Data type mesh -- - class GeometricFactors; - class FaceGeometricFactors; - class KnotVector; -@@ -49,15 +47,15 @@ class ParMesh; - class ParNCMesh; - #endif - -+/// Mesh data type - class Mesh - { -+ friend class NCMesh; -+ friend class NURBSExtension; - #ifdef MFEM_USE_MPI - friend class ParMesh; - friend class ParNCMesh; - #endif -- friend class NCMesh; -- friend class NURBSExtension; -- - #ifdef MFEM_USE_ADIOS2 - friend class adios2stream; - #endif -@@ -365,8 +363,8 @@ protected: - - void MarkForRefinement(); - void MarkTriMeshForRefinement(); -- void GetEdgeOrdering(DSTable &v_to_v, Array &order); -- virtual void MarkTetMeshForRefinement(DSTable &v_to_v); -+ void GetEdgeOrdering(const DSTable &v_to_v, Array &order); -+ virtual void MarkTetMeshForRefinement(const DSTable &v_to_v); - - // Methods used to prepare and apply permutation of the mesh nodes assuming - // that the mesh elements may be rotated (e.g. to mark triangle or tet edges -diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp -index 6490793f3..28b8a1bf8 100644 ---- a/mesh/pmesh.cpp -+++ b/mesh/pmesh.cpp -@@ -930,7 +930,8 @@ void ParMesh::FinalizeParTopo() - } - } - --ParMesh::ParMesh(MPI_Comm comm, istream &input, bool refine) -+ParMesh::ParMesh(MPI_Comm comm, istream &input, int generate_edges, -+ int refine, bool fix_orientation) - : glob_elem_offset(-1) - , glob_offset_sequence(-1) - , gtopo(comm) -@@ -942,9 +943,7 @@ ParMesh::ParMesh(MPI_Comm comm, istream &input, bool refine) - have_face_nbr_data = false; - pncmesh = NULL; - -- const int gen_edges = 1; -- -- Load(input, gen_edges, refine, true); -+ Load(input, generate_edges, refine, fix_orientation); - } - - void ParMesh::Load(istream &input, int generate_edges, int refine, -@@ -1738,7 +1737,7 @@ void ParMesh::GetSharedTriCommunicator(int ordering, - stria_comm.Finalize(); - } - --void ParMesh::MarkTetMeshForRefinement(DSTable &v_to_v) -+void ParMesh::MarkTetMeshForRefinement(const DSTable &v_to_v) - { - Array order; - GetEdgeOrdering(v_to_v, order); // local edge ordering -@@ -2063,6 +2062,7 @@ void ParMesh::DeleteFaceNbrData() - - void ParMesh::SetCurvature(int order, bool discont, int space_dim, int ordering) - { -+ DeleteFaceNbrData(); - space_dim = (space_dim == -1) ? spaceDim : space_dim; - FiniteElementCollection* nfec; - if (discont) -@@ -2083,6 +2083,7 @@ void ParMesh::SetCurvature(int order, bool discont, int space_dim, int ordering) - - void ParMesh::SetNodalFESpace(FiniteElementSpace *nfes) - { -+ DeleteFaceNbrData(); - ParFiniteElementSpace *npfes = dynamic_cast(nfes); - if (npfes) - { -@@ -2096,6 +2097,7 @@ void ParMesh::SetNodalFESpace(FiniteElementSpace *nfes) - - void ParMesh::SetNodalFESpace(ParFiniteElementSpace *npfes) - { -+ DeleteFaceNbrData(); - ParGridFunction *nodes = new ParGridFunction(npfes); - SetNodalGridFunction(nodes, true); - } -@@ -2104,19 +2106,17 @@ void ParMesh::EnsureParNodes() - { - if (Nodes && dynamic_cast(Nodes->FESpace()) == NULL) - { -+ DeleteFaceNbrData(); - ParFiniteElementSpace *pfes = - new ParFiniteElementSpace(*Nodes->FESpace(), *this); - ParGridFunction *new_nodes = new ParGridFunction(pfes); -- - *new_nodes = *Nodes; -- - if (Nodes->OwnFEC()) - { - new_nodes->MakeOwner(Nodes->OwnFEC()); - Nodes->MakeOwner(NULL); // takes away ownership of 'fec' and 'fes' - delete Nodes->FESpace(); - } -- - delete Nodes; - Nodes = new_nodes; - } -@@ -3212,17 +3212,15 @@ void ParMesh::ReorientTetMesh() - // other ranks in the group - Array svert_master_rank(svert_lvert.Size()); - Array svert_master_index(svert_lvert); -+ for (int i = 0; i < group_svert.Size(); i++) - { -- for (int i = 0; i < group_svert.Size(); i++) -+ int rank = gtopo.GetGroupMasterRank(i+1); -+ for (int j = 0; j < group_svert.RowSize(i); j++) - { -- int rank = gtopo.GetGroupMasterRank(i+1); -- for (int j = 0; j < group_svert.RowSize(i); j++) -- { -- svert_master_rank[group_svert.GetRow(i)[j]] = rank; -- } -+ svert_master_rank[group_svert.GetRow(i)[j]] = rank; - } -- svert_comm.Bcast(svert_master_index); - } -+ svert_comm.Bcast(svert_master_index); - - // the pairs (master rank, master local index) define a globally consistent - // vertex ordering -diff --git a/mesh/pmesh.hpp b/mesh/pmesh.hpp -index e8e0955c8..06f09dc0c 100644 ---- a/mesh/pmesh.hpp -+++ b/mesh/pmesh.hpp -@@ -24,6 +24,7 @@ - - namespace mfem - { -+ - #ifdef MFEM_USE_PUMI - class ParPumiMesh; - #endif -@@ -31,9 +32,16 @@ class ParPumiMesh; - /// Class for parallel meshes - class ParMesh : public Mesh - { --protected: -+ friend class ParNCMesh; - friend class ParSubMesh; -+#ifdef MFEM_USE_PUMI -+ friend class ParPumiMesh; -+#endif -+#ifdef MFEM_USE_ADIOS2 -+ friend class adios2stream; -+#endif - -+protected: - MPI_Comm MyComm; - int NRanks, MyRank; - -@@ -105,7 +113,7 @@ protected: - - // Mark all tets to ensure consistency across MPI tasks; also mark the - // shared and boundary triangle faces using the consistently marked tets. -- void MarkTetMeshForRefinement(DSTable &v_to_v) override; -+ void MarkTetMeshForRefinement(const DSTable &v_to_v) override; - - /// Return a number(0-1) identifying how the given edge has been split - int GetEdgeSplittings(Element *edge, const DSTable &v_to_v, int *middle); -@@ -337,7 +345,8 @@ public: - - /// Read a parallel mesh, each MPI rank from its own file/stream. - /** The @a refine parameter is passed to the method Mesh::Finalize(). */ -- ParMesh(MPI_Comm comm, std::istream &input, bool refine = true); -+ ParMesh(MPI_Comm comm, std::istream &input, int generate_edges = 0, -+ int refine = 1, bool fix_orientation = true); - - /// Deprecated: see @a ParMesh::MakeRefined - MFEM_DEPRECATED -@@ -694,14 +703,6 @@ public: - void PrintSharedEntities(const std::string &fname_prefix) const; - - virtual ~ParMesh(); -- -- friend class ParNCMesh; --#ifdef MFEM_USE_PUMI -- friend class ParPumiMesh; --#endif --#ifdef MFEM_USE_ADIOS2 -- friend class adios2stream; --#endif - }; - - } -diff --git a/tests/unit/mesh/test_ncmesh.cpp b/tests/unit/mesh/test_ncmesh.cpp -index 63e10f743..c1bb54e13 100644 ---- a/tests/unit/mesh/test_ncmesh.cpp -+++ b/tests/unit/mesh/test_ncmesh.cpp -@@ -937,11 +937,9 @@ void TestVectorValueInVolume(Mesh &smesh, int nc_level, int skip, bool use_ND) - // along the processor boundary. - - // Create a grid function of the mesh coordinates -- pmesh.ExchangeFaceNbrData(); - pmesh.EnsureNodes(); -- REQUIRE(pmesh.OwnsNodes()); -+ pmesh.ExchangeFaceNbrData(); - GridFunction * const coords = pmesh.GetNodes(); -- dynamic_cast(pmesh.GetNodes())->ExchangeFaceNbrData(); - - // Project the linear function onto the mesh. Quadratic ND tetrahedral - // elements are the first to require face orientations. +diff --git a/fem/pfespace.cpp b/fem/pfespace.cpp +index b0fdfe2c0..81b5bd5b2 100644 +--- a/fem/pfespace.cpp ++++ b/fem/pfespace.cpp +@@ -43,7 +43,7 @@ ParFiniteElementSpace::ParFiniteElementSpace( + } + + ParFiniteElementSpace::ParFiniteElementSpace( +- ParMesh *pm, const FiniteElementSpace *global_fes, const int *partitioning, ++ ParMesh *pm, const FiniteElementSpace *global_fes, + const FiniteElementCollection *f) + : FiniteElementSpace(pm, MakeLocalNURBSext(global_fes->GetNURBSext(), + pm->NURBSext), +diff --git a/fem/pfespace.hpp b/fem/pfespace.hpp +index 44d512f73..7c7b49b7e 100644 +--- a/fem/pfespace.hpp ++++ b/fem/pfespace.hpp +@@ -243,14 +243,11 @@ public: + /** @brief Construct the *local* ParFiniteElementSpace corresponding to the + global FE space, @a global_fes. */ + /** The parameter @a pm is the *local* ParMesh obtained by decomposing the +- global Mesh used by @a global_fes. The array @a partitioning represents +- the parallel decomposition - it maps global element ids to MPI ranks. +- If the FiniteElementCollection, @a f, is NULL (default), the FE +- collection used by @a global_fes will be reused. If @a f is not NULL, it +- must be the same as, or a copy of, the FE collection used by +- @a global_fes. */ ++ global Mesh used by @a global_fes. If the FiniteElementCollection, @a f, ++ is NULL (default), the FE collection used by @a global_fes will be ++ reused. If @a f is not NULL, it must be the same as, or a copy of, the FE ++ collection used by @a global_fes. */ + ParFiniteElementSpace(ParMesh *pm, const FiniteElementSpace *global_fes, +- const int *partitioning, + const FiniteElementCollection *f = NULL); + + ParFiniteElementSpace(ParMesh *pm, const FiniteElementCollection *f, +diff --git a/fem/pgridfunc.cpp b/fem/pgridfunc.cpp +index 3631579ea..38b57a3e1 100644 +--- a/fem/pgridfunc.cpp ++++ b/fem/pgridfunc.cpp +@@ -41,7 +41,7 @@ ParGridFunction::ParGridFunction(ParMesh *pmesh, const GridFunction *gf, + // duplicate the FiniteElementCollection from 'gf' + fec = FiniteElementCollection::New(glob_fes->FEColl()->Name()); + // create a local ParFiniteElementSpace from the global one: +- fes = pfes = new ParFiniteElementSpace(pmesh, glob_fes, partitioning, fec); ++ fes = pfes = new ParFiniteElementSpace(pmesh, glob_fes, fec); + SetSize(pfes->GetVSize()); + + if (partitioning) +diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp +index a1403e5d5..f76104da3 100644 +--- a/mesh/mesh.cpp ++++ b/mesh/mesh.cpp +@@ -2427,7 +2427,7 @@ void Mesh::MarkTriMeshForRefinement() + } + } + +-void Mesh::GetEdgeOrdering(DSTable &v_to_v, Array &order) ++void Mesh::GetEdgeOrdering(const DSTable &v_to_v, Array &order) + { + NumOfEdges = v_to_v.NumberOfEntries(); + order.SetSize(NumOfEdges); +@@ -2452,7 +2452,7 @@ void Mesh::GetEdgeOrdering(DSTable &v_to_v, Array &order) + } + } + +-void Mesh::MarkTetMeshForRefinement(DSTable &v_to_v) ++void Mesh::MarkTetMeshForRefinement(const DSTable &v_to_v) + { + // Mark the longest tetrahedral edge by rotating the indices so that + // vertex 0 - vertex 1 is the longest edge in the element. +diff --git a/mesh/mesh.hpp b/mesh/mesh.hpp +index 208501345..8be58b232 100644 +--- a/mesh/mesh.hpp ++++ b/mesh/mesh.hpp +@@ -31,8 +31,6 @@ + namespace mfem + { + +-// Data type mesh +- + class GeometricFactors; + class FaceGeometricFactors; + class KnotVector; +@@ -49,15 +47,15 @@ class ParMesh; + class ParNCMesh; + #endif + ++/// Mesh data type + class Mesh + { ++ friend class NCMesh; ++ friend class NURBSExtension; + #ifdef MFEM_USE_MPI + friend class ParMesh; + friend class ParNCMesh; + #endif +- friend class NCMesh; +- friend class NURBSExtension; +- + #ifdef MFEM_USE_ADIOS2 + friend class adios2stream; + #endif +@@ -365,8 +363,8 @@ protected: + + void MarkForRefinement(); + void MarkTriMeshForRefinement(); +- void GetEdgeOrdering(DSTable &v_to_v, Array &order); +- virtual void MarkTetMeshForRefinement(DSTable &v_to_v); ++ void GetEdgeOrdering(const DSTable &v_to_v, Array &order); ++ virtual void MarkTetMeshForRefinement(const DSTable &v_to_v); + + // Methods used to prepare and apply permutation of the mesh nodes assuming + // that the mesh elements may be rotated (e.g. to mark triangle or tet edges +diff --git a/mesh/pmesh.cpp b/mesh/pmesh.cpp +index 6490793f3..28b8a1bf8 100644 +--- a/mesh/pmesh.cpp ++++ b/mesh/pmesh.cpp +@@ -930,7 +930,8 @@ void ParMesh::FinalizeParTopo() + } + } + +-ParMesh::ParMesh(MPI_Comm comm, istream &input, bool refine) ++ParMesh::ParMesh(MPI_Comm comm, istream &input, int generate_edges, ++ int refine, bool fix_orientation) + : glob_elem_offset(-1) + , glob_offset_sequence(-1) + , gtopo(comm) +@@ -942,9 +943,7 @@ ParMesh::ParMesh(MPI_Comm comm, istream &input, bool refine) + have_face_nbr_data = false; + pncmesh = NULL; + +- const int gen_edges = 1; +- +- Load(input, gen_edges, refine, true); ++ Load(input, generate_edges, refine, fix_orientation); + } + + void ParMesh::Load(istream &input, int generate_edges, int refine, +@@ -1738,7 +1737,7 @@ void ParMesh::GetSharedTriCommunicator(int ordering, + stria_comm.Finalize(); + } + +-void ParMesh::MarkTetMeshForRefinement(DSTable &v_to_v) ++void ParMesh::MarkTetMeshForRefinement(const DSTable &v_to_v) + { + Array order; + GetEdgeOrdering(v_to_v, order); // local edge ordering +@@ -2063,6 +2062,7 @@ void ParMesh::DeleteFaceNbrData() + + void ParMesh::SetCurvature(int order, bool discont, int space_dim, int ordering) + { ++ DeleteFaceNbrData(); + space_dim = (space_dim == -1) ? spaceDim : space_dim; + FiniteElementCollection* nfec; + if (discont) +@@ -2083,6 +2083,7 @@ void ParMesh::SetCurvature(int order, bool discont, int space_dim, int ordering) + + void ParMesh::SetNodalFESpace(FiniteElementSpace *nfes) + { ++ DeleteFaceNbrData(); + ParFiniteElementSpace *npfes = dynamic_cast(nfes); + if (npfes) + { +@@ -2096,6 +2097,7 @@ void ParMesh::SetNodalFESpace(FiniteElementSpace *nfes) + + void ParMesh::SetNodalFESpace(ParFiniteElementSpace *npfes) + { ++ DeleteFaceNbrData(); + ParGridFunction *nodes = new ParGridFunction(npfes); + SetNodalGridFunction(nodes, true); + } +@@ -2104,19 +2106,17 @@ void ParMesh::EnsureParNodes() + { + if (Nodes && dynamic_cast(Nodes->FESpace()) == NULL) + { ++ DeleteFaceNbrData(); + ParFiniteElementSpace *pfes = + new ParFiniteElementSpace(*Nodes->FESpace(), *this); + ParGridFunction *new_nodes = new ParGridFunction(pfes); +- + *new_nodes = *Nodes; +- + if (Nodes->OwnFEC()) + { + new_nodes->MakeOwner(Nodes->OwnFEC()); + Nodes->MakeOwner(NULL); // takes away ownership of 'fec' and 'fes' + delete Nodes->FESpace(); + } +- + delete Nodes; + Nodes = new_nodes; + } +@@ -3212,17 +3212,15 @@ void ParMesh::ReorientTetMesh() + // other ranks in the group + Array svert_master_rank(svert_lvert.Size()); + Array svert_master_index(svert_lvert); ++ for (int i = 0; i < group_svert.Size(); i++) + { +- for (int i = 0; i < group_svert.Size(); i++) ++ int rank = gtopo.GetGroupMasterRank(i+1); ++ for (int j = 0; j < group_svert.RowSize(i); j++) + { +- int rank = gtopo.GetGroupMasterRank(i+1); +- for (int j = 0; j < group_svert.RowSize(i); j++) +- { +- svert_master_rank[group_svert.GetRow(i)[j]] = rank; +- } ++ svert_master_rank[group_svert.GetRow(i)[j]] = rank; + } +- svert_comm.Bcast(svert_master_index); + } ++ svert_comm.Bcast(svert_master_index); + + // the pairs (master rank, master local index) define a globally consistent + // vertex ordering +diff --git a/mesh/pmesh.hpp b/mesh/pmesh.hpp +index e8e0955c8..06f09dc0c 100644 +--- a/mesh/pmesh.hpp ++++ b/mesh/pmesh.hpp +@@ -24,6 +24,7 @@ + + namespace mfem + { ++ + #ifdef MFEM_USE_PUMI + class ParPumiMesh; + #endif +@@ -31,9 +32,16 @@ class ParPumiMesh; + /// Class for parallel meshes + class ParMesh : public Mesh + { +-protected: ++ friend class ParNCMesh; + friend class ParSubMesh; ++#ifdef MFEM_USE_PUMI ++ friend class ParPumiMesh; ++#endif ++#ifdef MFEM_USE_ADIOS2 ++ friend class adios2stream; ++#endif + ++protected: + MPI_Comm MyComm; + int NRanks, MyRank; + +@@ -105,7 +113,7 @@ protected: + + // Mark all tets to ensure consistency across MPI tasks; also mark the + // shared and boundary triangle faces using the consistently marked tets. +- void MarkTetMeshForRefinement(DSTable &v_to_v) override; ++ void MarkTetMeshForRefinement(const DSTable &v_to_v) override; + + /// Return a number(0-1) identifying how the given edge has been split + int GetEdgeSplittings(Element *edge, const DSTable &v_to_v, int *middle); +@@ -337,7 +345,8 @@ public: + + /// Read a parallel mesh, each MPI rank from its own file/stream. + /** The @a refine parameter is passed to the method Mesh::Finalize(). */ +- ParMesh(MPI_Comm comm, std::istream &input, bool refine = true); ++ ParMesh(MPI_Comm comm, std::istream &input, int generate_edges = 0, ++ int refine = 1, bool fix_orientation = true); + + /// Deprecated: see @a ParMesh::MakeRefined + MFEM_DEPRECATED +@@ -694,14 +703,6 @@ public: + void PrintSharedEntities(const std::string &fname_prefix) const; + + virtual ~ParMesh(); +- +- friend class ParNCMesh; +-#ifdef MFEM_USE_PUMI +- friend class ParPumiMesh; +-#endif +-#ifdef MFEM_USE_ADIOS2 +- friend class adios2stream; +-#endif + }; + + } +diff --git a/tests/unit/mesh/test_ncmesh.cpp b/tests/unit/mesh/test_ncmesh.cpp +index 63e10f743..c1bb54e13 100644 +--- a/tests/unit/mesh/test_ncmesh.cpp ++++ b/tests/unit/mesh/test_ncmesh.cpp +@@ -937,11 +937,9 @@ void TestVectorValueInVolume(Mesh &smesh, int nc_level, int skip, bool use_ND) + // along the processor boundary. + + // Create a grid function of the mesh coordinates +- pmesh.ExchangeFaceNbrData(); + pmesh.EnsureNodes(); +- REQUIRE(pmesh.OwnsNodes()); ++ pmesh.ExchangeFaceNbrData(); + GridFunction * const coords = pmesh.GetNodes(); +- dynamic_cast(pmesh.GetNodes())->ExchangeFaceNbrData(); + + // Project the linear function onto the mesh. Quadratic ND tetrahedral + // elements are the first to require face orientations. diff --git a/extern/patch/mfem/patch_stateless_doftrans_threadsafe.diff b/extern/patch/mfem/patch_stateless_doftrans_threadsafe.diff index 5eedc63805..01e4db11b8 100644 --- a/extern/patch/mfem/patch_stateless_doftrans_threadsafe.diff +++ b/extern/patch/mfem/patch_stateless_doftrans_threadsafe.diff @@ -1,2616 +1,2616 @@ -diff --git a/fem/doftrans.cpp b/fem/doftrans.cpp -index 93d5588de..0b4dbcef7 100644 ---- a/fem/doftrans.cpp -+++ b/fem/doftrans.cpp -@@ -14,62 +14,18 @@ - namespace mfem - { - --void TransformPrimal(const DofTransformation *ran_dof_trans, -- const DofTransformation *dom_dof_trans, -- DenseMatrix &elmat) --{ -- if (ran_dof_trans && dom_dof_trans) -- { -- ran_dof_trans->TransformPrimalCols(elmat); -- dom_dof_trans->TransformDualRows(elmat); -- } -- else if (ran_dof_trans) -- { -- ran_dof_trans->TransformPrimalCols(elmat); -- } -- else if (dom_dof_trans) -- { -- dom_dof_trans->TransformDualRows(elmat); -- } -- else -- { -- // If both transformations are NULL this function should not be called -- } --} -- --void TransformDual(const DofTransformation *ran_dof_trans, -- const DofTransformation *dom_dof_trans, -- DenseMatrix &elmat) --{ -- if (ran_dof_trans && dom_dof_trans) -- { -- ran_dof_trans->TransformDualCols(elmat); -- dom_dof_trans->TransformDualRows(elmat); -- } -- else if (ran_dof_trans) -- { -- ran_dof_trans->TransformDualCols(elmat); -- } -- else if (dom_dof_trans) -- { -- dom_dof_trans->TransformDualRows(elmat); -- } -- else -- { -- // If both transformations are NULL this function should not be called -- } --} -- --void StatelessVDofTransformation::TransformPrimal(const Array & face_ori, -- double *v) const -+void DofTransformation::TransformPrimal(double *v) const - { -- int size = sdoftrans_->Size(); -+ MFEM_ASSERT(dof_trans_, -+ "DofTransformation has no local transformation, call " -+ "SetDofTransformation first!"); -+ int size = dof_trans_->Size(); - -- if ((Ordering::Type)ordering_ == Ordering::byNODES || vdim_ == 1) -+ if (vdim_ == 1 || (Ordering::Type)ordering_ == Ordering::byNODES) - { - for (int i=0; iTransformPrimal(face_ori, &v[i*size]); -+ dof_trans_->TransformPrimal(Fo_, &v[i*size]); - } - } - else -@@ -81,7 +37,7 @@ void StatelessVDofTransformation::TransformPrimal(const Array & face_ori, - { - vec(j) = v[j*vdim_+i]; - } -- sdoftrans_->TransformPrimal(face_ori, vec); -+ dof_trans_->TransformPrimal(Fo_, vec); - for (int j=0; j & face_ori, - } - } - --void StatelessVDofTransformation::InvTransformPrimal( -- const Array & face_ori, -- double *v) const -+void DofTransformation::InvTransformPrimal(double *v) const - { -- int size = sdoftrans_->Height(); -+ MFEM_ASSERT(dof_trans_, -+ "DofTransformation has no local transformation, call " -+ "SetDofTransformation first!"); -+ int size = dof_trans_->Height(); - -- if ((Ordering::Type)ordering_ == Ordering::byNODES) -+ if (vdim_ == 1 || (Ordering::Type)ordering_ == Ordering::byNODES) - { - for (int i=0; iInvTransformPrimal(face_ori, &v[i*size]); -+ dof_trans_->InvTransformPrimal(Fo_, &v[i*size]); - } - } - else -@@ -112,7 +69,7 @@ void StatelessVDofTransformation::InvTransformPrimal( - { - vec(j) = v[j*vdim_+i]; - } -- sdoftrans_->InvTransformPrimal(face_ori, vec); -+ dof_trans_->InvTransformPrimal(Fo_, vec); - for (int j=0; j & face_ori, -- double *v) const -+void DofTransformation::TransformDual(double *v) const - { -- int size = sdoftrans_->Size(); -+ MFEM_ASSERT(dof_trans_, -+ "DofTransformation has no local transformation, call " -+ "SetDofTransformation first!"); -+ int size = dof_trans_->Size(); - -- if ((Ordering::Type)ordering_ == Ordering::byNODES) -+ if (vdim_ == 1 || (Ordering::Type)ordering_ == Ordering::byNODES) - { - for (int i=0; iTransformDual(face_ori, &v[i*size]); -+ dof_trans_->TransformDual(Fo_, &v[i*size]); - } - } - else -@@ -142,7 +101,7 @@ void StatelessVDofTransformation::TransformDual(const Array & face_ori, - { - vec(j) = v[j*vdim_+i]; - } -- sdoftrans_->TransformDual(face_ori, vec); -+ dof_trans_->TransformDual(Fo_, vec); - for (int j=0; j & face_ori, - } - } - --void StatelessVDofTransformation::InvTransformDual(const Array & face_ori, -- double *v) const -+void DofTransformation::InvTransformDual(double *v) const - { -- int size = sdoftrans_->Size(); -+ MFEM_ASSERT(dof_trans_, -+ "DofTransformation has no local transformation, call " -+ "SetDofTransformation first!"); -+ int size = dof_trans_->Size(); - -- if ((Ordering::Type)ordering_ == Ordering::byNODES) -+ if (vdim_ == 1 || (Ordering::Type)ordering_ == Ordering::byNODES) - { - for (int i=0; iInvTransformDual(face_ori, &v[i*size]); -+ dof_trans_->InvTransformDual(Fo_, &v[i*size]); - } - } - else -@@ -172,7 +133,7 @@ void StatelessVDofTransformation::InvTransformDual(const Array & face_ori, - { - vec(j) = v[j*vdim_+i]; - } -- sdoftrans_->InvTransformDual(face_ori, vec); -+ dof_trans_->InvTransformDual(Fo_, vec); - for (int j=0; j & face_ori, - } - } - -+void TransformPrimal(const DofTransformation *ran_dof_trans, -+ const DofTransformation *dom_dof_trans, -+ DenseMatrix &elmat) -+{ -+ // No action if both transformations are NULL -+ if (ran_dof_trans) -+ { -+ ran_dof_trans->TransformPrimalCols(elmat); -+ } -+ if (dom_dof_trans) -+ { -+ dom_dof_trans->TransformDualRows(elmat); -+ } -+} -+ -+void TransformDual(const DofTransformation *ran_dof_trans, -+ const DofTransformation *dom_dof_trans, -+ DenseMatrix &elmat) -+{ -+ // No action if both transformations are NULL -+ if (ran_dof_trans) -+ { -+ ran_dof_trans->TransformDualCols(elmat); -+ } -+ if (dom_dof_trans) -+ { -+ dom_dof_trans->TransformDualRows(elmat); -+ } -+} -+ - // ordering (i0j0, i1j0, i0j1, i1j1), each row is a column major matrix --const double ND_StatelessDofTransformation::T_data[24] = -+const double ND_DofTransformation::T_data[24] = - { - 1.0, 0.0, 0.0, 1.0, - -1.0, -1.0, 0.0, 1.0, -@@ -192,11 +183,11 @@ const double ND_StatelessDofTransformation::T_data[24] = - 0.0, 1.0, 1.0, 0.0 - }; - --const DenseTensor ND_StatelessDofTransformation --::T(const_cast(ND_StatelessDofTransformation::T_data), 2, 2, 6); -+const DenseTensor ND_DofTransformation -+::T(const_cast(ND_DofTransformation::T_data), 2, 2, 6); - - // ordering (i0j0, i1j0, i0j1, i1j1), each row is a column major matrix --const double ND_StatelessDofTransformation::TInv_data[24] = -+const double ND_DofTransformation::TInv_data[24] = - { - 1.0, 0.0, 0.0, 1.0, - -1.0, -1.0, 0.0, 1.0, -@@ -206,12 +197,11 @@ const double ND_StatelessDofTransformation::TInv_data[24] = - 0.0, 1.0, 1.0, 0.0 - }; - --const DenseTensor ND_StatelessDofTransformation --::TInv(const_cast(TInv_data), 2, 2, 6); -+const DenseTensor ND_DofTransformation -+::TInv(const_cast(TInv_data), 2, 2, 6); - --ND_StatelessDofTransformation::ND_StatelessDofTransformation(int size, int p, -- int num_edges, -- int num_tri_faces) -+ND_DofTransformation::ND_DofTransformation(int size, int p, int num_edges, -+ int num_tri_faces) - : StatelessDofTransformation(size) - , order(p) - , nedofs(p) -@@ -221,18 +211,19 @@ ND_StatelessDofTransformation::ND_StatelessDofTransformation(int size, int p, - { - } - --void ND_StatelessDofTransformation::TransformPrimal(const Array & Fo, -- double *v) const -+void ND_DofTransformation::TransformPrimal(const Array & Fo, -+ double *v) const - { - // Return immediately when no face DoFs are present -- if (nfdofs < 2) { return; } -+ if (IsIdentity()) { return; } - - MFEM_VERIFY(Fo.Size() >= nfaces, - "Face orientation array is shorter than the number of faces in " -- "ND_StatelessDofTransformation"); -+ "ND_DofTransformation"); - - double data[2]; - Vector v2(data, 2); -+ DenseMatrix T2; - - // Transform face DoFs - for (int f=0; f & Fo, - for (int i=0; i(T.GetData(Fo[f])), 2, 2); -+ T2.Mult(v2, &v[nedges*nedofs + f*nfdofs + 2*i]); - } - } - } - --void ND_StatelessDofTransformation::InvTransformPrimal(const Array & Fo, -- double *v) const -+void ND_DofTransformation::InvTransformPrimal(const Array & Fo, -+ double *v) const - { - // Return immediately when no face DoFs are present -- if (nfdofs < 2) { return; } -+ if (IsIdentity()) { return; } - - MFEM_VERIFY(Fo.Size() >= nfaces, - "Face orientation array is shorter than the number of faces in " -- "ND_StatelessDofTransformation"); -+ "ND_DofTransformation"); - - double data[2]; - Vector v2(data, 2); -+ DenseMatrix T2Inv; - - // Transform face DoFs - for (int f=0; f & Fo, - for (int i=0; i(TInv.GetData(Fo[f])), 2, 2); -+ T2Inv.Mult(v2, &v[nedges*nedofs + f*nfdofs + 2*i]); - } - } - } - --void ND_StatelessDofTransformation::TransformDual(const Array & Fo, -- double *v) const -+void ND_DofTransformation::TransformDual(const Array & Fo, double *v) const - { - // Return immediately when no face DoFs are present -- if (nfdofs < 2) { return; } -+ if (IsIdentity()) { return; } - - MFEM_VERIFY(Fo.Size() >= nfaces, - "Face orientation array is shorter than the number of faces in " -- "ND_StatelessDofTransformation"); -+ "ND_DofTransformation"); - - double data[2]; - Vector v2(data, 2); -+ DenseMatrix T2Inv; - - // Transform face DoFs - for (int f=0; f & Fo, - for (int i=0; i(TInv.GetData(Fo[f])), 2, 2); -+ T2Inv.MultTranspose(v2, &v[nedges*nedofs + f*nfdofs + 2*i]); - } - } - } - --void ND_StatelessDofTransformation::InvTransformDual(const Array & Fo, -- double *v) const -+void ND_DofTransformation::InvTransformDual(const Array & Fo, -+ double *v) const - { - // Return immediately when no face DoFs are present -- if (nfdofs < 2) { return; } -+ if (IsIdentity()) { return; } - - MFEM_VERIFY(Fo.Size() >= nfaces, - "Face orientation array is shorter than the number of faces in " -- "ND_StatelessDofTransformation"); -+ "ND_DofTransformation"); - - double data[2]; - Vector v2(data, 2); -+ DenseMatrix T2; - - // Transform face DoFs - for (int f=0; f & Fo, - for (int i=0; i(T.GetData(Fo[f])), 2, 2); -+ T2.MultTranspose(v2, &v[nedges*nedofs + f*nfdofs + 2*i]); - } - } - } -diff --git a/fem/doftrans.hpp b/fem/doftrans.hpp -index 5111bbb3d..81956bdbf 100644 ---- a/fem/doftrans.hpp -+++ b/fem/doftrans.hpp -@@ -80,6 +80,9 @@ public: - inline int Width() const { return size_; } - inline int NumCols() const { return size_; } - -+ /// If the DofTransformation performs no transformation -+ virtual bool IsIdentity() const = 0; -+ - /** Transform local DoFs to align with the global DoFs. For example, this - transformation can be used to map the local vector computed by - FiniteElement::Project() to the transformed vector stored within a -@@ -115,6 +118,8 @@ public: - inline void InvTransformDual(const Array & face_orientation, - Vector &v) const - { InvTransformDual(face_orientation, v.GetData()); } -+ -+ virtual ~StatelessDofTransformation() = default; - }; - - /** The DofTransformation class is an extension of the -@@ -133,35 +138,76 @@ public: - transferring finite element degrees of freedom between different meshes. - For examples of its use see the TransferMap used by the SubMesh class. - */ --class DofTransformation : virtual public StatelessDofTransformation -+class DofTransformation - { - protected: -- Array Fo; -- -- DofTransformation(int size) -- : StatelessDofTransformation(size) {} -+ Array Fo_; -+ const StatelessDofTransformation * dof_trans_; -+ int vdim_; -+ int ordering_; - - public: -+ /** @brief Default constructor which requires that SetDofTransformation be -+ called before use. */ -+ DofTransformation(int vdim = 1, int ordering = 0) -+ : dof_trans_(NULL) -+ , vdim_(vdim) -+ , ordering_(ordering) -+ {} -+ -+ /// Constructor with a known StatelessDofTransformation -+ DofTransformation(const StatelessDofTransformation & dof_trans, -+ int vdim = 1, int ordering = 0) -+ : dof_trans_(&dof_trans) -+ , vdim_(vdim) -+ , ordering_(ordering) -+ {} - - /** @brief Configure the transformation using face orientations for the - current element. */ - /// The face_orientation array can be obtained from Mesh::GetElementFaces. -- inline void SetFaceOrientations(const Array & face_orientation) -- { Fo = face_orientation; } -+ inline void SetFaceOrientations(const Array & Fo) -+ { Fo_ = Fo; } - -- inline const Array & GetFaceOrientations() const { return Fo; } -+ /// Return the face orientations for the current element -+ inline const Array & GetFaceOrientations() const { return Fo_; } -+ -+ /// Set or change the nested StatelessDofTransformation object -+ inline void SetDofTransformation(const StatelessDofTransformation & dof_trans) -+ { -+ dof_trans_ = &dof_trans; -+ } -+ inline void SetDofTransformation(const StatelessDofTransformation * dof_trans) -+ { -+ dof_trans_ = dof_trans; -+ } -+ -+ /// Return the nested StatelessDofTransformation object -+ inline const StatelessDofTransformation * GetDofTransformation() const -+ { return dof_trans_; } -+ -+ /// Set or change the vdim and ordering parameter -+ inline void SetVDim(int vdim = 1, int ordering = 0) -+ { -+ vdim_ = vdim; -+ ordering_ = ordering; -+ } -+ -+ /// Return the current vdim value -+ inline int GetVDim() const { return vdim_; } - -- using StatelessDofTransformation::TransformPrimal; -- using StatelessDofTransformation::InvTransformPrimal; -- using StatelessDofTransformation::TransformDual; -- using StatelessDofTransformation::InvTransformDual; -+ inline int Size() const { return dof_trans_->Size(); } -+ inline int Height() const { return dof_trans_->Height(); } -+ inline int NumRows() const { return dof_trans_->NumRows(); } -+ inline int Width() const { return dof_trans_->Width(); } -+ inline int NumCols() const { return dof_trans_->NumCols(); } -+ inline bool IsIdentity() const { return dof_trans_->IsIdentity(); } - - /** Transform local DoFs to align with the global DoFs. For example, this - transformation can be used to map the local vector computed by - FiniteElement::Project() to the transformed vector stored within a - GridFunction object. */ -- inline void TransformPrimal(double *v) const -- { TransformPrimal(Fo, v); } -+ void TransformPrimal(double *v) const; - inline void TransformPrimal(Vector &v) const - { TransformPrimal(v.GetData()); } - -@@ -179,21 +225,18 @@ public: - transform the vector obtained using GridFunction::GetSubVector before it - can be used to compute a local interpolation. - */ -- inline void InvTransformPrimal(double *v) const -- { InvTransformPrimal(Fo, v); } -+ void InvTransformPrimal(double *v) const; - inline void InvTransformPrimal(Vector &v) const - { InvTransformPrimal(v.GetData()); } - - /** Transform dual DoFs as computed by a LinearFormIntegrator before summing - into a LinearForm object. */ -- inline void TransformDual(double *v) const -- { TransformDual(Fo, v); } -+ void TransformDual(double *v) const; - inline void TransformDual(Vector &v) const - { TransformDual(v.GetData()); } - - /** Inverse Transform dual DoFs */ -- inline void InvTransformDual(double *v) const -- { InvTransformDual(Fo, v); } -+ void InvTransformDual(double *v) const; - inline void InvTransformDual(Vector &v) const - { InvTransformDual(v.GetData()); } - -@@ -225,8 +268,6 @@ public: - TransformDual(V.GetColumn(c)); - } - } -- -- virtual ~DofTransformation() = default; - }; - - /** Transform a matrix of DoFs entries from different finite element spaces as -@@ -245,145 +286,6 @@ void TransformDual(const DofTransformation *ran_dof_trans, - const DofTransformation *dom_dof_trans, - DenseMatrix &elmat); - --/** The StatelessVDofTransformation class implements a nested transformation -- where an arbitrary StatelessDofTransformation is replicated with a -- vdim >= 1. --*/ --class StatelessVDofTransformation : virtual public StatelessDofTransformation --{ --protected: -- int vdim_; -- int ordering_; -- StatelessDofTransformation * sdoftrans_; -- --public: -- /** @brief Default constructor which requires that SetDofTransformation be -- called before use. */ -- StatelessVDofTransformation(int vdim = 1, int ordering = 0) -- : StatelessDofTransformation(0) -- , vdim_(vdim) -- , ordering_(ordering) -- , sdoftrans_(NULL) -- {} -- -- /// Constructor with a known StatelessDofTransformation -- StatelessVDofTransformation(StatelessDofTransformation & doftrans, -- int vdim = 1, -- int ordering = 0) -- : StatelessDofTransformation(vdim * doftrans.Size()) -- , vdim_(vdim) -- , ordering_(ordering) -- , sdoftrans_(&doftrans) -- {} -- -- /// Set or change the vdim parameter -- inline void SetVDim(int vdim) -- { -- vdim_ = vdim; -- if (sdoftrans_) -- { -- size_ = vdim_ * sdoftrans_->Size(); -- } -- } -- -- /// Return the current vdim value -- inline int GetVDim() const { return vdim_; } -- -- /// Set or change the nested StatelessDofTransformation object -- inline void SetDofTransformation(StatelessDofTransformation & doftrans) -- { -- size_ = vdim_ * doftrans.Size(); -- sdoftrans_ = &doftrans; -- } -- -- /// Return the nested StatelessDofTransformation object -- inline StatelessDofTransformation * GetDofTransformation() const -- { return sdoftrans_; } -- -- using StatelessDofTransformation::TransformPrimal; -- using StatelessDofTransformation::InvTransformPrimal; -- using StatelessDofTransformation::TransformDual; -- using StatelessDofTransformation::InvTransformDual; -- -- /** Specializations of these base class methods which account for the vdim -- and ordering of the full set of DoFs. -- */ -- void TransformPrimal(const Array & face_ori, double *v) const; -- void InvTransformPrimal(const Array & face_ori, double *v) const; -- void TransformDual(const Array & face_ori, double *v) const; -- void InvTransformDual(const Array & face_ori, double *v) const; --}; -- --/** The VDofTransformation class implements a nested transformation where an -- arbitrary DofTransformation is replicated with a vdim >= 1. --*/ --class VDofTransformation : public StatelessVDofTransformation, -- public DofTransformation --{ --protected: -- DofTransformation * doftrans_; -- --public: -- /** @brief Default constructor which requires that SetDofTransformation be -- called before use. */ -- VDofTransformation(int vdim = 1, int ordering = 0) -- : StatelessDofTransformation(0) -- , StatelessVDofTransformation(vdim, ordering) -- , DofTransformation(0) -- , doftrans_(NULL) -- {} -- -- /// Constructor with a known DofTransformation -- /// @note The face orientations in @a doftrans will be copied into the -- /// new VDofTransformation object. -- VDofTransformation(DofTransformation & doftrans, int vdim = 1, -- int ordering = 0) -- : StatelessDofTransformation(vdim * doftrans.Size()) -- , StatelessVDofTransformation(doftrans, vdim, ordering) -- , DofTransformation(vdim * doftrans.Size()) -- , doftrans_(&doftrans) -- { -- DofTransformation::SetFaceOrientations(doftrans.GetFaceOrientations()); -- } -- -- using StatelessVDofTransformation::SetDofTransformation; -- -- /// Set or change the nested DofTransformation object -- /// @note The face orientations in @a doftrans will be copied into the -- /// VDofTransformation object. -- void SetDofTransformation(DofTransformation & doftrans) -- { -- doftrans_ = &doftrans; -- StatelessVDofTransformation::SetDofTransformation(doftrans); -- DofTransformation::SetFaceOrientations(doftrans.GetFaceOrientations()); -- } -- -- /// Return the nested DofTransformation object -- inline DofTransformation * GetDofTransformation() const { return doftrans_; } -- -- /// Set new face orientations in both the VDofTransformation and the -- /// DofTransformation contained within (if there is one). -- inline void SetFaceOrientations(const Array & face_orientation) -- { -- DofTransformation::SetFaceOrientations(face_orientation); -- if (doftrans_) { doftrans_->SetFaceOrientations(face_orientation); } -- } -- -- using DofTransformation::TransformPrimal; -- using DofTransformation::InvTransformPrimal; -- using DofTransformation::TransformDual; -- using DofTransformation::InvTransformDual; -- -- inline void TransformPrimal(double *v) const -- { TransformPrimal(Fo, v); } -- inline void InvTransformPrimal(double *v) const -- { InvTransformPrimal(Fo, v); } -- inline void TransformDual(double *v) const -- { TransformDual(Fo, v); } -- inline void InvTransformDual(double *v) const -- { InvTransformDual(Fo, v); } --}; -- - /** Abstract base class for high-order Nedelec spaces on elements with - triangular faces. - -@@ -396,7 +298,7 @@ public: - be accessed as DenseMatrices using the GetFaceTransform() and - GetFaceInverseTransform() methods. - */ --class ND_StatelessDofTransformation : virtual public StatelessDofTransformation -+class ND_DofTransformation : public StatelessDofTransformation - { - private: - static const double T_data[24]; -@@ -410,8 +312,7 @@ protected: - const int nedges; // number of edges per element - const int nfaces; // number of triangular faces per element - -- ND_StatelessDofTransformation(int size, int order, -- int num_edges, int num_tri_faces); -+ ND_DofTransformation(int size, int order, int num_edges, int num_tri_faces); - - public: - // Return the 2x2 transformation operator for the given face orientation -@@ -421,116 +322,41 @@ public: - static const DenseMatrix & GetFaceInverseTransform(int ori) - { return TInv(ori); } - -- void TransformPrimal(const Array & face_orientation, -- double *v) const; -- -- void InvTransformPrimal(const Array & face_orientation, -- double *v) const; -+ bool IsIdentity() const override { return nfdofs < 2; } - -- void TransformDual(const Array & face_orientation, -- double *v) const; -- -- void InvTransformDual(const Array & face_orientation, -- double *v) const; -+ void TransformPrimal(const Array & Fo, double *v) const override; -+ void InvTransformPrimal(const Array & Fo, double *v) const override; -+ void TransformDual(const Array & Fo, double *v) const override; -+ void InvTransformDual(const Array & Fo, double *v) const override; - }; - - /// Stateless DoF transformation implementation for the Nedelec basis on - /// triangles --class ND_TriStatelessDofTransformation : public ND_StatelessDofTransformation --{ --public: -- ND_TriStatelessDofTransformation(int order) -- : StatelessDofTransformation(order*(order + 2)) -- , ND_StatelessDofTransformation(order*(order + 2), order, 3, 1) -- {} --}; -- --/// DoF transformation implementation for the Nedelec basis on triangles --class ND_TriDofTransformation : public DofTransformation, -- public ND_TriStatelessDofTransformation -+class ND_TriDofTransformation : public ND_DofTransformation - { - public: - ND_TriDofTransformation(int order) -- : StatelessDofTransformation(order*(order + 2)) -- , DofTransformation(order*(order + 2)) -- , ND_TriStatelessDofTransformation(order) -- {} -- -- using DofTransformation::TransformPrimal; -- using DofTransformation::InvTransformPrimal; -- using DofTransformation::TransformDual; -- using DofTransformation::InvTransformDual; -- -- using ND_TriStatelessDofTransformation::TransformPrimal; -- using ND_TriStatelessDofTransformation::InvTransformPrimal; -- using ND_TriStatelessDofTransformation::TransformDual; -- using ND_TriStatelessDofTransformation::InvTransformDual; --}; -- --/// DoF transformation implementation for the Nedelec basis on tetrahedra --class ND_TetStatelessDofTransformation : public ND_StatelessDofTransformation --{ --public: -- ND_TetStatelessDofTransformation(int order) -- : StatelessDofTransformation(order*(order + 2)*(order + 3)/2) -- , ND_StatelessDofTransformation(order*(order + 2)*(order + 3)/2, order, -- 6, 4) -+ : ND_DofTransformation(order*(order + 2), order, 3, 1) - {} - }; - - /// DoF transformation implementation for the Nedelec basis on tetrahedra --class ND_TetDofTransformation : public DofTransformation, -- public ND_TetStatelessDofTransformation -+class ND_TetDofTransformation : public ND_DofTransformation - { - public: - ND_TetDofTransformation(int order) -- : StatelessDofTransformation(order*(order + 2)*(order + 3)/2) -- , DofTransformation(order*(order + 2)*(order + 3)/2) -- , ND_TetStatelessDofTransformation(order) -- {} -- -- using DofTransformation::TransformPrimal; -- using DofTransformation::InvTransformPrimal; -- using DofTransformation::TransformDual; -- using DofTransformation::InvTransformDual; -- -- using ND_TetStatelessDofTransformation::TransformPrimal; -- using ND_TetStatelessDofTransformation::InvTransformPrimal; -- using ND_TetStatelessDofTransformation::TransformDual; -- using ND_TetStatelessDofTransformation::InvTransformDual; --}; -- --/// DoF transformation implementation for the Nedelec basis on wedge elements --class ND_WedgeStatelessDofTransformation : public ND_StatelessDofTransformation --{ --public: -- ND_WedgeStatelessDofTransformation(int order) -- : StatelessDofTransformation(3 * order * ((order + 1) * (order + 2))/2) -- , ND_StatelessDofTransformation(3 * order * ((order + 1) * (order + 2))/2, -- order, 9, 2) -+ : ND_DofTransformation(order*(order + 2)*(order + 3)/2, order, 6, 4) - {} - }; - - /// DoF transformation implementation for the Nedelec basis on wedge elements --class ND_WedgeDofTransformation : public DofTransformation, -- public ND_WedgeStatelessDofTransformation -+class ND_WedgeDofTransformation : public ND_DofTransformation - { - public: - ND_WedgeDofTransformation(int order) -- : StatelessDofTransformation(3 * order * ((order + 1) * (order + 2))/2) -- , DofTransformation(3 * order * ((order + 1) * (order + 2))/2) -- , ND_WedgeStatelessDofTransformation(order) -+ : ND_DofTransformation(3 * order * ((order + 1) * (order + 2))/2, -+ order, 9, 2) - {} -- -- using DofTransformation::TransformPrimal; -- using DofTransformation::InvTransformPrimal; -- using DofTransformation::TransformDual; -- using DofTransformation::InvTransformDual; -- -- using ND_WedgeStatelessDofTransformation::TransformPrimal; -- using ND_WedgeStatelessDofTransformation::InvTransformPrimal; -- using ND_WedgeStatelessDofTransformation::TransformDual; -- using ND_WedgeStatelessDofTransformation::InvTransformDual; - }; - - } // namespace mfem -diff --git a/fem/fe/fe_base.hpp b/fem/fe/fe_base.hpp -index f9e31b457..55bfabaac 100644 ---- a/fem/fe/fe_base.hpp -+++ b/fem/fe/fe_base.hpp -@@ -596,7 +596,7 @@ public: - /** @brief Return a DoF transformation object for this particular type of - basis. - */ -- virtual StatelessDofTransformation * GetDofTransformation() const -+ virtual const StatelessDofTransformation *GetDofTransformation() const - { return NULL; } - - /// Deconstruct the FiniteElement -diff --git a/fem/fe/fe_nd.hpp b/fem/fe/fe_nd.hpp -index 231c050a7..c01129aed 100644 ---- a/fem/fe/fe_nd.hpp -+++ b/fem/fe/fe_nd.hpp -@@ -179,7 +179,7 @@ class ND_TetrahedronElement : public VectorFiniteElement - Array dof2tk; - DenseMatrixInverse Ti; - -- mutable ND_TetStatelessDofTransformation doftrans; -+ ND_TetDofTransformation doftrans; - - public: - /// Construct the ND_TetrahedronElement of order @a p -@@ -201,7 +201,7 @@ public: - ElementTransformation &Trans, - DenseMatrix &I) const - { LocalInterpolation_ND(CheckVectorFE(fe), tk, dof2tk, Trans, I); } -- virtual StatelessDofTransformation * GetDofTransformation() const -+ virtual const StatelessDofTransformation *GetDofTransformation() const - { return &doftrans; } - using FiniteElement::Project; - virtual void Project(VectorCoefficient &vc, -@@ -242,7 +242,7 @@ class ND_TriangleElement : public VectorFiniteElement - Array dof2tk; - DenseMatrixInverse Ti; - -- mutable ND_TriStatelessDofTransformation doftrans; -+ ND_TriDofTransformation doftrans; - - public: - /// Construct the ND_TriangleElement of order @a p -@@ -264,7 +264,7 @@ public: - ElementTransformation &Trans, - DenseMatrix &I) const - { LocalInterpolation_ND(CheckVectorFE(fe), tk, dof2tk, Trans, I); } -- virtual StatelessDofTransformation * GetDofTransformation() const -+ virtual const StatelessDofTransformation *GetDofTransformation() const - { return &doftrans; } - using FiniteElement::Project; - virtual void Project(VectorCoefficient &vc, -@@ -346,7 +346,7 @@ private: - #endif - Array dof2tk, t_dof, s_dof; - -- mutable ND_WedgeStatelessDofTransformation doftrans; -+ ND_WedgeDofTransformation doftrans; - - H1_TriangleElement H1TriangleFE; - ND_TriangleElement NDTriangleFE; -@@ -379,7 +379,7 @@ public: - DenseMatrix &I) const - { LocalInterpolation_ND(CheckVectorFE(fe), tk, dof2tk, Trans, I); } - -- virtual StatelessDofTransformation * GetDofTransformation() const -+ virtual const StatelessDofTransformation *GetDofTransformation() const - { return &doftrans; } - - using FiniteElement::Project; -diff --git a/fem/fe_coll.cpp b/fem/fe_coll.cpp -index 2b5ed6f46..6556da637 100644 ---- a/fem/fe_coll.cpp -+++ b/fem/fe_coll.cpp -@@ -2896,7 +2896,7 @@ ND_FECollection::FiniteElementForGeometry(Geometry::Type GeomType) const - } - } - --StatelessDofTransformation * -+const StatelessDofTransformation * - ND_FECollection::DofTransformationForGeometry(Geometry::Type GeomType) const - { - if (!Geometry::IsTensorProduct(GeomType) && this->GetOrder() > 1) -diff --git a/fem/fe_coll.hpp b/fem/fe_coll.hpp -index 96c7921c4..5d7a79dc3 100644 ---- a/fem/fe_coll.hpp -+++ b/fem/fe_coll.hpp -@@ -63,7 +63,7 @@ public: - /** @brief Returns a DoF transformation object compatible with this basis - and geometry type. - */ -- virtual StatelessDofTransformation * -+ virtual const StatelessDofTransformation * - DofTransformationForGeometry(Geometry::Type GeomType) const - { return NULL; } - -@@ -483,7 +483,7 @@ public: - int DofForGeometry(Geometry::Type GeomType) const override - { return ND_dof[GeomType]; } - -- StatelessDofTransformation * -+ const StatelessDofTransformation * - DofTransformationForGeometry(Geometry::Type GeomType) const override; - - const int *DofOrderForOrientation(Geometry::Type GeomType, -diff --git a/fem/fespace.cpp b/fem/fespace.cpp -index ecff4e476..660fec17a 100644 ---- a/fem/fespace.cpp -+++ b/fem/fespace.cpp -@@ -63,7 +63,6 @@ FiniteElementSpace::FiniteElementSpace() - elem_dof(NULL), elem_fos(NULL), bdr_elem_dof(NULL), bdr_elem_fos(NULL), - face_dof(NULL), - NURBSext(NULL), own_ext(false), -- DoFTrans(0), VDoFTrans(vdim, ordering), - cP_is_set(false), - Th(Operator::ANY_TYPE), - sequence(0), mesh_sequence(0), orders_changed(false), relaxed_hp(false) -@@ -72,7 +71,6 @@ FiniteElementSpace::FiniteElementSpace() - FiniteElementSpace::FiniteElementSpace(const FiniteElementSpace &orig, - Mesh *mesh_, - const FiniteElementCollection *fec_) -- : VDoFTrans(orig.vdim, orig.ordering) - { - mesh_ = mesh_ ? mesh_ : orig.mesh; - fec_ = fec_ ? fec_ : orig.fec; -@@ -212,7 +210,7 @@ void FiniteElementSpace::GetVDofs(int vd, Array& dofs, int ndofs_) const - } - } - --void FiniteElementSpace::DofsToVDofs (Array &dofs, int ndofs_) const -+void FiniteElementSpace::DofsToVDofs(Array &dofs, int ndofs_) const - { - if (vdim == 1) { return; } - if (ndofs_ < 0) { ndofs_ = this->ndofs; } -@@ -264,7 +262,7 @@ int FiniteElementSpace::DofToVDof(int dof, int vd, int ndofs_) const - } - - // static function --void FiniteElementSpace::AdjustVDofs (Array &vdofs) -+void FiniteElementSpace::AdjustVDofs(Array &vdofs) - { - int n = vdofs.Size(), *vdof = vdofs; - for (int i = 0; i < n; i++) -@@ -277,36 +275,36 @@ void FiniteElementSpace::AdjustVDofs (Array &vdofs) - } - } - -+void FiniteElementSpace::GetElementVDofs(int i, Array &vdofs, -+ DofTransformation &doftrans) const -+{ -+ GetElementDofs(i, vdofs, doftrans); -+ DofsToVDofs(vdofs); -+ doftrans.SetVDim(vdim, ordering); -+} -+ - DofTransformation * - FiniteElementSpace::GetElementVDofs(int i, Array &vdofs) const - { -- DofTransformation * doftrans = GetElementDofs(i, vdofs); -+ DoFTrans.SetDofTransformation(NULL); -+ GetElementVDofs(i, vdofs, DoFTrans); -+ return DoFTrans.GetDofTransformation() ? &DoFTrans : NULL; -+} -+ -+void FiniteElementSpace::GetBdrElementVDofs(int i, Array &vdofs, -+ DofTransformation &doftrans) const -+{ -+ GetBdrElementDofs(i, vdofs, doftrans); - DofsToVDofs(vdofs); -- if (vdim == 1 || doftrans == NULL) -- { -- return doftrans; -- } -- else -- { -- VDoFTrans.SetDofTransformation(*doftrans); -- return &VDoFTrans; -- } -+ doftrans.SetVDim(vdim, ordering); - } - - DofTransformation * - FiniteElementSpace::GetBdrElementVDofs(int i, Array &vdofs) const - { -- DofTransformation * doftrans = GetBdrElementDofs(i, vdofs); -- DofsToVDofs(vdofs); -- if (vdim == 1 || doftrans == NULL) -- { -- return doftrans; -- } -- else -- { -- VDoFTrans.SetDofTransformation(*doftrans); -- return &VDoFTrans; -- } -+ DoFTrans.SetDofTransformation(NULL); -+ GetBdrElementVDofs(i, vdofs, DoFTrans); -+ return DoFTrans.GetDofTransformation() ? &DoFTrans : NULL; - } - - void FiniteElementSpace::GetPatchVDofs(int i, Array &vdofs) const -@@ -777,9 +775,9 @@ FiniteElementSpace::H2L_GlobalRestrictionMatrix (FiniteElementSpace *lfes) - return R; - } - --void FiniteElementSpace --::AddDependencies(SparseMatrix& deps, Array& master_dofs, -- Array& slave_dofs, DenseMatrix& I, int skipfirst) -+void FiniteElementSpace::AddDependencies( -+ SparseMatrix& deps, Array& master_dofs, Array& slave_dofs, -+ DenseMatrix& I, int skipfirst) - { - for (int i = skipfirst; i < slave_dofs.Size(); i++) - { -@@ -802,11 +800,9 @@ void FiniteElementSpace - } - } - --void FiniteElementSpace --::AddEdgeFaceDependencies(SparseMatrix &deps, Array &master_dofs, -- const FiniteElement *master_fe, -- Array &slave_dofs, int slave_face, -- const DenseMatrix *pm) const -+void FiniteElementSpace::AddEdgeFaceDependencies( -+ SparseMatrix &deps, Array &master_dofs, const FiniteElement *master_fe, -+ Array &slave_dofs, int slave_face, const DenseMatrix *pm) const - { - // In variable-order spaces in 3D, we need to only constrain interior face - // DOFs (this is done one level up), since edge dependencies can be more -@@ -1533,12 +1529,12 @@ SparseMatrix* FiniteElementSpace::RefinementMatrix(int old_ndofs, - localP); - } - --FiniteElementSpace::RefinementOperator::RefinementOperator --(const FiniteElementSpace* fespace, Table* old_elem_dof, Table* old_elem_fos, -- int old_ndofs) -- : fespace(fespace) -- , old_elem_dof(old_elem_dof) -- , old_elem_fos(old_elem_fos) -+FiniteElementSpace::RefinementOperator::RefinementOperator( -+ const FiniteElementSpace* fespace, Table* old_elem_dof, Table* old_elem_fos, -+ int old_ndofs) -+ : fespace(fespace), -+ old_elem_dof(old_elem_dof), -+ old_elem_fos(old_elem_fos) - { - MFEM_VERIFY(fespace->GetNE() >= old_elem_dof->Size(), - "Previous mesh is not coarser."); -@@ -1553,7 +1549,7 @@ FiniteElementSpace::RefinementOperator::RefinementOperator - fespace->GetLocalRefinementMatrices(elem_geoms[i], localP[elem_geoms[i]]); - } - -- ConstructDoFTrans(); -+ ConstructDoFTransArray(); - } - - FiniteElementSpace::RefinementOperator::RefinementOperator( -@@ -1578,59 +1574,58 @@ FiniteElementSpace::RefinementOperator::RefinementOperator( - old_elem_fos = new Table(*coarse_fes->GetElementToFaceOrientationTable()); - } - -- ConstructDoFTrans(); -+ ConstructDoFTransArray(); - } - - FiniteElementSpace::RefinementOperator::~RefinementOperator() - { - delete old_elem_dof; - delete old_elem_fos; -- for (int i=0; iFEColl(); - if (dynamic_cast(fec_ref)) - { -- const FiniteElement * nd_tri = -+ const FiniteElement *nd_tri = - fec_ref->FiniteElementForGeometry(Geometry::TRIANGLE); - if (nd_tri) - { -- old_DoFTrans[Geometry::TRIANGLE] = -+ old_DoFTransArray[Geometry::TRIANGLE] = - new ND_TriDofTransformation(nd_tri->GetOrder()); - } - -- const FiniteElement * nd_tet = -+ const FiniteElement *nd_tet = - fec_ref->FiniteElementForGeometry(Geometry::TETRAHEDRON); - if (nd_tet) - { -- old_DoFTrans[Geometry::TETRAHEDRON] = -+ old_DoFTransArray[Geometry::TETRAHEDRON] = - new ND_TetDofTransformation(nd_tet->GetOrder()); - } - -- const FiniteElement * nd_pri = -+ const FiniteElement *nd_pri = - fec_ref->FiniteElementForGeometry(Geometry::PRISM); - if (nd_pri) - { -- old_DoFTrans[Geometry::PRISM] = -+ old_DoFTransArray[Geometry::PRISM] = - new ND_WedgeDofTransformation(nd_pri->GetOrder()); - } - } - } - --void FiniteElementSpace::RefinementOperator --::Mult(const Vector &x, Vector &y) const -+void FiniteElementSpace::RefinementOperator::Mult(const Vector &x, -+ Vector &y) const - { - Mesh* mesh_ref = fespace->GetMesh(); - const CoarseFineTransformations &trans_ref = -@@ -1662,6 +1657,7 @@ void FiniteElementSpace::RefinementOperator - fespace->DofsToVDofs(vd, vdofs); - old_dofs.Copy(old_vdofs); - fespace->DofsToVDofs(vd, old_vdofs, old_ndofs); -+ - x.GetSubVector(old_vdofs, subX); - lP.Mult(subX, subY); - y.SetSubVector(vdofs, subY); -@@ -1670,40 +1666,30 @@ void FiniteElementSpace::RefinementOperator - else - { - old_elem_fos->GetRow(emb.parent, old_Fo); -- old_DoFTrans[geom]->SetFaceOrientations(old_Fo); -- -- DofTransformation *new_doftrans = NULL; -- VDofTransformation *vdoftrans = -- dynamic_cast(doftrans); -- if (vdoftrans) -- { -- new_doftrans = doftrans; -- doftrans = vdoftrans->GetDofTransformation(); -- } -+ old_DoFTrans.SetDofTransformation(*old_DoFTransArray[geom]); -+ old_DoFTrans.SetFaceOrientations(old_Fo); - -+ doftrans->SetVDim(); - for (int vd = 0; vd < rvdim; vd++) - { - dofs.Copy(vdofs); - fespace->DofsToVDofs(vd, vdofs); - old_dofs.Copy(old_vdofs); - fespace->DofsToVDofs(vd, old_vdofs, old_ndofs); -+ - x.GetSubVector(old_vdofs, subX); -- old_DoFTrans[geom]->InvTransformPrimal(subX); -+ old_DoFTrans.InvTransformPrimal(subX); - lP.Mult(subX, subY); - doftrans->TransformPrimal(subY); - y.SetSubVector(vdofs, subY); - } -- -- if (vdoftrans) -- { -- doftrans = new_doftrans; -- } -+ doftrans->SetVDim(rvdim, fespace->GetOrdering()); - } - } - } - --void FiniteElementSpace::RefinementOperator --::MultTranspose(const Vector &x, Vector &y) const -+void FiniteElementSpace::RefinementOperator::MultTranspose(const Vector &x, -+ Vector &y) const - { - y = 0.0; - -@@ -1727,7 +1713,7 @@ void FiniteElementSpace::RefinementOperator - const Geometry::Type geom = mesh_ref->GetElementBaseGeometry(k); - const DenseMatrix &lP = localP[geom](emb.matrix); - -- DofTransformation * doftrans = fespace->GetElementDofs(k, f_dofs); -+ DofTransformation *doftrans = fespace->GetElementDofs(k, f_dofs); - old_elem_dof->GetRow(emb.parent, c_dofs); - - if (!doftrans) -@@ -1742,7 +1728,6 @@ void FiniteElementSpace::RefinementOperator - fespace->DofsToVDofs(vd, c_vdofs, old_ndofs); - - x.GetSubVector(f_vdofs, subX); -- - for (int p = 0; p < f_dofs.Size(); ++p) - { - if (processed[DecodeDof(f_dofs[p])]) -@@ -1750,7 +1735,6 @@ void FiniteElementSpace::RefinementOperator - subX[p] = 0.0; - } - } -- - lP.MultTranspose(subX, subY); - y.AddElementVector(c_vdofs, subY); - } -@@ -1760,17 +1744,10 @@ void FiniteElementSpace::RefinementOperator - subYt.SetSize(lP.Width()); - - old_elem_fos->GetRow(emb.parent, old_Fo); -- old_DoFTrans[geom]->SetFaceOrientations(old_Fo); -- -- DofTransformation *new_doftrans = NULL; -- VDofTransformation *vdoftrans = -- dynamic_cast(doftrans); -- if (vdoftrans) -- { -- new_doftrans = doftrans; -- doftrans = vdoftrans->GetDofTransformation(); -- } -+ old_DoFTrans.SetDofTransformation(*old_DoFTransArray[geom]); -+ old_DoFTrans.SetFaceOrientations(old_Fo); - -+ doftrans->SetVDim(); - for (int vd = 0; vd < rvdim; vd++) - { - f_dofs.Copy(f_vdofs); -@@ -1787,16 +1764,11 @@ void FiniteElementSpace::RefinementOperator - subX[p] = 0.0; - } - } -- - lP.MultTranspose(subX, subYt); -- old_DoFTrans[geom]->TransformDual(subYt); -+ old_DoFTrans.TransformDual(subYt); - y.AddElementVector(c_vdofs, subYt); - } -- -- if (vdoftrans) -- { -- doftrans = new_doftrans; -- } -+ doftrans->SetVDim(rvdim, fespace->GetOrdering()); - } - - for (int p = 0; p < f_dofs.Size(); ++p) -@@ -2024,8 +1996,8 @@ FiniteElementSpace::DerefinementOperator::~DerefinementOperator() - delete coarse_elem_dof; - } - --void FiniteElementSpace::DerefinementOperator --::Mult(const Vector &x, Vector &y) const -+void FiniteElementSpace::DerefinementOperator::Mult(const Vector &x, -+ Vector &y) const - { - Array c_vdofs, f_vdofs; - Vector loc_x, loc_y; -@@ -2227,7 +2199,7 @@ void FiniteElementSpace::Constructor(Mesh *mesh_, NURBSExtension *NURBSext_, - R_transpose.reset(); - cP_is_set = false; - -- ConstructDoFTrans(); -+ ConstructDoFTransArray(); - } - else - { -@@ -2239,40 +2211,39 @@ void FiniteElementSpace::Constructor(Mesh *mesh_, NURBSExtension *NURBSext_, - BuildElementToDofTable(); - } - --void FiniteElementSpace::ConstructDoFTrans() -+void FiniteElementSpace::ConstructDoFTransArray() - { -- DestroyDoFTrans(); -+ DestroyDoFTransArray(); - -- VDoFTrans.SetVDim(vdim); -- DoFTrans.SetSize(Geometry::NUM_GEOMETRIES); -- for (int i=0; iDimension() < 3) { return; } - if (dynamic_cast(fec)) - { -- const FiniteElement * nd_tri = -+ const FiniteElement *nd_tri = - fec->FiniteElementForGeometry(Geometry::TRIANGLE); - if (nd_tri) - { -- DoFTrans[Geometry::TRIANGLE] = -+ DoFTransArray[Geometry::TRIANGLE] = - new ND_TriDofTransformation(nd_tri->GetOrder()); - } - -- const FiniteElement * nd_tet = -+ const FiniteElement *nd_tet = - fec->FiniteElementForGeometry(Geometry::TETRAHEDRON); - if (nd_tet) - { -- DoFTrans[Geometry::TETRAHEDRON] = -+ DoFTransArray[Geometry::TETRAHEDRON] = - new ND_TetDofTransformation(nd_tet->GetOrder()); - } - -- const FiniteElement * nd_pri = -+ const FiniteElement *nd_pri = - fec->FiniteElementForGeometry(Geometry::PRISM); - if (nd_pri) - { -- DoFTrans[Geometry::PRISM] = -+ DoFTransArray[Geometry::PRISM] = - new ND_WedgeDofTransformation(nd_pri->GetOrder()); - } - } -@@ -2476,7 +2447,7 @@ void FiniteElementSpace::Construct() - - ndofs = nvdofs + nedofs + nfdofs + nbdofs; - -- ConstructDoFTrans(); -+ ConstructDoFTransArray(); - - // record the current mesh sequence number to detect refinement etc. - mesh_sequence = mesh->GetSequence(); -@@ -2501,9 +2472,8 @@ int FiniteElementSpace::MinOrder(VarOrderBits bits) - return 0; - } - --void FiniteElementSpace --::CalcEdgeFaceVarOrders(Array &edge_orders, -- Array &face_orders) const -+void FiniteElementSpace::CalcEdgeFaceVarOrders( -+ Array &edge_orders, Array &face_orders) const - { - MFEM_ASSERT(IsVariableOrder(), ""); - MFEM_ASSERT(Nonconforming(), ""); -@@ -2727,8 +2697,8 @@ int FiniteElementSpace::GetNVariants(int entity, int index) const - static const char* msg_orders_changed = - "Element orders changed, you need to Update() the space first."; - --DofTransformation * --FiniteElementSpace::GetElementDofs(int elem, Array &dofs) const -+void FiniteElementSpace::GetElementDofs(int elem, Array &dofs, -+ DofTransformation &doftrans) const - { - MFEM_VERIFY(!orders_changed, msg_orders_changed); - -@@ -2736,13 +2706,16 @@ FiniteElementSpace::GetElementDofs(int elem, Array &dofs) const - { - elem_dof->GetRow(elem, dofs); - -- if (DoFTrans[mesh->GetElementBaseGeometry(elem)]) -+ if (DoFTransArray[mesh->GetElementBaseGeometry(elem)]) - { - Array Fo; - elem_fos -> GetRow (elem, Fo); -- DoFTrans[mesh->GetElementBaseGeometry(elem)]->SetFaceOrientations(Fo); -+ doftrans.SetDofTransformation( -+ *DoFTransArray[mesh->GetElementBaseGeometry(elem)]); -+ doftrans.SetFaceOrientations(Fo); -+ doftrans.SetVDim(); - } -- return DoFTrans[mesh->GetElementBaseGeometry(elem)]; -+ return; - } - - Array V, E, Eo, F, Fo; // TODO: LocalArray -@@ -2766,10 +2739,12 @@ FiniteElementSpace::GetElementDofs(int elem, Array &dofs) const - { - nfd += fec->GetNumDof(mesh->GetFaceGeometry(F[i]), order); - } -- if (DoFTrans[mesh->GetElementBaseGeometry(elem)]) -+ if (DoFTransArray[mesh->GetElementBaseGeometry(elem)]) - { -- DoFTrans[mesh->GetElementBaseGeometry(elem)] -- -> SetFaceOrientations(Fo); -+ doftrans.SetDofTransformation( -+ *DoFTransArray[mesh->GetElementBaseGeometry(elem)]); -+ doftrans.SetFaceOrientations(Fo); -+ doftrans.SetVDim(); - } - } - -@@ -2828,54 +2803,18 @@ FiniteElementSpace::GetElementDofs(int elem, Array &dofs) const - dofs.Append(bbase + j); - } - } -- return DoFTrans[mesh->GetElementBaseGeometry(elem)]; - } - --void FiniteElementSpace::GetPatchDofs(int patch, Array &dofs) const -+DofTransformation *FiniteElementSpace::GetElementDofs(int elem, -+ Array &dofs) const - { -- MFEM_ASSERT(NURBSext, -- "FiniteElementSpace::GetPatchDofs needs a NURBSExtension"); -- NURBSext->GetPatchDofs(patch, dofs); -+ DoFTrans.SetDofTransformation(NULL); -+ GetElementDofs(elem, dofs, DoFTrans); -+ return DoFTrans.GetDofTransformation() ? &DoFTrans : NULL; - } - --const FiniteElement *FiniteElementSpace::GetFE(int i) const --{ -- if (i < 0 || i >= mesh->GetNE()) -- { -- if (mesh->GetNE() == 0) -- { -- MFEM_ABORT("Empty MPI partitions are not permitted!"); -- } -- MFEM_ABORT("Invalid element id:" << i << "; minimum allowed:" << 0 << -- ", maximum allowed:" << mesh->GetNE()-1); -- } -- -- const FiniteElement *FE = -- fec->GetFE(mesh->GetElementGeometry(i), GetElementOrderImpl(i)); -- -- if (NURBSext) -- { -- NURBSext->LoadFE(i, FE); -- } -- else -- { --#ifdef MFEM_DEBUG -- // consistency check: fec->GetOrder() and FE->GetOrder() should return -- // the same value (for standard, constant-order spaces) -- if (!IsVariableOrder() && FE->GetDim() > 0) -- { -- MFEM_ASSERT(FE->GetOrder() == fec->GetOrder(), -- "internal error: " << -- FE->GetOrder() << " != " << fec->GetOrder()); -- } --#endif -- } -- -- return FE; --} -- --DofTransformation * --FiniteElementSpace::GetBdrElementDofs(int bel, Array &dofs) const -+void FiniteElementSpace::GetBdrElementDofs(int bel, Array &dofs, -+ DofTransformation &doftrans) const - { - MFEM_VERIFY(!orders_changed, msg_orders_changed); - -@@ -2883,17 +2822,19 @@ FiniteElementSpace::GetBdrElementDofs(int bel, Array &dofs) const - { - bdr_elem_dof->GetRow(bel, dofs); - -- if (DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]) -+ if (DoFTransArray[mesh->GetBdrElementBaseGeometry(bel)]) - { - Array Fo; - bdr_elem_fos -> GetRow (bel, Fo); -- DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]-> -- SetFaceOrientations(Fo); -+ doftrans.SetDofTransformation( -+ *DoFTransArray[mesh->GetBdrElementBaseGeometry(bel)]); -+ doftrans.SetFaceOrientations(Fo); -+ doftrans.SetVDim(); - } -- return DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]; -+ return; - } - -- Array V, E, Eo, Fo; // TODO: LocalArray -+ Array V, E, Eo; // TODO: LocalArray - int F, oF; - - int dim = mesh->Dimension(); -@@ -2917,11 +2858,14 @@ FiniteElementSpace::GetBdrElementDofs(int bel, Array &dofs) const - { - mesh->GetBdrElementFace(bel, &F, &oF); - -- if (DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]) -+ if (DoFTransArray[mesh->GetBdrElementBaseGeometry(bel)]) - { -- Fo.Append(oF); -- DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]-> -- SetFaceOrientations(Fo); -+ mfem::Array Fo(1); -+ Fo[0] = oF; -+ doftrans.SetDofTransformation( -+ *DoFTransArray[mesh->GetBdrElementBaseGeometry(bel)]); -+ doftrans.SetFaceOrientations(Fo); -+ doftrans.SetVDim(); - } - } - -@@ -2963,8 +2907,14 @@ FiniteElementSpace::GetBdrElementDofs(int bel, Array &dofs) const - dofs.Append(EncodeDof(nvdofs + nedofs + fbase, ind[j])); - } - } -+} - -- return DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]; -+DofTransformation *FiniteElementSpace::GetBdrElementDofs(int bel, -+ Array &dofs) const -+{ -+ DoFTrans.SetDofTransformation(NULL); -+ GetBdrElementDofs(bel, dofs, DoFTrans); -+ return DoFTrans.GetDofTransformation() ? &DoFTrans : NULL; - } - - int FiniteElementSpace::GetFaceDofs(int face, Array &dofs, -@@ -3134,18 +3084,6 @@ int FiniteElementSpace::GetNumElementInteriorDofs(int i) const - GetElementOrderImpl(i)); - } - --void FiniteElementSpace::GetEdgeInteriorDofs(int i, Array &dofs) const --{ -- MFEM_VERIFY(!IsVariableOrder(), "not implemented"); -- -- int ne = fec->DofForGeometry(Geometry::SEGMENT); -- dofs.SetSize (ne); -- for (int j = 0, k = nvdofs+i*ne; j < ne; j++, k++) -- { -- dofs[j] = k; -- } --} -- - void FiniteElementSpace::GetFaceInteriorDofs(int i, Array &dofs) const - { - MFEM_VERIFY(!IsVariableOrder(), "not implemented"); -@@ -3170,6 +3108,61 @@ void FiniteElementSpace::GetFaceInteriorDofs(int i, Array &dofs) const - } - } - -+void FiniteElementSpace::GetEdgeInteriorDofs(int i, Array &dofs) const -+{ -+ MFEM_VERIFY(!IsVariableOrder(), "not implemented"); -+ -+ int ne = fec->DofForGeometry(Geometry::SEGMENT); -+ dofs.SetSize (ne); -+ for (int j = 0, k = nvdofs+i*ne; j < ne; j++, k++) -+ { -+ dofs[j] = k; -+ } -+} -+ -+void FiniteElementSpace::GetPatchDofs(int patch, Array &dofs) const -+{ -+ MFEM_ASSERT(NURBSext, -+ "FiniteElementSpace::GetPatchDofs needs a NURBSExtension"); -+ NURBSext->GetPatchDofs(patch, dofs); -+} -+ -+const FiniteElement *FiniteElementSpace::GetFE(int i) const -+{ -+ if (i < 0 || i >= mesh->GetNE()) -+ { -+ if (mesh->GetNE() == 0) -+ { -+ MFEM_ABORT("Empty MPI partitions are not permitted!"); -+ } -+ MFEM_ABORT("Invalid element id:" << i << "; minimum allowed:" << 0 << -+ ", maximum allowed:" << mesh->GetNE()-1); -+ } -+ -+ const FiniteElement *FE = -+ fec->GetFE(mesh->GetElementGeometry(i), GetElementOrderImpl(i)); -+ -+ if (NURBSext) -+ { -+ NURBSext->LoadFE(i, FE); -+ } -+ else -+ { -+#ifdef MFEM_DEBUG -+ // consistency check: fec->GetOrder() and FE->GetOrder() should return -+ // the same value (for standard, constant-order spaces) -+ if (!IsVariableOrder() && FE->GetDim() > 0) -+ { -+ MFEM_ASSERT(FE->GetOrder() == fec->GetOrder(), -+ "internal error: " << -+ FE->GetOrder() << " != " << fec->GetOrder()); -+ } -+#endif -+ } -+ -+ return FE; -+} -+ - const FiniteElement *FiniteElementSpace::GetBE(int i) const - { - int order = fec->GetOrder(); -@@ -3242,8 +3235,8 @@ const FiniteElement *FiniteElementSpace::GetEdgeElement(int i, - return fec->GetFE(Geometry::SEGMENT, eo); - } - --const FiniteElement *FiniteElementSpace --::GetTraceElement(int i, Geometry::Type geom_type) const -+const FiniteElement *FiniteElementSpace::GetTraceElement( -+ int i, Geometry::Type geom_type) const - { - return fec->TraceFiniteElementForGeometry(geom_type); - } -@@ -3283,7 +3276,7 @@ void FiniteElementSpace::Destroy() - } - E2BFQ_array.SetSize(0); - -- DestroyDoFTrans(); -+ DestroyDoFTransArray(); - - dof_elem_array.DeleteAll(); - dof_ldof_array.DeleteAll(); -@@ -3301,19 +3294,18 @@ void FiniteElementSpace::Destroy() - delete bdr_elem_dof; - delete bdr_elem_fos; - delete face_dof; -- - delete [] bdofs; - } - ceed::RemoveBasisAndRestriction(this); - } - --void FiniteElementSpace::DestroyDoFTrans() -+void FiniteElementSpace::DestroyDoFTransArray() - { -- for (int i = 0; i < DoFTrans.Size(); i++) -+ for (int i = 0; i < DoFTransArray.Size(); i++) - { -- delete DoFTrans[i]; -+ delete DoFTransArray[i]; - } -- DoFTrans.SetSize(0); -+ DoFTransArray.SetSize(0); - } - - void FiniteElementSpace::GetTransferOperator( -diff --git a/fem/fespace.hpp b/fem/fespace.hpp -index bcff9a6be..0fd44b613 100644 ---- a/fem/fespace.hpp -+++ b/fem/fespace.hpp -@@ -271,8 +271,8 @@ protected: - int own_ext; - mutable Array face_to_be; // NURBS FE space only - -- Array DoFTrans; -- mutable VDofTransformation VDoFTrans; -+ Array DoFTransArray; -+ mutable DofTransformation DoFTrans; - - /** Matrix representing the prolongation from the global conforming dofs to - a set of intermediate partially conforming dofs, e.g. the dofs associated -@@ -328,8 +328,8 @@ protected: - void Construct(); - void Destroy(); - -- void ConstructDoFTrans(); -- void DestroyDoFTrans(); -+ void ConstructDoFTransArray(); -+ void DestroyDoFTransArray(); - - void BuildElementToDofTable() const; - void BuildBdrElementToDofTable() const; -@@ -416,10 +416,10 @@ protected: - Table* old_elem_dof; // Owned. - Table* old_elem_fos; // Owned. - -- Array old_DoFTrans; -- mutable VDofTransformation old_VDoFTrans; -+ Array old_DoFTransArray; -+ mutable DofTransformation old_DoFTrans; - -- void ConstructDoFTrans(); -+ void ConstructDoFTransArray(); - - public: - /** Construct the operator based on the elem_dof table of the original -@@ -803,7 +803,16 @@ public: - /// with triangular faces. - /// - /// @note The returned object should NOT be deleted by the caller. -- virtual DofTransformation *GetElementDofs(int elem, Array &dofs) const; -+ DofTransformation *GetElementDofs(int elem, Array &dofs) const; -+ -+ /// @brief The same as GetElementDofs(), but with a user-allocated -+ /// DofTransformation object. @a doftrans must be allocated in advance and -+ /// will be owned by the caller. The user can use the -+ /// DofTransformation::GetDofTransformation method on the returned -+ /// @a doftrans object to detect if the DofTransformation should actually be -+ /// used. -+ virtual void GetElementDofs(int elem, Array &dofs, -+ DofTransformation &doftrans) const; - - /// @brief Returns indices of degrees of freedom for boundary element 'bel'. - /// The returned indices are offsets into an @ref ldof vector. See also -@@ -817,13 +826,16 @@ public: - /// with triangular faces. - /// - /// @note The returned object should NOT be deleted by the caller. -- virtual DofTransformation *GetBdrElementDofs(int bel, -- Array &dofs) const; -+ DofTransformation *GetBdrElementDofs(int bel, Array &dofs) const; - -- /** @brief Returns indices of degrees of freedom for NURBS patch index -- @a patch. Cartesian ordering is used, for the tensor-product degrees of -- freedom. */ -- void GetPatchDofs(int patch, Array &dofs) const; -+ /// @brief The same as GetBdrElementDofs(), but with a user-allocated -+ /// DofTransformation object. @a doftrans must be allocated in advance and -+ /// will be owned by the caller. The user can use the -+ /// DofTransformation::GetDofTransformation method on the returned -+ /// @a doftrans object to detect if the DofTransformation should actually be -+ /// used. -+ virtual void GetBdrElementDofs(int bel, Array &dofs, -+ DofTransformation &doftrans) const; - - /// @brief Returns the indices of the degrees of freedom for the specified - /// face, including the DOFs for the edges and the vertices of the face. -@@ -870,6 +882,13 @@ public: - /// GetElementInteriorVDofs(). - void GetElementInteriorDofs(int i, Array &dofs) const; - -+ /// @brief Returns the number of degrees of freedom associated with the -+ /// interior of the specified element. -+ /// -+ /// See GetElementInteriorDofs() for more information or to obtain the -+ /// relevant indices. -+ int GetNumElementInteriorDofs(int i) const; -+ - /// @brief Returns the indices of the degrees of freedom for the interior - /// of the specified face. - /// -@@ -882,13 +901,6 @@ public: - /// GetFaceInteriorVDofs(). - void GetFaceInteriorDofs(int i, Array &dofs) const; - -- /// @brief Returns the number of degrees of freedom associated with the -- /// interior of the specified element. -- /// -- /// See GetElementInteriorDofs() for more information or to obtain the -- /// relevant indices. -- int GetNumElementInteriorDofs(int i) const; -- - /// @brief Returns the indices of the degrees of freedom for the interior - /// of the specified edge. - /// -@@ -897,6 +909,11 @@ public: - void GetEdgeInteriorDofs(int i, Array &dofs) const; - ///@} - -+ /** @brief Returns indices of degrees of freedom for NURBS patch index -+ @a patch. Cartesian ordering is used, for the tensor-product degrees of -+ freedom. */ -+ void GetPatchDofs(int patch, Array &dofs) const; -+ - /// @anchor dof2vdof @name DoF To VDoF Conversion methods - /// These methods convert between local dof and local vector dof using the - /// appropriate relationship based on the Ordering::Type defined in this -@@ -1023,6 +1040,15 @@ public: - /// @note The returned object should NOT be deleted by the caller. - DofTransformation *GetElementVDofs(int i, Array &vdofs) const; - -+ /// @brief The same as GetElementVDofs(), but with a user-allocated -+ /// DofTransformation object. @a doftrans must be allocated in advance and -+ /// will be owned by the caller. The user can use the -+ /// DofTransformation::GetDofTransformation method on the returned -+ /// @a doftrans object to detect if the DofTransformation should actually be -+ /// used. -+ void GetElementVDofs(int i, Array &vdofs, -+ DofTransformation &doftrans) const; -+ - /// @brief Returns indices of degrees of freedom for @a i'th boundary - /// element. - /// The returned indices are offsets into an @ref ldof vector with @b vdim -@@ -1038,6 +1064,15 @@ public: - /// @note The returned object should NOT be deleted by the caller. - DofTransformation *GetBdrElementVDofs(int i, Array &vdofs) const; - -+ /// @brief The same as GetBdrElementVDofs(), but with a user-allocated -+ /// DofTransformation object. @a doftrans must be allocated in advance and -+ /// will be owned by the caller. The user can use the -+ /// DofTransformation::GetDofTransformation method on the returned -+ /// @a doftrans object to detect if the DofTransformation should actually be -+ /// used. -+ void GetBdrElementVDofs(int i, Array &vdofs, -+ DofTransformation &doftrans) const; -+ - /// Returns indices of degrees of freedom in @a vdofs for NURBS patch @a i. - void GetPatchVDofs(int i, Array &vdofs) const; - -diff --git a/fem/pfespace.cpp b/fem/pfespace.cpp -index 81b5bd5b2..76ac230a1 100644 ---- a/fem/pfespace.cpp -+++ b/fem/pfespace.cpp -@@ -466,53 +466,54 @@ void ParFiniteElementSpace::ApplyLDofSigns(Table &el_dof) const - ApplyLDofSigns(all_dofs); - } - --DofTransformation * --ParFiniteElementSpace::GetElementDofs(int i, Array &dofs) const -+void ParFiniteElementSpace::GetElementDofs(int i, Array &dofs, -+ DofTransformation &doftrans) const - { - if (elem_dof) - { - elem_dof->GetRow(i, dofs); - -- if (DoFTrans[mesh->GetElementBaseGeometry(i)]) -+ if (DoFTransArray[mesh->GetElementBaseGeometry(i)]) - { - Array Fo; - elem_fos->GetRow(i, Fo); -- DoFTrans[mesh->GetElementBaseGeometry(i)]->SetFaceOrientations(Fo); -- return DoFTrans[mesh->GetElementBaseGeometry(i)]; -+ doftrans.SetDofTransformation( -+ *DoFTransArray[mesh->GetElementBaseGeometry(i)]); -+ doftrans.SetFaceOrientations(Fo); -+ doftrans.SetVDim(); - } -- return NULL; -+ return; - } -- DofTransformation * doftrans = FiniteElementSpace::GetElementDofs(i, dofs); -+ FiniteElementSpace::GetElementDofs(i, dofs, doftrans); - if (Conforming()) - { - ApplyLDofSigns(dofs); - } -- return doftrans; - } - --DofTransformation * --ParFiniteElementSpace::GetBdrElementDofs(int i, Array &dofs) const -+void ParFiniteElementSpace::GetBdrElementDofs(int i, Array &dofs, -+ DofTransformation &doftrans) const - { - if (bdr_elem_dof) - { - bdr_elem_dof->GetRow(i, dofs); - -- if (DoFTrans[mesh->GetBdrElementBaseGeometry(i)]) -+ if (DoFTransArray[mesh->GetBdrElementBaseGeometry(i)]) - { - Array Fo; -- bdr_elem_fos -> GetRow (i, Fo); -- DoFTrans[mesh->GetBdrElementBaseGeometry(i)]->SetFaceOrientations(Fo); -- return DoFTrans[mesh->GetBdrElementBaseGeometry(i)]; -+ bdr_elem_fos->GetRow(i, Fo); -+ doftrans.SetDofTransformation( -+ *DoFTransArray[mesh->GetBdrElementBaseGeometry(i)]); -+ doftrans.SetFaceOrientations(Fo); -+ doftrans.SetVDim(); - } -- return NULL; -+ return; - } -- DofTransformation * doftrans = -- FiniteElementSpace::GetBdrElementDofs(i, dofs); -+ FiniteElementSpace::GetBdrElementDofs(i, dofs, doftrans); - if (Conforming()) - { - ApplyLDofSigns(dofs); - } -- return doftrans; - } - - int ParFiniteElementSpace::GetFaceDofs(int i, Array &dofs, -@@ -939,8 +940,8 @@ void ParFiniteElementSpace::Build_Dof_TrueDof_Matrix() const // matrix P - } - else if (i_offd[i+1] == i_offd[i] + 2) - { -- const double * T = ND_StatelessDofTransformation -- ::GetFaceTransform(ltori[i]).GetData(); -+ const double *T = -+ ND_DofTransformation::GetFaceTransform(ltori[i]).GetData(); - j_offd[i_offd[i] + 1] = j_offd[i_offd[i]] + 1; - d_offd[i_offd[i]] = T[0]; d_offd[i_offd[i] + 1] = T[2]; - i++; -@@ -1454,31 +1455,30 @@ void ParFiniteElementSpace::ExchangeFaceNbrData() - delete [] requests; - } - --DofTransformation *ParFiniteElementSpace::GetFaceNbrElementVDofs( -- int i, Array &vdofs) const -+void ParFiniteElementSpace::GetFaceNbrElementVDofs( -+ int i, Array &vdofs, DofTransformation &doftrans) const - { - face_nbr_element_dof.GetRow(i, vdofs); - -- DofTransformation *doftrans = NULL; -- Geometry::Type geom = GetFaceNbrFE(i)->GetGeomType(); -- if (DoFTrans[geom]) -+ if (DoFTransArray[GetFaceNbrFE(i)->GetGeomType()]) - { - Array F, Fo; - pmesh->GetFaceNbrElementFaces(pmesh->GetNE() + i, F, Fo); -- doftrans = DoFTrans[geom]; -- doftrans->SetFaceOrientations(Fo); -- } -- if (vdim == 1 || doftrans == NULL) -- { -- return doftrans; -- } -- else -- { -- VDoFTrans.SetDofTransformation(*doftrans); -- return &VDoFTrans; -+ doftrans.SetDofTransformation( -+ *DoFTransArray[GetFaceNbrFE(i)->GetGeomType()]); -+ doftrans.SetFaceOrientations(Fo); -+ doftrans.SetVDim(vdim, ordering); - } - } - -+DofTransformation *ParFiniteElementSpace::GetFaceNbrElementVDofs( -+ int i, Array &vdofs) const -+{ -+ DoFTrans.SetDofTransformation(NULL); -+ GetFaceNbrElementVDofs(i, vdofs, DoFTrans); -+ return DoFTrans.GetDofTransformation() ? &DoFTrans : NULL; -+} -+ - void ParFiniteElementSpace::GetFaceNbrFaceVDofs(int i, Array &vdofs) const - { - // Works for NC mesh where 'i' is an index returned by -@@ -2235,19 +2235,13 @@ void NeighborRowMessage::Decode(int rank) - - // This is the second "fundamental unit" used in the transformation. - const auto initial_second_row = second_row; -+ const double *T = -+ ND_DofTransformation::GetFaceTransform(fo).GetData(); - -- const auto T = [&fo]() -- { -- auto T = ND_StatelessDofTransformation::GetFaceTransform(fo); -- T(0,0) -= 1; -- T(1,1) -= 1; -- return T; -- }(); -- -- first_row.AddRow(initial_first_row, T(0,0)); -- first_row.AddRow(initial_second_row, T(0,1)); -- second_row.AddRow(initial_first_row, T(1,0)); -- second_row.AddRow(initial_second_row, T(1,1)); -+ first_row.AddRow(initial_first_row, T[0] - 1.0); -+ first_row.AddRow(initial_second_row, T[2]); -+ second_row.AddRow(initial_first_row, T[1]); -+ second_row.AddRow(initial_second_row, T[3] - 1.0); - - first_row.Collapse(); - second_row.Collapse(); -diff --git a/fem/pfespace.hpp b/fem/pfespace.hpp -index 7c7b49b7e..72029be56 100644 ---- a/fem/pfespace.hpp -+++ b/fem/pfespace.hpp -@@ -281,11 +281,17 @@ public: - /// Return the number of local vector true dofs. - int GetTrueVSize() const override { return ltdof_size; } - -- /// Returns indexes of degrees of freedom in array dofs for i'th element. -- DofTransformation *GetElementDofs(int i, Array &dofs) const override; -- -- /// Returns indexes of degrees of freedom for i'th boundary element. -- DofTransformation *GetBdrElementDofs(int i, Array &dofs) const override; -+ /// Returns indexes of degrees of freedom in array dofs for i'th element and -+ /// returns the DofTransformation data in a user-provided object. -+ using FiniteElementSpace::GetElementDofs; -+ void GetElementDofs(int i, Array &dofs, -+ DofTransformation &doftrans) const override; -+ -+ /// Returns indexes of degrees of freedom for i'th boundary element and -+ /// returns the DofTransformation data in a user-provided object. -+ using FiniteElementSpace::GetBdrElementDofs; -+ void GetBdrElementDofs(int i, Array &dofs, -+ DofTransformation &doftrans) const override; - - /** Returns the indexes of the degrees of freedom for i'th face - including the dofs for the edges and the vertices of the face. */ -@@ -379,6 +385,8 @@ public: - // Face-neighbor functions - void ExchangeFaceNbrData(); - int GetFaceNbrVSize() const { return num_face_nbr_dofs; } -+ void GetFaceNbrElementVDofs(int i, Array &vdofs, -+ DofTransformation &doftrans) const; - DofTransformation *GetFaceNbrElementVDofs(int i, Array &vdofs) const; - void GetFaceNbrFaceVDofs(int i, Array &vdofs) const; - const FiniteElement *GetFaceNbrFE(int i) const; -diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp -index 600f2fc2a..a8ec98649 100644 ---- a/mesh/mesh.cpp -+++ b/mesh/mesh.cpp -@@ -384,6 +384,12 @@ void Mesh::GetElementTransformation(int i, IsoparametricTransformation *ElTr) - } - } - -+ElementTransformation *Mesh::GetElementTransformation(int i) -+{ -+ GetElementTransformation(i, &Transformation); -+ return &Transformation; -+} -+ - void Mesh::GetElementTransformation(int i, const Vector &nodes, - IsoparametricTransformation *ElTr) - { -@@ -428,19 +434,6 @@ void Mesh::GetElementTransformation(int i, const Vector &nodes, - } - } - --ElementTransformation *Mesh::GetElementTransformation(int i) --{ -- GetElementTransformation(i, &Transformation); -- -- return &Transformation; --} -- --ElementTransformation *Mesh::GetBdrElementTransformation(int i) --{ -- GetBdrElementTransformation(i, &BdrTransformation); -- return &BdrTransformation; --} -- - void Mesh::GetBdrElementTransformation(int i, IsoparametricTransformation* ElTr) - { - ElTr->Attribute = GetBdrAttribute(i); -@@ -502,6 +495,12 @@ void Mesh::GetBdrElementTransformation(int i, IsoparametricTransformation* ElTr) - } - } - -+ElementTransformation *Mesh::GetBdrElementTransformation(int i) -+{ -+ GetBdrElementTransformation(i, &BdrTransformation); -+ return &BdrTransformation; -+} -+ - void Mesh::GetFaceTransformation(int FaceNo, IsoparametricTransformation *FTr) - { - FTr->Attribute = (Dim == 1) ? 1 : faces[FaceNo]->GetAttribute(); -diff --git a/mesh/submesh/ptransfermap.cpp b/mesh/submesh/ptransfermap.cpp -index d7c4334cc..7e2324668 100644 ---- a/mesh/submesh/ptransfermap.cpp -+++ b/mesh/submesh/ptransfermap.cpp -@@ -317,8 +317,7 @@ ParTransferMap::CorrectFaceOrientations(const ParFiniteElementSpace &fes, - - if (parent_face_ori.Size() == 0) { return; } - -- VDofTransformation vdoftrans(fes.GetVDim(), -- fes.GetOrdering()); -+ DofTransformation doftrans(fes.GetVDim(), fes.GetOrdering()); - - int dim = mesh->Dimension(); - bool face = (dim == 3); -@@ -334,15 +333,11 @@ ParTransferMap::CorrectFaceOrientations(const ParFiniteElementSpace &fes, - Geometry::Type geom = face ? mesh->GetFaceGeometry(i) : - mesh->GetElementGeometry(i); - -- StatelessDofTransformation * doftrans = -- fec->DofTransformationForGeometry(geom); -- -- if (doftrans == NULL) { continue; } -- -- vdoftrans.SetDofTransformation(*doftrans); -+ if (!fec->DofTransformationForGeometry(geom)) { continue; } -+ doftrans.SetDofTransformation(*fec->DofTransformationForGeometry(geom)); - - Fo[0] = parent_face_ori[i]; -- vdoftrans.SetFaceOrientations(Fo); -+ doftrans.SetFaceOrientations(Fo); - - if (face) - { -@@ -356,12 +351,12 @@ ParTransferMap::CorrectFaceOrientations(const ParFiniteElementSpace &fes, - if (sub_to_parent_map) - { - src.GetSubVector(vdofs, face_vector); -- vdoftrans.TransformPrimal(face_vector); -+ doftrans.TransformPrimal(face_vector); - } - else - { - dst.GetSubVector(vdofs, face_vector); -- vdoftrans.InvTransformPrimal(face_vector); -+ doftrans.InvTransformPrimal(face_vector); - } - - for (int j = 0; j < vdofs.Size(); j++) -diff --git a/mesh/submesh/transfermap.cpp b/mesh/submesh/transfermap.cpp -index c81f0cf77..1ddb8994c 100644 ---- a/mesh/submesh/transfermap.cpp -+++ b/mesh/submesh/transfermap.cpp -@@ -241,8 +241,7 @@ void TransferMap::CorrectFaceOrientations(const FiniteElementSpace &fes, - - if (parent_face_ori.Size() == 0) { return; } - -- VDofTransformation vdoftrans(fes.GetVDim(), -- fes.GetOrdering()); -+ DofTransformation doftrans(fes.GetVDim(), fes.GetOrdering()); - - int dim = mesh->Dimension(); - bool face = (dim == 3); -@@ -258,15 +257,11 @@ void TransferMap::CorrectFaceOrientations(const FiniteElementSpace &fes, - Geometry::Type geom = face ? mesh->GetFaceGeometry(i) : - mesh->GetElementGeometry(i); - -- StatelessDofTransformation * doftrans = -- fec->DofTransformationForGeometry(geom); -- -- if (doftrans == NULL) { continue; } -- -- vdoftrans.SetDofTransformation(*doftrans); -+ if (!fec->DofTransformationForGeometry(geom)) { continue; } -+ doftrans.SetDofTransformation(*fec->DofTransformationForGeometry(geom)); - - Fo[0] = parent_face_ori[i]; -- vdoftrans.SetFaceOrientations(Fo); -+ doftrans.SetFaceOrientations(Fo); - - if (face) - { -@@ -280,12 +275,12 @@ void TransferMap::CorrectFaceOrientations(const FiniteElementSpace &fes, - if (sub_to_parent_map) - { - src.GetSubVector(vdofs, face_vector); -- vdoftrans.TransformPrimal(face_vector); -+ doftrans.TransformPrimal(face_vector); - } - else - { - dst.GetSubVector(vdofs, face_vector); -- vdoftrans.InvTransformPrimal(face_vector); -+ doftrans.InvTransformPrimal(face_vector); - } - - for (int j = 0; j < vdofs.Size(); j++) -diff --git a/tests/unit/fem/test_doftrans.cpp b/tests/unit/fem/test_doftrans.cpp -index 4518b4a5a..b65b8724d 100644 ---- a/tests/unit/fem/test_doftrans.cpp -+++ b/tests/unit/fem/test_doftrans.cpp -@@ -22,13 +22,14 @@ TEST_CASE("DoF Transformation Classes", - "[ND_TetDofTransformation]") - { - int p = 4; -+ int vdim = 3; - int seed = 123; - - double tol = 1e-13; - - SECTION("Nedelec Tetrahedral Transformations") - { -- ND_TetDofTransformation T(p); -+ ND_TetDofTransformation Tnd(p); - - Array ori(4); - ori[0] = 1; -@@ -36,102 +37,191 @@ TEST_CASE("DoF Transformation Classes", - ori[2] = 5; - ori[3] = 1; - -- T.SetFaceOrientations(ori); -+ SECTION("VDim == 1") -+ { -+ DofTransformation T(Tnd); -+ T.SetFaceOrientations(ori); - -- Vector u(T.Width()); -- Vector v(T.Width()); -- Vector f(T.Width()); -- Vector ut; -- Vector vt; -- Vector ft; -+ Vector u(T.Width()); -+ Vector v(T.Width()); -+ Vector f(T.Width()); -+ Vector ut; -+ Vector vt; -+ Vector ft; - -- u.Randomize(seed); -- v.Randomize(seed+1); -- f.Randomize(seed+2); -+ u.Randomize(seed); -+ v.Randomize(seed+1); -+ f.Randomize(seed+2); - -- SECTION("Inverse DoF transformation") -- { -- Vector w; -+ SECTION("Inverse DoF transformation") -+ { -+ Vector w; - -- ut = u; T.TransformPrimal(ut); -- w = ut; T.InvTransformPrimal(w); -+ ut = u; T.TransformPrimal(ut); -+ w = ut; T.InvTransformPrimal(w); - -- w -= u; -+ w -= u; - -- REQUIRE(w.Norml2() < tol * u.Norml2()); -- } -- SECTION("Inverse Dual DoF transformation") -- { -- Vector w; -+ REQUIRE(w.Norml2() < tol * u.Norml2()); -+ } -+ SECTION("Inverse Dual DoF transformation") -+ { -+ Vector w; - -- ut = u; T.TransformDual(ut); -- w = ut; T.InvTransformDual(w); -+ ut = u; T.TransformDual(ut); -+ w = ut; T.InvTransformDual(w); - -- w -= u; -+ w -= u; - -- REQUIRE(w.Norml2() < tol * u.Norml2()); -- } -+ REQUIRE(w.Norml2() < tol * u.Norml2()); -+ } - -- SECTION("Inner product with linear form f(v)") -- { -- vt = v; T.TransformPrimal(vt); -- ft = f; T.TransformDual(ft); -+ SECTION("Inner product with linear form f(v)") -+ { -+ vt = v; T.TransformPrimal(vt); -+ ft = f; T.TransformDual(ft); - -- double fv = f * v; -+ double fv = f * v; - -- REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); -- } -+ REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); -+ } - -- DenseMatrix A(T.Width()); -- { -- Vector Ac; -- for (int i=0; i 1") - { -- // The matrix A in this case should be regarded as a -- // DiscreteLinearOperator. -- DenseMatrix tA; -- DenseMatrix At; -- DenseMatrix tAt; -+ Vector v(vdim * Tnd.Width()); -+ Vector f(vdim * Tnd.Width()); -+ Vector vt; -+ Vector ft; - -- ft = f; T.TransformDual(ft); -- vt = v; T.TransformPrimal(vt); -+ v.Randomize(seed); -+ f.Randomize(seed+1); - -- At = A; T.TransformDualRows(At); -- tA = A; T.TransformPrimalCols(tA); -- tAt = At; T.TransformPrimalCols(tAt); -+ SECTION("Ordering == byNODES") -+ { -+ DofTransformation T(Tnd, vdim, Ordering::byNODES); -+ T.SetFaceOrientations(ori); -+ -+ SECTION("Inverse DoF transformation") -+ { -+ Vector w; -+ -+ vt = v; T.TransformPrimal(vt); -+ w = vt; T.InvTransformPrimal(w); -+ -+ w -= v; -+ -+ REQUIRE(w.Norml2() < tol * v.Norml2()); -+ } -+ SECTION("Inverse Dual DoF transformation") -+ { -+ Vector w; -+ -+ vt = v; T.TransformDual(vt); -+ w = vt; T.InvTransformDual(w); - -- double fAv = A.InnerProduct(v, f); -+ w -= v; -+ -+ REQUIRE(w.Norml2() < tol * v.Norml2()); -+ } -+ SECTION("Inner product with linear form f(v)") -+ { -+ vt = v; T.TransformPrimal(vt); -+ ft = f; T.TransformDual(ft); -+ -+ double fv = f * v; -+ -+ REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); -+ } -+ } -+ SECTION("Ordering == byVDIM") -+ { -+ DofTransformation T(Tnd, vdim, Ordering::byVDIM); -+ T.SetFaceOrientations(ori); - -- REQUIRE(fabs(fAv - At.InnerProduct(vt, f )) < tol * fabs(fAv)); -- REQUIRE(fabs(fAv - tA.InnerProduct(v, ft)) < tol * fabs(fAv)); -- REQUIRE(fabs(fAv - tAt.InnerProduct(vt, ft)) < tol * fabs(fAv)); -+ SECTION("Inverse DoF transformation") -+ { -+ Vector w; -+ -+ vt = v; T.TransformPrimal(vt); -+ w = vt; T.InvTransformPrimal(w); -+ -+ w -= v; -+ -+ REQUIRE(w.Norml2() < tol * v.Norml2()); -+ } -+ SECTION("Inverse Dual DoF transformation") -+ { -+ Vector w; -+ -+ vt = v; T.TransformDual(vt); -+ w = vt; T.InvTransformDual(w); -+ -+ w -= v; -+ -+ REQUIRE(w.Norml2() < tol * v.Norml2()); -+ } -+ SECTION("Inner product with linear form f(v)") -+ { -+ vt = v; T.TransformPrimal(vt); -+ ft = f; T.TransformDual(ft); -+ -+ double fv = f * v; -+ -+ REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); -+ } -+ } - } - } - } -@@ -146,8 +236,8 @@ TEST_CASE("DoF Transformation Functions", - - double tol = 1e-13; - -- ND_TetDofTransformation Tp(p); -- ND_TetDofTransformation Tq(q); -+ ND_TetDofTransformation Tndp(p); -+ ND_TetDofTransformation Tndq(q); - - Array ori(4); - ori[0] = 1; -@@ -155,6 +245,7 @@ TEST_CASE("DoF Transformation Functions", - ori[2] = 5; - ori[3] = 1; - -+ DofTransformation Tp(Tndp), Tq(Tndq); - Tp.SetFaceOrientations(ori); - Tq.SetFaceOrientations(ori); - -@@ -235,153 +326,4 @@ TEST_CASE("DoF Transformation Functions", - } - } - --TEST_CASE("VDoF Transformation Class", -- "[DofTransformation]" -- "[VDofTransformation]") --{ -- int p = 4; -- int vdim = 3; -- int seed = 123; -- -- double tol = 1e-13; -- -- ND_TetDofTransformation Tnd(p); -- -- Array ori(4); -- ori[0] = 1; -- ori[1] = 3; -- ori[2] = 5; -- ori[3] = 1; -- -- Tnd.SetFaceOrientations(ori); -- -- SECTION("VDim == 1") -- { -- VDofTransformation T(Tnd); -- -- Vector v(T.Width()); -- Vector f(T.Width()); -- Vector vt; -- Vector ft; -- -- v.Randomize(seed); -- f.Randomize(seed+1); -- -- SECTION("Inverse DoF transformation") -- { -- Vector w; -- -- vt = v; T.TransformPrimal(vt); -- w = vt; T.InvTransformPrimal(w); -- -- w -= v; -- -- REQUIRE(w.Norml2() < tol * v.Norml2()); -- } -- SECTION("Inverse Dual DoF transformation") -- { -- Vector w; -- -- vt = v; T.TransformDual(vt); -- w = vt; T.InvTransformDual(w); -- -- w -= v; -- -- REQUIRE(w.Norml2() < tol * v.Norml2()); -- } -- SECTION("Inner product with linear form f(v)") -- { -- vt = v; T.TransformPrimal(vt); -- ft = f; T.TransformDual(ft); -- -- double fv = f * v; -- -- REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); -- } -- } -- SECTION("VDim > 1") -- { -- Vector v(vdim * Tnd.Width()); -- Vector f(vdim * Tnd.Width()); -- Vector vt; -- Vector ft; -- -- v.Randomize(seed); -- f.Randomize(seed+1); -- -- SECTION("Ordering == byNODES") -- { -- VDofTransformation T(Tnd, vdim, Ordering::byNODES); -- -- SECTION("Inverse DoF transformation") -- { -- Vector w; -- -- vt = v; T.TransformPrimal(vt); -- w = vt; T.InvTransformPrimal(w); -- -- w -= v; -- -- REQUIRE(w.Norml2() < tol * v.Norml2()); -- } -- SECTION("Inverse Dual DoF transformation") -- { -- Vector w; -- -- vt = v; T.TransformDual(vt); -- w = vt; T.InvTransformDual(w); -- -- w -= v; -- -- REQUIRE(w.Norml2() < tol * v.Norml2()); -- } -- SECTION("Inner product with linear form f(v)") -- { -- vt = v; T.TransformPrimal(vt); -- ft = f; T.TransformDual(ft); -- -- double fv = f * v; -- -- REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); -- } -- } -- SECTION("Ordering == byVDIM") -- { -- VDofTransformation T(Tnd, vdim, Ordering::byVDIM); -- -- SECTION("Inverse DoF transformation") -- { -- Vector w; -- -- vt = v; T.TransformPrimal(vt); -- w = vt; T.InvTransformPrimal(w); -- -- w -= v; -- -- REQUIRE(w.Norml2() < tol * v.Norml2()); -- } -- SECTION("Inverse Dual DoF transformation") -- { -- Vector w; -- -- vt = v; T.TransformDual(vt); -- w = vt; T.InvTransformDual(w); -- -- w -= v; -- -- REQUIRE(w.Norml2() < tol * v.Norml2()); -- } -- SECTION("Inner product with linear form f(v)") -- { -- vt = v; T.TransformPrimal(vt); -- ft = f; T.TransformDual(ft); -- -- double fv = f * v; -- -- REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); -- } -- } -- } --} -- - } // namespace doftrans +diff --git a/fem/doftrans.cpp b/fem/doftrans.cpp +index 93d5588de..0b4dbcef7 100644 +--- a/fem/doftrans.cpp ++++ b/fem/doftrans.cpp +@@ -14,62 +14,18 @@ + namespace mfem + { + +-void TransformPrimal(const DofTransformation *ran_dof_trans, +- const DofTransformation *dom_dof_trans, +- DenseMatrix &elmat) +-{ +- if (ran_dof_trans && dom_dof_trans) +- { +- ran_dof_trans->TransformPrimalCols(elmat); +- dom_dof_trans->TransformDualRows(elmat); +- } +- else if (ran_dof_trans) +- { +- ran_dof_trans->TransformPrimalCols(elmat); +- } +- else if (dom_dof_trans) +- { +- dom_dof_trans->TransformDualRows(elmat); +- } +- else +- { +- // If both transformations are NULL this function should not be called +- } +-} +- +-void TransformDual(const DofTransformation *ran_dof_trans, +- const DofTransformation *dom_dof_trans, +- DenseMatrix &elmat) +-{ +- if (ran_dof_trans && dom_dof_trans) +- { +- ran_dof_trans->TransformDualCols(elmat); +- dom_dof_trans->TransformDualRows(elmat); +- } +- else if (ran_dof_trans) +- { +- ran_dof_trans->TransformDualCols(elmat); +- } +- else if (dom_dof_trans) +- { +- dom_dof_trans->TransformDualRows(elmat); +- } +- else +- { +- // If both transformations are NULL this function should not be called +- } +-} +- +-void StatelessVDofTransformation::TransformPrimal(const Array & face_ori, +- double *v) const ++void DofTransformation::TransformPrimal(double *v) const + { +- int size = sdoftrans_->Size(); ++ MFEM_ASSERT(dof_trans_, ++ "DofTransformation has no local transformation, call " ++ "SetDofTransformation first!"); ++ int size = dof_trans_->Size(); + +- if ((Ordering::Type)ordering_ == Ordering::byNODES || vdim_ == 1) ++ if (vdim_ == 1 || (Ordering::Type)ordering_ == Ordering::byNODES) + { + for (int i=0; iTransformPrimal(face_ori, &v[i*size]); ++ dof_trans_->TransformPrimal(Fo_, &v[i*size]); + } + } + else +@@ -81,7 +37,7 @@ void StatelessVDofTransformation::TransformPrimal(const Array & face_ori, + { + vec(j) = v[j*vdim_+i]; + } +- sdoftrans_->TransformPrimal(face_ori, vec); ++ dof_trans_->TransformPrimal(Fo_, vec); + for (int j=0; j & face_ori, + } + } + +-void StatelessVDofTransformation::InvTransformPrimal( +- const Array & face_ori, +- double *v) const ++void DofTransformation::InvTransformPrimal(double *v) const + { +- int size = sdoftrans_->Height(); ++ MFEM_ASSERT(dof_trans_, ++ "DofTransformation has no local transformation, call " ++ "SetDofTransformation first!"); ++ int size = dof_trans_->Height(); + +- if ((Ordering::Type)ordering_ == Ordering::byNODES) ++ if (vdim_ == 1 || (Ordering::Type)ordering_ == Ordering::byNODES) + { + for (int i=0; iInvTransformPrimal(face_ori, &v[i*size]); ++ dof_trans_->InvTransformPrimal(Fo_, &v[i*size]); + } + } + else +@@ -112,7 +69,7 @@ void StatelessVDofTransformation::InvTransformPrimal( + { + vec(j) = v[j*vdim_+i]; + } +- sdoftrans_->InvTransformPrimal(face_ori, vec); ++ dof_trans_->InvTransformPrimal(Fo_, vec); + for (int j=0; j & face_ori, +- double *v) const ++void DofTransformation::TransformDual(double *v) const + { +- int size = sdoftrans_->Size(); ++ MFEM_ASSERT(dof_trans_, ++ "DofTransformation has no local transformation, call " ++ "SetDofTransformation first!"); ++ int size = dof_trans_->Size(); + +- if ((Ordering::Type)ordering_ == Ordering::byNODES) ++ if (vdim_ == 1 || (Ordering::Type)ordering_ == Ordering::byNODES) + { + for (int i=0; iTransformDual(face_ori, &v[i*size]); ++ dof_trans_->TransformDual(Fo_, &v[i*size]); + } + } + else +@@ -142,7 +101,7 @@ void StatelessVDofTransformation::TransformDual(const Array & face_ori, + { + vec(j) = v[j*vdim_+i]; + } +- sdoftrans_->TransformDual(face_ori, vec); ++ dof_trans_->TransformDual(Fo_, vec); + for (int j=0; j & face_ori, + } + } + +-void StatelessVDofTransformation::InvTransformDual(const Array & face_ori, +- double *v) const ++void DofTransformation::InvTransformDual(double *v) const + { +- int size = sdoftrans_->Size(); ++ MFEM_ASSERT(dof_trans_, ++ "DofTransformation has no local transformation, call " ++ "SetDofTransformation first!"); ++ int size = dof_trans_->Size(); + +- if ((Ordering::Type)ordering_ == Ordering::byNODES) ++ if (vdim_ == 1 || (Ordering::Type)ordering_ == Ordering::byNODES) + { + for (int i=0; iInvTransformDual(face_ori, &v[i*size]); ++ dof_trans_->InvTransformDual(Fo_, &v[i*size]); + } + } + else +@@ -172,7 +133,7 @@ void StatelessVDofTransformation::InvTransformDual(const Array & face_ori, + { + vec(j) = v[j*vdim_+i]; + } +- sdoftrans_->InvTransformDual(face_ori, vec); ++ dof_trans_->InvTransformDual(Fo_, vec); + for (int j=0; j & face_ori, + } + } + ++void TransformPrimal(const DofTransformation *ran_dof_trans, ++ const DofTransformation *dom_dof_trans, ++ DenseMatrix &elmat) ++{ ++ // No action if both transformations are NULL ++ if (ran_dof_trans) ++ { ++ ran_dof_trans->TransformPrimalCols(elmat); ++ } ++ if (dom_dof_trans) ++ { ++ dom_dof_trans->TransformDualRows(elmat); ++ } ++} ++ ++void TransformDual(const DofTransformation *ran_dof_trans, ++ const DofTransformation *dom_dof_trans, ++ DenseMatrix &elmat) ++{ ++ // No action if both transformations are NULL ++ if (ran_dof_trans) ++ { ++ ran_dof_trans->TransformDualCols(elmat); ++ } ++ if (dom_dof_trans) ++ { ++ dom_dof_trans->TransformDualRows(elmat); ++ } ++} ++ + // ordering (i0j0, i1j0, i0j1, i1j1), each row is a column major matrix +-const double ND_StatelessDofTransformation::T_data[24] = ++const double ND_DofTransformation::T_data[24] = + { + 1.0, 0.0, 0.0, 1.0, + -1.0, -1.0, 0.0, 1.0, +@@ -192,11 +183,11 @@ const double ND_StatelessDofTransformation::T_data[24] = + 0.0, 1.0, 1.0, 0.0 + }; + +-const DenseTensor ND_StatelessDofTransformation +-::T(const_cast(ND_StatelessDofTransformation::T_data), 2, 2, 6); ++const DenseTensor ND_DofTransformation ++::T(const_cast(ND_DofTransformation::T_data), 2, 2, 6); + + // ordering (i0j0, i1j0, i0j1, i1j1), each row is a column major matrix +-const double ND_StatelessDofTransformation::TInv_data[24] = ++const double ND_DofTransformation::TInv_data[24] = + { + 1.0, 0.0, 0.0, 1.0, + -1.0, -1.0, 0.0, 1.0, +@@ -206,12 +197,11 @@ const double ND_StatelessDofTransformation::TInv_data[24] = + 0.0, 1.0, 1.0, 0.0 + }; + +-const DenseTensor ND_StatelessDofTransformation +-::TInv(const_cast(TInv_data), 2, 2, 6); ++const DenseTensor ND_DofTransformation ++::TInv(const_cast(TInv_data), 2, 2, 6); + +-ND_StatelessDofTransformation::ND_StatelessDofTransformation(int size, int p, +- int num_edges, +- int num_tri_faces) ++ND_DofTransformation::ND_DofTransformation(int size, int p, int num_edges, ++ int num_tri_faces) + : StatelessDofTransformation(size) + , order(p) + , nedofs(p) +@@ -221,18 +211,19 @@ ND_StatelessDofTransformation::ND_StatelessDofTransformation(int size, int p, + { + } + +-void ND_StatelessDofTransformation::TransformPrimal(const Array & Fo, +- double *v) const ++void ND_DofTransformation::TransformPrimal(const Array & Fo, ++ double *v) const + { + // Return immediately when no face DoFs are present +- if (nfdofs < 2) { return; } ++ if (IsIdentity()) { return; } + + MFEM_VERIFY(Fo.Size() >= nfaces, + "Face orientation array is shorter than the number of faces in " +- "ND_StatelessDofTransformation"); ++ "ND_DofTransformation"); + + double data[2]; + Vector v2(data, 2); ++ DenseMatrix T2; + + // Transform face DoFs + for (int f=0; f & Fo, + for (int i=0; i(T.GetData(Fo[f])), 2, 2); ++ T2.Mult(v2, &v[nedges*nedofs + f*nfdofs + 2*i]); + } + } + } + +-void ND_StatelessDofTransformation::InvTransformPrimal(const Array & Fo, +- double *v) const ++void ND_DofTransformation::InvTransformPrimal(const Array & Fo, ++ double *v) const + { + // Return immediately when no face DoFs are present +- if (nfdofs < 2) { return; } ++ if (IsIdentity()) { return; } + + MFEM_VERIFY(Fo.Size() >= nfaces, + "Face orientation array is shorter than the number of faces in " +- "ND_StatelessDofTransformation"); ++ "ND_DofTransformation"); + + double data[2]; + Vector v2(data, 2); ++ DenseMatrix T2Inv; + + // Transform face DoFs + for (int f=0; f & Fo, + for (int i=0; i(TInv.GetData(Fo[f])), 2, 2); ++ T2Inv.Mult(v2, &v[nedges*nedofs + f*nfdofs + 2*i]); + } + } + } + +-void ND_StatelessDofTransformation::TransformDual(const Array & Fo, +- double *v) const ++void ND_DofTransformation::TransformDual(const Array & Fo, double *v) const + { + // Return immediately when no face DoFs are present +- if (nfdofs < 2) { return; } ++ if (IsIdentity()) { return; } + + MFEM_VERIFY(Fo.Size() >= nfaces, + "Face orientation array is shorter than the number of faces in " +- "ND_StatelessDofTransformation"); ++ "ND_DofTransformation"); + + double data[2]; + Vector v2(data, 2); ++ DenseMatrix T2Inv; + + // Transform face DoFs + for (int f=0; f & Fo, + for (int i=0; i(TInv.GetData(Fo[f])), 2, 2); ++ T2Inv.MultTranspose(v2, &v[nedges*nedofs + f*nfdofs + 2*i]); + } + } + } + +-void ND_StatelessDofTransformation::InvTransformDual(const Array & Fo, +- double *v) const ++void ND_DofTransformation::InvTransformDual(const Array & Fo, ++ double *v) const + { + // Return immediately when no face DoFs are present +- if (nfdofs < 2) { return; } ++ if (IsIdentity()) { return; } + + MFEM_VERIFY(Fo.Size() >= nfaces, + "Face orientation array is shorter than the number of faces in " +- "ND_StatelessDofTransformation"); ++ "ND_DofTransformation"); + + double data[2]; + Vector v2(data, 2); ++ DenseMatrix T2; + + // Transform face DoFs + for (int f=0; f & Fo, + for (int i=0; i(T.GetData(Fo[f])), 2, 2); ++ T2.MultTranspose(v2, &v[nedges*nedofs + f*nfdofs + 2*i]); + } + } + } +diff --git a/fem/doftrans.hpp b/fem/doftrans.hpp +index 5111bbb3d..81956bdbf 100644 +--- a/fem/doftrans.hpp ++++ b/fem/doftrans.hpp +@@ -80,6 +80,9 @@ public: + inline int Width() const { return size_; } + inline int NumCols() const { return size_; } + ++ /// If the DofTransformation performs no transformation ++ virtual bool IsIdentity() const = 0; ++ + /** Transform local DoFs to align with the global DoFs. For example, this + transformation can be used to map the local vector computed by + FiniteElement::Project() to the transformed vector stored within a +@@ -115,6 +118,8 @@ public: + inline void InvTransformDual(const Array & face_orientation, + Vector &v) const + { InvTransformDual(face_orientation, v.GetData()); } ++ ++ virtual ~StatelessDofTransformation() = default; + }; + + /** The DofTransformation class is an extension of the +@@ -133,35 +138,76 @@ public: + transferring finite element degrees of freedom between different meshes. + For examples of its use see the TransferMap used by the SubMesh class. + */ +-class DofTransformation : virtual public StatelessDofTransformation ++class DofTransformation + { + protected: +- Array Fo; +- +- DofTransformation(int size) +- : StatelessDofTransformation(size) {} ++ Array Fo_; ++ const StatelessDofTransformation * dof_trans_; ++ int vdim_; ++ int ordering_; + + public: ++ /** @brief Default constructor which requires that SetDofTransformation be ++ called before use. */ ++ DofTransformation(int vdim = 1, int ordering = 0) ++ : dof_trans_(NULL) ++ , vdim_(vdim) ++ , ordering_(ordering) ++ {} ++ ++ /// Constructor with a known StatelessDofTransformation ++ DofTransformation(const StatelessDofTransformation & dof_trans, ++ int vdim = 1, int ordering = 0) ++ : dof_trans_(&dof_trans) ++ , vdim_(vdim) ++ , ordering_(ordering) ++ {} + + /** @brief Configure the transformation using face orientations for the + current element. */ + /// The face_orientation array can be obtained from Mesh::GetElementFaces. +- inline void SetFaceOrientations(const Array & face_orientation) +- { Fo = face_orientation; } ++ inline void SetFaceOrientations(const Array & Fo) ++ { Fo_ = Fo; } + +- inline const Array & GetFaceOrientations() const { return Fo; } ++ /// Return the face orientations for the current element ++ inline const Array & GetFaceOrientations() const { return Fo_; } ++ ++ /// Set or change the nested StatelessDofTransformation object ++ inline void SetDofTransformation(const StatelessDofTransformation & dof_trans) ++ { ++ dof_trans_ = &dof_trans; ++ } ++ inline void SetDofTransformation(const StatelessDofTransformation * dof_trans) ++ { ++ dof_trans_ = dof_trans; ++ } ++ ++ /// Return the nested StatelessDofTransformation object ++ inline const StatelessDofTransformation * GetDofTransformation() const ++ { return dof_trans_; } ++ ++ /// Set or change the vdim and ordering parameter ++ inline void SetVDim(int vdim = 1, int ordering = 0) ++ { ++ vdim_ = vdim; ++ ordering_ = ordering; ++ } ++ ++ /// Return the current vdim value ++ inline int GetVDim() const { return vdim_; } + +- using StatelessDofTransformation::TransformPrimal; +- using StatelessDofTransformation::InvTransformPrimal; +- using StatelessDofTransformation::TransformDual; +- using StatelessDofTransformation::InvTransformDual; ++ inline int Size() const { return dof_trans_->Size(); } ++ inline int Height() const { return dof_trans_->Height(); } ++ inline int NumRows() const { return dof_trans_->NumRows(); } ++ inline int Width() const { return dof_trans_->Width(); } ++ inline int NumCols() const { return dof_trans_->NumCols(); } ++ inline bool IsIdentity() const { return dof_trans_->IsIdentity(); } + + /** Transform local DoFs to align with the global DoFs. For example, this + transformation can be used to map the local vector computed by + FiniteElement::Project() to the transformed vector stored within a + GridFunction object. */ +- inline void TransformPrimal(double *v) const +- { TransformPrimal(Fo, v); } ++ void TransformPrimal(double *v) const; + inline void TransformPrimal(Vector &v) const + { TransformPrimal(v.GetData()); } + +@@ -179,21 +225,18 @@ public: + transform the vector obtained using GridFunction::GetSubVector before it + can be used to compute a local interpolation. + */ +- inline void InvTransformPrimal(double *v) const +- { InvTransformPrimal(Fo, v); } ++ void InvTransformPrimal(double *v) const; + inline void InvTransformPrimal(Vector &v) const + { InvTransformPrimal(v.GetData()); } + + /** Transform dual DoFs as computed by a LinearFormIntegrator before summing + into a LinearForm object. */ +- inline void TransformDual(double *v) const +- { TransformDual(Fo, v); } ++ void TransformDual(double *v) const; + inline void TransformDual(Vector &v) const + { TransformDual(v.GetData()); } + + /** Inverse Transform dual DoFs */ +- inline void InvTransformDual(double *v) const +- { InvTransformDual(Fo, v); } ++ void InvTransformDual(double *v) const; + inline void InvTransformDual(Vector &v) const + { InvTransformDual(v.GetData()); } + +@@ -225,8 +268,6 @@ public: + TransformDual(V.GetColumn(c)); + } + } +- +- virtual ~DofTransformation() = default; + }; + + /** Transform a matrix of DoFs entries from different finite element spaces as +@@ -245,145 +286,6 @@ void TransformDual(const DofTransformation *ran_dof_trans, + const DofTransformation *dom_dof_trans, + DenseMatrix &elmat); + +-/** The StatelessVDofTransformation class implements a nested transformation +- where an arbitrary StatelessDofTransformation is replicated with a +- vdim >= 1. +-*/ +-class StatelessVDofTransformation : virtual public StatelessDofTransformation +-{ +-protected: +- int vdim_; +- int ordering_; +- StatelessDofTransformation * sdoftrans_; +- +-public: +- /** @brief Default constructor which requires that SetDofTransformation be +- called before use. */ +- StatelessVDofTransformation(int vdim = 1, int ordering = 0) +- : StatelessDofTransformation(0) +- , vdim_(vdim) +- , ordering_(ordering) +- , sdoftrans_(NULL) +- {} +- +- /// Constructor with a known StatelessDofTransformation +- StatelessVDofTransformation(StatelessDofTransformation & doftrans, +- int vdim = 1, +- int ordering = 0) +- : StatelessDofTransformation(vdim * doftrans.Size()) +- , vdim_(vdim) +- , ordering_(ordering) +- , sdoftrans_(&doftrans) +- {} +- +- /// Set or change the vdim parameter +- inline void SetVDim(int vdim) +- { +- vdim_ = vdim; +- if (sdoftrans_) +- { +- size_ = vdim_ * sdoftrans_->Size(); +- } +- } +- +- /// Return the current vdim value +- inline int GetVDim() const { return vdim_; } +- +- /// Set or change the nested StatelessDofTransformation object +- inline void SetDofTransformation(StatelessDofTransformation & doftrans) +- { +- size_ = vdim_ * doftrans.Size(); +- sdoftrans_ = &doftrans; +- } +- +- /// Return the nested StatelessDofTransformation object +- inline StatelessDofTransformation * GetDofTransformation() const +- { return sdoftrans_; } +- +- using StatelessDofTransformation::TransformPrimal; +- using StatelessDofTransformation::InvTransformPrimal; +- using StatelessDofTransformation::TransformDual; +- using StatelessDofTransformation::InvTransformDual; +- +- /** Specializations of these base class methods which account for the vdim +- and ordering of the full set of DoFs. +- */ +- void TransformPrimal(const Array & face_ori, double *v) const; +- void InvTransformPrimal(const Array & face_ori, double *v) const; +- void TransformDual(const Array & face_ori, double *v) const; +- void InvTransformDual(const Array & face_ori, double *v) const; +-}; +- +-/** The VDofTransformation class implements a nested transformation where an +- arbitrary DofTransformation is replicated with a vdim >= 1. +-*/ +-class VDofTransformation : public StatelessVDofTransformation, +- public DofTransformation +-{ +-protected: +- DofTransformation * doftrans_; +- +-public: +- /** @brief Default constructor which requires that SetDofTransformation be +- called before use. */ +- VDofTransformation(int vdim = 1, int ordering = 0) +- : StatelessDofTransformation(0) +- , StatelessVDofTransformation(vdim, ordering) +- , DofTransformation(0) +- , doftrans_(NULL) +- {} +- +- /// Constructor with a known DofTransformation +- /// @note The face orientations in @a doftrans will be copied into the +- /// new VDofTransformation object. +- VDofTransformation(DofTransformation & doftrans, int vdim = 1, +- int ordering = 0) +- : StatelessDofTransformation(vdim * doftrans.Size()) +- , StatelessVDofTransformation(doftrans, vdim, ordering) +- , DofTransformation(vdim * doftrans.Size()) +- , doftrans_(&doftrans) +- { +- DofTransformation::SetFaceOrientations(doftrans.GetFaceOrientations()); +- } +- +- using StatelessVDofTransformation::SetDofTransformation; +- +- /// Set or change the nested DofTransformation object +- /// @note The face orientations in @a doftrans will be copied into the +- /// VDofTransformation object. +- void SetDofTransformation(DofTransformation & doftrans) +- { +- doftrans_ = &doftrans; +- StatelessVDofTransformation::SetDofTransformation(doftrans); +- DofTransformation::SetFaceOrientations(doftrans.GetFaceOrientations()); +- } +- +- /// Return the nested DofTransformation object +- inline DofTransformation * GetDofTransformation() const { return doftrans_; } +- +- /// Set new face orientations in both the VDofTransformation and the +- /// DofTransformation contained within (if there is one). +- inline void SetFaceOrientations(const Array & face_orientation) +- { +- DofTransformation::SetFaceOrientations(face_orientation); +- if (doftrans_) { doftrans_->SetFaceOrientations(face_orientation); } +- } +- +- using DofTransformation::TransformPrimal; +- using DofTransformation::InvTransformPrimal; +- using DofTransformation::TransformDual; +- using DofTransformation::InvTransformDual; +- +- inline void TransformPrimal(double *v) const +- { TransformPrimal(Fo, v); } +- inline void InvTransformPrimal(double *v) const +- { InvTransformPrimal(Fo, v); } +- inline void TransformDual(double *v) const +- { TransformDual(Fo, v); } +- inline void InvTransformDual(double *v) const +- { InvTransformDual(Fo, v); } +-}; +- + /** Abstract base class for high-order Nedelec spaces on elements with + triangular faces. + +@@ -396,7 +298,7 @@ public: + be accessed as DenseMatrices using the GetFaceTransform() and + GetFaceInverseTransform() methods. + */ +-class ND_StatelessDofTransformation : virtual public StatelessDofTransformation ++class ND_DofTransformation : public StatelessDofTransformation + { + private: + static const double T_data[24]; +@@ -410,8 +312,7 @@ protected: + const int nedges; // number of edges per element + const int nfaces; // number of triangular faces per element + +- ND_StatelessDofTransformation(int size, int order, +- int num_edges, int num_tri_faces); ++ ND_DofTransformation(int size, int order, int num_edges, int num_tri_faces); + + public: + // Return the 2x2 transformation operator for the given face orientation +@@ -421,116 +322,41 @@ public: + static const DenseMatrix & GetFaceInverseTransform(int ori) + { return TInv(ori); } + +- void TransformPrimal(const Array & face_orientation, +- double *v) const; +- +- void InvTransformPrimal(const Array & face_orientation, +- double *v) const; ++ bool IsIdentity() const override { return nfdofs < 2; } + +- void TransformDual(const Array & face_orientation, +- double *v) const; +- +- void InvTransformDual(const Array & face_orientation, +- double *v) const; ++ void TransformPrimal(const Array & Fo, double *v) const override; ++ void InvTransformPrimal(const Array & Fo, double *v) const override; ++ void TransformDual(const Array & Fo, double *v) const override; ++ void InvTransformDual(const Array & Fo, double *v) const override; + }; + + /// Stateless DoF transformation implementation for the Nedelec basis on + /// triangles +-class ND_TriStatelessDofTransformation : public ND_StatelessDofTransformation +-{ +-public: +- ND_TriStatelessDofTransformation(int order) +- : StatelessDofTransformation(order*(order + 2)) +- , ND_StatelessDofTransformation(order*(order + 2), order, 3, 1) +- {} +-}; +- +-/// DoF transformation implementation for the Nedelec basis on triangles +-class ND_TriDofTransformation : public DofTransformation, +- public ND_TriStatelessDofTransformation ++class ND_TriDofTransformation : public ND_DofTransformation + { + public: + ND_TriDofTransformation(int order) +- : StatelessDofTransformation(order*(order + 2)) +- , DofTransformation(order*(order + 2)) +- , ND_TriStatelessDofTransformation(order) +- {} +- +- using DofTransformation::TransformPrimal; +- using DofTransformation::InvTransformPrimal; +- using DofTransformation::TransformDual; +- using DofTransformation::InvTransformDual; +- +- using ND_TriStatelessDofTransformation::TransformPrimal; +- using ND_TriStatelessDofTransformation::InvTransformPrimal; +- using ND_TriStatelessDofTransformation::TransformDual; +- using ND_TriStatelessDofTransformation::InvTransformDual; +-}; +- +-/// DoF transformation implementation for the Nedelec basis on tetrahedra +-class ND_TetStatelessDofTransformation : public ND_StatelessDofTransformation +-{ +-public: +- ND_TetStatelessDofTransformation(int order) +- : StatelessDofTransformation(order*(order + 2)*(order + 3)/2) +- , ND_StatelessDofTransformation(order*(order + 2)*(order + 3)/2, order, +- 6, 4) ++ : ND_DofTransformation(order*(order + 2), order, 3, 1) + {} + }; + + /// DoF transformation implementation for the Nedelec basis on tetrahedra +-class ND_TetDofTransformation : public DofTransformation, +- public ND_TetStatelessDofTransformation ++class ND_TetDofTransformation : public ND_DofTransformation + { + public: + ND_TetDofTransformation(int order) +- : StatelessDofTransformation(order*(order + 2)*(order + 3)/2) +- , DofTransformation(order*(order + 2)*(order + 3)/2) +- , ND_TetStatelessDofTransformation(order) +- {} +- +- using DofTransformation::TransformPrimal; +- using DofTransformation::InvTransformPrimal; +- using DofTransformation::TransformDual; +- using DofTransformation::InvTransformDual; +- +- using ND_TetStatelessDofTransformation::TransformPrimal; +- using ND_TetStatelessDofTransformation::InvTransformPrimal; +- using ND_TetStatelessDofTransformation::TransformDual; +- using ND_TetStatelessDofTransformation::InvTransformDual; +-}; +- +-/// DoF transformation implementation for the Nedelec basis on wedge elements +-class ND_WedgeStatelessDofTransformation : public ND_StatelessDofTransformation +-{ +-public: +- ND_WedgeStatelessDofTransformation(int order) +- : StatelessDofTransformation(3 * order * ((order + 1) * (order + 2))/2) +- , ND_StatelessDofTransformation(3 * order * ((order + 1) * (order + 2))/2, +- order, 9, 2) ++ : ND_DofTransformation(order*(order + 2)*(order + 3)/2, order, 6, 4) + {} + }; + + /// DoF transformation implementation for the Nedelec basis on wedge elements +-class ND_WedgeDofTransformation : public DofTransformation, +- public ND_WedgeStatelessDofTransformation ++class ND_WedgeDofTransformation : public ND_DofTransformation + { + public: + ND_WedgeDofTransformation(int order) +- : StatelessDofTransformation(3 * order * ((order + 1) * (order + 2))/2) +- , DofTransformation(3 * order * ((order + 1) * (order + 2))/2) +- , ND_WedgeStatelessDofTransformation(order) ++ : ND_DofTransformation(3 * order * ((order + 1) * (order + 2))/2, ++ order, 9, 2) + {} +- +- using DofTransformation::TransformPrimal; +- using DofTransformation::InvTransformPrimal; +- using DofTransformation::TransformDual; +- using DofTransformation::InvTransformDual; +- +- using ND_WedgeStatelessDofTransformation::TransformPrimal; +- using ND_WedgeStatelessDofTransformation::InvTransformPrimal; +- using ND_WedgeStatelessDofTransformation::TransformDual; +- using ND_WedgeStatelessDofTransformation::InvTransformDual; + }; + + } // namespace mfem +diff --git a/fem/fe/fe_base.hpp b/fem/fe/fe_base.hpp +index f9e31b457..55bfabaac 100644 +--- a/fem/fe/fe_base.hpp ++++ b/fem/fe/fe_base.hpp +@@ -596,7 +596,7 @@ public: + /** @brief Return a DoF transformation object for this particular type of + basis. + */ +- virtual StatelessDofTransformation * GetDofTransformation() const ++ virtual const StatelessDofTransformation *GetDofTransformation() const + { return NULL; } + + /// Deconstruct the FiniteElement +diff --git a/fem/fe/fe_nd.hpp b/fem/fe/fe_nd.hpp +index 231c050a7..c01129aed 100644 +--- a/fem/fe/fe_nd.hpp ++++ b/fem/fe/fe_nd.hpp +@@ -179,7 +179,7 @@ class ND_TetrahedronElement : public VectorFiniteElement + Array dof2tk; + DenseMatrixInverse Ti; + +- mutable ND_TetStatelessDofTransformation doftrans; ++ ND_TetDofTransformation doftrans; + + public: + /// Construct the ND_TetrahedronElement of order @a p +@@ -201,7 +201,7 @@ public: + ElementTransformation &Trans, + DenseMatrix &I) const + { LocalInterpolation_ND(CheckVectorFE(fe), tk, dof2tk, Trans, I); } +- virtual StatelessDofTransformation * GetDofTransformation() const ++ virtual const StatelessDofTransformation *GetDofTransformation() const + { return &doftrans; } + using FiniteElement::Project; + virtual void Project(VectorCoefficient &vc, +@@ -242,7 +242,7 @@ class ND_TriangleElement : public VectorFiniteElement + Array dof2tk; + DenseMatrixInverse Ti; + +- mutable ND_TriStatelessDofTransformation doftrans; ++ ND_TriDofTransformation doftrans; + + public: + /// Construct the ND_TriangleElement of order @a p +@@ -264,7 +264,7 @@ public: + ElementTransformation &Trans, + DenseMatrix &I) const + { LocalInterpolation_ND(CheckVectorFE(fe), tk, dof2tk, Trans, I); } +- virtual StatelessDofTransformation * GetDofTransformation() const ++ virtual const StatelessDofTransformation *GetDofTransformation() const + { return &doftrans; } + using FiniteElement::Project; + virtual void Project(VectorCoefficient &vc, +@@ -346,7 +346,7 @@ private: + #endif + Array dof2tk, t_dof, s_dof; + +- mutable ND_WedgeStatelessDofTransformation doftrans; ++ ND_WedgeDofTransformation doftrans; + + H1_TriangleElement H1TriangleFE; + ND_TriangleElement NDTriangleFE; +@@ -379,7 +379,7 @@ public: + DenseMatrix &I) const + { LocalInterpolation_ND(CheckVectorFE(fe), tk, dof2tk, Trans, I); } + +- virtual StatelessDofTransformation * GetDofTransformation() const ++ virtual const StatelessDofTransformation *GetDofTransformation() const + { return &doftrans; } + + using FiniteElement::Project; +diff --git a/fem/fe_coll.cpp b/fem/fe_coll.cpp +index 2b5ed6f46..6556da637 100644 +--- a/fem/fe_coll.cpp ++++ b/fem/fe_coll.cpp +@@ -2896,7 +2896,7 @@ ND_FECollection::FiniteElementForGeometry(Geometry::Type GeomType) const + } + } + +-StatelessDofTransformation * ++const StatelessDofTransformation * + ND_FECollection::DofTransformationForGeometry(Geometry::Type GeomType) const + { + if (!Geometry::IsTensorProduct(GeomType) && this->GetOrder() > 1) +diff --git a/fem/fe_coll.hpp b/fem/fe_coll.hpp +index 96c7921c4..5d7a79dc3 100644 +--- a/fem/fe_coll.hpp ++++ b/fem/fe_coll.hpp +@@ -63,7 +63,7 @@ public: + /** @brief Returns a DoF transformation object compatible with this basis + and geometry type. + */ +- virtual StatelessDofTransformation * ++ virtual const StatelessDofTransformation * + DofTransformationForGeometry(Geometry::Type GeomType) const + { return NULL; } + +@@ -483,7 +483,7 @@ public: + int DofForGeometry(Geometry::Type GeomType) const override + { return ND_dof[GeomType]; } + +- StatelessDofTransformation * ++ const StatelessDofTransformation * + DofTransformationForGeometry(Geometry::Type GeomType) const override; + + const int *DofOrderForOrientation(Geometry::Type GeomType, +diff --git a/fem/fespace.cpp b/fem/fespace.cpp +index ecff4e476..660fec17a 100644 +--- a/fem/fespace.cpp ++++ b/fem/fespace.cpp +@@ -63,7 +63,6 @@ FiniteElementSpace::FiniteElementSpace() + elem_dof(NULL), elem_fos(NULL), bdr_elem_dof(NULL), bdr_elem_fos(NULL), + face_dof(NULL), + NURBSext(NULL), own_ext(false), +- DoFTrans(0), VDoFTrans(vdim, ordering), + cP_is_set(false), + Th(Operator::ANY_TYPE), + sequence(0), mesh_sequence(0), orders_changed(false), relaxed_hp(false) +@@ -72,7 +71,6 @@ FiniteElementSpace::FiniteElementSpace() + FiniteElementSpace::FiniteElementSpace(const FiniteElementSpace &orig, + Mesh *mesh_, + const FiniteElementCollection *fec_) +- : VDoFTrans(orig.vdim, orig.ordering) + { + mesh_ = mesh_ ? mesh_ : orig.mesh; + fec_ = fec_ ? fec_ : orig.fec; +@@ -212,7 +210,7 @@ void FiniteElementSpace::GetVDofs(int vd, Array& dofs, int ndofs_) const + } + } + +-void FiniteElementSpace::DofsToVDofs (Array &dofs, int ndofs_) const ++void FiniteElementSpace::DofsToVDofs(Array &dofs, int ndofs_) const + { + if (vdim == 1) { return; } + if (ndofs_ < 0) { ndofs_ = this->ndofs; } +@@ -264,7 +262,7 @@ int FiniteElementSpace::DofToVDof(int dof, int vd, int ndofs_) const + } + + // static function +-void FiniteElementSpace::AdjustVDofs (Array &vdofs) ++void FiniteElementSpace::AdjustVDofs(Array &vdofs) + { + int n = vdofs.Size(), *vdof = vdofs; + for (int i = 0; i < n; i++) +@@ -277,36 +275,36 @@ void FiniteElementSpace::AdjustVDofs (Array &vdofs) + } + } + ++void FiniteElementSpace::GetElementVDofs(int i, Array &vdofs, ++ DofTransformation &doftrans) const ++{ ++ GetElementDofs(i, vdofs, doftrans); ++ DofsToVDofs(vdofs); ++ doftrans.SetVDim(vdim, ordering); ++} ++ + DofTransformation * + FiniteElementSpace::GetElementVDofs(int i, Array &vdofs) const + { +- DofTransformation * doftrans = GetElementDofs(i, vdofs); ++ DoFTrans.SetDofTransformation(NULL); ++ GetElementVDofs(i, vdofs, DoFTrans); ++ return DoFTrans.GetDofTransformation() ? &DoFTrans : NULL; ++} ++ ++void FiniteElementSpace::GetBdrElementVDofs(int i, Array &vdofs, ++ DofTransformation &doftrans) const ++{ ++ GetBdrElementDofs(i, vdofs, doftrans); + DofsToVDofs(vdofs); +- if (vdim == 1 || doftrans == NULL) +- { +- return doftrans; +- } +- else +- { +- VDoFTrans.SetDofTransformation(*doftrans); +- return &VDoFTrans; +- } ++ doftrans.SetVDim(vdim, ordering); + } + + DofTransformation * + FiniteElementSpace::GetBdrElementVDofs(int i, Array &vdofs) const + { +- DofTransformation * doftrans = GetBdrElementDofs(i, vdofs); +- DofsToVDofs(vdofs); +- if (vdim == 1 || doftrans == NULL) +- { +- return doftrans; +- } +- else +- { +- VDoFTrans.SetDofTransformation(*doftrans); +- return &VDoFTrans; +- } ++ DoFTrans.SetDofTransformation(NULL); ++ GetBdrElementVDofs(i, vdofs, DoFTrans); ++ return DoFTrans.GetDofTransformation() ? &DoFTrans : NULL; + } + + void FiniteElementSpace::GetPatchVDofs(int i, Array &vdofs) const +@@ -777,9 +775,9 @@ FiniteElementSpace::H2L_GlobalRestrictionMatrix (FiniteElementSpace *lfes) + return R; + } + +-void FiniteElementSpace +-::AddDependencies(SparseMatrix& deps, Array& master_dofs, +- Array& slave_dofs, DenseMatrix& I, int skipfirst) ++void FiniteElementSpace::AddDependencies( ++ SparseMatrix& deps, Array& master_dofs, Array& slave_dofs, ++ DenseMatrix& I, int skipfirst) + { + for (int i = skipfirst; i < slave_dofs.Size(); i++) + { +@@ -802,11 +800,9 @@ void FiniteElementSpace + } + } + +-void FiniteElementSpace +-::AddEdgeFaceDependencies(SparseMatrix &deps, Array &master_dofs, +- const FiniteElement *master_fe, +- Array &slave_dofs, int slave_face, +- const DenseMatrix *pm) const ++void FiniteElementSpace::AddEdgeFaceDependencies( ++ SparseMatrix &deps, Array &master_dofs, const FiniteElement *master_fe, ++ Array &slave_dofs, int slave_face, const DenseMatrix *pm) const + { + // In variable-order spaces in 3D, we need to only constrain interior face + // DOFs (this is done one level up), since edge dependencies can be more +@@ -1533,12 +1529,12 @@ SparseMatrix* FiniteElementSpace::RefinementMatrix(int old_ndofs, + localP); + } + +-FiniteElementSpace::RefinementOperator::RefinementOperator +-(const FiniteElementSpace* fespace, Table* old_elem_dof, Table* old_elem_fos, +- int old_ndofs) +- : fespace(fespace) +- , old_elem_dof(old_elem_dof) +- , old_elem_fos(old_elem_fos) ++FiniteElementSpace::RefinementOperator::RefinementOperator( ++ const FiniteElementSpace* fespace, Table* old_elem_dof, Table* old_elem_fos, ++ int old_ndofs) ++ : fespace(fespace), ++ old_elem_dof(old_elem_dof), ++ old_elem_fos(old_elem_fos) + { + MFEM_VERIFY(fespace->GetNE() >= old_elem_dof->Size(), + "Previous mesh is not coarser."); +@@ -1553,7 +1549,7 @@ FiniteElementSpace::RefinementOperator::RefinementOperator + fespace->GetLocalRefinementMatrices(elem_geoms[i], localP[elem_geoms[i]]); + } + +- ConstructDoFTrans(); ++ ConstructDoFTransArray(); + } + + FiniteElementSpace::RefinementOperator::RefinementOperator( +@@ -1578,59 +1574,58 @@ FiniteElementSpace::RefinementOperator::RefinementOperator( + old_elem_fos = new Table(*coarse_fes->GetElementToFaceOrientationTable()); + } + +- ConstructDoFTrans(); ++ ConstructDoFTransArray(); + } + + FiniteElementSpace::RefinementOperator::~RefinementOperator() + { + delete old_elem_dof; + delete old_elem_fos; +- for (int i=0; iFEColl(); + if (dynamic_cast(fec_ref)) + { +- const FiniteElement * nd_tri = ++ const FiniteElement *nd_tri = + fec_ref->FiniteElementForGeometry(Geometry::TRIANGLE); + if (nd_tri) + { +- old_DoFTrans[Geometry::TRIANGLE] = ++ old_DoFTransArray[Geometry::TRIANGLE] = + new ND_TriDofTransformation(nd_tri->GetOrder()); + } + +- const FiniteElement * nd_tet = ++ const FiniteElement *nd_tet = + fec_ref->FiniteElementForGeometry(Geometry::TETRAHEDRON); + if (nd_tet) + { +- old_DoFTrans[Geometry::TETRAHEDRON] = ++ old_DoFTransArray[Geometry::TETRAHEDRON] = + new ND_TetDofTransformation(nd_tet->GetOrder()); + } + +- const FiniteElement * nd_pri = ++ const FiniteElement *nd_pri = + fec_ref->FiniteElementForGeometry(Geometry::PRISM); + if (nd_pri) + { +- old_DoFTrans[Geometry::PRISM] = ++ old_DoFTransArray[Geometry::PRISM] = + new ND_WedgeDofTransformation(nd_pri->GetOrder()); + } + } + } + +-void FiniteElementSpace::RefinementOperator +-::Mult(const Vector &x, Vector &y) const ++void FiniteElementSpace::RefinementOperator::Mult(const Vector &x, ++ Vector &y) const + { + Mesh* mesh_ref = fespace->GetMesh(); + const CoarseFineTransformations &trans_ref = +@@ -1662,6 +1657,7 @@ void FiniteElementSpace::RefinementOperator + fespace->DofsToVDofs(vd, vdofs); + old_dofs.Copy(old_vdofs); + fespace->DofsToVDofs(vd, old_vdofs, old_ndofs); ++ + x.GetSubVector(old_vdofs, subX); + lP.Mult(subX, subY); + y.SetSubVector(vdofs, subY); +@@ -1670,40 +1666,30 @@ void FiniteElementSpace::RefinementOperator + else + { + old_elem_fos->GetRow(emb.parent, old_Fo); +- old_DoFTrans[geom]->SetFaceOrientations(old_Fo); +- +- DofTransformation *new_doftrans = NULL; +- VDofTransformation *vdoftrans = +- dynamic_cast(doftrans); +- if (vdoftrans) +- { +- new_doftrans = doftrans; +- doftrans = vdoftrans->GetDofTransformation(); +- } ++ old_DoFTrans.SetDofTransformation(*old_DoFTransArray[geom]); ++ old_DoFTrans.SetFaceOrientations(old_Fo); + ++ doftrans->SetVDim(); + for (int vd = 0; vd < rvdim; vd++) + { + dofs.Copy(vdofs); + fespace->DofsToVDofs(vd, vdofs); + old_dofs.Copy(old_vdofs); + fespace->DofsToVDofs(vd, old_vdofs, old_ndofs); ++ + x.GetSubVector(old_vdofs, subX); +- old_DoFTrans[geom]->InvTransformPrimal(subX); ++ old_DoFTrans.InvTransformPrimal(subX); + lP.Mult(subX, subY); + doftrans->TransformPrimal(subY); + y.SetSubVector(vdofs, subY); + } +- +- if (vdoftrans) +- { +- doftrans = new_doftrans; +- } ++ doftrans->SetVDim(rvdim, fespace->GetOrdering()); + } + } + } + +-void FiniteElementSpace::RefinementOperator +-::MultTranspose(const Vector &x, Vector &y) const ++void FiniteElementSpace::RefinementOperator::MultTranspose(const Vector &x, ++ Vector &y) const + { + y = 0.0; + +@@ -1727,7 +1713,7 @@ void FiniteElementSpace::RefinementOperator + const Geometry::Type geom = mesh_ref->GetElementBaseGeometry(k); + const DenseMatrix &lP = localP[geom](emb.matrix); + +- DofTransformation * doftrans = fespace->GetElementDofs(k, f_dofs); ++ DofTransformation *doftrans = fespace->GetElementDofs(k, f_dofs); + old_elem_dof->GetRow(emb.parent, c_dofs); + + if (!doftrans) +@@ -1742,7 +1728,6 @@ void FiniteElementSpace::RefinementOperator + fespace->DofsToVDofs(vd, c_vdofs, old_ndofs); + + x.GetSubVector(f_vdofs, subX); +- + for (int p = 0; p < f_dofs.Size(); ++p) + { + if (processed[DecodeDof(f_dofs[p])]) +@@ -1750,7 +1735,6 @@ void FiniteElementSpace::RefinementOperator + subX[p] = 0.0; + } + } +- + lP.MultTranspose(subX, subY); + y.AddElementVector(c_vdofs, subY); + } +@@ -1760,17 +1744,10 @@ void FiniteElementSpace::RefinementOperator + subYt.SetSize(lP.Width()); + + old_elem_fos->GetRow(emb.parent, old_Fo); +- old_DoFTrans[geom]->SetFaceOrientations(old_Fo); +- +- DofTransformation *new_doftrans = NULL; +- VDofTransformation *vdoftrans = +- dynamic_cast(doftrans); +- if (vdoftrans) +- { +- new_doftrans = doftrans; +- doftrans = vdoftrans->GetDofTransformation(); +- } ++ old_DoFTrans.SetDofTransformation(*old_DoFTransArray[geom]); ++ old_DoFTrans.SetFaceOrientations(old_Fo); + ++ doftrans->SetVDim(); + for (int vd = 0; vd < rvdim; vd++) + { + f_dofs.Copy(f_vdofs); +@@ -1787,16 +1764,11 @@ void FiniteElementSpace::RefinementOperator + subX[p] = 0.0; + } + } +- + lP.MultTranspose(subX, subYt); +- old_DoFTrans[geom]->TransformDual(subYt); ++ old_DoFTrans.TransformDual(subYt); + y.AddElementVector(c_vdofs, subYt); + } +- +- if (vdoftrans) +- { +- doftrans = new_doftrans; +- } ++ doftrans->SetVDim(rvdim, fespace->GetOrdering()); + } + + for (int p = 0; p < f_dofs.Size(); ++p) +@@ -2024,8 +1996,8 @@ FiniteElementSpace::DerefinementOperator::~DerefinementOperator() + delete coarse_elem_dof; + } + +-void FiniteElementSpace::DerefinementOperator +-::Mult(const Vector &x, Vector &y) const ++void FiniteElementSpace::DerefinementOperator::Mult(const Vector &x, ++ Vector &y) const + { + Array c_vdofs, f_vdofs; + Vector loc_x, loc_y; +@@ -2227,7 +2199,7 @@ void FiniteElementSpace::Constructor(Mesh *mesh_, NURBSExtension *NURBSext_, + R_transpose.reset(); + cP_is_set = false; + +- ConstructDoFTrans(); ++ ConstructDoFTransArray(); + } + else + { +@@ -2239,40 +2211,39 @@ void FiniteElementSpace::Constructor(Mesh *mesh_, NURBSExtension *NURBSext_, + BuildElementToDofTable(); + } + +-void FiniteElementSpace::ConstructDoFTrans() ++void FiniteElementSpace::ConstructDoFTransArray() + { +- DestroyDoFTrans(); ++ DestroyDoFTransArray(); + +- VDoFTrans.SetVDim(vdim); +- DoFTrans.SetSize(Geometry::NUM_GEOMETRIES); +- for (int i=0; iDimension() < 3) { return; } + if (dynamic_cast(fec)) + { +- const FiniteElement * nd_tri = ++ const FiniteElement *nd_tri = + fec->FiniteElementForGeometry(Geometry::TRIANGLE); + if (nd_tri) + { +- DoFTrans[Geometry::TRIANGLE] = ++ DoFTransArray[Geometry::TRIANGLE] = + new ND_TriDofTransformation(nd_tri->GetOrder()); + } + +- const FiniteElement * nd_tet = ++ const FiniteElement *nd_tet = + fec->FiniteElementForGeometry(Geometry::TETRAHEDRON); + if (nd_tet) + { +- DoFTrans[Geometry::TETRAHEDRON] = ++ DoFTransArray[Geometry::TETRAHEDRON] = + new ND_TetDofTransformation(nd_tet->GetOrder()); + } + +- const FiniteElement * nd_pri = ++ const FiniteElement *nd_pri = + fec->FiniteElementForGeometry(Geometry::PRISM); + if (nd_pri) + { +- DoFTrans[Geometry::PRISM] = ++ DoFTransArray[Geometry::PRISM] = + new ND_WedgeDofTransformation(nd_pri->GetOrder()); + } + } +@@ -2476,7 +2447,7 @@ void FiniteElementSpace::Construct() + + ndofs = nvdofs + nedofs + nfdofs + nbdofs; + +- ConstructDoFTrans(); ++ ConstructDoFTransArray(); + + // record the current mesh sequence number to detect refinement etc. + mesh_sequence = mesh->GetSequence(); +@@ -2501,9 +2472,8 @@ int FiniteElementSpace::MinOrder(VarOrderBits bits) + return 0; + } + +-void FiniteElementSpace +-::CalcEdgeFaceVarOrders(Array &edge_orders, +- Array &face_orders) const ++void FiniteElementSpace::CalcEdgeFaceVarOrders( ++ Array &edge_orders, Array &face_orders) const + { + MFEM_ASSERT(IsVariableOrder(), ""); + MFEM_ASSERT(Nonconforming(), ""); +@@ -2727,8 +2697,8 @@ int FiniteElementSpace::GetNVariants(int entity, int index) const + static const char* msg_orders_changed = + "Element orders changed, you need to Update() the space first."; + +-DofTransformation * +-FiniteElementSpace::GetElementDofs(int elem, Array &dofs) const ++void FiniteElementSpace::GetElementDofs(int elem, Array &dofs, ++ DofTransformation &doftrans) const + { + MFEM_VERIFY(!orders_changed, msg_orders_changed); + +@@ -2736,13 +2706,16 @@ FiniteElementSpace::GetElementDofs(int elem, Array &dofs) const + { + elem_dof->GetRow(elem, dofs); + +- if (DoFTrans[mesh->GetElementBaseGeometry(elem)]) ++ if (DoFTransArray[mesh->GetElementBaseGeometry(elem)]) + { + Array Fo; + elem_fos -> GetRow (elem, Fo); +- DoFTrans[mesh->GetElementBaseGeometry(elem)]->SetFaceOrientations(Fo); ++ doftrans.SetDofTransformation( ++ *DoFTransArray[mesh->GetElementBaseGeometry(elem)]); ++ doftrans.SetFaceOrientations(Fo); ++ doftrans.SetVDim(); + } +- return DoFTrans[mesh->GetElementBaseGeometry(elem)]; ++ return; + } + + Array V, E, Eo, F, Fo; // TODO: LocalArray +@@ -2766,10 +2739,12 @@ FiniteElementSpace::GetElementDofs(int elem, Array &dofs) const + { + nfd += fec->GetNumDof(mesh->GetFaceGeometry(F[i]), order); + } +- if (DoFTrans[mesh->GetElementBaseGeometry(elem)]) ++ if (DoFTransArray[mesh->GetElementBaseGeometry(elem)]) + { +- DoFTrans[mesh->GetElementBaseGeometry(elem)] +- -> SetFaceOrientations(Fo); ++ doftrans.SetDofTransformation( ++ *DoFTransArray[mesh->GetElementBaseGeometry(elem)]); ++ doftrans.SetFaceOrientations(Fo); ++ doftrans.SetVDim(); + } + } + +@@ -2828,54 +2803,18 @@ FiniteElementSpace::GetElementDofs(int elem, Array &dofs) const + dofs.Append(bbase + j); + } + } +- return DoFTrans[mesh->GetElementBaseGeometry(elem)]; + } + +-void FiniteElementSpace::GetPatchDofs(int patch, Array &dofs) const ++DofTransformation *FiniteElementSpace::GetElementDofs(int elem, ++ Array &dofs) const + { +- MFEM_ASSERT(NURBSext, +- "FiniteElementSpace::GetPatchDofs needs a NURBSExtension"); +- NURBSext->GetPatchDofs(patch, dofs); ++ DoFTrans.SetDofTransformation(NULL); ++ GetElementDofs(elem, dofs, DoFTrans); ++ return DoFTrans.GetDofTransformation() ? &DoFTrans : NULL; + } + +-const FiniteElement *FiniteElementSpace::GetFE(int i) const +-{ +- if (i < 0 || i >= mesh->GetNE()) +- { +- if (mesh->GetNE() == 0) +- { +- MFEM_ABORT("Empty MPI partitions are not permitted!"); +- } +- MFEM_ABORT("Invalid element id:" << i << "; minimum allowed:" << 0 << +- ", maximum allowed:" << mesh->GetNE()-1); +- } +- +- const FiniteElement *FE = +- fec->GetFE(mesh->GetElementGeometry(i), GetElementOrderImpl(i)); +- +- if (NURBSext) +- { +- NURBSext->LoadFE(i, FE); +- } +- else +- { +-#ifdef MFEM_DEBUG +- // consistency check: fec->GetOrder() and FE->GetOrder() should return +- // the same value (for standard, constant-order spaces) +- if (!IsVariableOrder() && FE->GetDim() > 0) +- { +- MFEM_ASSERT(FE->GetOrder() == fec->GetOrder(), +- "internal error: " << +- FE->GetOrder() << " != " << fec->GetOrder()); +- } +-#endif +- } +- +- return FE; +-} +- +-DofTransformation * +-FiniteElementSpace::GetBdrElementDofs(int bel, Array &dofs) const ++void FiniteElementSpace::GetBdrElementDofs(int bel, Array &dofs, ++ DofTransformation &doftrans) const + { + MFEM_VERIFY(!orders_changed, msg_orders_changed); + +@@ -2883,17 +2822,19 @@ FiniteElementSpace::GetBdrElementDofs(int bel, Array &dofs) const + { + bdr_elem_dof->GetRow(bel, dofs); + +- if (DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]) ++ if (DoFTransArray[mesh->GetBdrElementBaseGeometry(bel)]) + { + Array Fo; + bdr_elem_fos -> GetRow (bel, Fo); +- DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]-> +- SetFaceOrientations(Fo); ++ doftrans.SetDofTransformation( ++ *DoFTransArray[mesh->GetBdrElementBaseGeometry(bel)]); ++ doftrans.SetFaceOrientations(Fo); ++ doftrans.SetVDim(); + } +- return DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]; ++ return; + } + +- Array V, E, Eo, Fo; // TODO: LocalArray ++ Array V, E, Eo; // TODO: LocalArray + int F, oF; + + int dim = mesh->Dimension(); +@@ -2917,11 +2858,14 @@ FiniteElementSpace::GetBdrElementDofs(int bel, Array &dofs) const + { + mesh->GetBdrElementFace(bel, &F, &oF); + +- if (DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]) ++ if (DoFTransArray[mesh->GetBdrElementBaseGeometry(bel)]) + { +- Fo.Append(oF); +- DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]-> +- SetFaceOrientations(Fo); ++ mfem::Array Fo(1); ++ Fo[0] = oF; ++ doftrans.SetDofTransformation( ++ *DoFTransArray[mesh->GetBdrElementBaseGeometry(bel)]); ++ doftrans.SetFaceOrientations(Fo); ++ doftrans.SetVDim(); + } + } + +@@ -2963,8 +2907,14 @@ FiniteElementSpace::GetBdrElementDofs(int bel, Array &dofs) const + dofs.Append(EncodeDof(nvdofs + nedofs + fbase, ind[j])); + } + } ++} + +- return DoFTrans[mesh->GetBdrElementBaseGeometry(bel)]; ++DofTransformation *FiniteElementSpace::GetBdrElementDofs(int bel, ++ Array &dofs) const ++{ ++ DoFTrans.SetDofTransformation(NULL); ++ GetBdrElementDofs(bel, dofs, DoFTrans); ++ return DoFTrans.GetDofTransformation() ? &DoFTrans : NULL; + } + + int FiniteElementSpace::GetFaceDofs(int face, Array &dofs, +@@ -3134,18 +3084,6 @@ int FiniteElementSpace::GetNumElementInteriorDofs(int i) const + GetElementOrderImpl(i)); + } + +-void FiniteElementSpace::GetEdgeInteriorDofs(int i, Array &dofs) const +-{ +- MFEM_VERIFY(!IsVariableOrder(), "not implemented"); +- +- int ne = fec->DofForGeometry(Geometry::SEGMENT); +- dofs.SetSize (ne); +- for (int j = 0, k = nvdofs+i*ne; j < ne; j++, k++) +- { +- dofs[j] = k; +- } +-} +- + void FiniteElementSpace::GetFaceInteriorDofs(int i, Array &dofs) const + { + MFEM_VERIFY(!IsVariableOrder(), "not implemented"); +@@ -3170,6 +3108,61 @@ void FiniteElementSpace::GetFaceInteriorDofs(int i, Array &dofs) const + } + } + ++void FiniteElementSpace::GetEdgeInteriorDofs(int i, Array &dofs) const ++{ ++ MFEM_VERIFY(!IsVariableOrder(), "not implemented"); ++ ++ int ne = fec->DofForGeometry(Geometry::SEGMENT); ++ dofs.SetSize (ne); ++ for (int j = 0, k = nvdofs+i*ne; j < ne; j++, k++) ++ { ++ dofs[j] = k; ++ } ++} ++ ++void FiniteElementSpace::GetPatchDofs(int patch, Array &dofs) const ++{ ++ MFEM_ASSERT(NURBSext, ++ "FiniteElementSpace::GetPatchDofs needs a NURBSExtension"); ++ NURBSext->GetPatchDofs(patch, dofs); ++} ++ ++const FiniteElement *FiniteElementSpace::GetFE(int i) const ++{ ++ if (i < 0 || i >= mesh->GetNE()) ++ { ++ if (mesh->GetNE() == 0) ++ { ++ MFEM_ABORT("Empty MPI partitions are not permitted!"); ++ } ++ MFEM_ABORT("Invalid element id:" << i << "; minimum allowed:" << 0 << ++ ", maximum allowed:" << mesh->GetNE()-1); ++ } ++ ++ const FiniteElement *FE = ++ fec->GetFE(mesh->GetElementGeometry(i), GetElementOrderImpl(i)); ++ ++ if (NURBSext) ++ { ++ NURBSext->LoadFE(i, FE); ++ } ++ else ++ { ++#ifdef MFEM_DEBUG ++ // consistency check: fec->GetOrder() and FE->GetOrder() should return ++ // the same value (for standard, constant-order spaces) ++ if (!IsVariableOrder() && FE->GetDim() > 0) ++ { ++ MFEM_ASSERT(FE->GetOrder() == fec->GetOrder(), ++ "internal error: " << ++ FE->GetOrder() << " != " << fec->GetOrder()); ++ } ++#endif ++ } ++ ++ return FE; ++} ++ + const FiniteElement *FiniteElementSpace::GetBE(int i) const + { + int order = fec->GetOrder(); +@@ -3242,8 +3235,8 @@ const FiniteElement *FiniteElementSpace::GetEdgeElement(int i, + return fec->GetFE(Geometry::SEGMENT, eo); + } + +-const FiniteElement *FiniteElementSpace +-::GetTraceElement(int i, Geometry::Type geom_type) const ++const FiniteElement *FiniteElementSpace::GetTraceElement( ++ int i, Geometry::Type geom_type) const + { + return fec->TraceFiniteElementForGeometry(geom_type); + } +@@ -3283,7 +3276,7 @@ void FiniteElementSpace::Destroy() + } + E2BFQ_array.SetSize(0); + +- DestroyDoFTrans(); ++ DestroyDoFTransArray(); + + dof_elem_array.DeleteAll(); + dof_ldof_array.DeleteAll(); +@@ -3301,19 +3294,18 @@ void FiniteElementSpace::Destroy() + delete bdr_elem_dof; + delete bdr_elem_fos; + delete face_dof; +- + delete [] bdofs; + } + ceed::RemoveBasisAndRestriction(this); + } + +-void FiniteElementSpace::DestroyDoFTrans() ++void FiniteElementSpace::DestroyDoFTransArray() + { +- for (int i = 0; i < DoFTrans.Size(); i++) ++ for (int i = 0; i < DoFTransArray.Size(); i++) + { +- delete DoFTrans[i]; ++ delete DoFTransArray[i]; + } +- DoFTrans.SetSize(0); ++ DoFTransArray.SetSize(0); + } + + void FiniteElementSpace::GetTransferOperator( +diff --git a/fem/fespace.hpp b/fem/fespace.hpp +index bcff9a6be..0fd44b613 100644 +--- a/fem/fespace.hpp ++++ b/fem/fespace.hpp +@@ -271,8 +271,8 @@ protected: + int own_ext; + mutable Array face_to_be; // NURBS FE space only + +- Array DoFTrans; +- mutable VDofTransformation VDoFTrans; ++ Array DoFTransArray; ++ mutable DofTransformation DoFTrans; + + /** Matrix representing the prolongation from the global conforming dofs to + a set of intermediate partially conforming dofs, e.g. the dofs associated +@@ -328,8 +328,8 @@ protected: + void Construct(); + void Destroy(); + +- void ConstructDoFTrans(); +- void DestroyDoFTrans(); ++ void ConstructDoFTransArray(); ++ void DestroyDoFTransArray(); + + void BuildElementToDofTable() const; + void BuildBdrElementToDofTable() const; +@@ -416,10 +416,10 @@ protected: + Table* old_elem_dof; // Owned. + Table* old_elem_fos; // Owned. + +- Array old_DoFTrans; +- mutable VDofTransformation old_VDoFTrans; ++ Array old_DoFTransArray; ++ mutable DofTransformation old_DoFTrans; + +- void ConstructDoFTrans(); ++ void ConstructDoFTransArray(); + + public: + /** Construct the operator based on the elem_dof table of the original +@@ -803,7 +803,16 @@ public: + /// with triangular faces. + /// + /// @note The returned object should NOT be deleted by the caller. +- virtual DofTransformation *GetElementDofs(int elem, Array &dofs) const; ++ DofTransformation *GetElementDofs(int elem, Array &dofs) const; ++ ++ /// @brief The same as GetElementDofs(), but with a user-allocated ++ /// DofTransformation object. @a doftrans must be allocated in advance and ++ /// will be owned by the caller. The user can use the ++ /// DofTransformation::GetDofTransformation method on the returned ++ /// @a doftrans object to detect if the DofTransformation should actually be ++ /// used. ++ virtual void GetElementDofs(int elem, Array &dofs, ++ DofTransformation &doftrans) const; + + /// @brief Returns indices of degrees of freedom for boundary element 'bel'. + /// The returned indices are offsets into an @ref ldof vector. See also +@@ -817,13 +826,16 @@ public: + /// with triangular faces. + /// + /// @note The returned object should NOT be deleted by the caller. +- virtual DofTransformation *GetBdrElementDofs(int bel, +- Array &dofs) const; ++ DofTransformation *GetBdrElementDofs(int bel, Array &dofs) const; + +- /** @brief Returns indices of degrees of freedom for NURBS patch index +- @a patch. Cartesian ordering is used, for the tensor-product degrees of +- freedom. */ +- void GetPatchDofs(int patch, Array &dofs) const; ++ /// @brief The same as GetBdrElementDofs(), but with a user-allocated ++ /// DofTransformation object. @a doftrans must be allocated in advance and ++ /// will be owned by the caller. The user can use the ++ /// DofTransformation::GetDofTransformation method on the returned ++ /// @a doftrans object to detect if the DofTransformation should actually be ++ /// used. ++ virtual void GetBdrElementDofs(int bel, Array &dofs, ++ DofTransformation &doftrans) const; + + /// @brief Returns the indices of the degrees of freedom for the specified + /// face, including the DOFs for the edges and the vertices of the face. +@@ -870,6 +882,13 @@ public: + /// GetElementInteriorVDofs(). + void GetElementInteriorDofs(int i, Array &dofs) const; + ++ /// @brief Returns the number of degrees of freedom associated with the ++ /// interior of the specified element. ++ /// ++ /// See GetElementInteriorDofs() for more information or to obtain the ++ /// relevant indices. ++ int GetNumElementInteriorDofs(int i) const; ++ + /// @brief Returns the indices of the degrees of freedom for the interior + /// of the specified face. + /// +@@ -882,13 +901,6 @@ public: + /// GetFaceInteriorVDofs(). + void GetFaceInteriorDofs(int i, Array &dofs) const; + +- /// @brief Returns the number of degrees of freedom associated with the +- /// interior of the specified element. +- /// +- /// See GetElementInteriorDofs() for more information or to obtain the +- /// relevant indices. +- int GetNumElementInteriorDofs(int i) const; +- + /// @brief Returns the indices of the degrees of freedom for the interior + /// of the specified edge. + /// +@@ -897,6 +909,11 @@ public: + void GetEdgeInteriorDofs(int i, Array &dofs) const; + ///@} + ++ /** @brief Returns indices of degrees of freedom for NURBS patch index ++ @a patch. Cartesian ordering is used, for the tensor-product degrees of ++ freedom. */ ++ void GetPatchDofs(int patch, Array &dofs) const; ++ + /// @anchor dof2vdof @name DoF To VDoF Conversion methods + /// These methods convert between local dof and local vector dof using the + /// appropriate relationship based on the Ordering::Type defined in this +@@ -1023,6 +1040,15 @@ public: + /// @note The returned object should NOT be deleted by the caller. + DofTransformation *GetElementVDofs(int i, Array &vdofs) const; + ++ /// @brief The same as GetElementVDofs(), but with a user-allocated ++ /// DofTransformation object. @a doftrans must be allocated in advance and ++ /// will be owned by the caller. The user can use the ++ /// DofTransformation::GetDofTransformation method on the returned ++ /// @a doftrans object to detect if the DofTransformation should actually be ++ /// used. ++ void GetElementVDofs(int i, Array &vdofs, ++ DofTransformation &doftrans) const; ++ + /// @brief Returns indices of degrees of freedom for @a i'th boundary + /// element. + /// The returned indices are offsets into an @ref ldof vector with @b vdim +@@ -1038,6 +1064,15 @@ public: + /// @note The returned object should NOT be deleted by the caller. + DofTransformation *GetBdrElementVDofs(int i, Array &vdofs) const; + ++ /// @brief The same as GetBdrElementVDofs(), but with a user-allocated ++ /// DofTransformation object. @a doftrans must be allocated in advance and ++ /// will be owned by the caller. The user can use the ++ /// DofTransformation::GetDofTransformation method on the returned ++ /// @a doftrans object to detect if the DofTransformation should actually be ++ /// used. ++ void GetBdrElementVDofs(int i, Array &vdofs, ++ DofTransformation &doftrans) const; ++ + /// Returns indices of degrees of freedom in @a vdofs for NURBS patch @a i. + void GetPatchVDofs(int i, Array &vdofs) const; + +diff --git a/fem/pfespace.cpp b/fem/pfespace.cpp +index 81b5bd5b2..76ac230a1 100644 +--- a/fem/pfespace.cpp ++++ b/fem/pfespace.cpp +@@ -466,53 +466,54 @@ void ParFiniteElementSpace::ApplyLDofSigns(Table &el_dof) const + ApplyLDofSigns(all_dofs); + } + +-DofTransformation * +-ParFiniteElementSpace::GetElementDofs(int i, Array &dofs) const ++void ParFiniteElementSpace::GetElementDofs(int i, Array &dofs, ++ DofTransformation &doftrans) const + { + if (elem_dof) + { + elem_dof->GetRow(i, dofs); + +- if (DoFTrans[mesh->GetElementBaseGeometry(i)]) ++ if (DoFTransArray[mesh->GetElementBaseGeometry(i)]) + { + Array Fo; + elem_fos->GetRow(i, Fo); +- DoFTrans[mesh->GetElementBaseGeometry(i)]->SetFaceOrientations(Fo); +- return DoFTrans[mesh->GetElementBaseGeometry(i)]; ++ doftrans.SetDofTransformation( ++ *DoFTransArray[mesh->GetElementBaseGeometry(i)]); ++ doftrans.SetFaceOrientations(Fo); ++ doftrans.SetVDim(); + } +- return NULL; ++ return; + } +- DofTransformation * doftrans = FiniteElementSpace::GetElementDofs(i, dofs); ++ FiniteElementSpace::GetElementDofs(i, dofs, doftrans); + if (Conforming()) + { + ApplyLDofSigns(dofs); + } +- return doftrans; + } + +-DofTransformation * +-ParFiniteElementSpace::GetBdrElementDofs(int i, Array &dofs) const ++void ParFiniteElementSpace::GetBdrElementDofs(int i, Array &dofs, ++ DofTransformation &doftrans) const + { + if (bdr_elem_dof) + { + bdr_elem_dof->GetRow(i, dofs); + +- if (DoFTrans[mesh->GetBdrElementBaseGeometry(i)]) ++ if (DoFTransArray[mesh->GetBdrElementBaseGeometry(i)]) + { + Array Fo; +- bdr_elem_fos -> GetRow (i, Fo); +- DoFTrans[mesh->GetBdrElementBaseGeometry(i)]->SetFaceOrientations(Fo); +- return DoFTrans[mesh->GetBdrElementBaseGeometry(i)]; ++ bdr_elem_fos->GetRow(i, Fo); ++ doftrans.SetDofTransformation( ++ *DoFTransArray[mesh->GetBdrElementBaseGeometry(i)]); ++ doftrans.SetFaceOrientations(Fo); ++ doftrans.SetVDim(); + } +- return NULL; ++ return; + } +- DofTransformation * doftrans = +- FiniteElementSpace::GetBdrElementDofs(i, dofs); ++ FiniteElementSpace::GetBdrElementDofs(i, dofs, doftrans); + if (Conforming()) + { + ApplyLDofSigns(dofs); + } +- return doftrans; + } + + int ParFiniteElementSpace::GetFaceDofs(int i, Array &dofs, +@@ -939,8 +940,8 @@ void ParFiniteElementSpace::Build_Dof_TrueDof_Matrix() const // matrix P + } + else if (i_offd[i+1] == i_offd[i] + 2) + { +- const double * T = ND_StatelessDofTransformation +- ::GetFaceTransform(ltori[i]).GetData(); ++ const double *T = ++ ND_DofTransformation::GetFaceTransform(ltori[i]).GetData(); + j_offd[i_offd[i] + 1] = j_offd[i_offd[i]] + 1; + d_offd[i_offd[i]] = T[0]; d_offd[i_offd[i] + 1] = T[2]; + i++; +@@ -1454,31 +1455,30 @@ void ParFiniteElementSpace::ExchangeFaceNbrData() + delete [] requests; + } + +-DofTransformation *ParFiniteElementSpace::GetFaceNbrElementVDofs( +- int i, Array &vdofs) const ++void ParFiniteElementSpace::GetFaceNbrElementVDofs( ++ int i, Array &vdofs, DofTransformation &doftrans) const + { + face_nbr_element_dof.GetRow(i, vdofs); + +- DofTransformation *doftrans = NULL; +- Geometry::Type geom = GetFaceNbrFE(i)->GetGeomType(); +- if (DoFTrans[geom]) ++ if (DoFTransArray[GetFaceNbrFE(i)->GetGeomType()]) + { + Array F, Fo; + pmesh->GetFaceNbrElementFaces(pmesh->GetNE() + i, F, Fo); +- doftrans = DoFTrans[geom]; +- doftrans->SetFaceOrientations(Fo); +- } +- if (vdim == 1 || doftrans == NULL) +- { +- return doftrans; +- } +- else +- { +- VDoFTrans.SetDofTransformation(*doftrans); +- return &VDoFTrans; ++ doftrans.SetDofTransformation( ++ *DoFTransArray[GetFaceNbrFE(i)->GetGeomType()]); ++ doftrans.SetFaceOrientations(Fo); ++ doftrans.SetVDim(vdim, ordering); + } + } + ++DofTransformation *ParFiniteElementSpace::GetFaceNbrElementVDofs( ++ int i, Array &vdofs) const ++{ ++ DoFTrans.SetDofTransformation(NULL); ++ GetFaceNbrElementVDofs(i, vdofs, DoFTrans); ++ return DoFTrans.GetDofTransformation() ? &DoFTrans : NULL; ++} ++ + void ParFiniteElementSpace::GetFaceNbrFaceVDofs(int i, Array &vdofs) const + { + // Works for NC mesh where 'i' is an index returned by +@@ -2235,19 +2235,13 @@ void NeighborRowMessage::Decode(int rank) + + // This is the second "fundamental unit" used in the transformation. + const auto initial_second_row = second_row; ++ const double *T = ++ ND_DofTransformation::GetFaceTransform(fo).GetData(); + +- const auto T = [&fo]() +- { +- auto T = ND_StatelessDofTransformation::GetFaceTransform(fo); +- T(0,0) -= 1; +- T(1,1) -= 1; +- return T; +- }(); +- +- first_row.AddRow(initial_first_row, T(0,0)); +- first_row.AddRow(initial_second_row, T(0,1)); +- second_row.AddRow(initial_first_row, T(1,0)); +- second_row.AddRow(initial_second_row, T(1,1)); ++ first_row.AddRow(initial_first_row, T[0] - 1.0); ++ first_row.AddRow(initial_second_row, T[2]); ++ second_row.AddRow(initial_first_row, T[1]); ++ second_row.AddRow(initial_second_row, T[3] - 1.0); + + first_row.Collapse(); + second_row.Collapse(); +diff --git a/fem/pfespace.hpp b/fem/pfespace.hpp +index 7c7b49b7e..72029be56 100644 +--- a/fem/pfespace.hpp ++++ b/fem/pfespace.hpp +@@ -281,11 +281,17 @@ public: + /// Return the number of local vector true dofs. + int GetTrueVSize() const override { return ltdof_size; } + +- /// Returns indexes of degrees of freedom in array dofs for i'th element. +- DofTransformation *GetElementDofs(int i, Array &dofs) const override; +- +- /// Returns indexes of degrees of freedom for i'th boundary element. +- DofTransformation *GetBdrElementDofs(int i, Array &dofs) const override; ++ /// Returns indexes of degrees of freedom in array dofs for i'th element and ++ /// returns the DofTransformation data in a user-provided object. ++ using FiniteElementSpace::GetElementDofs; ++ void GetElementDofs(int i, Array &dofs, ++ DofTransformation &doftrans) const override; ++ ++ /// Returns indexes of degrees of freedom for i'th boundary element and ++ /// returns the DofTransformation data in a user-provided object. ++ using FiniteElementSpace::GetBdrElementDofs; ++ void GetBdrElementDofs(int i, Array &dofs, ++ DofTransformation &doftrans) const override; + + /** Returns the indexes of the degrees of freedom for i'th face + including the dofs for the edges and the vertices of the face. */ +@@ -379,6 +385,8 @@ public: + // Face-neighbor functions + void ExchangeFaceNbrData(); + int GetFaceNbrVSize() const { return num_face_nbr_dofs; } ++ void GetFaceNbrElementVDofs(int i, Array &vdofs, ++ DofTransformation &doftrans) const; + DofTransformation *GetFaceNbrElementVDofs(int i, Array &vdofs) const; + void GetFaceNbrFaceVDofs(int i, Array &vdofs) const; + const FiniteElement *GetFaceNbrFE(int i) const; +diff --git a/mesh/mesh.cpp b/mesh/mesh.cpp +index 600f2fc2a..a8ec98649 100644 +--- a/mesh/mesh.cpp ++++ b/mesh/mesh.cpp +@@ -384,6 +384,12 @@ void Mesh::GetElementTransformation(int i, IsoparametricTransformation *ElTr) + } + } + ++ElementTransformation *Mesh::GetElementTransformation(int i) ++{ ++ GetElementTransformation(i, &Transformation); ++ return &Transformation; ++} ++ + void Mesh::GetElementTransformation(int i, const Vector &nodes, + IsoparametricTransformation *ElTr) + { +@@ -428,19 +434,6 @@ void Mesh::GetElementTransformation(int i, const Vector &nodes, + } + } + +-ElementTransformation *Mesh::GetElementTransformation(int i) +-{ +- GetElementTransformation(i, &Transformation); +- +- return &Transformation; +-} +- +-ElementTransformation *Mesh::GetBdrElementTransformation(int i) +-{ +- GetBdrElementTransformation(i, &BdrTransformation); +- return &BdrTransformation; +-} +- + void Mesh::GetBdrElementTransformation(int i, IsoparametricTransformation* ElTr) + { + ElTr->Attribute = GetBdrAttribute(i); +@@ -502,6 +495,12 @@ void Mesh::GetBdrElementTransformation(int i, IsoparametricTransformation* ElTr) + } + } + ++ElementTransformation *Mesh::GetBdrElementTransformation(int i) ++{ ++ GetBdrElementTransformation(i, &BdrTransformation); ++ return &BdrTransformation; ++} ++ + void Mesh::GetFaceTransformation(int FaceNo, IsoparametricTransformation *FTr) + { + FTr->Attribute = (Dim == 1) ? 1 : faces[FaceNo]->GetAttribute(); +diff --git a/mesh/submesh/ptransfermap.cpp b/mesh/submesh/ptransfermap.cpp +index d7c4334cc..7e2324668 100644 +--- a/mesh/submesh/ptransfermap.cpp ++++ b/mesh/submesh/ptransfermap.cpp +@@ -317,8 +317,7 @@ ParTransferMap::CorrectFaceOrientations(const ParFiniteElementSpace &fes, + + if (parent_face_ori.Size() == 0) { return; } + +- VDofTransformation vdoftrans(fes.GetVDim(), +- fes.GetOrdering()); ++ DofTransformation doftrans(fes.GetVDim(), fes.GetOrdering()); + + int dim = mesh->Dimension(); + bool face = (dim == 3); +@@ -334,15 +333,11 @@ ParTransferMap::CorrectFaceOrientations(const ParFiniteElementSpace &fes, + Geometry::Type geom = face ? mesh->GetFaceGeometry(i) : + mesh->GetElementGeometry(i); + +- StatelessDofTransformation * doftrans = +- fec->DofTransformationForGeometry(geom); +- +- if (doftrans == NULL) { continue; } +- +- vdoftrans.SetDofTransformation(*doftrans); ++ if (!fec->DofTransformationForGeometry(geom)) { continue; } ++ doftrans.SetDofTransformation(*fec->DofTransformationForGeometry(geom)); + + Fo[0] = parent_face_ori[i]; +- vdoftrans.SetFaceOrientations(Fo); ++ doftrans.SetFaceOrientations(Fo); + + if (face) + { +@@ -356,12 +351,12 @@ ParTransferMap::CorrectFaceOrientations(const ParFiniteElementSpace &fes, + if (sub_to_parent_map) + { + src.GetSubVector(vdofs, face_vector); +- vdoftrans.TransformPrimal(face_vector); ++ doftrans.TransformPrimal(face_vector); + } + else + { + dst.GetSubVector(vdofs, face_vector); +- vdoftrans.InvTransformPrimal(face_vector); ++ doftrans.InvTransformPrimal(face_vector); + } + + for (int j = 0; j < vdofs.Size(); j++) +diff --git a/mesh/submesh/transfermap.cpp b/mesh/submesh/transfermap.cpp +index c81f0cf77..1ddb8994c 100644 +--- a/mesh/submesh/transfermap.cpp ++++ b/mesh/submesh/transfermap.cpp +@@ -241,8 +241,7 @@ void TransferMap::CorrectFaceOrientations(const FiniteElementSpace &fes, + + if (parent_face_ori.Size() == 0) { return; } + +- VDofTransformation vdoftrans(fes.GetVDim(), +- fes.GetOrdering()); ++ DofTransformation doftrans(fes.GetVDim(), fes.GetOrdering()); + + int dim = mesh->Dimension(); + bool face = (dim == 3); +@@ -258,15 +257,11 @@ void TransferMap::CorrectFaceOrientations(const FiniteElementSpace &fes, + Geometry::Type geom = face ? mesh->GetFaceGeometry(i) : + mesh->GetElementGeometry(i); + +- StatelessDofTransformation * doftrans = +- fec->DofTransformationForGeometry(geom); +- +- if (doftrans == NULL) { continue; } +- +- vdoftrans.SetDofTransformation(*doftrans); ++ if (!fec->DofTransformationForGeometry(geom)) { continue; } ++ doftrans.SetDofTransformation(*fec->DofTransformationForGeometry(geom)); + + Fo[0] = parent_face_ori[i]; +- vdoftrans.SetFaceOrientations(Fo); ++ doftrans.SetFaceOrientations(Fo); + + if (face) + { +@@ -280,12 +275,12 @@ void TransferMap::CorrectFaceOrientations(const FiniteElementSpace &fes, + if (sub_to_parent_map) + { + src.GetSubVector(vdofs, face_vector); +- vdoftrans.TransformPrimal(face_vector); ++ doftrans.TransformPrimal(face_vector); + } + else + { + dst.GetSubVector(vdofs, face_vector); +- vdoftrans.InvTransformPrimal(face_vector); ++ doftrans.InvTransformPrimal(face_vector); + } + + for (int j = 0; j < vdofs.Size(); j++) +diff --git a/tests/unit/fem/test_doftrans.cpp b/tests/unit/fem/test_doftrans.cpp +index 4518b4a5a..b65b8724d 100644 +--- a/tests/unit/fem/test_doftrans.cpp ++++ b/tests/unit/fem/test_doftrans.cpp +@@ -22,13 +22,14 @@ TEST_CASE("DoF Transformation Classes", + "[ND_TetDofTransformation]") + { + int p = 4; ++ int vdim = 3; + int seed = 123; + + double tol = 1e-13; + + SECTION("Nedelec Tetrahedral Transformations") + { +- ND_TetDofTransformation T(p); ++ ND_TetDofTransformation Tnd(p); + + Array ori(4); + ori[0] = 1; +@@ -36,102 +37,191 @@ TEST_CASE("DoF Transformation Classes", + ori[2] = 5; + ori[3] = 1; + +- T.SetFaceOrientations(ori); ++ SECTION("VDim == 1") ++ { ++ DofTransformation T(Tnd); ++ T.SetFaceOrientations(ori); + +- Vector u(T.Width()); +- Vector v(T.Width()); +- Vector f(T.Width()); +- Vector ut; +- Vector vt; +- Vector ft; ++ Vector u(T.Width()); ++ Vector v(T.Width()); ++ Vector f(T.Width()); ++ Vector ut; ++ Vector vt; ++ Vector ft; + +- u.Randomize(seed); +- v.Randomize(seed+1); +- f.Randomize(seed+2); ++ u.Randomize(seed); ++ v.Randomize(seed+1); ++ f.Randomize(seed+2); + +- SECTION("Inverse DoF transformation") +- { +- Vector w; ++ SECTION("Inverse DoF transformation") ++ { ++ Vector w; + +- ut = u; T.TransformPrimal(ut); +- w = ut; T.InvTransformPrimal(w); ++ ut = u; T.TransformPrimal(ut); ++ w = ut; T.InvTransformPrimal(w); + +- w -= u; ++ w -= u; + +- REQUIRE(w.Norml2() < tol * u.Norml2()); +- } +- SECTION("Inverse Dual DoF transformation") +- { +- Vector w; ++ REQUIRE(w.Norml2() < tol * u.Norml2()); ++ } ++ SECTION("Inverse Dual DoF transformation") ++ { ++ Vector w; + +- ut = u; T.TransformDual(ut); +- w = ut; T.InvTransformDual(w); ++ ut = u; T.TransformDual(ut); ++ w = ut; T.InvTransformDual(w); + +- w -= u; ++ w -= u; + +- REQUIRE(w.Norml2() < tol * u.Norml2()); +- } ++ REQUIRE(w.Norml2() < tol * u.Norml2()); ++ } + +- SECTION("Inner product with linear form f(v)") +- { +- vt = v; T.TransformPrimal(vt); +- ft = f; T.TransformDual(ft); ++ SECTION("Inner product with linear form f(v)") ++ { ++ vt = v; T.TransformPrimal(vt); ++ ft = f; T.TransformDual(ft); + +- double fv = f * v; ++ double fv = f * v; + +- REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); +- } ++ REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); ++ } + +- DenseMatrix A(T.Width()); +- { +- Vector Ac; +- for (int i=0; i 1") + { +- // The matrix A in this case should be regarded as a +- // DiscreteLinearOperator. +- DenseMatrix tA; +- DenseMatrix At; +- DenseMatrix tAt; ++ Vector v(vdim * Tnd.Width()); ++ Vector f(vdim * Tnd.Width()); ++ Vector vt; ++ Vector ft; + +- ft = f; T.TransformDual(ft); +- vt = v; T.TransformPrimal(vt); ++ v.Randomize(seed); ++ f.Randomize(seed+1); + +- At = A; T.TransformDualRows(At); +- tA = A; T.TransformPrimalCols(tA); +- tAt = At; T.TransformPrimalCols(tAt); ++ SECTION("Ordering == byNODES") ++ { ++ DofTransformation T(Tnd, vdim, Ordering::byNODES); ++ T.SetFaceOrientations(ori); ++ ++ SECTION("Inverse DoF transformation") ++ { ++ Vector w; ++ ++ vt = v; T.TransformPrimal(vt); ++ w = vt; T.InvTransformPrimal(w); ++ ++ w -= v; ++ ++ REQUIRE(w.Norml2() < tol * v.Norml2()); ++ } ++ SECTION("Inverse Dual DoF transformation") ++ { ++ Vector w; ++ ++ vt = v; T.TransformDual(vt); ++ w = vt; T.InvTransformDual(w); + +- double fAv = A.InnerProduct(v, f); ++ w -= v; ++ ++ REQUIRE(w.Norml2() < tol * v.Norml2()); ++ } ++ SECTION("Inner product with linear form f(v)") ++ { ++ vt = v; T.TransformPrimal(vt); ++ ft = f; T.TransformDual(ft); ++ ++ double fv = f * v; ++ ++ REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); ++ } ++ } ++ SECTION("Ordering == byVDIM") ++ { ++ DofTransformation T(Tnd, vdim, Ordering::byVDIM); ++ T.SetFaceOrientations(ori); + +- REQUIRE(fabs(fAv - At.InnerProduct(vt, f )) < tol * fabs(fAv)); +- REQUIRE(fabs(fAv - tA.InnerProduct(v, ft)) < tol * fabs(fAv)); +- REQUIRE(fabs(fAv - tAt.InnerProduct(vt, ft)) < tol * fabs(fAv)); ++ SECTION("Inverse DoF transformation") ++ { ++ Vector w; ++ ++ vt = v; T.TransformPrimal(vt); ++ w = vt; T.InvTransformPrimal(w); ++ ++ w -= v; ++ ++ REQUIRE(w.Norml2() < tol * v.Norml2()); ++ } ++ SECTION("Inverse Dual DoF transformation") ++ { ++ Vector w; ++ ++ vt = v; T.TransformDual(vt); ++ w = vt; T.InvTransformDual(w); ++ ++ w -= v; ++ ++ REQUIRE(w.Norml2() < tol * v.Norml2()); ++ } ++ SECTION("Inner product with linear form f(v)") ++ { ++ vt = v; T.TransformPrimal(vt); ++ ft = f; T.TransformDual(ft); ++ ++ double fv = f * v; ++ ++ REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); ++ } ++ } + } + } + } +@@ -146,8 +236,8 @@ TEST_CASE("DoF Transformation Functions", + + double tol = 1e-13; + +- ND_TetDofTransformation Tp(p); +- ND_TetDofTransformation Tq(q); ++ ND_TetDofTransformation Tndp(p); ++ ND_TetDofTransformation Tndq(q); + + Array ori(4); + ori[0] = 1; +@@ -155,6 +245,7 @@ TEST_CASE("DoF Transformation Functions", + ori[2] = 5; + ori[3] = 1; + ++ DofTransformation Tp(Tndp), Tq(Tndq); + Tp.SetFaceOrientations(ori); + Tq.SetFaceOrientations(ori); + +@@ -235,153 +326,4 @@ TEST_CASE("DoF Transformation Functions", + } + } + +-TEST_CASE("VDoF Transformation Class", +- "[DofTransformation]" +- "[VDofTransformation]") +-{ +- int p = 4; +- int vdim = 3; +- int seed = 123; +- +- double tol = 1e-13; +- +- ND_TetDofTransformation Tnd(p); +- +- Array ori(4); +- ori[0] = 1; +- ori[1] = 3; +- ori[2] = 5; +- ori[3] = 1; +- +- Tnd.SetFaceOrientations(ori); +- +- SECTION("VDim == 1") +- { +- VDofTransformation T(Tnd); +- +- Vector v(T.Width()); +- Vector f(T.Width()); +- Vector vt; +- Vector ft; +- +- v.Randomize(seed); +- f.Randomize(seed+1); +- +- SECTION("Inverse DoF transformation") +- { +- Vector w; +- +- vt = v; T.TransformPrimal(vt); +- w = vt; T.InvTransformPrimal(w); +- +- w -= v; +- +- REQUIRE(w.Norml2() < tol * v.Norml2()); +- } +- SECTION("Inverse Dual DoF transformation") +- { +- Vector w; +- +- vt = v; T.TransformDual(vt); +- w = vt; T.InvTransformDual(w); +- +- w -= v; +- +- REQUIRE(w.Norml2() < tol * v.Norml2()); +- } +- SECTION("Inner product with linear form f(v)") +- { +- vt = v; T.TransformPrimal(vt); +- ft = f; T.TransformDual(ft); +- +- double fv = f * v; +- +- REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); +- } +- } +- SECTION("VDim > 1") +- { +- Vector v(vdim * Tnd.Width()); +- Vector f(vdim * Tnd.Width()); +- Vector vt; +- Vector ft; +- +- v.Randomize(seed); +- f.Randomize(seed+1); +- +- SECTION("Ordering == byNODES") +- { +- VDofTransformation T(Tnd, vdim, Ordering::byNODES); +- +- SECTION("Inverse DoF transformation") +- { +- Vector w; +- +- vt = v; T.TransformPrimal(vt); +- w = vt; T.InvTransformPrimal(w); +- +- w -= v; +- +- REQUIRE(w.Norml2() < tol * v.Norml2()); +- } +- SECTION("Inverse Dual DoF transformation") +- { +- Vector w; +- +- vt = v; T.TransformDual(vt); +- w = vt; T.InvTransformDual(w); +- +- w -= v; +- +- REQUIRE(w.Norml2() < tol * v.Norml2()); +- } +- SECTION("Inner product with linear form f(v)") +- { +- vt = v; T.TransformPrimal(vt); +- ft = f; T.TransformDual(ft); +- +- double fv = f * v; +- +- REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); +- } +- } +- SECTION("Ordering == byVDIM") +- { +- VDofTransformation T(Tnd, vdim, Ordering::byVDIM); +- +- SECTION("Inverse DoF transformation") +- { +- Vector w; +- +- vt = v; T.TransformPrimal(vt); +- w = vt; T.InvTransformPrimal(w); +- +- w -= v; +- +- REQUIRE(w.Norml2() < tol * v.Norml2()); +- } +- SECTION("Inverse Dual DoF transformation") +- { +- Vector w; +- +- vt = v; T.TransformDual(vt); +- w = vt; T.InvTransformDual(w); +- +- w -= v; +- +- REQUIRE(w.Norml2() < tol * v.Norml2()); +- } +- SECTION("Inner product with linear form f(v)") +- { +- vt = v; T.TransformPrimal(vt); +- ft = f; T.TransformDual(ft); +- +- double fv = f * v; +- +- REQUIRE(fabs(fv - ft * vt) < tol * fabs(fv)); +- } +- } +- } +-} +- + } // namespace doftrans diff --git a/extern/patch/mfem/patch_strumpack_solver_dev.diff b/extern/patch/mfem/patch_strumpack_solver_dev.diff index 3ca3069cb2..ac5d181b96 100644 --- a/extern/patch/mfem/patch_strumpack_solver_dev.diff +++ b/extern/patch/mfem/patch_strumpack_solver_dev.diff @@ -1,1317 +1,1317 @@ -diff --git a/INSTALL b/INSTALL -index b7fdfa3af..aad927a56 100644 ---- a/INSTALL -+++ b/INSTALL -@@ -659,8 +659,7 @@ The specific libraries and their options are: - requires the PT-Scotch and Scalapack libraries as well as ParMETIS, which - includes METIS 5 in its distribution. Starting with STRUMPACK v2.2.0, ParMETIS - and PT-Scotch are optional dependencies. -- The support for STRUMPACK was added in MFEM v3.3.2 and it requires STRUMPACK -- 2.0.0 or later. -+ The support for STRUMPACK was added in MFEM v3.3.2. - URL: http://portal.nersc.gov/project/sparse/strumpack - Options: STRUMPACK_OPT, STRUMPACK_LIB. - Versions: STRUMPACK >= 3.0.0. -diff --git a/config/defaults.cmake b/config/defaults.cmake -index 4bd4cdf8d..3985ebd93 100644 ---- a/config/defaults.cmake -+++ b/config/defaults.cmake -@@ -152,7 +152,8 @@ set(STRUMPACK_DIR "${MFEM_DIR}/../STRUMPACK-build" CACHE PATH - # STRUMPACK may also depend on "OpenMP", depending on how it was compiled. - # Starting with v2.2.0 of STRUMPACK, ParMETIS and Scotch are optional. - set(STRUMPACK_REQUIRED_PACKAGES "MPI" "MPI_Fortran" "ParMETIS" "METIS" -- "ScaLAPACK" "Scotch/ptscotch/ptscotcherr/scotch/scotcherr" CACHE STRING -+ "Scotch/ptscotch/ptscotcherr/scotch/scotcherr" -+ "ScaLAPACK" "LAPACK" "BLAS" CACHE STRING - "Additional packages required by STRUMPACK.") - # If the MPI package does not find all required Fortran libraries: - # set(STRUMPACK_REQUIRED_LIBRARIES "gfortran" "mpi_mpifh" CACHE STRING -diff --git a/examples/ex11p.cpp b/examples/ex11p.cpp -index 216a6f443..eca3ce929 100644 ---- a/examples/ex11p.cpp -+++ b/examples/ex11p.cpp -@@ -262,12 +262,13 @@ int main(int argc, char *argv[]) - #ifdef MFEM_USE_STRUMPACK - if (sp_solver) - { -- STRUMPACKSolver * strumpack = new STRUMPACKSolver(argc, argv, MPI_COMM_WORLD); -+ STRUMPACKSolver * strumpack = new STRUMPACKSolver(MPI_COMM_WORLD, argc, argv); - strumpack->SetPrintFactorStatistics(true); - strumpack->SetPrintSolveStatistics(false); - strumpack->SetKrylovSolver(strumpack::KrylovSolver::DIRECT); - strumpack->SetReorderingStrategy(strumpack::ReorderingStrategy::METIS); -- strumpack->DisableMatching(); -+ strumpack->SetMatching(strumpack::MatchingJob::NONE); -+ strumpack->SetCompression(strumpack::CompressionType::NONE); - strumpack->SetOperator(*Arow); - strumpack->SetFromCommandLine(); - precond = strumpack; -diff --git a/examples/ex25p.cpp b/examples/ex25p.cpp -index e3848b848..cf5daf412 100644 ---- a/examples/ex25p.cpp -+++ b/examples/ex25p.cpp -@@ -170,6 +170,7 @@ int main(int argc, char *argv[]) - bool herm_conv = true; - bool slu_solver = false; - bool mumps_solver = false; -+ bool strumpack_solver = false; - bool visualization = 1; - bool pa = false; - const char *device_config = "cpu"; -@@ -200,6 +201,11 @@ int main(int argc, char *argv[]) - #ifdef MFEM_USE_MUMPS - args.AddOption(&mumps_solver, "-mumps", "--mumps-solver", "-no-mumps", - "--no-mumps-solver", "Use the MUMPS Solver."); -+#endif -+#ifdef MFEM_USE_STRUMPACK -+ args.AddOption(&strumpack_solver, "-strumpack", "--strumpack-solver", -+ "-no-strumpack", "--no-strumpack-solver", -+ "Use the STRUMPACK Solver."); - #endif - args.AddOption(&visualization, "-vis", "--visualization", "-no-vis", - "--no-visualization", -@@ -209,13 +215,14 @@ int main(int argc, char *argv[]) - args.AddOption(&device_config, "-d", "--device", - "Device configuration string, see Device::Configure()."); - args.Parse(); -- if (slu_solver && mumps_solver) -+ if (slu_solver + mumps_solver + strumpack_solver > 1) - { - if (myid == 0) -- cout << "WARNING: Both SuperLU and MUMPS have been selected," -- << " please choose either one." << endl -+ cout << "WARNING: More than one of SuperLU, MUMPS, and STRUMPACK have" -+ << " been selected, please choose only one." << endl - << " Defaulting to SuperLU." << endl; - mumps_solver = false; -+ strumpack_solver = false; - } - - if (iprob > 4) { iprob = 4; } -@@ -474,6 +481,24 @@ int main(int argc, char *argv[]) - delete A; - } - #endif -+#ifdef MFEM_USE_STRUMPACK -+ if (!pa && strumpack_solver) -+ { -+ HypreParMatrix *A = Ah.As()->GetSystemMatrix(); -+ STRUMPACKRowLocMatrix SA(*A); -+ STRUMPACKSolver strumpack(MPI_COMM_WORLD, argc, argv); -+ strumpack.SetPrintFactorStatistics(false); -+ strumpack.SetPrintSolveStatistics(false); -+ strumpack.SetKrylovSolver(strumpack::KrylovSolver::DIRECT); -+ strumpack.SetReorderingStrategy(strumpack::ReorderingStrategy::METIS); -+ strumpack.SetMatching(strumpack::MatchingJob::NONE); -+ strumpack.SetCompression(strumpack::CompressionType::NONE); -+ strumpack.SetFromCommandLine(); -+ strumpack.SetOperator(SA); -+ strumpack.Mult(B, X); -+ delete A; -+ } -+#endif - #ifdef MFEM_USE_MUMPS - if (!pa && mumps_solver) - { -@@ -493,7 +518,7 @@ int main(int argc, char *argv[]) - // - // In PML: 1/mu (abs(1/det(J) J^T J) Curl E, Curl F) - // + omega^2 * epsilon (abs(det(J) * (J^T J)^-1) * E, F) -- if (pa || (!slu_solver && !mumps_solver)) -+ if (pa || (!slu_solver && !mumps_solver && !strumpack_solver)) - { - ConstantCoefficient absomeg(pow(omega, 2) * epsilon); - RestrictedCoefficient restr_absomeg(absomeg,attr); -diff --git a/examples/petsc/ex11p.cpp b/examples/petsc/ex11p.cpp -index 51238c4d7..e6f4730fe 100644 ---- a/examples/petsc/ex11p.cpp -+++ b/examples/petsc/ex11p.cpp -@@ -273,12 +273,13 @@ int main(int argc, char *argv[]) - #ifdef MFEM_USE_STRUMPACK - if (sp_solver) - { -- STRUMPACKSolver * strumpack = new STRUMPACKSolver(argc, argv, MPI_COMM_WORLD); -+ STRUMPACKSolver * strumpack = new STRUMPACKSolver(MPI_COMM_WORLD, argc, argv); - strumpack->SetPrintFactorStatistics(true); - strumpack->SetPrintSolveStatistics(false); - strumpack->SetKrylovSolver(strumpack::KrylovSolver::DIRECT); - strumpack->SetReorderingStrategy(strumpack::ReorderingStrategy::METIS); -- strumpack->DisableMatching(); -+ strumpack->SetMatching(strumpack::MatchingJob::NONE); -+ strumpack->SetCompression(strumpack::CompressionType::NONE); - strumpack->SetOperator(*Arow); - strumpack->SetFromCommandLine(); - precond = strumpack; -diff --git a/general/communication.cpp b/general/communication.cpp -index 10fa6988c..0c2fffc1f 100644 ---- a/general/communication.cpp -+++ b/general/communication.cpp -@@ -26,6 +26,10 @@ - #include "sort_pairs.hpp" - #include "globals.hpp" - -+#ifdef MFEM_USE_STRUMPACK -+#include // STRUMPACK_USE_PTSCOTCH, etc. -+#endif -+ - #include - #include - -@@ -34,6 +38,14 @@ using namespace std; - namespace mfem - { - -+#if defined(MFEM_USE_STRUMPACK) && \ -+ (defined(STRUMPACK_USE_PTSCOTCH) || defined(STRUMPACK_USE_SLATE_SCALAPACK)) -+int Mpi::default_thread_required = MPI_THREAD_MULTIPLE; -+#else -+int Mpi::default_thread_required = MPI_THREAD_SINGLE; -+#endif -+ -+ - GroupTopology::GroupTopology(const GroupTopology >) - : MyComm(gt.MyComm), - group_lproc(gt.group_lproc) -diff --git a/general/communication.hpp b/general/communication.hpp -index 474486f1b..46d4f9f21 100644 ---- a/general/communication.hpp -+++ b/general/communication.hpp -@@ -22,7 +22,6 @@ - #include "globals.hpp" - #include - -- - namespace mfem - { - -@@ -32,10 +31,25 @@ namespace mfem - class Mpi - { - public: -- /// Singleton creation with Mpi::Init(); -- static void Init() { Init_(NULL, NULL); } -- /// Singleton creation with Mpi::Init(argc,argv); -- static void Init(int &argc, char **&argv) { Init_(&argc, &argv); } -+ /// Singleton creation with Mpi::Init(argc, argv). -+ static void Init(int &argc, char **&argv, -+ int required = default_thread_required, -+ int *provided = nullptr) -+ { Init(&argc, &argv, required, provided); } -+ /// Singleton creation with Mpi::Init(). -+ static void Init(int *argc = nullptr, char ***argv = nullptr, -+ int required = default_thread_required, -+ int *provided = nullptr) -+ { -+ MFEM_VERIFY(!IsInitialized(), "MPI already initialized!"); -+ int mpi_provided; -+ int mpi_err = MPI_Init_thread(argc, argv, required, &mpi_provided); -+ MFEM_VERIFY(!mpi_err, "error in MPI_Init()!"); -+ if (provided) { *provided = mpi_provided; } -+ // The Mpi singleton object below needs to be created after MPI_Init() for -+ // some MPI implementations. -+ Singleton(); -+ } - /// Finalize MPI (if it has been initialized and not yet already finalized). - static void Finalize() - { -@@ -71,20 +85,19 @@ public: - } - /// Return true if the rank in MPI_COMM_WORLD is zero. - static bool Root() { return WorldRank() == 0; } -+ /// Default level of thread support for MPI_Init_thread. -+ static MFEM_EXPORT int default_thread_required; - private: -- /// Initialize MPI -- static void Init_(int *argc, char ***argv) -+ /// Initialize the Mpi singleton. -+ static Mpi &Singleton() - { -- MFEM_VERIFY(!IsInitialized(), "MPI already initialized!") -- MPI_Init(argc, argv); -- // The "mpi" object below needs to be created after MPI_Init() for some -- // MPI implementations - static Mpi mpi; -+ return mpi; - } -- /// Finalize MPI -+ /// Finalize MPI. - ~Mpi() { Finalize(); } -- /// Prevent direct construction of objects of this class -- Mpi() { } -+ /// Prevent direct construction of objects of this class. -+ Mpi() {} - }; - - /** @brief A simple convenience class based on the Mpi singleton class above. -diff --git a/linalg/strumpack.cpp b/linalg/strumpack.cpp -index f0ff11ab4..270a4483a 100644 ---- a/linalg/strumpack.cpp -+++ b/linalg/strumpack.cpp -@@ -16,238 +16,471 @@ - - #include "strumpack.hpp" - --using namespace std; --using namespace strumpack; -- - namespace mfem - { - - STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(MPI_Comm comm, -- int num_loc_rows, int first_loc_row, -- int glob_nrows, int glob_ncols, -- int *I, int *J, double *data) -- : comm_(comm), A_(NULL) -+ int num_loc_rows, -+ HYPRE_BigInt first_loc_row, -+ HYPRE_BigInt glob_nrows, -+ HYPRE_BigInt glob_ncols, -+ int *I, HYPRE_BigInt *J, -+ double *data, bool sym_sparse) - { - // Set mfem::Operator member data - height = num_loc_rows; - width = num_loc_rows; - -- // Allocate STRUMPACK's CSRMatrixMPI -- int nprocs, rank; -- MPI_Comm_rank(comm_, &rank); -- MPI_Comm_size(comm_, &nprocs); -- int * dist = new int[nprocs + 1]; -- dist[rank + 1] = first_loc_row + num_loc_rows; -+ // Allocate STRUMPACK's CSRMatrixMPI (copies all inputs) -+ int rank, nprocs; -+ MPI_Comm_rank(comm, &rank); -+ MPI_Comm_size(comm, &nprocs); -+ Array dist(nprocs + 1); - dist[0] = 0; -- MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, dist + 1, 1, MPI_INT, comm_); -- A_ = new CSRMatrixMPI(num_loc_rows, I, J, data, dist, comm_, false); -- delete[] dist; -+ dist[rank + 1] = first_loc_row + (HYPRE_BigInt)num_loc_rows; -+ MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, -+ dist.GetData() + 1, 1, HYPRE_MPI_BIG_INT, comm); -+ -+#if !(defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT)) -+ A_ = new strumpack::CSRMatrixMPI( -+ (HYPRE_BigInt)num_loc_rows, I, J, data, dist.GetData(), -+ comm, sym_sparse); -+#else -+ Array II(num_loc_rows+1); -+ for (int i = 0; i <= num_loc_rows; i++) { II[i] = (HYPRE_BigInt)I[i]; } -+ A_ = new strumpack::CSRMatrixMPI( -+ (HYPRE_BigInt)num_loc_rows, II.GetData(), J, data, dist.GetData(), -+ comm, sym_sparse); -+#endif - } - --STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(const HypreParMatrix & hypParMat) -- : comm_(hypParMat.GetComm()), -- A_(NULL) -+STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(const Operator &op, -+ bool sym_sparse) - { -- // First cast the parameter to a hypre_ParCSRMatrix -- hypre_ParCSRMatrix * parcsr_op = -- (hypre_ParCSRMatrix *)const_cast(hypParMat); -+ const HypreParMatrix *APtr = dynamic_cast(&op); -+ MFEM_VERIFY(APtr, "Not a compatible matrix type"); -+ MPI_Comm comm = APtr->GetComm(); - -- MFEM_ASSERT(parcsr_op != NULL,"STRUMPACK: const_cast failed in SetOperator"); -+ // Set mfem::Operator member data -+ height = op.Height(); -+ width = op.Width(); - -- // Create the CSRMatrixMPI A_ by borrowing the internal data from a -- // hypre_CSRMatrix. -- hypParMat.HostRead(); -- hypre_CSRMatrix * csr_op = hypre_MergeDiagAndOffd(parcsr_op); -- hypParMat.HypreRead(); -- hypre_CSRMatrixSetDataOwner(csr_op,0); -+ // First cast the parameter to a hypre_ParCSRMatrix -+ hypre_ParCSRMatrix *parcsr_op = -+ (hypre_ParCSRMatrix *)const_cast(*APtr); -+ -+ // Create the CSRMatrixMPI A by taking the internal data from a -+ // hypre_CSRMatrix -+ APtr->HostRead(); -+ hypre_CSRMatrix *csr_op = hypre_MergeDiagAndOffd(parcsr_op); -+ APtr->HypreRead(); -+ HYPRE_Int *Iptr = csr_op->i; - #if MFEM_HYPRE_VERSION >= 21600 -- // For now, this method assumes that HYPRE_Int is int. Also, csr_op->num_cols -- // is of type HYPRE_Int, so if we want to check for big indices in -- // csr_op->big_j, we'll have to check all entries and that check will only be -- // necessary in HYPRE_MIXEDINT mode which is not supported at the moment. -- hypre_CSRMatrixBigJtoJ(csr_op); -+ HYPRE_BigInt *Jptr = csr_op->big_j; -+#else -+ HYPRE_Int *Jptr = csr_op->j; - #endif -+ double *data = csr_op->data; - -- height = csr_op->num_rows; -- width = csr_op->num_rows; -+ HYPRE_BigInt fst_row = parcsr_op->first_row_index; -+ HYPRE_Int m_loc = csr_op->num_rows; - -- int nprocs, rank; -- MPI_Comm_rank(comm_, &rank); -- MPI_Comm_size(comm_, &nprocs); -- int * dist = new int[nprocs + 1]; -- dist[rank + 1] = parcsr_op->first_row_index + csr_op->num_rows; -+ // Allocate STRUMPACK's CSRMatrixMPI -+ int rank, nprocs; -+ MPI_Comm_rank(comm, &rank); -+ MPI_Comm_size(comm, &nprocs); -+ Array dist(nprocs + 1); - dist[0] = 0; -- MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, dist + 1, 1, MPI_INT, comm_); -- A_ = new CSRMatrixMPI(csr_op->num_rows, csr_op->i, csr_op->j, -- csr_op->data, dist, comm_, false); -- delete[] dist; -+ dist[rank + 1] = fst_row + (HYPRE_BigInt)m_loc; -+ MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, -+ dist.GetData() + 1, 1, HYPRE_MPI_BIG_INT, comm); -+ -+#if !defined(HYPRE_MIXEDINT) -+ A_ = new strumpack::CSRMatrixMPI( -+ (HYPRE_BigInt)m_loc, Iptr, Jptr, data, dist.GetData(), -+ comm, sym_sparse); -+#else -+ Array II(m_loc+1); -+ for (int i = 0; i <= m_loc; i++) { II[i] = (HYPRE_BigInt)Iptr[i]; } -+ A_ = new strumpack::CSRMatrixMPI( -+ (HYPRE_BigInt)m_loc, II.GetData(), Jptr, data, dist.GetData(), -+ comm, sym_sparse); -+#endif - -- // Everything has been copied or abducted so delete the structure -+ // Everything has been copied so delete the structure - hypre_CSRMatrixDestroy(csr_op); - } - - STRUMPACKRowLocMatrix::~STRUMPACKRowLocMatrix() - { -- // Delete the struct -- if ( A_ != NULL ) { delete A_; } -+ delete A_; - } - --STRUMPACKSolver::STRUMPACKSolver( int argc, char* argv[], MPI_Comm comm ) -- : comm_(comm), -- APtr_(NULL), -- solver_(NULL) -+template -+STRUMPACKSolverBase:: -+STRUMPACKSolverBase(MPI_Comm comm, int argc, char *argv[]) -+ : APtr_(NULL), -+ factor_verbose_(false), -+ solve_verbose_(false), -+ reorder_reuse_(false), -+ nrhs_(-1) - { -- this->Init(argc, argv); -+ solver_ = new STRUMPACKSolverType(comm, argc, argv, false); - } - --STRUMPACKSolver::STRUMPACKSolver( STRUMPACKRowLocMatrix & A ) -- : comm_(A.GetComm()), -- APtr_(&A), -- solver_(NULL) -+template -+STRUMPACKSolverBase:: -+STRUMPACKSolverBase(STRUMPACKRowLocMatrix &A, int argc, char *argv[]) -+ : APtr_(&A), -+ factor_verbose_(false), -+ solve_verbose_(false), -+ reorder_reuse_(false), -+ nrhs_(-1) - { -- height = A.Height(); -- width = A.Width(); -+ solver_ = new STRUMPACKSolverType(A.GetComm(), argc, argv, false); -+ SetOperator(A); -+} - -- this->Init(0, NULL); -+template -+STRUMPACKSolverBase:: -+~STRUMPACKSolverBase() -+{ -+ delete solver_; - } - --STRUMPACKSolver::~STRUMPACKSolver() -+template -+void STRUMPACKSolverBase:: -+SetFromCommandLine() - { -- if ( solver_ != NULL ) { delete solver_; } -+ solver_->options().set_from_command_line(); - } - --void STRUMPACKSolver::Init( int argc, char* argv[] ) -+template -+void STRUMPACKSolverBase:: -+SetPrintFactorStatistics(bool print_stat) - { -- MPI_Comm_size(comm_, &numProcs_); -- MPI_Comm_rank(comm_, &myid_); -+ factor_verbose_ = print_stat; -+} - -- factor_verbose_ = false; -- solve_verbose_ = false; -+template -+void STRUMPACKSolverBase:: -+SetPrintSolveStatistics(bool print_stat) -+{ -+ solve_verbose_ = print_stat; -+} - -- solver_ = new StrumpackSparseSolverMPIDist(comm_, argc, argv, -- false); -+template -+void STRUMPACKSolverBase -+::SetRelTol(double rtol) -+{ -+ solver_->options().set_rel_tol(rtol); - } - --void STRUMPACKSolver::SetFromCommandLine( ) -+template -+void STRUMPACKSolverBase -+::SetAbsTol(double atol) - { -- solver_->options().set_from_command_line( ); -+ solver_->options().set_abs_tol(atol); - } - --void STRUMPACKSolver::SetPrintFactorStatistics( bool print_stat ) -+template -+void STRUMPACKSolverBase -+::SetMaxIter(int max_it) - { -- factor_verbose_ = print_stat; -+ solver_->options().set_maxit(max_it); - } - --void STRUMPACKSolver::SetPrintSolveStatistics( bool print_stat ) -+template -+void STRUMPACKSolverBase -+::SetReorderingReuse(bool reuse) - { -- solve_verbose_ = print_stat; -+ reorder_reuse_ = reuse; -+} -+ -+template -+void STRUMPACKSolverBase -+::EnableGPU() -+{ -+ solver_->options().enable_gpu(); - } - --void STRUMPACKSolver::SetKrylovSolver( strumpack::KrylovSolver method ) -+template -+void STRUMPACKSolverBase -+::DisableGPU() - { -- solver_->options().set_Krylov_solver( method ); -+ solver_->options().disable_gpu(); - } - --void STRUMPACKSolver::SetReorderingStrategy( strumpack::ReorderingStrategy -- method ) -+template -+void STRUMPACKSolverBase:: -+SetKrylovSolver(strumpack::KrylovSolver method) - { -- solver_->options().set_reordering_method( method ); -+ solver_->options().set_Krylov_solver(method); - } - --void STRUMPACKSolver::DisableMatching( ) -+template -+void STRUMPACKSolverBase:: -+SetReorderingStrategy(strumpack::ReorderingStrategy method) - { --#if STRUMPACK_VERSION_MAJOR >= 3 -- solver_->options().set_matching( strumpack::MatchingJob::NONE ); -+ solver_->options().set_reordering_method(method); -+} -+ -+template -+void STRUMPACKSolverBase:: -+SetMatching(strumpack::MatchingJob job) -+{ -+ solver_->options().set_matching(job); -+} -+ -+template -+void STRUMPACKSolverBase:: -+SetCompression(strumpack::CompressionType type) -+{ -+#if STRUMPACK_VERSION_MAJOR >= 5 -+ solver_->options().set_compression(type); - #else -- solver_->options().set_mc64job( strumpack::MC64Job::NONE ); -+ switch (type) -+ { -+ case strumpack::NONE: -+ solver_->options().disable_BLR(); -+ solver_->options().disable_HSS(); -+ break; -+ case strumpack::BLR: -+ solver_->options().enable_BLR(); -+ break; -+ case strumpack::HSS: -+ solver_->options().enable_HSS(); -+ break; -+ default: -+ MFEM_ABORT("Invalid compression type for STRUMPACK version " << -+ STRUMPACK_VERSION_MAJOR << "!"); -+ break; -+ } - #endif - } - --void STRUMPACKSolver::EnableMatching( ) -+template -+void STRUMPACKSolverBase:: -+SetCompressionRelTol(double rtol) - { --#if STRUMPACK_VERSION_MAJOR >= 3 -- solver_->options().set_matching -- ( strumpack::MatchingJob::MAX_DIAGONAL_PRODUCT_SCALING ); -+#if STRUMPACK_VERSION_MAJOR >= 5 -+ solver_->options().set_compression_rel_tol(rtol); - #else -- solver_->options().set_mc64job -- ( strumpack::MC64Job::MAX_DIAGONAL_PRODUCT_SCALING ); -+ solver_->options().BLR_options().set_rel_tol(rtol); -+ solver_->options().HSS_options().set_rel_tol(rtol); - #endif - } - --#if STRUMPACK_VERSION_MAJOR >= 3 --void STRUMPACKSolver::EnableParallelMatching( ) -+template -+void STRUMPACKSolverBase:: -+SetCompressionAbsTol(double atol) - { -- solver_->options().set_matching -- ( strumpack::MatchingJob::COMBBLAS ); --} -+#if STRUMPACK_VERSION_MAJOR >= 5 -+ solver_->options().set_compression_abs_tol(atol); -+#else -+ solver_->options().BLR_options().set_abs_tol(atol); -+ solver_->options().HSS_options().set_abs_tol(atol); - #endif -+} - --void STRUMPACKSolver::SetRelTol( double rtol ) -+#if STRUMPACK_VERSION_MAJOR >= 5 -+template -+void STRUMPACKSolverBase:: -+SetCompressionLossyPrecision(int precision) - { -- solver_->options().set_rel_tol( rtol ); -+ solver_->options().set_lossy_precision(precision); - } - --void STRUMPACKSolver::SetAbsTol( double atol ) -+template -+void STRUMPACKSolverBase:: -+SetCompressionButterflyLevels(int levels) - { -- solver_->options().set_abs_tol( atol ); -+ solver_->options().HODLR_options().set_butterfly_levels(levels); - } -+#endif - -- --void STRUMPACKSolver::Mult( const Vector & x, Vector & y ) const -+template -+void STRUMPACKSolverBase:: -+SetOperator(const Operator &op) - { -- MFEM_ASSERT(APtr_ != NULL, -- "STRUMPACK Error: The operator must be set before" -- " the system can be solved."); -- MFEM_ASSERT(x.Size() == Width(), "invalid x.Size() = " << x.Size() -- << ", expected size = " << Width()); -- MFEM_ASSERT(y.Size() == Height(), "invalid y.Size() = " << y.Size() -- << ", expected size = " << Height()); -+ // Verify that we have a compatible operator -+ bool first_mat = !APtr_; -+ APtr_ = dynamic_cast(&op); -+ MFEM_VERIFY(APtr_, -+ "STRUMPACK: Operator is not a STRUMPACKRowLocMatrix!"); - -- double* yPtr = y.HostWrite(); -- const double* xPtr = x.HostRead(); -+ // Set mfem::Operator member data -+ height = op.Height(); -+ width = op.Width(); - -- solver_->options().set_verbose( factor_verbose_ ); -- ReturnCode ret = solver_->factor(); -- switch (ret) -+ if (first_mat || !reorder_reuse_) - { -- case ReturnCode::SUCCESS: break; -- case ReturnCode::MATRIX_NOT_SET: -- { -- MFEM_ABORT("STRUMPACK: Matrix was not set!"); -- } -- break; -- case ReturnCode::REORDERING_ERROR: -- { -- MFEM_ABORT("STRUMPACK: Matrix reordering failed!"); -- } -- break; -- default: -- { -- MFEM_ABORT("STRUMPACK: 'factor()' error code = " << ret); -- } -+ solver_->set_matrix(*(APtr_->GetA())); -+ } -+ else -+ { -+ solver_->update_matrix_values(*(APtr_->GetA())); - } -- solver_->options().set_verbose( solve_verbose_ ); -- solver_->solve(xPtr, yPtr); -+} - -+template -+void STRUMPACKSolverBase:: -+FactorInternal() const -+{ -+ MFEM_ASSERT(APtr_, -+ "STRUMPACK: Operator must be set before the system can be " -+ "solved!"); -+ solver_->options().set_verbose(factor_verbose_); -+ strumpack::ReturnCode ret = solver_->factor(); -+ if (ret != strumpack::ReturnCode::SUCCESS) -+ { -+#if STRUMPACK_VERSION_MAJOR >= 7 -+ MFEM_ABORT("STRUMPACK: Factor failed with return code " << ret << "!"); -+#else -+ MFEM_ABORT("STRUMPACK: Factor failed!"); -+#endif -+ } - } - --void STRUMPACKSolver::SetOperator( const Operator & op ) -+template -+void STRUMPACKSolverBase:: -+Mult(const Vector &x, Vector &y) const - { -- // Verify that we have a compatible operator -- APtr_ = dynamic_cast(&op); -- if ( APtr_ == NULL ) -+ MFEM_ASSERT(x.Size() == Width(), -+ "STRUMPACK: Invalid x.Size() = " << x.Size() << -+ ", expected size = " << Width() << "!"); -+ MFEM_ASSERT(y.Size() == Height(), -+ "STRUMPACK: Invalid y.Size() = " << y.Size() << -+ ", expected size = " << Height() << "!"); -+ -+ const double *xPtr = x.HostRead(); -+ double *yPtr = y.HostReadWrite(); -+ -+ FactorInternal(); -+ solver_->options().set_verbose(solve_verbose_); -+ strumpack::ReturnCode ret = solver_->solve(xPtr, yPtr, false); -+ if (ret != strumpack::ReturnCode::SUCCESS) - { -- mfem_error("STRUMPACKSolver::SetOperator : not STRUMPACKRowLocMatrix!"); -+#if STRUMPACK_VERSION_MAJOR >= 7 -+ MFEM_ABORT("STRUMPACK: Solve failed with return code " << ret << "!"); -+#else -+ MFEM_ABORT("STRUMPACK: Solve failed!"); -+#endif - } -+} - -- solver_->set_matrix( *(APtr_->getA()) ); -+template -+void STRUMPACKSolverBase:: -+ArrayMult(const Array &X, Array &Y) const -+{ -+ MFEM_ASSERT(X.Size() == Y.Size(), -+ "Number of columns mismatch in STRUMPACK solve!"); -+ if (X.Size() == 1) -+ { -+ nrhs_ = 1; -+ MFEM_ASSERT(X[0] && Y[0], "Missing Vector in STRUMPACK solve!"); -+ Mult(*X[0], *Y[0]); -+ return; -+ } - -- // Set mfem::Operator member data -- height = op.Height(); -- width = op.Width(); -+ // Multiple RHS case -+ int ldx = Height(); -+ if (nrhs_ != X.Size()) -+ { -+ rhs_.SetSize(X.Size() * ldx); -+ sol_.SetSize(X.Size() * ldx); -+ nrhs_ = X.Size(); -+ } -+ for (int i = 0; i < nrhs_; i++) -+ { -+ MFEM_ASSERT(X[i] && X[i]->Size() == Width(), -+ "STRUMPACK: Missing or invalid sized RHS Vector in solve!"); -+ Vector s(rhs_, i * ldx, ldx); -+ s = *X[i]; -+ rhs_.SyncMemory(s); // Update flags for rhs_ if updated on device -+ } -+ const double *xPtr = rhs_.HostRead(); -+ double *yPtr = sol_.HostReadWrite(); -+ -+ FactorInternal(); -+ solver_->options().set_verbose(solve_verbose_); -+ strumpack::ReturnCode ret = solver_->solve(nrhs_, xPtr, ldx, yPtr, ldx, -+ false); -+ if (ret != strumpack::ReturnCode::SUCCESS) -+ { -+#if STRUMPACK_VERSION_MAJOR >= 7 -+ MFEM_ABORT("STRUMPACK: Solve failed with return code " << ret << "!"); -+#else -+ MFEM_ABORT("STRUMPACK: Solve failed!"); -+#endif -+ } - -+ for (int i = 0; i < nrhs_; i++) -+ { -+ MFEM_ASSERT(Y[i] && Y[i]->Size() == Width(), -+ "STRUMPACK: Missing or invalid sized solution Vector in solve!"); -+ Vector s(sol_, i * ldx, ldx); -+ *Y[i] = s; -+ } - } - -+STRUMPACKSolver:: -+STRUMPACKSolver(MPI_Comm comm) -+ : STRUMPACKSolverBase> -+ (comm, 0, NULL) {} -+ -+STRUMPACKSolver:: -+STRUMPACKSolver(STRUMPACKRowLocMatrix &A) -+ : STRUMPACKSolverBase> -+ (A, 0, NULL) {} -+ -+STRUMPACKSolver:: -+STRUMPACKSolver(MPI_Comm comm, int argc, char *argv[]) -+ : STRUMPACKSolverBase> -+ (comm, argc, argv) {} -+ -+STRUMPACKSolver:: -+STRUMPACKSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[]) -+ : STRUMPACKSolverBase> -+ (A, argc, argv) {} -+ -+#if STRUMPACK_VERSION_MAJOR >= 7 -+STRUMPACKMixedPrecisionSolver:: -+STRUMPACKMixedPrecisionSolver(MPI_Comm comm) -+ : STRUMPACKSolverBase> -+ (comm, 0, NULL) {} -+ -+STRUMPACKMixedPrecisionSolver:: -+STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A) -+ : STRUMPACKSolverBase> -+ (A, 0, NULL) {} -+ -+STRUMPACKMixedPrecisionSolver:: -+STRUMPACKMixedPrecisionSolver(MPI_Comm comm, int argc, char *argv[]) -+ : STRUMPACKSolverBase> -+ (comm, argc, argv) {} -+ -+STRUMPACKMixedPrecisionSolver:: -+STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[]) -+ : STRUMPACKSolverBase> -+ (A, argc, argv) {} -+#endif -+ -+template class STRUMPACKSolverBase>; -+#if STRUMPACK_VERSION_MAJOR >= 7 -+template class STRUMPACKSolverBase>; -+#endif -+ - } // mfem namespace - - #endif // MFEM_USE_MPI -diff --git a/linalg/strumpack.hpp b/linalg/strumpack.hpp -index 300b8415e..42ae555c7 100644 ---- a/linalg/strumpack.hpp -+++ b/linalg/strumpack.hpp -@@ -16,12 +16,14 @@ - - #ifdef MFEM_USE_STRUMPACK - #ifdef MFEM_USE_MPI -+ - #include "operator.hpp" - #include "hypre.hpp" -- - #include - -+// STRUMPACK headers - #include "StrumpackSparseSolverMPIDist.hpp" -+#include "StrumpackSparseSolverMixedPrecisionMPIDist.hpp" - - namespace mfem - { -@@ -34,63 +36,80 @@ public: - be of size (local) nrows by (global) glob_ncols. The new parallel matrix - contains copies of all input arrays (so they can be deleted). */ - STRUMPACKRowLocMatrix(MPI_Comm comm, -- int num_loc_rows, int first_loc_row, -- int glob_nrows, int glob_ncols, -- int *I, int *J, double *data); -+ int num_loc_rows, HYPRE_BigInt first_loc_row, -+ HYPRE_BigInt glob_nrows, HYPRE_BigInt glob_ncols, -+ int *I, HYPRE_BigInt *J, double *data, -+ bool sym_sparse = false); - - /** Creates a copy of the parallel matrix hypParMat in STRUMPACK's RowLoc - format. All data is copied so the original matrix may be deleted. */ -- STRUMPACKRowLocMatrix(const HypreParMatrix & hypParMat); -+ STRUMPACKRowLocMatrix(const Operator &op, bool sym_sparse = false); - - ~STRUMPACKRowLocMatrix(); - - void Mult(const Vector &x, Vector &y) const - { -- mfem_error("STRUMPACKRowLocMatrix::Mult(...)\n" -- " matrix vector products are not supported."); -+ MFEM_ABORT("STRUMPACKRowLocMatrix::Mult: Matrix vector products are not " -+ "supported!"); - } - -- MPI_Comm GetComm() const { return comm_; } -+ MPI_Comm GetComm() const { return A_->comm(); } - -- strumpack::CSRMatrixMPI* getA() const { return A_; } -+ strumpack::CSRMatrixMPI *GetA() const { return A_; } - - private: -- MPI_Comm comm_; -- strumpack::CSRMatrixMPI* A_; -- --}; // mfem::STRUMPACKRowLocMatrix -+ strumpack::CSRMatrixMPI *A_; -+}; - - /** The MFEM STRUMPACK Direct Solver class. - - The mfem::STRUMPACKSolver class uses the STRUMPACK library to perform LU - factorization of a parallel sparse matrix. The solver is capable of handling -- double precision types. See http://portal.nersc.gov/project/sparse/strumpack -+ double precision types. See -+ http://portal.nersc.gov/project/sparse/strumpack/. - */ --class STRUMPACKSolver : public mfem::Solver -+template -+class STRUMPACKSolverBase : public Solver - { --public: -- // Constructor with MPI_Comm parameter. -- STRUMPACKSolver( int argc, char* argv[], MPI_Comm comm ); -+protected: -+ // Constructor with MPI_Comm parameter and command line arguments. -+ STRUMPACKSolverBase(MPI_Comm comm, int argc, char *argv[]); - -- // Constructor with STRUMPACK Matrix Object. -- STRUMPACKSolver( STRUMPACKRowLocMatrix & A); -+ // Constructor with STRUMPACK matrix object and command line arguments. -+ STRUMPACKSolverBase(STRUMPACKRowLocMatrix &A, int argc, char *argv[]); - -+public: - // Default destructor. -- ~STRUMPACKSolver( void ); -+ virtual ~STRUMPACKSolverBase(); - - // Factor and solve the linear system y = Op^{-1} x. -- void Mult( const Vector & x, Vector & y ) const; -+ void Mult(const Vector &x, Vector &y) const; -+ void ArrayMult(const Array &X, Array &Y) const; - - // Set the operator. -- void SetOperator( const Operator & op ); -+ void SetOperator(const Operator &op); - - // Set various solver options. Refer to STRUMPACK documentation for - // details. -- void SetFromCommandLine( ); -- void SetPrintFactorStatistics( bool print_stat ); -- void SetPrintSolveStatistics( bool print_stat ); -- void SetRelTol( double rtol ); -- void SetAbsTol( double atol ); -+ void SetFromCommandLine(); -+ void SetPrintFactorStatistics(bool print_stat); -+ void SetPrintSolveStatistics(bool print_stat); -+ -+ // Set tolerances and iterations for iterative solvers. Compression -+ // tolerance is handled below. -+ void SetRelTol(double rtol); -+ void SetAbsTol(double atol); -+ void SetMaxIter(int max_it); -+ -+ // Set the flag controlling reuse of the symbolic factorization for multiple -+ // operators. This method has to be called before repeated calls to -+ // SetOperator. -+ void SetReorderingReuse(bool reuse); -+ -+ // Enable or not GPU off-loading available if STRUMPACK was compiled with CUDA. Note -+ // that input/output from MFEM to STRUMPACK is all still through host memory. -+ void EnableGPU(); -+ void DisableGPU(); - - /** - * STRUMPACK is an (approximate) direct solver. It can be used as a direct -@@ -100,70 +119,153 @@ public: - * used without preconditioner. - * - * Supported values are: -- * AUTO: Use iterative refinement if no HSS compression is used, -- * otherwise use GMRes. -- * DIRECT: No outer iterative solver, just a single application of -- * the multifrontal solver. -- * REFINE: Iterative refinement. -- * PREC_GMRES: Preconditioned GMRes. -- * The preconditioner is the (approx) multifrontal solver. -- * GMRES: UN-preconditioned GMRes. (for testing mainly) -- * PREC_BICGSTAB: Preconditioned BiCGStab. -- * The preconditioner is the (approx) multifrontal solver. -+ * AUTO: Use iterative refinement if no HSS compression is -+ * used, otherwise use GMRes -+ * DIRECT: No outer iterative solver, just a single application -+ * of the multifrontal solver -+ * REFINE: Iterative refinement -+ * PREC_GMRES: Preconditioned GMRes -+ * The preconditioner is the (approx) multifrontal solver -+ * GMRES: UN-preconditioned GMRes (for testing mainly) -+ * PREC_BICGSTAB: Preconditioned BiCGStab -+ * The preconditioner is the (approx) multifrontal solver - * BICGSTAB: UN-preconditioned BiCGStab. (for testing mainly) - */ -- void SetKrylovSolver( strumpack::KrylovSolver method ); -+ void SetKrylovSolver(strumpack::KrylovSolver method); - - /** - * Supported reorderings are: -- * METIS, PARMETIS, SCOTCH, PTSCOTCH, RCM -+ * NATURAL: Do not reorder the system -+ * METIS: Use Metis nested-dissection reordering (default) -+ * PARMETIS: Use ParMetis nested-dissection reordering -+ * SCOTCH: Use Scotch nested-dissection reordering -+ * PTSCOTCH: Use PT-Scotch nested-dissection reordering -+ * RCM: Use RCM reordering -+ * GEOMETRIC: A simple geometric nested dissection code that -+ * only works for regular meshes -+ * AMD: Approximate minimum degree -+ * MMD: Multiple minimum degree -+ * AND: Nested dissection -+ * MLF: Minimum local fill -+ * SPECTRAL: Spectral nested dissection - */ -- void SetReorderingStrategy( strumpack::ReorderingStrategy method ); -+ void SetReorderingStrategy(strumpack::ReorderingStrategy method); - - /** -- * Disable static pivoting for stability. The static pivoting in strumpack -+ * Configure static pivoting for stability. The static pivoting in STRUMPACK - * permutes the sparse input matrix in order to get large (nonzero) elements - * on the diagonal. If the input matrix is already diagonally dominant, this - * reordering can be disabled. -+ * -+ * Supported matching algorithms are: -+ * NONE: Don't do anything -+ * MAX_CARDINALITY: Maximum cardinality -+ * MAX_SMALLEST_DIAGONAL: Maximum smallest diagonal value -+ * MAX_SMALLEST_DIAGONAL_2: Same as MAX_SMALLEST_DIAGONAL -+ * but different algorithm -+ * MAX_DIAGONAL_SUM: Maximum sum of diagonal values -+ * MAX_DIAGONAL_PRODUCT_SCALING: Maximum product of diagonal values -+ * and row and column scaling (default) -+ * COMBBLAS: Use AWPM from CombBLAS (only with -+ * version >= 3) - */ -- void DisableMatching(); -- -- /** -- * Enable static pivoting for stability using the MC64 algorithm with -- * job=5. Using a matching algorithm, this will permute the sparse input -- * matrix in order to get nonzero elements (as large as possible) on the -- * diagonal. And will also scale the rows and columns of the matrix. -- */ -- void EnableMatching(); -+ void SetMatching(strumpack::MatchingJob job); - --#if STRUMPACK_VERSION_MAJOR >= 3 - /** -- * Use the AWPM (approximate weight perfect matching) algorithm from the -- * Combinatorial BLAS library for static pivoting, i.e. getting large -- * nonzeros on the diagonal. This requires that strumpack was compiled with -- * support for Combinatorial BLAS. -+ * Enable support for rank-structured data formats, which can be used -+ * for compression within the sparse solver. -+ * -+ * Supported compression types are: -+ * NONE: No compression, purely direct solver (default) -+ * HSS: HSS compression of frontal matrices -+ * BLR: Block low-rank compression of fronts -+ * HODLR: Hierarchically Off-diagonal Low-Rank -+ * compression of frontal matrices -+ * BLR_HODLR: Block low-rank compression of medium -+ * fronts and Hierarchically Off-diagonal -+ * Low-Rank compression of large fronts -+ * ZFP_BLR_HODLR: ZFP compression for small fronts, -+ * Block low-rank compression of medium -+ * fronts and Hierarchically Off-diagonal -+ * Low-Rank compression of large fronts -+ * LOSSLESS: Lossless compression -+ * LOSSY: Lossy compression -+ * -+ * For versions of STRUMPACK < 5, we support only NONE, HSS, and BLR. -+ * BLR_HODLR and ZPR_BLR_HODLR are supported in STRUMPACK >= 6. - */ -- void EnableParallelMatching(); -+ void SetCompression(strumpack::CompressionType type); -+ void SetCompressionRelTol(double rtol); -+ void SetCompressionAbsTol(double atol); -+#if STRUMPACK_VERSION_MAJOR >= 5 -+ void SetCompressionLossyPrecision(int precision); -+ void SetCompressionButterflyLevels(int levels); - #endif - - private: -- void Init( int argc, char* argv[] ); -+ // Helper method for calling the STRUMPACK factoriation routine. -+ void FactorInternal() const; - - protected: -- -- MPI_Comm comm_; -- int numProcs_; -- int myid_; -+ const STRUMPACKRowLocMatrix *APtr_; -+ STRUMPACKSolverType *solver_; - - bool factor_verbose_; - bool solve_verbose_; -+ bool reorder_reuse_; -+ -+ mutable Vector rhs_, sol_; -+ mutable int nrhs_; -+}; - -- const STRUMPACKRowLocMatrix * APtr_; -- strumpack::StrumpackSparseSolverMPIDist * solver_; -+class STRUMPACKSolver : -+ public STRUMPACKSolverBase> -+{ -+public: -+ // Constructor with MPI_Comm parameter. -+ STRUMPACKSolver(MPI_Comm comm); -+ -+ // Constructor with STRUMPACK matrix object. -+ STRUMPACKSolver(STRUMPACKRowLocMatrix &A); - --}; // mfem::STRUMPACKSolver class -+ // Constructor with MPI_Comm parameter and command line arguments. -+ STRUMPACKSolver(MPI_Comm comm, int argc, char *argv[]); -+ MFEM_DEPRECATED STRUMPACKSolver(int argc, char *argv[], MPI_Comm comm) -+ : STRUMPACKSolver(comm, argc, argv) {} -+ -+ // Constructor with STRUMPACK matrix object and command line arguments. -+ STRUMPACKSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[]); -+ -+ // Destructor. -+ ~STRUMPACKSolver() {} -+}; -+ -+#if STRUMPACK_VERSION_MAJOR >= 7 -+class STRUMPACKMixedPrecisionSolver : -+ public STRUMPACKSolverBase> -+{ -+public: -+ // Constructor with MPI_Comm parameter. -+ STRUMPACKMixedPrecisionSolver(MPI_Comm comm); -+ -+ // Constructor with STRUMPACK matrix object. -+ STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A); -+ -+ // Constructor with MPI_Comm parameter and command line arguments. -+ STRUMPACKMixedPrecisionSolver(MPI_Comm comm, int argc, char *argv[]); -+ -+ // Constructor with STRUMPACK matrix object and command line arguments. -+ STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A, -+ int argc, char *argv[]); -+ -+ // Destructor. -+ ~STRUMPACKMixedPrecisionSolver() {} -+}; -+#endif - --} // mfem namespace -+} // namespace mfem - - #endif // MFEM_USE_MPI - #endif // MFEM_USE_STRUMPACK -diff --git a/linalg/superlu.cpp b/linalg/superlu.cpp -index 4633eb9ef..c120c4f22 100644 ---- a/linalg/superlu.cpp -+++ b/linalg/superlu.cpp -@@ -650,6 +650,7 @@ void SuperLUSolver::ArrayMult(const Array &X, - MFEM_ASSERT(X[i], "Missing Vector in SuperLUSolver::Mult!"); - Vector s(sol_, i * ldx, ldx); - s = *X[i]; -+ sol_.SyncMemory(s); // Update flags for sol_ if updated on device - } - } - -diff --git a/miniapps/multidomain/multidomain.cpp b/miniapps/multidomain/multidomain.cpp -index dfb669224..83ebb9813 100644 ---- a/miniapps/multidomain/multidomain.cpp -+++ b/miniapps/multidomain/multidomain.cpp -@@ -322,17 +322,19 @@ int main(int argc, char *argv[]) - - char vishost[] = "localhost"; - int visport = 19916; -- socketstream cyl_sol_sock(vishost, visport); -+ socketstream cyl_sol_sock; - if (visualization) - { -+ cyl_sol_sock.open(vishost, visport); - cyl_sol_sock << "parallel " << num_procs << " " << myid << "\n"; - cyl_sol_sock.precision(8); - cyl_sol_sock << "solution\n" << cylinder_submesh << temperature_cylinder_gf << - "pause\n" << std::flush; - } -- socketstream block_sol_sock(vishost, visport); -+ socketstream block_sol_sock; - if (visualization) - { -+ block_sol_sock.open(vishost, visport); - block_sol_sock << "parallel " << num_procs << " " << myid << "\n"; - block_sol_sock.precision(8); - block_sol_sock << "solution\n" << block_submesh << temperature_block_gf << -diff --git a/miniapps/nurbs/nurbs_ex11p.cpp b/miniapps/nurbs/nurbs_ex11p.cpp -index 7b8e3bd2d..e5cf95062 100644 ---- a/miniapps/nurbs/nurbs_ex11p.cpp -+++ b/miniapps/nurbs/nurbs_ex11p.cpp -@@ -281,12 +281,13 @@ int main(int argc, char *argv[]) - #ifdef MFEM_USE_STRUMPACK - if (sp_solver) - { -- STRUMPACKSolver * strumpack = new STRUMPACKSolver(argc, argv, MPI_COMM_WORLD); -+ STRUMPACKSolver * strumpack = new STRUMPACKSolver(MPI_COMM_WORLD, argc, argv); - strumpack->SetPrintFactorStatistics(true); - strumpack->SetPrintSolveStatistics(false); - strumpack->SetKrylovSolver(strumpack::KrylovSolver::DIRECT); - strumpack->SetReorderingStrategy(strumpack::ReorderingStrategy::METIS); -- strumpack->DisableMatching(); -+ strumpack->SetMatching(strumpack::MatchingJob::NONE); -+ strumpack->SetCompression(strumpack::CompressionType::NONE); - strumpack->SetOperator(*Arow); - strumpack->SetFromCommandLine(); - precond = strumpack; -diff --git a/tests/unit/linalg/test_direct_solvers.cpp b/tests/unit/linalg/test_direct_solvers.cpp -index 48a1ac30e..5b74a98fa 100644 ---- a/tests/unit/linalg/test_direct_solvers.cpp -+++ b/tests/unit/linalg/test_direct_solvers.cpp -@@ -26,6 +26,9 @@ using namespace mfem; - #ifdef MFEM_USE_SUPERLU - #define DIRECT_SOLVE_PARALLEL - #endif -+#ifdef MFEM_USE_STRUMPACK -+#define DIRECT_SOLVE_PARALLEL -+#endif - - #if defined(DIRECT_SOLVE_SERIAL) || defined(DIRECT_SOLVE_PARALLEL) - -@@ -103,7 +106,7 @@ TEST_CASE("Serial Direct Solvers", "[CUDA]") - Mesh mesh; - if (dim == 1) - { -- mesh = Mesh::MakeCartesian1D(ne, 1.0); -+ mesh = Mesh::MakeCartesian1D(ne, 1.0); - } - else if (dim == 2) - { -@@ -187,13 +190,13 @@ TEST_CASE("Parallel Direct Solvers", "[Parallel], [CUDA]") - { - int rank; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); -- const int ne = 2; -+ const int ne = 4; - for (int dim = 1; dim < 4; ++dim) - { - Mesh mesh; - if (dim == 1) - { -- mesh = Mesh::MakeCartesian1D(ne, 1.0); -+ mesh = Mesh::MakeCartesian1D(ne, 1.0); - } - else if (dim == 2) - { -@@ -312,6 +315,39 @@ TEST_CASE("Parallel Direct Solvers", "[Parallel], [CUDA]") - REQUIRE(error < 1.e-12); - } - #endif -+#ifdef MFEM_USE_STRUMPACK -+ // Transform to monolithic HypreParMatrix -+ { -+ STRUMPACKRowLocMatrix SA(*A.As()); -+ STRUMPACKSolver strumpack(MPI_COMM_WORLD); -+ strumpack.SetPrintFactorStatistics(false); -+ strumpack.SetPrintSolveStatistics(false); -+ strumpack.SetKrylovSolver(strumpack::KrylovSolver::DIRECT); -+ strumpack.SetReorderingStrategy(dim > 1 ? strumpack::ReorderingStrategy::METIS : -+ strumpack::ReorderingStrategy::NATURAL); -+ strumpack.SetOperator(SA); -+ strumpack.Mult(B, X); -+ -+ Vector Y(X.Size()); -+ A->Mult(X, Y); -+ Y -= B; -+ REQUIRE(Y.Norml2() < 1.e-12); -+ -+ strumpack.ArrayMult(BB, XX); -+ -+ for (int i = 0; i < XX.Size(); i++) -+ { -+ A->Mult(*XX[i], Y); -+ Y -= *BB[i]; -+ REQUIRE(Y.Norml2() < 1.e-12); -+ } -+ -+ a.RecoverFEMSolution(X, b, x); -+ VectorFunctionCoefficient grad(dim, gradexact); -+ double error = x.ComputeH1Error(&uex, &grad); -+ REQUIRE(error < 1.e-12); -+ } -+#endif - } - } - +diff --git a/INSTALL b/INSTALL +index b7fdfa3af..aad927a56 100644 +--- a/INSTALL ++++ b/INSTALL +@@ -659,8 +659,7 @@ The specific libraries and their options are: + requires the PT-Scotch and Scalapack libraries as well as ParMETIS, which + includes METIS 5 in its distribution. Starting with STRUMPACK v2.2.0, ParMETIS + and PT-Scotch are optional dependencies. +- The support for STRUMPACK was added in MFEM v3.3.2 and it requires STRUMPACK +- 2.0.0 or later. ++ The support for STRUMPACK was added in MFEM v3.3.2. + URL: http://portal.nersc.gov/project/sparse/strumpack + Options: STRUMPACK_OPT, STRUMPACK_LIB. + Versions: STRUMPACK >= 3.0.0. +diff --git a/config/defaults.cmake b/config/defaults.cmake +index 4bd4cdf8d..3985ebd93 100644 +--- a/config/defaults.cmake ++++ b/config/defaults.cmake +@@ -152,7 +152,8 @@ set(STRUMPACK_DIR "${MFEM_DIR}/../STRUMPACK-build" CACHE PATH + # STRUMPACK may also depend on "OpenMP", depending on how it was compiled. + # Starting with v2.2.0 of STRUMPACK, ParMETIS and Scotch are optional. + set(STRUMPACK_REQUIRED_PACKAGES "MPI" "MPI_Fortran" "ParMETIS" "METIS" +- "ScaLAPACK" "Scotch/ptscotch/ptscotcherr/scotch/scotcherr" CACHE STRING ++ "Scotch/ptscotch/ptscotcherr/scotch/scotcherr" ++ "ScaLAPACK" "LAPACK" "BLAS" CACHE STRING + "Additional packages required by STRUMPACK.") + # If the MPI package does not find all required Fortran libraries: + # set(STRUMPACK_REQUIRED_LIBRARIES "gfortran" "mpi_mpifh" CACHE STRING +diff --git a/examples/ex11p.cpp b/examples/ex11p.cpp +index 216a6f443..eca3ce929 100644 +--- a/examples/ex11p.cpp ++++ b/examples/ex11p.cpp +@@ -262,12 +262,13 @@ int main(int argc, char *argv[]) + #ifdef MFEM_USE_STRUMPACK + if (sp_solver) + { +- STRUMPACKSolver * strumpack = new STRUMPACKSolver(argc, argv, MPI_COMM_WORLD); ++ STRUMPACKSolver * strumpack = new STRUMPACKSolver(MPI_COMM_WORLD, argc, argv); + strumpack->SetPrintFactorStatistics(true); + strumpack->SetPrintSolveStatistics(false); + strumpack->SetKrylovSolver(strumpack::KrylovSolver::DIRECT); + strumpack->SetReorderingStrategy(strumpack::ReorderingStrategy::METIS); +- strumpack->DisableMatching(); ++ strumpack->SetMatching(strumpack::MatchingJob::NONE); ++ strumpack->SetCompression(strumpack::CompressionType::NONE); + strumpack->SetOperator(*Arow); + strumpack->SetFromCommandLine(); + precond = strumpack; +diff --git a/examples/ex25p.cpp b/examples/ex25p.cpp +index e3848b848..cf5daf412 100644 +--- a/examples/ex25p.cpp ++++ b/examples/ex25p.cpp +@@ -170,6 +170,7 @@ int main(int argc, char *argv[]) + bool herm_conv = true; + bool slu_solver = false; + bool mumps_solver = false; ++ bool strumpack_solver = false; + bool visualization = 1; + bool pa = false; + const char *device_config = "cpu"; +@@ -200,6 +201,11 @@ int main(int argc, char *argv[]) + #ifdef MFEM_USE_MUMPS + args.AddOption(&mumps_solver, "-mumps", "--mumps-solver", "-no-mumps", + "--no-mumps-solver", "Use the MUMPS Solver."); ++#endif ++#ifdef MFEM_USE_STRUMPACK ++ args.AddOption(&strumpack_solver, "-strumpack", "--strumpack-solver", ++ "-no-strumpack", "--no-strumpack-solver", ++ "Use the STRUMPACK Solver."); + #endif + args.AddOption(&visualization, "-vis", "--visualization", "-no-vis", + "--no-visualization", +@@ -209,13 +215,14 @@ int main(int argc, char *argv[]) + args.AddOption(&device_config, "-d", "--device", + "Device configuration string, see Device::Configure()."); + args.Parse(); +- if (slu_solver && mumps_solver) ++ if (slu_solver + mumps_solver + strumpack_solver > 1) + { + if (myid == 0) +- cout << "WARNING: Both SuperLU and MUMPS have been selected," +- << " please choose either one." << endl ++ cout << "WARNING: More than one of SuperLU, MUMPS, and STRUMPACK have" ++ << " been selected, please choose only one." << endl + << " Defaulting to SuperLU." << endl; + mumps_solver = false; ++ strumpack_solver = false; + } + + if (iprob > 4) { iprob = 4; } +@@ -474,6 +481,24 @@ int main(int argc, char *argv[]) + delete A; + } + #endif ++#ifdef MFEM_USE_STRUMPACK ++ if (!pa && strumpack_solver) ++ { ++ HypreParMatrix *A = Ah.As()->GetSystemMatrix(); ++ STRUMPACKRowLocMatrix SA(*A); ++ STRUMPACKSolver strumpack(MPI_COMM_WORLD, argc, argv); ++ strumpack.SetPrintFactorStatistics(false); ++ strumpack.SetPrintSolveStatistics(false); ++ strumpack.SetKrylovSolver(strumpack::KrylovSolver::DIRECT); ++ strumpack.SetReorderingStrategy(strumpack::ReorderingStrategy::METIS); ++ strumpack.SetMatching(strumpack::MatchingJob::NONE); ++ strumpack.SetCompression(strumpack::CompressionType::NONE); ++ strumpack.SetFromCommandLine(); ++ strumpack.SetOperator(SA); ++ strumpack.Mult(B, X); ++ delete A; ++ } ++#endif + #ifdef MFEM_USE_MUMPS + if (!pa && mumps_solver) + { +@@ -493,7 +518,7 @@ int main(int argc, char *argv[]) + // + // In PML: 1/mu (abs(1/det(J) J^T J) Curl E, Curl F) + // + omega^2 * epsilon (abs(det(J) * (J^T J)^-1) * E, F) +- if (pa || (!slu_solver && !mumps_solver)) ++ if (pa || (!slu_solver && !mumps_solver && !strumpack_solver)) + { + ConstantCoefficient absomeg(pow(omega, 2) * epsilon); + RestrictedCoefficient restr_absomeg(absomeg,attr); +diff --git a/examples/petsc/ex11p.cpp b/examples/petsc/ex11p.cpp +index 51238c4d7..e6f4730fe 100644 +--- a/examples/petsc/ex11p.cpp ++++ b/examples/petsc/ex11p.cpp +@@ -273,12 +273,13 @@ int main(int argc, char *argv[]) + #ifdef MFEM_USE_STRUMPACK + if (sp_solver) + { +- STRUMPACKSolver * strumpack = new STRUMPACKSolver(argc, argv, MPI_COMM_WORLD); ++ STRUMPACKSolver * strumpack = new STRUMPACKSolver(MPI_COMM_WORLD, argc, argv); + strumpack->SetPrintFactorStatistics(true); + strumpack->SetPrintSolveStatistics(false); + strumpack->SetKrylovSolver(strumpack::KrylovSolver::DIRECT); + strumpack->SetReorderingStrategy(strumpack::ReorderingStrategy::METIS); +- strumpack->DisableMatching(); ++ strumpack->SetMatching(strumpack::MatchingJob::NONE); ++ strumpack->SetCompression(strumpack::CompressionType::NONE); + strumpack->SetOperator(*Arow); + strumpack->SetFromCommandLine(); + precond = strumpack; +diff --git a/general/communication.cpp b/general/communication.cpp +index 10fa6988c..0c2fffc1f 100644 +--- a/general/communication.cpp ++++ b/general/communication.cpp +@@ -26,6 +26,10 @@ + #include "sort_pairs.hpp" + #include "globals.hpp" + ++#ifdef MFEM_USE_STRUMPACK ++#include // STRUMPACK_USE_PTSCOTCH, etc. ++#endif ++ + #include + #include + +@@ -34,6 +38,14 @@ using namespace std; + namespace mfem + { + ++#if defined(MFEM_USE_STRUMPACK) && \ ++ (defined(STRUMPACK_USE_PTSCOTCH) || defined(STRUMPACK_USE_SLATE_SCALAPACK)) ++int Mpi::default_thread_required = MPI_THREAD_MULTIPLE; ++#else ++int Mpi::default_thread_required = MPI_THREAD_SINGLE; ++#endif ++ ++ + GroupTopology::GroupTopology(const GroupTopology >) + : MyComm(gt.MyComm), + group_lproc(gt.group_lproc) +diff --git a/general/communication.hpp b/general/communication.hpp +index 474486f1b..46d4f9f21 100644 +--- a/general/communication.hpp ++++ b/general/communication.hpp +@@ -22,7 +22,6 @@ + #include "globals.hpp" + #include + +- + namespace mfem + { + +@@ -32,10 +31,25 @@ namespace mfem + class Mpi + { + public: +- /// Singleton creation with Mpi::Init(); +- static void Init() { Init_(NULL, NULL); } +- /// Singleton creation with Mpi::Init(argc,argv); +- static void Init(int &argc, char **&argv) { Init_(&argc, &argv); } ++ /// Singleton creation with Mpi::Init(argc, argv). ++ static void Init(int &argc, char **&argv, ++ int required = default_thread_required, ++ int *provided = nullptr) ++ { Init(&argc, &argv, required, provided); } ++ /// Singleton creation with Mpi::Init(). ++ static void Init(int *argc = nullptr, char ***argv = nullptr, ++ int required = default_thread_required, ++ int *provided = nullptr) ++ { ++ MFEM_VERIFY(!IsInitialized(), "MPI already initialized!"); ++ int mpi_provided; ++ int mpi_err = MPI_Init_thread(argc, argv, required, &mpi_provided); ++ MFEM_VERIFY(!mpi_err, "error in MPI_Init()!"); ++ if (provided) { *provided = mpi_provided; } ++ // The Mpi singleton object below needs to be created after MPI_Init() for ++ // some MPI implementations. ++ Singleton(); ++ } + /// Finalize MPI (if it has been initialized and not yet already finalized). + static void Finalize() + { +@@ -71,20 +85,19 @@ public: + } + /// Return true if the rank in MPI_COMM_WORLD is zero. + static bool Root() { return WorldRank() == 0; } ++ /// Default level of thread support for MPI_Init_thread. ++ static MFEM_EXPORT int default_thread_required; + private: +- /// Initialize MPI +- static void Init_(int *argc, char ***argv) ++ /// Initialize the Mpi singleton. ++ static Mpi &Singleton() + { +- MFEM_VERIFY(!IsInitialized(), "MPI already initialized!") +- MPI_Init(argc, argv); +- // The "mpi" object below needs to be created after MPI_Init() for some +- // MPI implementations + static Mpi mpi; ++ return mpi; + } +- /// Finalize MPI ++ /// Finalize MPI. + ~Mpi() { Finalize(); } +- /// Prevent direct construction of objects of this class +- Mpi() { } ++ /// Prevent direct construction of objects of this class. ++ Mpi() {} + }; + + /** @brief A simple convenience class based on the Mpi singleton class above. +diff --git a/linalg/strumpack.cpp b/linalg/strumpack.cpp +index f0ff11ab4..270a4483a 100644 +--- a/linalg/strumpack.cpp ++++ b/linalg/strumpack.cpp +@@ -16,238 +16,471 @@ + + #include "strumpack.hpp" + +-using namespace std; +-using namespace strumpack; +- + namespace mfem + { + + STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(MPI_Comm comm, +- int num_loc_rows, int first_loc_row, +- int glob_nrows, int glob_ncols, +- int *I, int *J, double *data) +- : comm_(comm), A_(NULL) ++ int num_loc_rows, ++ HYPRE_BigInt first_loc_row, ++ HYPRE_BigInt glob_nrows, ++ HYPRE_BigInt glob_ncols, ++ int *I, HYPRE_BigInt *J, ++ double *data, bool sym_sparse) + { + // Set mfem::Operator member data + height = num_loc_rows; + width = num_loc_rows; + +- // Allocate STRUMPACK's CSRMatrixMPI +- int nprocs, rank; +- MPI_Comm_rank(comm_, &rank); +- MPI_Comm_size(comm_, &nprocs); +- int * dist = new int[nprocs + 1]; +- dist[rank + 1] = first_loc_row + num_loc_rows; ++ // Allocate STRUMPACK's CSRMatrixMPI (copies all inputs) ++ int rank, nprocs; ++ MPI_Comm_rank(comm, &rank); ++ MPI_Comm_size(comm, &nprocs); ++ Array dist(nprocs + 1); + dist[0] = 0; +- MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, dist + 1, 1, MPI_INT, comm_); +- A_ = new CSRMatrixMPI(num_loc_rows, I, J, data, dist, comm_, false); +- delete[] dist; ++ dist[rank + 1] = first_loc_row + (HYPRE_BigInt)num_loc_rows; ++ MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, ++ dist.GetData() + 1, 1, HYPRE_MPI_BIG_INT, comm); ++ ++#if !(defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT)) ++ A_ = new strumpack::CSRMatrixMPI( ++ (HYPRE_BigInt)num_loc_rows, I, J, data, dist.GetData(), ++ comm, sym_sparse); ++#else ++ Array II(num_loc_rows+1); ++ for (int i = 0; i <= num_loc_rows; i++) { II[i] = (HYPRE_BigInt)I[i]; } ++ A_ = new strumpack::CSRMatrixMPI( ++ (HYPRE_BigInt)num_loc_rows, II.GetData(), J, data, dist.GetData(), ++ comm, sym_sparse); ++#endif + } + +-STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(const HypreParMatrix & hypParMat) +- : comm_(hypParMat.GetComm()), +- A_(NULL) ++STRUMPACKRowLocMatrix::STRUMPACKRowLocMatrix(const Operator &op, ++ bool sym_sparse) + { +- // First cast the parameter to a hypre_ParCSRMatrix +- hypre_ParCSRMatrix * parcsr_op = +- (hypre_ParCSRMatrix *)const_cast(hypParMat); ++ const HypreParMatrix *APtr = dynamic_cast(&op); ++ MFEM_VERIFY(APtr, "Not a compatible matrix type"); ++ MPI_Comm comm = APtr->GetComm(); + +- MFEM_ASSERT(parcsr_op != NULL,"STRUMPACK: const_cast failed in SetOperator"); ++ // Set mfem::Operator member data ++ height = op.Height(); ++ width = op.Width(); + +- // Create the CSRMatrixMPI A_ by borrowing the internal data from a +- // hypre_CSRMatrix. +- hypParMat.HostRead(); +- hypre_CSRMatrix * csr_op = hypre_MergeDiagAndOffd(parcsr_op); +- hypParMat.HypreRead(); +- hypre_CSRMatrixSetDataOwner(csr_op,0); ++ // First cast the parameter to a hypre_ParCSRMatrix ++ hypre_ParCSRMatrix *parcsr_op = ++ (hypre_ParCSRMatrix *)const_cast(*APtr); ++ ++ // Create the CSRMatrixMPI A by taking the internal data from a ++ // hypre_CSRMatrix ++ APtr->HostRead(); ++ hypre_CSRMatrix *csr_op = hypre_MergeDiagAndOffd(parcsr_op); ++ APtr->HypreRead(); ++ HYPRE_Int *Iptr = csr_op->i; + #if MFEM_HYPRE_VERSION >= 21600 +- // For now, this method assumes that HYPRE_Int is int. Also, csr_op->num_cols +- // is of type HYPRE_Int, so if we want to check for big indices in +- // csr_op->big_j, we'll have to check all entries and that check will only be +- // necessary in HYPRE_MIXEDINT mode which is not supported at the moment. +- hypre_CSRMatrixBigJtoJ(csr_op); ++ HYPRE_BigInt *Jptr = csr_op->big_j; ++#else ++ HYPRE_Int *Jptr = csr_op->j; + #endif ++ double *data = csr_op->data; + +- height = csr_op->num_rows; +- width = csr_op->num_rows; ++ HYPRE_BigInt fst_row = parcsr_op->first_row_index; ++ HYPRE_Int m_loc = csr_op->num_rows; + +- int nprocs, rank; +- MPI_Comm_rank(comm_, &rank); +- MPI_Comm_size(comm_, &nprocs); +- int * dist = new int[nprocs + 1]; +- dist[rank + 1] = parcsr_op->first_row_index + csr_op->num_rows; ++ // Allocate STRUMPACK's CSRMatrixMPI ++ int rank, nprocs; ++ MPI_Comm_rank(comm, &rank); ++ MPI_Comm_size(comm, &nprocs); ++ Array dist(nprocs + 1); + dist[0] = 0; +- MPI_Allgather(MPI_IN_PLACE, 0, MPI_INT, dist + 1, 1, MPI_INT, comm_); +- A_ = new CSRMatrixMPI(csr_op->num_rows, csr_op->i, csr_op->j, +- csr_op->data, dist, comm_, false); +- delete[] dist; ++ dist[rank + 1] = fst_row + (HYPRE_BigInt)m_loc; ++ MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, ++ dist.GetData() + 1, 1, HYPRE_MPI_BIG_INT, comm); ++ ++#if !defined(HYPRE_MIXEDINT) ++ A_ = new strumpack::CSRMatrixMPI( ++ (HYPRE_BigInt)m_loc, Iptr, Jptr, data, dist.GetData(), ++ comm, sym_sparse); ++#else ++ Array II(m_loc+1); ++ for (int i = 0; i <= m_loc; i++) { II[i] = (HYPRE_BigInt)Iptr[i]; } ++ A_ = new strumpack::CSRMatrixMPI( ++ (HYPRE_BigInt)m_loc, II.GetData(), Jptr, data, dist.GetData(), ++ comm, sym_sparse); ++#endif + +- // Everything has been copied or abducted so delete the structure ++ // Everything has been copied so delete the structure + hypre_CSRMatrixDestroy(csr_op); + } + + STRUMPACKRowLocMatrix::~STRUMPACKRowLocMatrix() + { +- // Delete the struct +- if ( A_ != NULL ) { delete A_; } ++ delete A_; + } + +-STRUMPACKSolver::STRUMPACKSolver( int argc, char* argv[], MPI_Comm comm ) +- : comm_(comm), +- APtr_(NULL), +- solver_(NULL) ++template ++STRUMPACKSolverBase:: ++STRUMPACKSolverBase(MPI_Comm comm, int argc, char *argv[]) ++ : APtr_(NULL), ++ factor_verbose_(false), ++ solve_verbose_(false), ++ reorder_reuse_(false), ++ nrhs_(-1) + { +- this->Init(argc, argv); ++ solver_ = new STRUMPACKSolverType(comm, argc, argv, false); + } + +-STRUMPACKSolver::STRUMPACKSolver( STRUMPACKRowLocMatrix & A ) +- : comm_(A.GetComm()), +- APtr_(&A), +- solver_(NULL) ++template ++STRUMPACKSolverBase:: ++STRUMPACKSolverBase(STRUMPACKRowLocMatrix &A, int argc, char *argv[]) ++ : APtr_(&A), ++ factor_verbose_(false), ++ solve_verbose_(false), ++ reorder_reuse_(false), ++ nrhs_(-1) + { +- height = A.Height(); +- width = A.Width(); ++ solver_ = new STRUMPACKSolverType(A.GetComm(), argc, argv, false); ++ SetOperator(A); ++} + +- this->Init(0, NULL); ++template ++STRUMPACKSolverBase:: ++~STRUMPACKSolverBase() ++{ ++ delete solver_; + } + +-STRUMPACKSolver::~STRUMPACKSolver() ++template ++void STRUMPACKSolverBase:: ++SetFromCommandLine() + { +- if ( solver_ != NULL ) { delete solver_; } ++ solver_->options().set_from_command_line(); + } + +-void STRUMPACKSolver::Init( int argc, char* argv[] ) ++template ++void STRUMPACKSolverBase:: ++SetPrintFactorStatistics(bool print_stat) + { +- MPI_Comm_size(comm_, &numProcs_); +- MPI_Comm_rank(comm_, &myid_); ++ factor_verbose_ = print_stat; ++} + +- factor_verbose_ = false; +- solve_verbose_ = false; ++template ++void STRUMPACKSolverBase:: ++SetPrintSolveStatistics(bool print_stat) ++{ ++ solve_verbose_ = print_stat; ++} + +- solver_ = new StrumpackSparseSolverMPIDist(comm_, argc, argv, +- false); ++template ++void STRUMPACKSolverBase ++::SetRelTol(double rtol) ++{ ++ solver_->options().set_rel_tol(rtol); + } + +-void STRUMPACKSolver::SetFromCommandLine( ) ++template ++void STRUMPACKSolverBase ++::SetAbsTol(double atol) + { +- solver_->options().set_from_command_line( ); ++ solver_->options().set_abs_tol(atol); + } + +-void STRUMPACKSolver::SetPrintFactorStatistics( bool print_stat ) ++template ++void STRUMPACKSolverBase ++::SetMaxIter(int max_it) + { +- factor_verbose_ = print_stat; ++ solver_->options().set_maxit(max_it); + } + +-void STRUMPACKSolver::SetPrintSolveStatistics( bool print_stat ) ++template ++void STRUMPACKSolverBase ++::SetReorderingReuse(bool reuse) + { +- solve_verbose_ = print_stat; ++ reorder_reuse_ = reuse; ++} ++ ++template ++void STRUMPACKSolverBase ++::EnableGPU() ++{ ++ solver_->options().enable_gpu(); + } + +-void STRUMPACKSolver::SetKrylovSolver( strumpack::KrylovSolver method ) ++template ++void STRUMPACKSolverBase ++::DisableGPU() + { +- solver_->options().set_Krylov_solver( method ); ++ solver_->options().disable_gpu(); + } + +-void STRUMPACKSolver::SetReorderingStrategy( strumpack::ReorderingStrategy +- method ) ++template ++void STRUMPACKSolverBase:: ++SetKrylovSolver(strumpack::KrylovSolver method) + { +- solver_->options().set_reordering_method( method ); ++ solver_->options().set_Krylov_solver(method); + } + +-void STRUMPACKSolver::DisableMatching( ) ++template ++void STRUMPACKSolverBase:: ++SetReorderingStrategy(strumpack::ReorderingStrategy method) + { +-#if STRUMPACK_VERSION_MAJOR >= 3 +- solver_->options().set_matching( strumpack::MatchingJob::NONE ); ++ solver_->options().set_reordering_method(method); ++} ++ ++template ++void STRUMPACKSolverBase:: ++SetMatching(strumpack::MatchingJob job) ++{ ++ solver_->options().set_matching(job); ++} ++ ++template ++void STRUMPACKSolverBase:: ++SetCompression(strumpack::CompressionType type) ++{ ++#if STRUMPACK_VERSION_MAJOR >= 5 ++ solver_->options().set_compression(type); + #else +- solver_->options().set_mc64job( strumpack::MC64Job::NONE ); ++ switch (type) ++ { ++ case strumpack::NONE: ++ solver_->options().disable_BLR(); ++ solver_->options().disable_HSS(); ++ break; ++ case strumpack::BLR: ++ solver_->options().enable_BLR(); ++ break; ++ case strumpack::HSS: ++ solver_->options().enable_HSS(); ++ break; ++ default: ++ MFEM_ABORT("Invalid compression type for STRUMPACK version " << ++ STRUMPACK_VERSION_MAJOR << "!"); ++ break; ++ } + #endif + } + +-void STRUMPACKSolver::EnableMatching( ) ++template ++void STRUMPACKSolverBase:: ++SetCompressionRelTol(double rtol) + { +-#if STRUMPACK_VERSION_MAJOR >= 3 +- solver_->options().set_matching +- ( strumpack::MatchingJob::MAX_DIAGONAL_PRODUCT_SCALING ); ++#if STRUMPACK_VERSION_MAJOR >= 5 ++ solver_->options().set_compression_rel_tol(rtol); + #else +- solver_->options().set_mc64job +- ( strumpack::MC64Job::MAX_DIAGONAL_PRODUCT_SCALING ); ++ solver_->options().BLR_options().set_rel_tol(rtol); ++ solver_->options().HSS_options().set_rel_tol(rtol); + #endif + } + +-#if STRUMPACK_VERSION_MAJOR >= 3 +-void STRUMPACKSolver::EnableParallelMatching( ) ++template ++void STRUMPACKSolverBase:: ++SetCompressionAbsTol(double atol) + { +- solver_->options().set_matching +- ( strumpack::MatchingJob::COMBBLAS ); +-} ++#if STRUMPACK_VERSION_MAJOR >= 5 ++ solver_->options().set_compression_abs_tol(atol); ++#else ++ solver_->options().BLR_options().set_abs_tol(atol); ++ solver_->options().HSS_options().set_abs_tol(atol); + #endif ++} + +-void STRUMPACKSolver::SetRelTol( double rtol ) ++#if STRUMPACK_VERSION_MAJOR >= 5 ++template ++void STRUMPACKSolverBase:: ++SetCompressionLossyPrecision(int precision) + { +- solver_->options().set_rel_tol( rtol ); ++ solver_->options().set_lossy_precision(precision); + } + +-void STRUMPACKSolver::SetAbsTol( double atol ) ++template ++void STRUMPACKSolverBase:: ++SetCompressionButterflyLevels(int levels) + { +- solver_->options().set_abs_tol( atol ); ++ solver_->options().HODLR_options().set_butterfly_levels(levels); + } ++#endif + +- +-void STRUMPACKSolver::Mult( const Vector & x, Vector & y ) const ++template ++void STRUMPACKSolverBase:: ++SetOperator(const Operator &op) + { +- MFEM_ASSERT(APtr_ != NULL, +- "STRUMPACK Error: The operator must be set before" +- " the system can be solved."); +- MFEM_ASSERT(x.Size() == Width(), "invalid x.Size() = " << x.Size() +- << ", expected size = " << Width()); +- MFEM_ASSERT(y.Size() == Height(), "invalid y.Size() = " << y.Size() +- << ", expected size = " << Height()); ++ // Verify that we have a compatible operator ++ bool first_mat = !APtr_; ++ APtr_ = dynamic_cast(&op); ++ MFEM_VERIFY(APtr_, ++ "STRUMPACK: Operator is not a STRUMPACKRowLocMatrix!"); + +- double* yPtr = y.HostWrite(); +- const double* xPtr = x.HostRead(); ++ // Set mfem::Operator member data ++ height = op.Height(); ++ width = op.Width(); + +- solver_->options().set_verbose( factor_verbose_ ); +- ReturnCode ret = solver_->factor(); +- switch (ret) ++ if (first_mat || !reorder_reuse_) + { +- case ReturnCode::SUCCESS: break; +- case ReturnCode::MATRIX_NOT_SET: +- { +- MFEM_ABORT("STRUMPACK: Matrix was not set!"); +- } +- break; +- case ReturnCode::REORDERING_ERROR: +- { +- MFEM_ABORT("STRUMPACK: Matrix reordering failed!"); +- } +- break; +- default: +- { +- MFEM_ABORT("STRUMPACK: 'factor()' error code = " << ret); +- } ++ solver_->set_matrix(*(APtr_->GetA())); ++ } ++ else ++ { ++ solver_->update_matrix_values(*(APtr_->GetA())); + } +- solver_->options().set_verbose( solve_verbose_ ); +- solver_->solve(xPtr, yPtr); ++} + ++template ++void STRUMPACKSolverBase:: ++FactorInternal() const ++{ ++ MFEM_ASSERT(APtr_, ++ "STRUMPACK: Operator must be set before the system can be " ++ "solved!"); ++ solver_->options().set_verbose(factor_verbose_); ++ strumpack::ReturnCode ret = solver_->factor(); ++ if (ret != strumpack::ReturnCode::SUCCESS) ++ { ++#if STRUMPACK_VERSION_MAJOR >= 7 ++ MFEM_ABORT("STRUMPACK: Factor failed with return code " << ret << "!"); ++#else ++ MFEM_ABORT("STRUMPACK: Factor failed!"); ++#endif ++ } + } + +-void STRUMPACKSolver::SetOperator( const Operator & op ) ++template ++void STRUMPACKSolverBase:: ++Mult(const Vector &x, Vector &y) const + { +- // Verify that we have a compatible operator +- APtr_ = dynamic_cast(&op); +- if ( APtr_ == NULL ) ++ MFEM_ASSERT(x.Size() == Width(), ++ "STRUMPACK: Invalid x.Size() = " << x.Size() << ++ ", expected size = " << Width() << "!"); ++ MFEM_ASSERT(y.Size() == Height(), ++ "STRUMPACK: Invalid y.Size() = " << y.Size() << ++ ", expected size = " << Height() << "!"); ++ ++ const double *xPtr = x.HostRead(); ++ double *yPtr = y.HostReadWrite(); ++ ++ FactorInternal(); ++ solver_->options().set_verbose(solve_verbose_); ++ strumpack::ReturnCode ret = solver_->solve(xPtr, yPtr, false); ++ if (ret != strumpack::ReturnCode::SUCCESS) + { +- mfem_error("STRUMPACKSolver::SetOperator : not STRUMPACKRowLocMatrix!"); ++#if STRUMPACK_VERSION_MAJOR >= 7 ++ MFEM_ABORT("STRUMPACK: Solve failed with return code " << ret << "!"); ++#else ++ MFEM_ABORT("STRUMPACK: Solve failed!"); ++#endif + } ++} + +- solver_->set_matrix( *(APtr_->getA()) ); ++template ++void STRUMPACKSolverBase:: ++ArrayMult(const Array &X, Array &Y) const ++{ ++ MFEM_ASSERT(X.Size() == Y.Size(), ++ "Number of columns mismatch in STRUMPACK solve!"); ++ if (X.Size() == 1) ++ { ++ nrhs_ = 1; ++ MFEM_ASSERT(X[0] && Y[0], "Missing Vector in STRUMPACK solve!"); ++ Mult(*X[0], *Y[0]); ++ return; ++ } + +- // Set mfem::Operator member data +- height = op.Height(); +- width = op.Width(); ++ // Multiple RHS case ++ int ldx = Height(); ++ if (nrhs_ != X.Size()) ++ { ++ rhs_.SetSize(X.Size() * ldx); ++ sol_.SetSize(X.Size() * ldx); ++ nrhs_ = X.Size(); ++ } ++ for (int i = 0; i < nrhs_; i++) ++ { ++ MFEM_ASSERT(X[i] && X[i]->Size() == Width(), ++ "STRUMPACK: Missing or invalid sized RHS Vector in solve!"); ++ Vector s(rhs_, i * ldx, ldx); ++ s = *X[i]; ++ rhs_.SyncMemory(s); // Update flags for rhs_ if updated on device ++ } ++ const double *xPtr = rhs_.HostRead(); ++ double *yPtr = sol_.HostReadWrite(); ++ ++ FactorInternal(); ++ solver_->options().set_verbose(solve_verbose_); ++ strumpack::ReturnCode ret = solver_->solve(nrhs_, xPtr, ldx, yPtr, ldx, ++ false); ++ if (ret != strumpack::ReturnCode::SUCCESS) ++ { ++#if STRUMPACK_VERSION_MAJOR >= 7 ++ MFEM_ABORT("STRUMPACK: Solve failed with return code " << ret << "!"); ++#else ++ MFEM_ABORT("STRUMPACK: Solve failed!"); ++#endif ++ } + ++ for (int i = 0; i < nrhs_; i++) ++ { ++ MFEM_ASSERT(Y[i] && Y[i]->Size() == Width(), ++ "STRUMPACK: Missing or invalid sized solution Vector in solve!"); ++ Vector s(sol_, i * ldx, ldx); ++ *Y[i] = s; ++ } + } + ++STRUMPACKSolver:: ++STRUMPACKSolver(MPI_Comm comm) ++ : STRUMPACKSolverBase> ++ (comm, 0, NULL) {} ++ ++STRUMPACKSolver:: ++STRUMPACKSolver(STRUMPACKRowLocMatrix &A) ++ : STRUMPACKSolverBase> ++ (A, 0, NULL) {} ++ ++STRUMPACKSolver:: ++STRUMPACKSolver(MPI_Comm comm, int argc, char *argv[]) ++ : STRUMPACKSolverBase> ++ (comm, argc, argv) {} ++ ++STRUMPACKSolver:: ++STRUMPACKSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[]) ++ : STRUMPACKSolverBase> ++ (A, argc, argv) {} ++ ++#if STRUMPACK_VERSION_MAJOR >= 7 ++STRUMPACKMixedPrecisionSolver:: ++STRUMPACKMixedPrecisionSolver(MPI_Comm comm) ++ : STRUMPACKSolverBase> ++ (comm, 0, NULL) {} ++ ++STRUMPACKMixedPrecisionSolver:: ++STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A) ++ : STRUMPACKSolverBase> ++ (A, 0, NULL) {} ++ ++STRUMPACKMixedPrecisionSolver:: ++STRUMPACKMixedPrecisionSolver(MPI_Comm comm, int argc, char *argv[]) ++ : STRUMPACKSolverBase> ++ (comm, argc, argv) {} ++ ++STRUMPACKMixedPrecisionSolver:: ++STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[]) ++ : STRUMPACKSolverBase> ++ (A, argc, argv) {} ++#endif ++ ++template class STRUMPACKSolverBase>; ++#if STRUMPACK_VERSION_MAJOR >= 7 ++template class STRUMPACKSolverBase>; ++#endif ++ + } // mfem namespace + + #endif // MFEM_USE_MPI +diff --git a/linalg/strumpack.hpp b/linalg/strumpack.hpp +index 300b8415e..42ae555c7 100644 +--- a/linalg/strumpack.hpp ++++ b/linalg/strumpack.hpp +@@ -16,12 +16,14 @@ + + #ifdef MFEM_USE_STRUMPACK + #ifdef MFEM_USE_MPI ++ + #include "operator.hpp" + #include "hypre.hpp" +- + #include + ++// STRUMPACK headers + #include "StrumpackSparseSolverMPIDist.hpp" ++#include "StrumpackSparseSolverMixedPrecisionMPIDist.hpp" + + namespace mfem + { +@@ -34,63 +36,80 @@ public: + be of size (local) nrows by (global) glob_ncols. The new parallel matrix + contains copies of all input arrays (so they can be deleted). */ + STRUMPACKRowLocMatrix(MPI_Comm comm, +- int num_loc_rows, int first_loc_row, +- int glob_nrows, int glob_ncols, +- int *I, int *J, double *data); ++ int num_loc_rows, HYPRE_BigInt first_loc_row, ++ HYPRE_BigInt glob_nrows, HYPRE_BigInt glob_ncols, ++ int *I, HYPRE_BigInt *J, double *data, ++ bool sym_sparse = false); + + /** Creates a copy of the parallel matrix hypParMat in STRUMPACK's RowLoc + format. All data is copied so the original matrix may be deleted. */ +- STRUMPACKRowLocMatrix(const HypreParMatrix & hypParMat); ++ STRUMPACKRowLocMatrix(const Operator &op, bool sym_sparse = false); + + ~STRUMPACKRowLocMatrix(); + + void Mult(const Vector &x, Vector &y) const + { +- mfem_error("STRUMPACKRowLocMatrix::Mult(...)\n" +- " matrix vector products are not supported."); ++ MFEM_ABORT("STRUMPACKRowLocMatrix::Mult: Matrix vector products are not " ++ "supported!"); + } + +- MPI_Comm GetComm() const { return comm_; } ++ MPI_Comm GetComm() const { return A_->comm(); } + +- strumpack::CSRMatrixMPI* getA() const { return A_; } ++ strumpack::CSRMatrixMPI *GetA() const { return A_; } + + private: +- MPI_Comm comm_; +- strumpack::CSRMatrixMPI* A_; +- +-}; // mfem::STRUMPACKRowLocMatrix ++ strumpack::CSRMatrixMPI *A_; ++}; + + /** The MFEM STRUMPACK Direct Solver class. + + The mfem::STRUMPACKSolver class uses the STRUMPACK library to perform LU + factorization of a parallel sparse matrix. The solver is capable of handling +- double precision types. See http://portal.nersc.gov/project/sparse/strumpack ++ double precision types. See ++ http://portal.nersc.gov/project/sparse/strumpack/. + */ +-class STRUMPACKSolver : public mfem::Solver ++template ++class STRUMPACKSolverBase : public Solver + { +-public: +- // Constructor with MPI_Comm parameter. +- STRUMPACKSolver( int argc, char* argv[], MPI_Comm comm ); ++protected: ++ // Constructor with MPI_Comm parameter and command line arguments. ++ STRUMPACKSolverBase(MPI_Comm comm, int argc, char *argv[]); + +- // Constructor with STRUMPACK Matrix Object. +- STRUMPACKSolver( STRUMPACKRowLocMatrix & A); ++ // Constructor with STRUMPACK matrix object and command line arguments. ++ STRUMPACKSolverBase(STRUMPACKRowLocMatrix &A, int argc, char *argv[]); + ++public: + // Default destructor. +- ~STRUMPACKSolver( void ); ++ virtual ~STRUMPACKSolverBase(); + + // Factor and solve the linear system y = Op^{-1} x. +- void Mult( const Vector & x, Vector & y ) const; ++ void Mult(const Vector &x, Vector &y) const; ++ void ArrayMult(const Array &X, Array &Y) const; + + // Set the operator. +- void SetOperator( const Operator & op ); ++ void SetOperator(const Operator &op); + + // Set various solver options. Refer to STRUMPACK documentation for + // details. +- void SetFromCommandLine( ); +- void SetPrintFactorStatistics( bool print_stat ); +- void SetPrintSolveStatistics( bool print_stat ); +- void SetRelTol( double rtol ); +- void SetAbsTol( double atol ); ++ void SetFromCommandLine(); ++ void SetPrintFactorStatistics(bool print_stat); ++ void SetPrintSolveStatistics(bool print_stat); ++ ++ // Set tolerances and iterations for iterative solvers. Compression ++ // tolerance is handled below. ++ void SetRelTol(double rtol); ++ void SetAbsTol(double atol); ++ void SetMaxIter(int max_it); ++ ++ // Set the flag controlling reuse of the symbolic factorization for multiple ++ // operators. This method has to be called before repeated calls to ++ // SetOperator. ++ void SetReorderingReuse(bool reuse); ++ ++ // Enable or not GPU off-loading available if STRUMPACK was compiled with CUDA. Note ++ // that input/output from MFEM to STRUMPACK is all still through host memory. ++ void EnableGPU(); ++ void DisableGPU(); + + /** + * STRUMPACK is an (approximate) direct solver. It can be used as a direct +@@ -100,70 +119,153 @@ public: + * used without preconditioner. + * + * Supported values are: +- * AUTO: Use iterative refinement if no HSS compression is used, +- * otherwise use GMRes. +- * DIRECT: No outer iterative solver, just a single application of +- * the multifrontal solver. +- * REFINE: Iterative refinement. +- * PREC_GMRES: Preconditioned GMRes. +- * The preconditioner is the (approx) multifrontal solver. +- * GMRES: UN-preconditioned GMRes. (for testing mainly) +- * PREC_BICGSTAB: Preconditioned BiCGStab. +- * The preconditioner is the (approx) multifrontal solver. ++ * AUTO: Use iterative refinement if no HSS compression is ++ * used, otherwise use GMRes ++ * DIRECT: No outer iterative solver, just a single application ++ * of the multifrontal solver ++ * REFINE: Iterative refinement ++ * PREC_GMRES: Preconditioned GMRes ++ * The preconditioner is the (approx) multifrontal solver ++ * GMRES: UN-preconditioned GMRes (for testing mainly) ++ * PREC_BICGSTAB: Preconditioned BiCGStab ++ * The preconditioner is the (approx) multifrontal solver + * BICGSTAB: UN-preconditioned BiCGStab. (for testing mainly) + */ +- void SetKrylovSolver( strumpack::KrylovSolver method ); ++ void SetKrylovSolver(strumpack::KrylovSolver method); + + /** + * Supported reorderings are: +- * METIS, PARMETIS, SCOTCH, PTSCOTCH, RCM ++ * NATURAL: Do not reorder the system ++ * METIS: Use Metis nested-dissection reordering (default) ++ * PARMETIS: Use ParMetis nested-dissection reordering ++ * SCOTCH: Use Scotch nested-dissection reordering ++ * PTSCOTCH: Use PT-Scotch nested-dissection reordering ++ * RCM: Use RCM reordering ++ * GEOMETRIC: A simple geometric nested dissection code that ++ * only works for regular meshes ++ * AMD: Approximate minimum degree ++ * MMD: Multiple minimum degree ++ * AND: Nested dissection ++ * MLF: Minimum local fill ++ * SPECTRAL: Spectral nested dissection + */ +- void SetReorderingStrategy( strumpack::ReorderingStrategy method ); ++ void SetReorderingStrategy(strumpack::ReorderingStrategy method); + + /** +- * Disable static pivoting for stability. The static pivoting in strumpack ++ * Configure static pivoting for stability. The static pivoting in STRUMPACK + * permutes the sparse input matrix in order to get large (nonzero) elements + * on the diagonal. If the input matrix is already diagonally dominant, this + * reordering can be disabled. ++ * ++ * Supported matching algorithms are: ++ * NONE: Don't do anything ++ * MAX_CARDINALITY: Maximum cardinality ++ * MAX_SMALLEST_DIAGONAL: Maximum smallest diagonal value ++ * MAX_SMALLEST_DIAGONAL_2: Same as MAX_SMALLEST_DIAGONAL ++ * but different algorithm ++ * MAX_DIAGONAL_SUM: Maximum sum of diagonal values ++ * MAX_DIAGONAL_PRODUCT_SCALING: Maximum product of diagonal values ++ * and row and column scaling (default) ++ * COMBBLAS: Use AWPM from CombBLAS (only with ++ * version >= 3) + */ +- void DisableMatching(); +- +- /** +- * Enable static pivoting for stability using the MC64 algorithm with +- * job=5. Using a matching algorithm, this will permute the sparse input +- * matrix in order to get nonzero elements (as large as possible) on the +- * diagonal. And will also scale the rows and columns of the matrix. +- */ +- void EnableMatching(); ++ void SetMatching(strumpack::MatchingJob job); + +-#if STRUMPACK_VERSION_MAJOR >= 3 + /** +- * Use the AWPM (approximate weight perfect matching) algorithm from the +- * Combinatorial BLAS library for static pivoting, i.e. getting large +- * nonzeros on the diagonal. This requires that strumpack was compiled with +- * support for Combinatorial BLAS. ++ * Enable support for rank-structured data formats, which can be used ++ * for compression within the sparse solver. ++ * ++ * Supported compression types are: ++ * NONE: No compression, purely direct solver (default) ++ * HSS: HSS compression of frontal matrices ++ * BLR: Block low-rank compression of fronts ++ * HODLR: Hierarchically Off-diagonal Low-Rank ++ * compression of frontal matrices ++ * BLR_HODLR: Block low-rank compression of medium ++ * fronts and Hierarchically Off-diagonal ++ * Low-Rank compression of large fronts ++ * ZFP_BLR_HODLR: ZFP compression for small fronts, ++ * Block low-rank compression of medium ++ * fronts and Hierarchically Off-diagonal ++ * Low-Rank compression of large fronts ++ * LOSSLESS: Lossless compression ++ * LOSSY: Lossy compression ++ * ++ * For versions of STRUMPACK < 5, we support only NONE, HSS, and BLR. ++ * BLR_HODLR and ZPR_BLR_HODLR are supported in STRUMPACK >= 6. + */ +- void EnableParallelMatching(); ++ void SetCompression(strumpack::CompressionType type); ++ void SetCompressionRelTol(double rtol); ++ void SetCompressionAbsTol(double atol); ++#if STRUMPACK_VERSION_MAJOR >= 5 ++ void SetCompressionLossyPrecision(int precision); ++ void SetCompressionButterflyLevels(int levels); + #endif + + private: +- void Init( int argc, char* argv[] ); ++ // Helper method for calling the STRUMPACK factoriation routine. ++ void FactorInternal() const; + + protected: +- +- MPI_Comm comm_; +- int numProcs_; +- int myid_; ++ const STRUMPACKRowLocMatrix *APtr_; ++ STRUMPACKSolverType *solver_; + + bool factor_verbose_; + bool solve_verbose_; ++ bool reorder_reuse_; ++ ++ mutable Vector rhs_, sol_; ++ mutable int nrhs_; ++}; + +- const STRUMPACKRowLocMatrix * APtr_; +- strumpack::StrumpackSparseSolverMPIDist * solver_; ++class STRUMPACKSolver : ++ public STRUMPACKSolverBase> ++{ ++public: ++ // Constructor with MPI_Comm parameter. ++ STRUMPACKSolver(MPI_Comm comm); ++ ++ // Constructor with STRUMPACK matrix object. ++ STRUMPACKSolver(STRUMPACKRowLocMatrix &A); + +-}; // mfem::STRUMPACKSolver class ++ // Constructor with MPI_Comm parameter and command line arguments. ++ STRUMPACKSolver(MPI_Comm comm, int argc, char *argv[]); ++ MFEM_DEPRECATED STRUMPACKSolver(int argc, char *argv[], MPI_Comm comm) ++ : STRUMPACKSolver(comm, argc, argv) {} ++ ++ // Constructor with STRUMPACK matrix object and command line arguments. ++ STRUMPACKSolver(STRUMPACKRowLocMatrix &A, int argc, char *argv[]); ++ ++ // Destructor. ++ ~STRUMPACKSolver() {} ++}; ++ ++#if STRUMPACK_VERSION_MAJOR >= 7 ++class STRUMPACKMixedPrecisionSolver : ++ public STRUMPACKSolverBase> ++{ ++public: ++ // Constructor with MPI_Comm parameter. ++ STRUMPACKMixedPrecisionSolver(MPI_Comm comm); ++ ++ // Constructor with STRUMPACK matrix object. ++ STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A); ++ ++ // Constructor with MPI_Comm parameter and command line arguments. ++ STRUMPACKMixedPrecisionSolver(MPI_Comm comm, int argc, char *argv[]); ++ ++ // Constructor with STRUMPACK matrix object and command line arguments. ++ STRUMPACKMixedPrecisionSolver(STRUMPACKRowLocMatrix &A, ++ int argc, char *argv[]); ++ ++ // Destructor. ++ ~STRUMPACKMixedPrecisionSolver() {} ++}; ++#endif + +-} // mfem namespace ++} // namespace mfem + + #endif // MFEM_USE_MPI + #endif // MFEM_USE_STRUMPACK +diff --git a/linalg/superlu.cpp b/linalg/superlu.cpp +index 4633eb9ef..c120c4f22 100644 +--- a/linalg/superlu.cpp ++++ b/linalg/superlu.cpp +@@ -650,6 +650,7 @@ void SuperLUSolver::ArrayMult(const Array &X, + MFEM_ASSERT(X[i], "Missing Vector in SuperLUSolver::Mult!"); + Vector s(sol_, i * ldx, ldx); + s = *X[i]; ++ sol_.SyncMemory(s); // Update flags for sol_ if updated on device + } + } + +diff --git a/miniapps/multidomain/multidomain.cpp b/miniapps/multidomain/multidomain.cpp +index dfb669224..83ebb9813 100644 +--- a/miniapps/multidomain/multidomain.cpp ++++ b/miniapps/multidomain/multidomain.cpp +@@ -322,17 +322,19 @@ int main(int argc, char *argv[]) + + char vishost[] = "localhost"; + int visport = 19916; +- socketstream cyl_sol_sock(vishost, visport); ++ socketstream cyl_sol_sock; + if (visualization) + { ++ cyl_sol_sock.open(vishost, visport); + cyl_sol_sock << "parallel " << num_procs << " " << myid << "\n"; + cyl_sol_sock.precision(8); + cyl_sol_sock << "solution\n" << cylinder_submesh << temperature_cylinder_gf << + "pause\n" << std::flush; + } +- socketstream block_sol_sock(vishost, visport); ++ socketstream block_sol_sock; + if (visualization) + { ++ block_sol_sock.open(vishost, visport); + block_sol_sock << "parallel " << num_procs << " " << myid << "\n"; + block_sol_sock.precision(8); + block_sol_sock << "solution\n" << block_submesh << temperature_block_gf << +diff --git a/miniapps/nurbs/nurbs_ex11p.cpp b/miniapps/nurbs/nurbs_ex11p.cpp +index 7b8e3bd2d..e5cf95062 100644 +--- a/miniapps/nurbs/nurbs_ex11p.cpp ++++ b/miniapps/nurbs/nurbs_ex11p.cpp +@@ -281,12 +281,13 @@ int main(int argc, char *argv[]) + #ifdef MFEM_USE_STRUMPACK + if (sp_solver) + { +- STRUMPACKSolver * strumpack = new STRUMPACKSolver(argc, argv, MPI_COMM_WORLD); ++ STRUMPACKSolver * strumpack = new STRUMPACKSolver(MPI_COMM_WORLD, argc, argv); + strumpack->SetPrintFactorStatistics(true); + strumpack->SetPrintSolveStatistics(false); + strumpack->SetKrylovSolver(strumpack::KrylovSolver::DIRECT); + strumpack->SetReorderingStrategy(strumpack::ReorderingStrategy::METIS); +- strumpack->DisableMatching(); ++ strumpack->SetMatching(strumpack::MatchingJob::NONE); ++ strumpack->SetCompression(strumpack::CompressionType::NONE); + strumpack->SetOperator(*Arow); + strumpack->SetFromCommandLine(); + precond = strumpack; +diff --git a/tests/unit/linalg/test_direct_solvers.cpp b/tests/unit/linalg/test_direct_solvers.cpp +index 48a1ac30e..5b74a98fa 100644 +--- a/tests/unit/linalg/test_direct_solvers.cpp ++++ b/tests/unit/linalg/test_direct_solvers.cpp +@@ -26,6 +26,9 @@ using namespace mfem; + #ifdef MFEM_USE_SUPERLU + #define DIRECT_SOLVE_PARALLEL + #endif ++#ifdef MFEM_USE_STRUMPACK ++#define DIRECT_SOLVE_PARALLEL ++#endif + + #if defined(DIRECT_SOLVE_SERIAL) || defined(DIRECT_SOLVE_PARALLEL) + +@@ -103,7 +106,7 @@ TEST_CASE("Serial Direct Solvers", "[CUDA]") + Mesh mesh; + if (dim == 1) + { +- mesh = Mesh::MakeCartesian1D(ne, 1.0); ++ mesh = Mesh::MakeCartesian1D(ne, 1.0); + } + else if (dim == 2) + { +@@ -187,13 +190,13 @@ TEST_CASE("Parallel Direct Solvers", "[Parallel], [CUDA]") + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +- const int ne = 2; ++ const int ne = 4; + for (int dim = 1; dim < 4; ++dim) + { + Mesh mesh; + if (dim == 1) + { +- mesh = Mesh::MakeCartesian1D(ne, 1.0); ++ mesh = Mesh::MakeCartesian1D(ne, 1.0); + } + else if (dim == 2) + { +@@ -312,6 +315,39 @@ TEST_CASE("Parallel Direct Solvers", "[Parallel], [CUDA]") + REQUIRE(error < 1.e-12); + } + #endif ++#ifdef MFEM_USE_STRUMPACK ++ // Transform to monolithic HypreParMatrix ++ { ++ STRUMPACKRowLocMatrix SA(*A.As()); ++ STRUMPACKSolver strumpack(MPI_COMM_WORLD); ++ strumpack.SetPrintFactorStatistics(false); ++ strumpack.SetPrintSolveStatistics(false); ++ strumpack.SetKrylovSolver(strumpack::KrylovSolver::DIRECT); ++ strumpack.SetReorderingStrategy(dim > 1 ? strumpack::ReorderingStrategy::METIS : ++ strumpack::ReorderingStrategy::NATURAL); ++ strumpack.SetOperator(SA); ++ strumpack.Mult(B, X); ++ ++ Vector Y(X.Size()); ++ A->Mult(X, Y); ++ Y -= B; ++ REQUIRE(Y.Norml2() < 1.e-12); ++ ++ strumpack.ArrayMult(BB, XX); ++ ++ for (int i = 0; i < XX.Size(); i++) ++ { ++ A->Mult(*XX[i], Y); ++ Y -= *BB[i]; ++ REQUIRE(Y.Norml2() < 1.e-12); ++ } ++ ++ a.RecoverFEMSolution(X, b, x); ++ VectorFunctionCoefficient grad(dim, gradexact); ++ double error = x.ComputeH1Error(&uex, &grad); ++ REQUIRE(error < 1.e-12); ++ } ++#endif + } + } + diff --git a/extern/patch/mumps/patch_build.diff b/extern/patch/mumps/patch_build.diff index 389cbf3930..ccd4ca11cb 100644 --- a/extern/patch/mumps/patch_build.diff +++ b/extern/patch/mumps/patch_build.diff @@ -1,845 +1,877 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index e81d19d..1cdd28e 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -48,10 +48,7 @@ find_package(Threads) - - if(parallel) - find_package(MPI COMPONENTS C Fortran REQUIRED) -- if(NOT DEFINED ENV{MKLROOT}) -- # oneMKL MKLConfig.cmake must be invoked only once -- include(cmake/lapack.cmake) -- endif() -+ include(cmake/lapack.cmake) - include(cmake/scalapack.cmake) - - set(NUMERIC_LIBS SCALAPACK::SCALAPACK LAPACK::LAPACK) -@@ -87,7 +84,7 @@ if(scotch) - endif() - - if(parmetis) -- find_package(METIS REQUIRED COMPONENTS parallel) -+ find_package(METIS REQUIRED COMPONENTS ParMETIS) - list(APPEND ORDERING_DEFS parmetis metis) - list(APPEND ORDERING_LIBS METIS::METIS) - elseif(metis) -@@ -98,15 +95,6 @@ endif() - - list(APPEND ORDERING_LIBS pord) - --install(FILES --${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindLAPACK.cmake --${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindSCALAPACK.cmake --${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindMETIS.cmake --${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindScotch.cmake --${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindMUMPS.cmake --DESTINATION cmake --) -- - message(STATUS "MUMPS ORDERING_DEFS: ${ORDERING_DEFS}") - message(STATUS "MUMPS ORDERING_LIBS: ${ORDERING_LIBS}") - -diff --git a/cmake/FindLAPACK.cmake b/cmake/FindLAPACK.cmake -deleted file mode 100644 -index 136c16d..0000000 ---- a/cmake/FindLAPACK.cmake -+++ /dev/null -@@ -1,501 +0,0 @@ --# Distributed under the OSI-approved BSD 3-Clause License. See accompanying --# file Copyright.txt or https://cmake.org/licensing for details. -- --#[=======================================================================[.rst: -- --FindLapack ------------ -- --* Michael Hirsch, Ph.D. www.scivision.dev --* David Eklund -- --Let Michael know if there are more MKL / Lapack / compiler combination you want. --Refer to https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor -- --Finds LAPACK libraries for C / C++ / Fortran. --Works with Netlib Lapack / LapackE, Atlas and Intel MKL. --Intel MKL relies on having environment variable MKLROOT set, typically by sourcing --mklvars.sh beforehand. -- --Why not the FindLapack.cmake built into CMake? It has a lot of old code for --infrequently used Lapack libraries and is unreliable for me. -- --Tested on Linux, MacOS and Windows with: --* GCC / Gfortran --* Clang / Flang --* Intel (icc, ifort) --* Cray -- -- --Parameters --^^^^^^^^^^ -- --COMPONENTS default to Netlib LAPACK / LapackE, otherwise: -- --``MKL`` -- Intel MKL -- sequential by default, or add TBB or MPI as well --``MKL64`` -- MKL only: 64-bit integers (default is 32-bit integers) --``TBB`` -- Intel MPI + TBB for MKL --``AOCL`` -- AMD Optimizing CPU Libraries -- --``LAPACKE`` -- Netlib LapackE for C / C++ --``Netlib`` -- Netlib Lapack for Fortran --``OpenBLAS`` -- OpenBLAS Lapack for Fortran -- --``LAPACK95`` -- get Lapack95 interfaces for MKL or Netlib (must also specify one of MKL, Netlib) -- --``STATIC`` -- Library search default on non-Windows is shared then static. On Windows default search is static only. -- Specifying STATIC component searches for static libraries only. -- -- --Result Variables --^^^^^^^^^^^^^^^^ -- --``LAPACK_FOUND`` -- Lapack libraries were found --``LAPACK__FOUND`` -- LAPACK specified was found --``LAPACK_LIBRARIES`` -- Lapack library files (including BLAS --``LAPACK_INCLUDE_DIRS`` -- Lapack include directories (for C/C++) -- -- --References --^^^^^^^^^^ -- --* Pkg-Config and MKL: https://software.intel.com/en-us/articles/intel-math-kernel-library-intel-mkl-and-pkg-config-tool --* MKL for Windows: https://software.intel.com/en-us/mkl-windows-developer-guide-static-libraries-in-the-lib-intel64-win-directory --* MKL Windows directories: https://software.intel.com/en-us/mkl-windows-developer-guide-high-level-directory-structure --* Atlas http://math-atlas.sourceforge.net/errata.html#LINK --* MKL LAPACKE (C, C++): https://software.intel.com/en-us/mkl-linux-developer-guide-calling-lapack-blas-and-cblas-routines-from-c-c-language-environments --#]=======================================================================] -- --include(CheckFortranSourceCompiles) -- --# clear to avoid endless appending on subsequent calls --set(LAPACK_LIBRARY) --unset(LAPACK_INCLUDE_DIR) -- --# ===== functions ========== -- --function(atlas_libs) -- --find_library(ATLAS_LIB --NAMES atlas --PATH_SUFFIXES atlas --DOC "ATLAS library" --) -- --find_library(LAPACK_ATLAS --NAMES ptlapack lapack_atlas lapack --NAMES_PER_DIR --PATH_SUFFIXES atlas --DOC "LAPACK ATLAS library" --) -- --find_library(BLAS_LIBRARY --NAMES ptf77blas f77blas blas --NAMES_PER_DIR --PATH_SUFFIXES atlas --DOC "BLAS ATLAS library" --) -- --# === C === --find_library(BLAS_C_ATLAS --NAMES ptcblas cblas --NAMES_PER_DIR --PATH_SUFFIXES atlas --DOC "BLAS C ATLAS library" --) -- --find_path(LAPACK_INCLUDE_DIR --NAMES cblas-atlas.h cblas.h clapack.h --DOC "ATLAS headers" --) -- --#=========== --if(LAPACK_ATLAS AND BLAS_C_ATLAS AND BLAS_LIBRARY AND ATLAS_LIB) -- set(LAPACK_Atlas_FOUND true PARENT_SCOPE) -- set(LAPACK_LIBRARY ${LAPACK_ATLAS} ${BLAS_C_ATLAS} ${BLAS_LIBRARY} ${ATLAS_LIB}) -- list(APPEND LAPACK_LIBRARY ${CMAKE_THREAD_LIBS_INIT}) --endif() -- --set(LAPACK_LIBRARY ${LAPACK_LIBRARY} PARENT_SCOPE) -- --endfunction(atlas_libs) -- --#======================= -- --function(netlib_libs) -- --if(LAPACK95 IN_LIST LAPACK_FIND_COMPONENTS) -- find_path(LAPACK95_INCLUDE_DIR -- NAMES f95_lapack.mod -- HINTS ${LAPACK95_ROOT} ENV LAPACK95_ROOT -- PATH_SUFFIXES include -- DOC "LAPACK95 Fortran module" -- ) -- -- find_library(LAPACK95_LIBRARY -- NAMES lapack95 -- HINTS ${LAPACK95_ROOT} ENV LAPACK95_ROOT -- DOC "LAPACK95 library" -- ) -- -- if(NOT (LAPACK95_LIBRARY AND LAPACK95_INCLUDE_DIR)) -- return() -- endif() -- -- set(LAPACK95_LIBRARY ${LAPACK95_LIBRARY} PARENT_SCOPE) -- set(LAPACK_LAPACK95_FOUND true PARENT_SCOPE) --endif(LAPACK95 IN_LIST LAPACK_FIND_COMPONENTS) -- --find_library(LAPACK_LIBRARY --NAMES lapack --PATH_SUFFIXES lapack lapack/lib --DOC "LAPACK library" --) --if(NOT LAPACK_LIBRARY) -- return() --endif() -- --if(LAPACKE IN_LIST LAPACK_FIND_COMPONENTS) -- -- find_library(LAPACKE_LIBRARY -- NAMES lapacke -- PATH_SUFFIXES lapack lapack/lib -- DOC "LAPACKE library" -- ) -- -- # lapack/include for Homebrew -- find_path(LAPACKE_INCLUDE_DIR -- NAMES lapacke.h -- PATH_SUFFIXES lapack lapack/include -- DOC "LAPACKE include directory" -- ) -- if(NOT (LAPACKE_LIBRARY AND LAPACKE_INCLUDE_DIR)) -- return() -- endif() -- -- set(LAPACK_LAPACKE_FOUND true PARENT_SCOPE) -- list(APPEND LAPACK_INCLUDE_DIR ${LAPACKE_INCLUDE_DIR}) -- list(APPEND LAPACK_LIBRARY ${LAPACKE_LIBRARY}) -- mark_as_advanced(LAPACKE_LIBRARY LAPACKE_INCLUDE_DIR) --endif(LAPACKE IN_LIST LAPACK_FIND_COMPONENTS) -- --# Netlib on Cygwin and others -- --find_library(BLAS_LIBRARY --NAMES refblas blas --NAMES_PER_DIR --PATH_SUFFIXES lapack lapack/lib blas --DOC "BLAS library" --) -- --if(NOT BLAS_LIBRARY) -- return() --endif() -- --list(APPEND LAPACK_LIBRARY ${BLAS_LIBRARY}) --set(LAPACK_Netlib_FOUND true PARENT_SCOPE) -- --list(APPEND LAPACK_LIBRARY ${CMAKE_THREAD_LIBS_INIT}) -- --set(LAPACK_LIBRARY ${LAPACK_LIBRARY} PARENT_SCOPE) -- --endfunction(netlib_libs) -- --#=============================== --function(openblas_libs) -- --find_library(LAPACK_LIBRARY --NAMES openblas --PATH_SUFFIXES openblas --DOC "OpenBLAS library" --) -- --find_path(LAPACK_INCLUDE_DIR --NAMES openblas_config.h cblas-openblas.h --DOC "OpenBLAS include directory" --) -- --if(NOT LAPACK_LIBRARY) -- return() --endif() -- --set(BLAS_LIBRARY ${LAPACK_LIBRARY} CACHE FILEPATH "OpenBLAS library") -- --set(LAPACK_OpenBLAS_FOUND true PARENT_SCOPE) -- --list(APPEND LAPACK_LIBRARY ${CMAKE_THREAD_LIBS_INIT}) -- --set(LAPACK_LIBRARY ${LAPACK_LIBRARY} PARENT_SCOPE) -- --endfunction(openblas_libs) -- -- --function(aocl_libs) -- --set(_names flame) --if(WIN32) -- if(BUILD_SHARED_LIBS) -- list(APPEND _names AOCL-LibFlame-Win-MT-dll AOCL-LibFlame-Win-dll) -- else() -- list(APPEND _names AOCL-LibFlame-Win-MT AOCL-LibFlame-Win) -- endif() --endif() -- --find_library(LAPACK_LIBRARY --NAMES ${_names} --NAMES_PER_DIR --PATH_SUFFIXES LP64 --DOC "LAPACK Flame library" --) -- --set(_names blis-mt blis) --if(WIN32) -- if(BUILD_SHARED_LIBS) -- list(APPEND _names AOCL-LibBlis-Win-MT-dll AOCL-LibBlis-Win-dll) -- else() -- list(APPEND _names AOCL-LibBlis-Win-MT AOCL-LibBlis-Win) -- endif() --endif() -- --find_library(BLAS_LIBRARY --NAMES ${_names} --NAMES_PER_DIR --PATH_SUFFIXES LP64 --DOC "BLAS Blis library" --) -- --if(NOT (LAPACK_LIBRARY AND BLAS_LIBRARY)) -- return() --endif() -- --find_path(LAPACK_INCLUDE_DIR --NAMES FLAME.h --PATH_SUFFIXES LP64 --DOC "Flame header" --) -- --find_path(BLAS_INCLUDE_DIR --NAMES blis.h --PATH_SUFFIXES LP64 --DOC "Blis header" --) -- --if(NOT (LAPACK_INCLUDE_DIR AND BLAS_INCLUDE_DIR)) -- return() --endif() -- -- --set(LAPACK_AOCL_FOUND true PARENT_SCOPE) --set(LAPACK_LIBRARY ${LAPACK_LIBRARY} ${BLAS_LIBRARY} ${CMAKE_THREAD_LIBS_INIT} PARENT_SCOPE) --set(LAPACK_INCLUDE_DIR ${LAPACK_INCLUDE_DIR} ${BLAS_INCLUDE_DIR} PARENT_SCOPE) -- --endfunction(aocl_libs) -- --#=============================== -- --macro(find_mkl_libs) --# https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-linux/2023-2/cmake-config-for-onemkl.html -- --set(MKL_INTERFACE "lp64") --if(MKL64 IN_LIST LAPACK_FIND_COMPONENTS) -- string(PREPEND MKL_INTERFACE "i") --endif() -- --if(LAPACK95 IN_LIST LAPACK_FIND_COMPONENTS) -- set(ENABLE_BLAS95 true) -- set(ENABLE_LAPACK95 true) --endif() -- --# MKL_THREADING default: "intel_thread" which is Intel OpenMP --if(TBB IN_LIST LAPACK_FIND_COMPONENTS) -- set(MKL_THREADING "tbb_thread") --endif() -- --# default: dynamic --if(STATIC IN_LIST LAPACK_FIND_COMPONENTS) -- set(MKL_LINK "static") --endif() -- --find_package(MKL CONFIG HINTS $ENV{MKLROOT}) -- --if(NOT MKL_FOUND) -- return() --endif() -- --# get_property(LAPACK_COMPILE_OPTIONS TARGET MKL::MKL PROPERTY INTERFACE_COMPILE_OPTIONS) --# flags are empty generator expressions that trip up check_source_compiles -- --get_property(LAPACK_INCLUDE_DIR TARGET MKL::MKL PROPERTY INTERFACE_INCLUDE_DIRECTORIES) --get_property(LAPACK_LIBRARY TARGET MKL::MKL PROPERTY INTERFACE_LINK_LIBRARIES) -- -- --set(LAPACK_MKL_FOUND true) -- --foreach(c IN ITEMS TBB LAPACK95 MKL64) -- if(${c} IN_LIST LAPACK_FIND_COMPONENTS) -- set(LAPACK_${c}_FOUND true) -- endif() --endforeach() -- --endmacro(find_mkl_libs) -- --# ========== main program -- --set(lapack_cray false) --if(DEFINED ENV{CRAYPE_VERSION}) -- set(lapack_cray true) --endif() -- --if(NOT (lapack_cray -- OR OpenBLAS IN_LIST LAPACK_FIND_COMPONENTS -- OR Netlib IN_LIST LAPACK_FIND_COMPONENTS -- OR Atlas IN_LIST LAPACK_FIND_COMPONENTS -- OR MKL IN_LIST LAPACK_FIND_COMPONENTS -- OR AOCL IN_LIST LAPACK_FIND_COMPONENTS)) -- if(DEFINED ENV{MKLROOT}) -- list(APPEND LAPACK_FIND_COMPONENTS MKL) -- else() -- list(APPEND LAPACK_FIND_COMPONENTS Netlib) -- endif() --endif() -- --find_package(Threads) -- --if(STATIC IN_LIST LAPACK_FIND_COMPONENTS) -- set(_orig_suff ${CMAKE_FIND_LIBRARY_SUFFIXES}) -- set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_STATIC_LIBRARY_SUFFIX}) --endif() -- --if(MKL IN_LIST LAPACK_FIND_COMPONENTS OR MKL64 IN_LIST LAPACK_FIND_COMPONENTS) -- find_mkl_libs() --elseif(Atlas IN_LIST LAPACK_FIND_COMPONENTS) -- atlas_libs() --elseif(Netlib IN_LIST LAPACK_FIND_COMPONENTS) -- netlib_libs() --elseif(OpenBLAS IN_LIST LAPACK_FIND_COMPONENTS) -- openblas_libs() --elseif(AOCL IN_LIST LAPACK_FIND_COMPONENTS) -- aocl_libs() --elseif(lapack_cray) -- # LAPACK is implicitly part of Cray PE LibSci, use Cray compiler wrapper. --endif() -- --if(STATIC IN_LIST LAPACK_FIND_COMPONENTS) -- if(LAPACK_LIBRARY) -- set(LAPACK_STATIC_FOUND true) -- endif() -- set(CMAKE_FIND_LIBRARY_SUFFIXES ${_orig_suff}) --endif() -- --# -- verify library works -- --function(lapack_check) -- --get_property(enabled_langs GLOBAL PROPERTY ENABLED_LANGUAGES) --if(NOT Fortran IN_LIST enabled_langs) -- set(LAPACK_links true PARENT_SCOPE) -- return() --endif() -- --set(CMAKE_REQUIRED_FLAGS) --set(CMAKE_REQUIRED_LINK_OPTIONS) --set(CMAKE_REQUIRED_INCLUDES ${LAPACK_INCLUDE_DIR}) --set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARY}) -- --check_fortran_source_compiles( --"program check_lapack --use, intrinsic :: iso_fortran_env, only : real32 --implicit none --real(real32), external :: snrm2 --print *, snrm2(1, [0._real32], 1) --end program" --LAPACK_s_FOUND --SRC_EXT f90 --) -- --check_fortran_source_compiles( --"program check_lapack --use, intrinsic :: iso_fortran_env, only : real64 --implicit none --real(real64), external :: dnrm2 --print *, dnrm2(1, [0._real64], 1) --end program" --LAPACK_d_FOUND --SRC_EXT f90 --) -- --if(LAPACK_s_FOUND OR LAPACK_d_FOUND) -- set(LAPACK_links true PARENT_SCOPE) --endif() -- --endfunction(lapack_check) -- --# --- Check library links --if(lapack_cray OR LAPACK_LIBRARY) -- lapack_check() --endif() -- -- --include(FindPackageHandleStandardArgs) -- --if(lapack_cray) -- find_package_handle_standard_args(LAPACK HANDLE_COMPONENTS -- REQUIRED_VARS LAPACK_links -- ) --else() -- find_package_handle_standard_args(LAPACK HANDLE_COMPONENTS -- REQUIRED_VARS LAPACK_LIBRARY LAPACK_links -- ) --endif() -- -- --set(BLAS_LIBRARIES ${BLAS_LIBRARY}) --set(LAPACK_LIBRARIES ${LAPACK_LIBRARY}) --set(LAPACK_INCLUDE_DIRS ${LAPACK_INCLUDE_DIR}) -- --if(LAPACK_FOUND) --# need if _FOUND guard as can't overwrite imported target even if bad -- -- --message(VERBOSE "Lapack libraries: ${LAPACK_LIBRARIES} --Lapack include directories: ${LAPACK_INCLUDE_DIRS}") -- --if(NOT TARGET BLAS::BLAS) -- add_library(BLAS::BLAS INTERFACE IMPORTED) -- set_property(TARGET BLAS::BLAS PROPERTY INTERFACE_LINK_LIBRARIES "${BLAS_LIBRARY}") --endif() -- --if(NOT TARGET LAPACK::LAPACK) -- add_library(LAPACK::LAPACK INTERFACE IMPORTED) -- set_property(TARGET LAPACK::LAPACK PROPERTY INTERFACE_COMPILE_OPTIONS "${LAPACK_COMPILE_OPTIONS}") -- set_property(TARGET LAPACK::LAPACK PROPERTY INTERFACE_LINK_LIBRARIES "${LAPACK_LIBRARY}") -- set_property(TARGET LAPACK::LAPACK PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${LAPACK_INCLUDE_DIR}") --endif() -- --if(LAPACK_LAPACK95_FOUND) -- set(LAPACK95_LIBRARIES ${LAPACK95_LIBRARY}) -- set(LAPACK95_INCLUDE_DIRS ${LAPACK95_INCLUDE_DIR}) -- -- if(NOT TARGET LAPACK::LAPACK95) -- add_library(LAPACK::LAPACK95 INTERFACE IMPORTED) -- set_property(TARGET LAPACK::LAPACK95 PROPERTY INTERFACE_LINK_LIBRARIES "${LAPACK95_LIBRARY}") -- set_property(TARGET LAPACK::LAPACK95 PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${LAPACK95_INCLUDE_DIR}") -- endif() --endif() -- --endif(LAPACK_FOUND) -- --mark_as_advanced(LAPACK_LIBRARY LAPACK_INCLUDE_DIR) -diff --git a/cmake/FindMETIS.cmake b/cmake/FindMETIS.cmake -index 0d866df..9a39a15 100644 ---- a/cmake/FindMETIS.cmake -+++ b/cmake/FindMETIS.cmake -@@ -26,24 +26,24 @@ METIS_INCLUDE_DIRS - #]=======================================================================] - - --if(parallel IN_LIST METIS_FIND_COMPONENTS) -+if(ParMETIS IN_LIST METIS_FIND_COMPONENTS) - find_library(PARMETIS_LIBRARY -- NAMES parmetis -- PATH_SUFFIXES METIS libmetis -- DOC "ParMETIS library" -- ) -+ NAMES parmetis -+ PATH_SUFFIXES METIS libmetis -+ DOC "ParMETIS library" -+ ) - if(PARMETIS_LIBRARY) -- set(METIS_parallel_FOUND true) -+ set(METIS_ParMETIS_FOUND true) - endif() - endif() - - find_library(METIS_LIBRARY -- NAMES metis -- PATH_SUFFIXES METIS libmetis -- DOC "METIS library" -- ) -+NAMES metis -+PATH_SUFFIXES METIS libmetis -+DOC "METIS library" -+) - --if(parallel IN_LIST METIS_FIND_COMPONENTS) -+if(ParMETIS IN_LIST METIS_FIND_COMPONENTS) - set(metis_inc parmetis.h) - else() - set(metis_inc metis.h) -@@ -55,25 +55,25 @@ PATH_SUFFIXES METIS openmpi-x86_64 mpich-x86_64 - DOC "METIS include directory" - ) - -+set(METIS_LIBRARIES ${PARMETIS_LIBRARY} ${METIS_LIBRARY}) -+ - include(FindPackageHandleStandardArgs) - find_package_handle_standard_args(METIS --REQUIRED_VARS METIS_LIBRARY METIS_INCLUDE_DIR -+REQUIRED_VARS METIS_LIBRARIES METIS_INCLUDE_DIR - HANDLE_COMPONENTS - ) - - if(METIS_FOUND) -+ set(METIS_INCLUDE_DIRS ${METIS_INCLUDE_DIR}) - --set(METIS_LIBRARIES ${PARMETIS_LIBRARY} ${METIS_LIBRARY}) --set(METIS_INCLUDE_DIRS ${METIS_INCLUDE_DIR}) -- --message(VERBOSE "METIS libraries: ${METIS_LIBRARIES} --METIS include directories: ${METIS_INCLUDE_DIRS}") -+ message(VERBOSE "METIS libraries: ${METIS_LIBRARIES} -+ METIS include directories: ${METIS_INCLUDE_DIRS}") - --if(NOT TARGET METIS::METIS) -- add_library(METIS::METIS INTERFACE IMPORTED) -- set_property(TARGET METIS::METIS PROPERTY INTERFACE_LINK_LIBRARIES "${METIS_LIBRARIES}") -- set_property(TARGET METIS::METIS PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${METIS_INCLUDE_DIR}") --endif() -+ if(NOT TARGET METIS::METIS) -+ add_library(METIS::METIS INTERFACE IMPORTED) -+ set_property(TARGET METIS::METIS PROPERTY INTERFACE_LINK_LIBRARIES "${METIS_LIBRARIES}") -+ set_property(TARGET METIS::METIS PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${METIS_INCLUDE_DIR}") -+ endif() - endif(METIS_FOUND) - - mark_as_advanced(METIS_INCLUDE_DIR METIS_LIBRARY PARMETIS_LIBRARY) -diff --git a/cmake/FindSCALAPACK.cmake b/cmake/FindSCALAPACK.cmake -index 56d7d9c..7a68153 100644 ---- a/cmake/FindSCALAPACK.cmake -+++ b/cmake/FindSCALAPACK.cmake -@@ -52,8 +52,6 @@ References - - include(CheckFortranSourceCompiles) - --set(SCALAPACK_LIBRARY) # avoids appending to prior FindScalapack -- - #===== functions - - function(scalapack_check) -@@ -64,11 +62,8 @@ find_package(Threads) - - set(CMAKE_REQUIRED_FLAGS) - set(CMAKE_REQUIRED_LINK_OPTIONS) --set(CMAKE_REQUIRED_INCLUDES ${SCALAPACK_INCLUDE_DIR} ${LAPACK_INCLUDE_DIRS} ${MPI_Fortran_INCLUDE_DIRS}) --set(CMAKE_REQUIRED_LIBRARIES ${SCALAPACK_LIBRARY}) --if(BLACS_LIBRARY) -- list(APPEND CMAKE_REQUIRED_LIBRARIES ${BLACS_LIBRARY}) --endif() -+set(CMAKE_REQUIRED_INCLUDES ${SCALAPACK_INCLUDE_DIRS} ${LAPACK_INCLUDE_DIRS} ${MPI_Fortran_INCLUDE_DIRS}) -+set(CMAKE_REQUIRED_LIBRARIES ${SCALAPACK_LIBRARIES}) - list(APPEND CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES} ${MPI_Fortran_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) - - if(STATIC IN_LIST SCALAPACK_FIND_COMPONENTS AND -@@ -188,40 +183,13 @@ endfunction(scalapack_lib) - - # === main - --set(scalapack_cray false) --if(DEFINED ENV{CRAYPE_VERSION}) -- set(scalapack_cray true) --endif() - --if(NOT scalapack_cray) -- if(NOT MKL IN_LIST SCALAPACK_FIND_COMPONENTS AND DEFINED ENV{MKLROOT}) -- list(APPEND SCALAPACK_FIND_COMPONENTS MKL) -- endif() --endif() - --if(STATIC IN_LIST SCALAPACK_FIND_COMPONENTS) -- set(_orig_suff ${CMAKE_FIND_LIBRARY_SUFFIXES}) -- set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_STATIC_LIBRARY_SUFFIX}) --endif() - --if(MKL IN_LIST SCALAPACK_FIND_COMPONENTS OR MKL64 IN_LIST SCALAPACK_FIND_COMPONENTS) -- scalapack_mkl() --elseif(scalapack_cray) -- # Cray PE has Scalapack build into LibSci. Use Cray compiler wrapper. --else() -- scalapack_lib() --endif() -- --if(STATIC IN_LIST SCALAPACK_FIND_COMPONENTS) -- if(SCALAPACK_LIBRARY) -- set(SCALAPACK_STATIC_FOUND true) -- endif() -- set(CMAKE_FIND_LIBRARY_SUFFIXES ${_orig_suff}) --endif() - - # --- Check that Scalapack links - --if(scalapack_cray OR SCALAPACK_LIBRARY) -+if(SCALAPACK_LIBRARIES) - scalapack_check() - endif() - -@@ -229,32 +197,19 @@ endif() - - include(FindPackageHandleStandardArgs) - --if(scalapack_cray) -- find_package_handle_standard_args(SCALAPACK HANDLE_COMPONENTS -- REQUIRED_VARS SCALAPACK_links -- ) --else() -- find_package_handle_standard_args(SCALAPACK HANDLE_COMPONENTS -- REQUIRED_VARS SCALAPACK_LIBRARY SCALAPACK_links -- ) --endif() -+find_package_handle_standard_args(SCALAPACK HANDLE_COMPONENTS -+REQUIRED_VARS SCALAPACK_LIBRARIES SCALAPACK_links -+) - - if(SCALAPACK_FOUND) - # need if _FOUND guard as can't overwrite imported target even if bad -- set(SCALAPACK_LIBRARIES ${SCALAPACK_LIBRARY}) -- if(BLACS_LIBRARY) -- list(APPEND SCALAPACK_LIBRARIES ${BLACS_LIBRARY}) -- endif() -- -- set(SCALAPACK_INCLUDE_DIRS ${SCALAPACK_INCLUDE_DIR}) -- - message(VERBOSE "Scalapack libraries: ${SCALAPACK_LIBRARIES} - Scalapack include directories: ${SCALAPACK_INCLUDE_DIRS}") - - if(NOT TARGET SCALAPACK::SCALAPACK) - add_library(SCALAPACK::SCALAPACK INTERFACE IMPORTED) - set_property(TARGET SCALAPACK::SCALAPACK PROPERTY INTERFACE_LINK_LIBRARIES "${SCALAPACK_LIBRARIES}") -- set_property(TARGET SCALAPACK::SCALAPACK PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${SCALAPACK_INCLUDE_DIR}") -+ set_property(TARGET SCALAPACK::SCALAPACK PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${SCALAPACK_INCLUDE_DIRS}") - - # For MKL, we don't use FindLapack, so define LAPACK::LAPACK as alias - if(MKL_FOUND AND NOT TARGET LAPACK::LAPACK) -@@ -262,5 +217,3 @@ Scalapack include directories: ${SCALAPACK_INCLUDE_DIRS}") - endif() - endif() - endif() -- --mark_as_advanced(SCALAPACK_LIBRARY SCALAPACK_INCLUDE_DIR) -diff --git a/cmake/FindScotch.cmake b/cmake/FindScotch.cmake -index 39e378d..1066d99 100644 ---- a/cmake/FindScotch.cmake -+++ b/cmake/FindScotch.cmake -@@ -18,7 +18,7 @@ - # COMPONENTS: - # - # * ESMUMPS: detect Scotch esmumps interface --# * parallel: detect parallel (MPI) Scotch -+# * PTScotch: detect parallel (MPI) Scotch - # - # This module finds headers and scotch library. - # Results are reported in variables: -@@ -61,7 +61,7 @@ if(ESMUMPS IN_LIST Scotch_FIND_COMPONENTS) - list(INSERT scotch_names 0 esmumps) - endif() - --if(parallel IN_LIST Scotch_FIND_COMPONENTS) -+if(PTScotch IN_LIST Scotch_FIND_COMPONENTS) - list(INSERT scotch_names 0 ptscotch ptscotcherr) - if(ESMUMPS IN_LIST Scotch_FIND_COMPONENTS) - list(INSERT scotch_names 0 ptesmumps) -@@ -79,10 +79,10 @@ foreach(l IN LISTS scotch_names) - mark_as_advanced(Scotch_${l}_LIBRARY) - endforeach() - --if(parallel IN_LIST Scotch_FIND_COMPONENTS) -+if(PTScotch IN_LIST Scotch_FIND_COMPONENTS) - if(Scotch_ptesmumps_LIBRARY AND Scotch_ptscotch_LIBRARY) - set(Scotch_ESMUMPS_FOUND true) -- set(Scotch_parallel_FOUND true) -+ set(Scotch_PTScotch_FOUND true) - endif() - elseif(Scotch_esmumps_LIBRARY) - set(Scotch_ESMUMPS_FOUND true) -@@ -95,16 +95,16 @@ HANDLE_COMPONENTS - ) - - if(Scotch_FOUND) --set(Scotch_INCLUDE_DIRS ${Scotch_INCLUDE_DIR}) -+ set(Scotch_INCLUDE_DIRS ${Scotch_INCLUDE_DIR}) - --message(VERBOSE "Scotch libraries: ${Scotch_LIBRARIES} --Scotch include directories: ${Scotch_INCLUDE_DIRS}") -+ message(VERBOSE "Scotch libraries: ${Scotch_LIBRARIES} -+ Scotch include directories: ${Scotch_INCLUDE_DIRS}") - --if(NOT TARGET Scotch::Scotch) -- add_library(Scotch::Scotch INTERFACE IMPORTED) -- set_property(TARGET Scotch::Scotch PROPERTY INTERFACE_LINK_LIBRARIES "${Scotch_LIBRARIES}") -- set_property(TARGET Scotch::Scotch PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${Scotch_INCLUDE_DIR}") --endif() -+ if(NOT TARGET Scotch::Scotch) -+ add_library(Scotch::Scotch INTERFACE IMPORTED) -+ set_property(TARGET Scotch::Scotch PROPERTY INTERFACE_LINK_LIBRARIES "${Scotch_LIBRARIES}") -+ set_property(TARGET Scotch::Scotch PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${Scotch_INCLUDE_DIR}") -+ endif() - endif(Scotch_FOUND) - - mark_as_advanced(Scotch_INCLUDE_DIR) -diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake -index 9d5b09a..40a0ad3 100644 ---- a/cmake/lapack.cmake -+++ b/cmake/lapack.cmake -@@ -2,15 +2,7 @@ - - include(CheckSourceCompiles) - --if(NOT DEFINED LAPACK_VENDOR AND DEFINED ENV{MKLROOT}) -- set(LAPACK_VENDOR MKL) --endif() -- --if(find_static) -- list(APPEND LAPACK_VENDOR STATIC) --endif() -- --find_package(LAPACK REQUIRED COMPONENTS ${LAPACK_VENDOR}) -+find_package(LAPACK REQUIRED) - - # GEMMT is recommeded in MUMPS User Manual if available - if(gemmt) -diff --git a/cmake/scalapack.cmake b/cmake/scalapack.cmake -index b4fbd3b..5c66e4e 100644 ---- a/cmake/scalapack.cmake -+++ b/cmake/scalapack.cmake -@@ -3,21 +3,7 @@ include(GNUInstallDirs) - - if(find) - --if(NOT DEFINED SCALAPACK_VENDOR AND DEFINED ENV{MKLROOT}) -- set(SCALAPACK_VENDOR MKL) --endif() -- --if(MKL IN_LIST SCALAPACK_VENDOR) -- if(intsize64) -- list(APPEND SCALAPACK_VENDOR MKL64) -- endif() --endif() -- --if(find_static) -- list(APPEND SCALAPACK_VENDOR STATIC) --endif() -- --find_package(SCALAPACK COMPONENTS ${SCALAPACK_VENDOR}) -+find_package(SCALAPACK) - - endif() - +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 4e0bb93..e4aca6f 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -48,10 +48,7 @@ find_package(Threads) + + if(MUMPS_parallel) + find_package(MPI COMPONENTS C Fortran REQUIRED) +- if(NOT DEFINED ENV{MKLROOT} AND NOT LAPACK_VENDOR MATCHES "^MKL" OR NOT MUMPS_scalapack) +- # oneMKL MKLConfig.cmake must be invoked only once +- include(cmake/lapack.cmake) +- endif() ++ include(cmake/lapack.cmake) + set(NUMERIC_LIBS LAPACK::LAPACK) + + if(MUMPS_scalapack) +@@ -99,14 +96,6 @@ endif() + + list(APPEND ORDERING_LIBS pord) + +-install(FILES +-${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindLAPACK.cmake +-${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindSCALAPACK.cmake +-${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindMETIS.cmake +-${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindScotch.cmake +-DESTINATION cmake +-) +- + message(STATUS "MUMPS ORDERING_DEFS: ${ORDERING_DEFS}") + message(STATUS "MUMPS ORDERING_LIBS: ${ORDERING_LIBS}") + message(STATUS "MUMPS LAPACK_VENDOR: ${LAPACK_VENDOR}") +diff --git a/cmake/FindLAPACK.cmake b/cmake/FindLAPACK.cmake +deleted file mode 100644 +index 56dde6f..0000000 +--- a/cmake/FindLAPACK.cmake ++++ /dev/null +@@ -1,570 +0,0 @@ +-# Distributed under the OSI-approved BSD 3-Clause License. See accompanying +-# file Copyright.txt or https://cmake.org/licensing for details. +- +-#[=======================================================================[.rst: +- +-FindLapack +----------- +- +-* Michael Hirsch, Ph.D. www.scivision.dev +-* David Eklund +- +-Let Michael know if there are more MKL / Lapack / compiler combination you want. +-Refer to https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor +- +-Finds LAPACK libraries for C / C++ / Fortran. +-Works with Netlib Lapack / LapackE, Atlas and Intel MKL. +-Intel MKL relies on having environment variable MKLROOT set, typically by sourcing +-mklvars.sh beforehand. +- +-Why not the FindLapack.cmake built into CMake? It has a lot of old code for +-infrequently used Lapack libraries and is unreliable for me. +- +-Tested on Linux, MacOS and Windows with: +-* GCC / Gfortran +-* Clang / Flang +-* Intel (icc, ifort) +-* Cray +- +- +-Parameters +-^^^^^^^^^^ +- +-COMPONENTS default to Netlib LAPACK / LapackE, otherwise: +- +-``MKL`` +- Intel MKL -- sequential by default, or add TBB or MPI as well +-``MKL64`` +- MKL only: 64-bit integers (default is 32-bit integers) +-``TBB`` +- Intel MPI + TBB for MKL +-``OpenMP`` +- MKL only: use OpenMP (default is sequential) +- +- +-``AOCL`` +- AMD ScaLAPACK fork of Netlib ScaLAPACK. +- Requires LAPACK AOCL +- https://www.amd.com/en/developer/aocl/scalapack.html +-``AOCL64`` +- AOCL 64-bit integers (default is 32-bit integers) +- +-``LAPACKE`` +- LapackE C / C++ interface +- +-``Netlib`` +- Netlib Lapack for Fortran +-``OpenBLAS`` +- OpenBLAS Lapack for Fortran +- +-``LAPACK95`` +- get Lapack95 interfaces for MKL or Netlib (must also specify one of MKL, Netlib) +- +-``STATIC`` +- Library search default on non-Windows is shared then static. On Windows default search is static only. +- Specifying STATIC component searches for static libraries only. +- +- +-Result Variables +-^^^^^^^^^^^^^^^^ +- +-``LAPACK_FOUND`` +- Lapack libraries were found +-``LAPACK__FOUND`` +- LAPACK specified was found +-``LAPACK_LIBRARIES`` +- Lapack library files (including BLAS +-``LAPACK_INCLUDE_DIRS`` +- Lapack include directories (for C/C++) +- +- +-References +-^^^^^^^^^^ +- +-* Pkg-Config and MKL: https://software.intel.com/en-us/articles/intel-math-kernel-library-intel-mkl-and-pkg-config-tool +-* MKL for Windows: https://software.intel.com/en-us/mkl-windows-developer-guide-static-libraries-in-the-lib-intel64-win-directory +-* MKL Windows directories: https://software.intel.com/en-us/mkl-windows-developer-guide-high-level-directory-structure +-* Atlas http://math-atlas.sourceforge.net/errata.html#LINK +-* MKL LAPACKE (C, C++): https://software.intel.com/en-us/mkl-linux-developer-guide-calling-lapack-blas-and-cblas-routines-from-c-c-language-environments +-#]=======================================================================] +- +-include(CheckSourceCompiles) +- +-# clear to avoid endless appending on subsequent calls +-set(LAPACK_LIBRARY) +-unset(LAPACK_INCLUDE_DIR) +- +-# ===== functions ========== +- +-function(lapack_atlas) +- +-find_library(ATLAS_LIB +-NAMES atlas +-PATH_SUFFIXES atlas +-DOC "ATLAS library" +-) +- +-find_library(LAPACK_ATLAS +-NAMES ptlapack lapack_atlas lapack +-NAMES_PER_DIR +-PATH_SUFFIXES atlas +-DOC "LAPACK ATLAS library" +-) +- +-find_library(BLAS_LIBRARY +-NAMES ptf77blas f77blas blas +-NAMES_PER_DIR +-PATH_SUFFIXES atlas +-DOC "BLAS ATLAS library" +-) +- +-# === C === +-find_library(BLAS_C_ATLAS +-NAMES ptcblas cblas +-NAMES_PER_DIR +-PATH_SUFFIXES atlas +-DOC "BLAS C ATLAS library" +-) +- +-find_path(LAPACK_INCLUDE_DIR +-NAMES cblas-atlas.h cblas.h clapack.h +-DOC "ATLAS headers" +-) +- +-#=========== +-if(LAPACK_ATLAS AND BLAS_C_ATLAS AND BLAS_LIBRARY AND ATLAS_LIB) +- set(LAPACK_Atlas_FOUND true PARENT_SCOPE) +- set(LAPACK_LIBRARY ${LAPACK_ATLAS} ${BLAS_C_ATLAS} ${BLAS_LIBRARY} ${ATLAS_LIB}) +- list(APPEND LAPACK_LIBRARY ${CMAKE_THREAD_LIBS_INIT}) +-endif() +- +-set(LAPACK_LIBRARY ${LAPACK_LIBRARY} PARENT_SCOPE) +- +-endfunction() +- +-#======================= +- +-function(lapack_netlib) +- +-if(LAPACK95 IN_LIST LAPACK_FIND_COMPONENTS) +- find_path(LAPACK95_INCLUDE_DIR +- NAMES f95_lapack.mod +- HINTS ${LAPACK95_ROOT} ENV LAPACK95_ROOT +- PATH_SUFFIXES include +- DOC "LAPACK95 Fortran module" +- ) +- +- find_library(LAPACK95_LIBRARY +- NAMES lapack95 +- HINTS ${LAPACK95_ROOT} ENV LAPACK95_ROOT +- DOC "LAPACK95 library" +- ) +- +- if(NOT (LAPACK95_LIBRARY AND LAPACK95_INCLUDE_DIR)) +- return() +- endif() +- +- set(LAPACK95_LIBRARY ${LAPACK95_LIBRARY} PARENT_SCOPE) +- set(LAPACK_LAPACK95_FOUND true PARENT_SCOPE) +-endif(LAPACK95 IN_LIST LAPACK_FIND_COMPONENTS) +- +-find_library(LAPACK_LIBRARY +-NAMES lapack +-PATH_SUFFIXES lapack lapack/lib +-DOC "LAPACK library" +-) +-if(NOT LAPACK_LIBRARY) +- return() +-endif() +- +-if(LAPACKE IN_LIST LAPACK_FIND_COMPONENTS) +- +- find_library(LAPACKE_LIBRARY +- NAMES lapacke +- PATH_SUFFIXES lapack lapack/lib +- DOC "LAPACKE library" +- ) +- +- # lapack/include for Homebrew +- find_path(LAPACKE_INCLUDE_DIR +- NAMES lapacke.h +- PATH_SUFFIXES lapack lapack/include +- DOC "LAPACKE include directory" +- ) +- if(NOT (LAPACKE_LIBRARY AND LAPACKE_INCLUDE_DIR)) +- return() +- endif() +- +- set(LAPACK_LAPACKE_FOUND true PARENT_SCOPE) +- list(APPEND LAPACK_INCLUDE_DIR ${LAPACKE_INCLUDE_DIR}) +- list(APPEND LAPACK_LIBRARY ${LAPACKE_LIBRARY}) +- mark_as_advanced(LAPACKE_LIBRARY LAPACKE_INCLUDE_DIR) +-endif(LAPACKE IN_LIST LAPACK_FIND_COMPONENTS) +- +-# Netlib on Cygwin and others +- +-find_library(BLAS_LIBRARY +-NAMES refblas blas +-NAMES_PER_DIR +-PATH_SUFFIXES lapack lapack/lib blas +-DOC "BLAS library" +-) +- +-if(NOT BLAS_LIBRARY) +- return() +-endif() +- +-list(APPEND LAPACK_LIBRARY ${BLAS_LIBRARY}) +-set(LAPACK_Netlib_FOUND true PARENT_SCOPE) +- +-list(APPEND LAPACK_LIBRARY ${CMAKE_THREAD_LIBS_INIT}) +- +-set(LAPACK_LIBRARY ${LAPACK_LIBRARY} PARENT_SCOPE) +- +-endfunction() +- +-#=============================== +-function(lapack_openblas) +- +-find_library(LAPACK_LIBRARY +-NAMES openblas +-PATH_SUFFIXES openblas +-DOC "OpenBLAS library" +-) +- +-find_path(LAPACK_INCLUDE_DIR +-NAMES openblas_config.h cblas-openblas.h +-DOC "OpenBLAS include directory" +-) +- +-if(NOT LAPACK_LIBRARY) +- return() +-endif() +- +-set(BLAS_LIBRARY ${LAPACK_LIBRARY} CACHE FILEPATH "OpenBLAS library") +- +-set(LAPACK_OpenBLAS_FOUND true PARENT_SCOPE) +- +-list(APPEND LAPACK_LIBRARY ${CMAKE_THREAD_LIBS_INIT}) +- +-set(LAPACK_LIBRARY ${LAPACK_LIBRARY} PARENT_SCOPE) +- +-endfunction() +- +- +-function(lapack_aocl) +- +-set(_nodef_lapack) +-if(DEFINED LAPACK_ROOT) +- set(_nodef_lapack NO_DEFAULT_PATH) +-endif() +- +-set(_names flame) +-if(WIN32) +- if(BUILD_SHARED_LIBS) +- list(APPEND _names AOCL-LibFlame-Win-MT-dll AOCL-LibFlame-Win-dll) +- else() +- list(APPEND _names AOCL-LibFlame-Win-MT AOCL-LibFlame-Win) +- endif() +-endif() +- +-set(_s "LP64") +-if(AOCL64 IN_LIST SCALAPACK_FIND_COMPONENTS) +- string(PREPEND _s "I") +-endif() +- +-find_library(LAPACK_LIBRARY +-NAMES ${_names} +-NAMES_PER_DIR +-PATH_SUFFIXES lib/${_s} +-HINTS ${LAPACK_ROOT} $ENV{LAPACK_ROOT} +-${_nodef_lapack} +-DOC "AOCL Flame library" +-) +- +-find_path(LAPACK_INCLUDE_DIR +-NAMES FLAME.h +-PATH_SUFFIXES include/${_s} +-HINTS ${LAPACK_ROOT} $ENV{LAPACK_ROOT} +-${_nodef_lapack} +-DOC "Flame header" +-) +- +-if(NOT LAPACK_LIBRARY AND LAPACK_INCLUDE_DIR) +- return() +-endif() +- +-# --- BLIS +-set(_nodef_blas) +-if(DEFINED BLAS_ROOT) +- set(_nodef_blas NO_DEFAULT_PATH) +-endif() +- +-set(_names blis-mt blis) +-if(WIN32) +- if(BUILD_SHARED_LIBS) +- list(APPEND _names AOCL-LibBlis-Win-MT-dll AOCL-LibBlis-Win-dll) +- else() +- list(APPEND _names AOCL-LibBlis-Win-MT AOCL-LibBlis-Win) +- endif() +-endif() +- +-find_library(BLAS_LIBRARY +-NAMES ${_names} +-NAMES_PER_DIR +-HINTS ${BLAS_ROOT} +-PATH_SUFFIXES lib/${_s} +-HINTS ${BLAS_ROOT} $ENV{BLAS_ROOT} +-${_nodef_blas} +-DOC "AOCL Blis library" +-) +- +-find_path(BLAS_INCLUDE_DIR +-NAMES blis.h +-HINTS ${BLAS_ROOT} +-PATH_SUFFIXES include/${_s} +-HINTS ${BLAS_ROOT} $ENV{BLAS_ROOT} +-${_nodef_blas} +-DOC "Blis header" +-) +- +-if(NOT BLAS_LIBRARY AND BLAS_INCLUDE_DIR) +- return() +-endif() +- +- +-if(LAPACKE IN_LIST LAPACK_FIND_COMPONENTS) +- +- find_library(LAPACKE_LIBRARY +- NAMES lapacke +- PATH_SUFFIXES lib/${_s} +- HINTS ${LAPACK_ROOT} $ENV{LAPACK_ROOT} +- ${_nodef_lapack} +- DOC "AOCL LAPACKE library" +- ) +- +- # lapack/include for Homebrew +- find_path(LAPACKE_INCLUDE_DIR +- NAMES lapacke.h +- PATH_SUFFIXES include/${_s} +- HINTS ${LAPACK_ROOT} $ENV{LAPACK_ROOT} +- ${_nodef_lapack} +- DOC "AOCL LAPACKE include directory" +- ) +- if(NOT (LAPACKE_LIBRARY AND LAPACKE_INCLUDE_DIR)) +- return() +- endif() +- +- set(LAPACK_LAPACKE_FOUND true PARENT_SCOPE) +- list(APPEND LAPACK_INCLUDE_DIR ${LAPACKE_INCLUDE_DIR}) +- list(APPEND LAPACK_LIBRARY ${LAPACKE_LIBRARY}) +- mark_as_advanced(LAPACKE_LIBRARY LAPACKE_INCLUDE_DIR) +-endif() +- +- +-set(LAPACK_AOCL_FOUND true PARENT_SCOPE) +-set(LAPACK_LIBRARY ${LAPACK_LIBRARY} ${BLAS_LIBRARY} ${CMAKE_THREAD_LIBS_INIT} PARENT_SCOPE) +-set(LAPACK_INCLUDE_DIR ${LAPACK_INCLUDE_DIR} ${BLAS_INCLUDE_DIR} PARENT_SCOPE) +- +-endfunction() +- +-#=============================== +- +-macro(lapack_mkl) +-# https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-linux/2025-0/cmake-config-for-onemkl.html +- +-set(MKL_ARCH "intel64") +- +-set(MKL_INTERFACE "lp64") +-if(MKL64 IN_LIST LAPACK_FIND_COMPONENTS) +- string(PREPEND MKL_INTERFACE "i") +-endif() +- +-if(LAPACK95 IN_LIST LAPACK_FIND_COMPONENTS) +- set(ENABLE_BLAS95 true) +- set(ENABLE_LAPACK95 true) +-endif() +- +-# MKL_THREADING default: "intel_thread" which is Intel OpenMP +-# some systems have messed up OpenMP, so sequential unless requested +-if(TBB IN_LIST SCALAPACK_FIND_COMPONENTS) +- set(MKL_THREADING "tbb_thread") +-elseif(OpenMP IN_LIST SCALAPACK_FIND_COMPONENTS) +- set(MKL_THREADING "intel_thread") +-else() +- set(MKL_THREADING "sequential") +-endif() +- +-# default: dynamic +-if(STATIC IN_LIST LAPACK_FIND_COMPONENTS) +- set(MKL_LINK "static") +-endif() +- +-find_package(MKL CONFIG HINTS $ENV{MKLROOT}) +- +-if(NOT MKL_FOUND) +- return() +-endif() +- +-# get_property(LAPACK_COMPILE_OPTIONS TARGET MKL::MKL PROPERTY INTERFACE_COMPILE_OPTIONS) +-# flags are empty generator expressions that trip up check_source_compiles +- +-get_property(LAPACK_INCLUDE_DIR TARGET MKL::MKL PROPERTY INTERFACE_INCLUDE_DIRECTORIES) +-get_property(LAPACK_LIBRARY TARGET MKL::MKL PROPERTY INTERFACE_LINK_LIBRARIES) +- +- +-set(LAPACK_MKL_FOUND true) +- +-foreach(c IN ITEMS TBB LAPACK95 MKL64 OpenMP) +- if(${c} IN_LIST LAPACK_FIND_COMPONENTS) +- set(LAPACK_${c}_FOUND true) +- endif() +-endforeach() +- +-endmacro() +- +-# ========== main program +- +-if(NOT DEFINED LAPACK_CRAY AND DEFINED ENV{CRAYPE_VERSION}) +- set(LAPACK_CRAY true) +-endif() +- +-if(NOT (LAPACK_CRAY +- OR OpenBLAS IN_LIST LAPACK_FIND_COMPONENTS +- OR Netlib IN_LIST LAPACK_FIND_COMPONENTS +- OR Atlas IN_LIST LAPACK_FIND_COMPONENTS +- OR MKL IN_LIST LAPACK_FIND_COMPONENTS +- OR MKL64 IN_LIST LAPACK_FIND_COMPONENTS +- OR AOCL IN_LIST LAPACK_FIND_COMPONENTS)) +- if(DEFINED ENV{MKLROOT} AND IS_DIRECTORY "$ENV{MKLROOT}") +- list(APPEND LAPACK_FIND_COMPONENTS MKL) +- else() +- list(APPEND LAPACK_FIND_COMPONENTS Netlib) +- endif() +-endif() +- +-find_package(Threads) +- +-if(STATIC IN_LIST LAPACK_FIND_COMPONENTS) +- set(_orig_suff ${CMAKE_FIND_LIBRARY_SUFFIXES}) +- set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_STATIC_LIBRARY_SUFFIX}) +-endif() +- +-if(MKL IN_LIST LAPACK_FIND_COMPONENTS OR MKL64 IN_LIST LAPACK_FIND_COMPONENTS) +- lapack_mkl() +-elseif(Atlas IN_LIST LAPACK_FIND_COMPONENTS) +- lapack_atlas() +-elseif(Netlib IN_LIST LAPACK_FIND_COMPONENTS) +- lapack_netlib() +-elseif(OpenBLAS IN_LIST LAPACK_FIND_COMPONENTS) +- lapack_openblas() +-elseif(AOCL IN_LIST LAPACK_FIND_COMPONENTS) +- lapack_aocl() +-elseif(LAPACK_CRAY) +- # LAPACK is implicitly part of Cray PE LibSci, use Cray compiler wrapper. +-endif() +- +-if(STATIC IN_LIST LAPACK_FIND_COMPONENTS) +- if(LAPACK_LIBRARY) +- set(LAPACK_STATIC_FOUND true) +- endif() +- set(CMAKE_FIND_LIBRARY_SUFFIXES ${_orig_suff}) +-endif() +- +-# -- verify library works +- +-function(lapack_check) +- +-get_property(enabled_langs GLOBAL PROPERTY ENABLED_LANGUAGES) +-if(NOT Fortran IN_LIST enabled_langs) +- set(LAPACK_links true PARENT_SCOPE) +- return() +-endif() +- +-set(CMAKE_REQUIRED_FLAGS) +-set(CMAKE_REQUIRED_LINK_OPTIONS) +-set(CMAKE_REQUIRED_INCLUDES ${LAPACK_INCLUDE_DIR}) +-set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARY}) +- +-check_source_compiles(Fortran +-"program check_lapack +-use, intrinsic :: iso_fortran_env, only : real32 +-implicit none +-real(real32), external :: snrm2 +-print *, snrm2(1, [0._real32], 1) +-end program" +-LAPACK_s_FOUND +-) +- +-check_source_compiles(Fortran +-"program check_lapack +-use, intrinsic :: iso_fortran_env, only : real64 +-implicit none +-real(real64), external :: dnrm2 +-print *, dnrm2(1, [0._real64], 1) +-end program" +-LAPACK_d_FOUND +-) +- +-if(LAPACK_s_FOUND OR LAPACK_d_FOUND) +- set(LAPACK_links true PARENT_SCOPE) +-endif() +- +-endfunction() +- +-# --- Check library links +-if(LAPACK_CRAY OR LAPACK_LIBRARY) +- lapack_check() +-endif() +- +- +-include(FindPackageHandleStandardArgs) +- +-if(LAPACK_CRAY) +- find_package_handle_standard_args(LAPACK HANDLE_COMPONENTS +- REQUIRED_VARS LAPACK_links +- ) +-else() +- find_package_handle_standard_args(LAPACK HANDLE_COMPONENTS +- REQUIRED_VARS LAPACK_LIBRARY LAPACK_links +- ) +-endif() +- +- +-set(BLAS_LIBRARIES ${BLAS_LIBRARY}) +-set(LAPACK_LIBRARIES ${LAPACK_LIBRARY}) +-set(LAPACK_INCLUDE_DIRS ${LAPACK_INCLUDE_DIR}) +- +-if(LAPACK_FOUND) +-# need if _FOUND guard as can't overwrite imported target even if bad +- +- +-message(VERBOSE "Lapack libraries: ${LAPACK_LIBRARIES} +-Lapack include directories: ${LAPACK_INCLUDE_DIRS}") +- +-if(NOT TARGET BLAS::BLAS) +- add_library(BLAS::BLAS INTERFACE IMPORTED) +- set_property(TARGET BLAS::BLAS PROPERTY INTERFACE_LINK_LIBRARIES "${BLAS_LIBRARY}") +-endif() +- +-if(NOT TARGET LAPACK::LAPACK) +- add_library(LAPACK::LAPACK INTERFACE IMPORTED) +- set_property(TARGET LAPACK::LAPACK PROPERTY INTERFACE_COMPILE_OPTIONS "${LAPACK_COMPILE_OPTIONS}") +- set_property(TARGET LAPACK::LAPACK PROPERTY INTERFACE_LINK_LIBRARIES "${LAPACK_LIBRARY}") +- set_property(TARGET LAPACK::LAPACK PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${LAPACK_INCLUDE_DIR}") +-endif() +- +-if(LAPACK_LAPACK95_FOUND) +- set(LAPACK95_LIBRARIES ${LAPACK95_LIBRARY}) +- set(LAPACK95_INCLUDE_DIRS ${LAPACK95_INCLUDE_DIR}) +- +- if(NOT TARGET LAPACK::LAPACK95) +- add_library(LAPACK::LAPACK95 INTERFACE IMPORTED) +- set_property(TARGET LAPACK::LAPACK95 PROPERTY INTERFACE_LINK_LIBRARIES "${LAPACK95_LIBRARY}") +- set_property(TARGET LAPACK::LAPACK95 PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${LAPACK95_INCLUDE_DIR}") +- endif() +-endif() +- +-endif(LAPACK_FOUND) +- +-mark_as_advanced(LAPACK_LIBRARY LAPACK_INCLUDE_DIR) +diff --git a/cmake/FindPARMETIS.cmake b/cmake/FindPARMETIS.cmake +new file mode 100644 +index 0000000..8d680c5 +--- /dev/null ++++ b/cmake/FindPARMETIS.cmake +@@ -0,0 +1,82 @@ ++# FindPARMETIS.cmake ++# Finds the PARMETIS library and sets the following variables: ++# PARMETIS_FOUND - True if PARMETIS was found ++# PARMETIS_INCLUDE_DIRS - PARMETIS include directories ++# PARMETIS_LIBRARIES - PARMETIS libraries ++# PARMETIS_VERSION - PARMETIS version if found ++ ++# Try to find PARMETIS using pkg-config first ++find_package(PkgConfig QUIET) ++if(PKG_CONFIG_FOUND) ++ pkg_check_modules(PC_PARMETIS QUIET parmetis) ++endif() ++ ++# Find MPI as PARMETIS requires it ++find_package(MPI REQUIRED) ++ ++# Find METIS as PARMETIS requires it ++find_package(METIS REQUIRED) ++ ++# Find the PARMETIS header ++find_path(PARMETIS_INCLUDE_DIR ++ NAMES parmetis.h ++ PATHS ++ ${PC_PARMETIS_INCLUDE_DIRS} ++ /usr/include ++ /usr/local/include ++ $ENV{PARMETIS_DIR}/include ++ PATH_SUFFIXES parmetis ++) ++ ++# Find the PARMETIS library ++find_library(PARMETIS_LIBRARY ++ NAMES parmetis ++ PATHS ++ ${PC_PARMETIS_LIBRARY_DIRS} ++ /usr/lib ++ /usr/local/lib ++ $ENV{PARMETIS_DIR}/lib ++) ++ ++# Try to find version information ++if(PARMETIS_INCLUDE_DIR) ++ file(STRINGS "${PARMETIS_INCLUDE_DIR}/parmetis.h" PARMETIS_VERSION_MAJOR_LINE REGEX "^#define[\t ]+PARMETIS_MAJOR_VERSION[\t ]+[0-9]+") ++ file(STRINGS "${PARMETIS_INCLUDE_DIR}/parmetis.h" PARMETIS_VERSION_MINOR_LINE REGEX "^#define[\t ]+PARMETIS_MINOR_VERSION[\t ]+[0-9]+") ++ ++ string(REGEX REPLACE "^#define[\t ]+PARMETIS_MAJOR_VERSION[\t ]+([0-9]+)" "\\1" PARMETIS_VERSION_MAJOR "${PARMETIS_VERSION_MAJOR_LINE}") ++ string(REGEX REPLACE "^#define[\t ]+PARMETIS_MINOR_VERSION[\t ]+([0-9]+)" "\\1" PARMETIS_VERSION_MINOR "${PARMETIS_VERSION_MINOR_LINE}") ++ ++ set(PARMETIS_VERSION "${PARMETIS_VERSION_MAJOR}.${PARMETIS_VERSION_MINOR}") ++endif() ++ ++# Handle standard arguments ++include(FindPackageHandleStandardArgs) ++find_package_handle_standard_args(PARMETIS ++ REQUIRED_VARS ++ PARMETIS_LIBRARY ++ PARMETIS_INCLUDE_DIR ++ MPI_FOUND ++ METIS_FOUND ++ VERSION_VAR PARMETIS_VERSION ++) ++ ++# Set output variables ++if(PARMETIS_FOUND) ++ set(PARMETIS_LIBRARIES ${PARMETIS_LIBRARY} ${METIS_LIBRARIES} ${MPI_LIBRARIES}) ++ set(PARMETIS_INCLUDE_DIRS ${PARMETIS_INCLUDE_DIR} ${METIS_INCLUDE_DIRS} ${MPI_INCLUDE_PATH}) ++ ++ if(NOT TARGET parmetis::parmetis) ++ add_library(parmetis::parmetis UNKNOWN IMPORTED) ++ set_target_properties(parmetis::parmetis PROPERTIES ++ IMPORTED_LOCATION "${PARMETIS_LIBRARY}" ++ INTERFACE_INCLUDE_DIRECTORIES "${PARMETIS_INCLUDE_DIR}" ++ INTERFACE_LINK_LIBRARIES "metis::metis;MPI::MPI_C" ++ ) ++ endif() ++endif() ++ ++# Mark advanced variables ++mark_as_advanced( ++ PARMETIS_INCLUDE_DIR ++ PARMETIS_LIBRARY ++) +diff --git a/cmake/FindSCALAPACK.cmake b/cmake/FindSCALAPACK.cmake +index d1880b7..d2ea331 100644 +--- a/cmake/FindSCALAPACK.cmake ++++ b/cmake/FindSCALAPACK.cmake +@@ -62,8 +62,6 @@ References + + include(CheckSourceCompiles) + +-set(SCALAPACK_LIBRARY) # avoids appending to prior FindScalapack +- + #===== functions + + function(scalapack_check) +@@ -74,11 +72,8 @@ find_package(Threads) + + set(CMAKE_REQUIRED_FLAGS) + set(CMAKE_REQUIRED_LINK_OPTIONS) +-set(CMAKE_REQUIRED_INCLUDES ${SCALAPACK_INCLUDE_DIR} ${LAPACK_INCLUDE_DIRS} ${MPI_Fortran_INCLUDE_DIRS}) +-set(CMAKE_REQUIRED_LIBRARIES ${SCALAPACK_LIBRARY}) +-if(BLACS_LIBRARY) +- list(APPEND CMAKE_REQUIRED_LIBRARIES ${BLACS_LIBRARY}) +-endif() ++set(CMAKE_REQUIRED_INCLUDES ${SCALAPACK_INCLUDE_DIRS} ${LAPACK_INCLUDE_DIRS} ${MPI_Fortran_INCLUDE_DIRS}) ++set(CMAKE_REQUIRED_LIBRARIES ${SCALAPACK_LIBRARIES}) + list(APPEND CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES} ${MPI_Fortran_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) + + if(STATIC IN_LIST SCALAPACK_FIND_COMPONENTS AND +@@ -285,20 +280,13 @@ endif() + + if(SCALAPACK_FOUND) + # need if _FOUND guard as can't overwrite imported target even if bad +- set(SCALAPACK_LIBRARIES ${SCALAPACK_LIBRARY}) +- if(BLACS_LIBRARY) +- list(APPEND SCALAPACK_LIBRARIES ${BLACS_LIBRARY}) +- endif() +- +- set(SCALAPACK_INCLUDE_DIRS ${SCALAPACK_INCLUDE_DIR}) +- + message(VERBOSE "Scalapack libraries: ${SCALAPACK_LIBRARIES} + Scalapack include directories: ${SCALAPACK_INCLUDE_DIRS}") + + if(NOT TARGET SCALAPACK::SCALAPACK) + add_library(SCALAPACK::SCALAPACK INTERFACE IMPORTED) + set_property(TARGET SCALAPACK::SCALAPACK PROPERTY INTERFACE_LINK_LIBRARIES "${SCALAPACK_LIBRARIES}") +- set_property(TARGET SCALAPACK::SCALAPACK PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${SCALAPACK_INCLUDE_DIR}") ++ set_property(TARGET SCALAPACK::SCALAPACK PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${SCALAPACK_INCLUDE_DIRS}") + + # For MKL, we don't use FindLapack, so define LAPACK::LAPACK as alias + if(MKL_FOUND AND NOT TARGET LAPACK::LAPACK) +@@ -306,5 +294,3 @@ Scalapack include directories: ${SCALAPACK_INCLUDE_DIRS}") + endif() + endif() + endif() +- +-mark_as_advanced(SCALAPACK_LIBRARY SCALAPACK_INCLUDE_DIR) +diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake +index 3f0997e..6776359 100644 +--- a/cmake/lapack.cmake ++++ b/cmake/lapack.cmake +@@ -1,11 +1,2 @@ + # Handle options for finding LAPACK +- +-if(NOT DEFINED LAPACK_VENDOR AND DEFINED ENV{MKLROOT} AND IS_DIRECTORY "$ENV{MKLROOT}") +- set(LAPACK_VENDOR MKL) +-endif() +- +-if(find_static) +- list(APPEND LAPACK_VENDOR STATIC) +-endif() +- +-find_package(LAPACK REQUIRED COMPONENTS ${LAPACK_VENDOR}) ++find_package(LAPACK REQUIRED) +diff --git a/cmake/scalapack.cmake b/cmake/scalapack.cmake +index 8727508..3762473 100644 +--- a/cmake/scalapack.cmake ++++ b/cmake/scalapack.cmake +@@ -1,103 +1 @@ +-include(ExternalProject) +-include(GNUInstallDirs) +- +-if(find AND NOT TARGET SCALAPACK::SCALAPACK) +- +-# Make SCALAPACK_VENDOR match LAPACK_VENDOR +- +-if(NOT DEFINED SCALAPACK_VENDOR) +- if(LAPACK_VENDOR MATCHES "^MKL") +- set(SCALAPACK_VENDOR MKL) +- elseif(NOT DEFINED LAPACK_VENDOR AND +- (DEFINED ENV{MKLROOT} AND IS_DIRECTORY "$ENV{MKLROOT}")) +- set(SCALAPACK_VENDOR MKL) +- set(LAPACK_VENDOR MKL) +- endif() +- +- if(LAPACK_VENDOR STREQUAL "AOCL") +- set(SCALAPACK_VENDOR AOCL) +- endif() +-endif() +- +-if(MKL IN_LIST SCALAPACK_VENDOR) +- if(MUMPS_openmp) +- list(APPEND SCALAPACK_VENDOR OpenMP) +- endif() +-endif() +- +-if(MKL IN_LIST SCALAPACK_VENDOR AND NOT MKL64 IN_LIST SCALAPACK_VENDOR) +- if(intsize64) +- list(APPEND SCALAPACK_VENDOR MKL64) +- endif() +-endif() +- +-if(find_static) +- list(APPEND SCALAPACK_VENDOR STATIC) +-endif() +- +-find_package(SCALAPACK COMPONENTS ${SCALAPACK_VENDOR}) +- +-endif() +- +-if(SCALAPACK_FOUND OR TARGET SCALAPACK::SCALAPACK) +- return() +-elseif(DEFINED SCALAPACK_VENDOR) +- message(FATAL_ERROR "Scalapack from ${SCALAPACK_VENDOR} not found.") +-endif() +- +-set(scalapack_cmake_args +--DBUILD_SINGLE:BOOL=${BUILD_SINGLE} +--DBUILD_DOUBLE:BOOL=${BUILD_DOUBLE} +--DBUILD_COMPLEX:BOOL=${BUILD_COMPLEX} +--DBUILD_COMPLEX16:BOOL=${BUILD_COMPLEX16} +--DBUILD_SHARED_LIBS:BOOL=${BUILD_SHARED_LIBS} +--DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_INSTALL_PREFIX} +--DCMAKE_C_COMPILER:PATH=${CMAKE_C_COMPILER} +--DCMAKE_Fortran_COMPILER:PATH=${CMAKE_Fortran_COMPILER} +--DBUILD_TESTING:BOOL=off +--DCMAKE_BUILD_TYPE:STRING=Release +-) +- +-file(READ ${CMAKE_CURRENT_LIST_DIR}/libraries.json json) +- +-string(JSON scalapack_url GET ${json} scalapack git) +-string(JSON scalapack_tag GET ${json} scalapack tag) +- +-set(SCALAPACK_INCLUDE_DIRS ${CMAKE_INSTALL_FULL_INCLUDEDIR}) +-file(MAKE_DIRECTORY ${SCALAPACK_INCLUDE_DIRS}) +-if(NOT IS_DIRECTORY ${SCALAPACK_INCLUDE_DIRS}) +- message(FATAL_ERROR "Could not create directory: ${SCALAPACK_INCLUDE_DIRS}") +-endif() +- +-if(BUILD_SHARED_LIBS) +- set(SCALAPACK_LIBRARIES ${CMAKE_INSTALL_FULL_LIBDIR}/${CMAKE_SHARED_LIBRARY_PREFIX}scalapack${CMAKE_SHARED_LIBRARY_SUFFIX} +- ${CMAKE_INSTALL_FULL_LIBDIR}/${CMAKE_SHARED_LIBRARY_PREFIX}blacs${CMAKE_SHARED_LIBRARY_SUFFIX} +- ) +-else() +- set(SCALAPACK_LIBRARIES ${CMAKE_INSTALL_FULL_LIBDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}scalapack${CMAKE_STATIC_LIBRARY_SUFFIX} +- ${CMAKE_INSTALL_FULL_LIBDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}blacs${CMAKE_STATIC_LIBRARY_SUFFIX} +- ) +-endif() +- +-ExternalProject_Add(scalapack +-GIT_REPOSITORY ${scalapack_url} +-GIT_TAG ${scalapack_tag} +-GIT_SHALLOW true +-CMAKE_ARGS ${scalapack_cmake_args} +-TEST_COMMAND "" +-BUILD_BYPRODUCTS ${SCALAPACK_LIBRARIES} +-CONFIGURE_HANDLED_BY_BUILD true +-USES_TERMINAL_DOWNLOAD true +-USES_TERMINAL_UPDATE true +-USES_TERMINAL_PATCH true +-USES_TERMINAL_CONFIGURE true +-USES_TERMINAL_BUILD true +-USES_TERMINAL_INSTALL true +-USES_TERMINAL_TEST true +-) +- +-add_library(SCALAPACK::SCALAPACK INTERFACE IMPORTED GLOBAL) +-target_include_directories(SCALAPACK::SCALAPACK INTERFACE ${SCALAPACK_INCLUDE_DIRS}) +-target_link_libraries(SCALAPACK::SCALAPACK INTERFACE ${SCALAPACK_LIBRARIES}) +- +-add_dependencies(SCALAPACK::SCALAPACK scalapack) ++find_package(SCALAPACK) +\ No newline at end of file diff --git a/extern/patch/petsc/patch_build.diff b/extern/patch/petsc/patch_build.diff index bbd7f7d128..24c16794d9 100644 --- a/extern/patch/petsc/patch_build.diff +++ b/extern/patch/petsc/patch_build.diff @@ -1,49 +1,49 @@ -diff --git a/config/BuildSystem/config/packages/openmp.py b/config/BuildSystem/config/packages/openmp.py -index 38e9f1c1695..616228ac5e0 100644 ---- a/config/BuildSystem/config/packages/openmp.py -+++ b/config/BuildSystem/config/packages/openmp.py -@@ -28,6 +28,7 @@ class Configure(config.package.Package): - "-mp", # Portland Group - "-Qopenmp", # Intel windows - "-openmp", # Intel -+ "-qopenmp", # Intel - "-xopenmp", # Sun - "+Oopenmp", # HP - "/openmp" # Microsoft Visual Studio -diff --git a/config/BuildSystem/config/setCompilers.py b/config/BuildSystem/config/setCompilers.py -index 5824b52ed23..44b4cb5abbf 100644 ---- a/config/BuildSystem/config/setCompilers.py -+++ b/config/BuildSystem/config/setCompilers.py -@@ -1950,7 +1950,8 @@ class Configure(config.base.Configure): - 'invalid option','invalid suboption','bad ',' option','petsc error', - 'unbekannte option','linker input file unused because linking not done', - 'warning: // comments are not allowed in this language', -- 'no se reconoce la opci','non reconnue','warning: unsupported linker arg:','ignoring unknown option') -+ 'no se reconoce la opci','non reconnue','warning: unsupported linker arg:','ignoring unknown option', -+ 'warning: Use of \'-qopenmp\' recommended over \'-fopenmp\'') - outlo = output.lower() - return any(sub.lower() in outlo for sub in substrings) - -diff --git a/src/sys/objects/version.c b/src/sys/objects/version.c -index 176da9ba1ad..9a28b765748 100644 ---- a/src/sys/objects/version.c -+++ b/src/sys/objects/version.c -@@ -58,13 +58,12 @@ PetscErrorCode PetscGetVersionNumber(PetscInt *major, PetscInt *minor, PetscInt - if (release) *release = PETSC_VERSION_RELEASE; - return PETSC_SUCCESS; - } --#if defined(PETSC_HAVE_MKL_SET_NUM_THREADS) -+#if defined(PETSC_HAVE_BLI_THREAD_SET_NUM_THREADS) -+EXTERN_C_BEGIN -+void bli_thread_set_num_threads(int); -+EXTERN_C_END -+#elif defined(PETSC_HAVE_MKL_SET_NUM_THREADS) - #include --#elif defined(PETSC_HAVE_BLI_THREAD_SET_NUM_THREADS) -- #pragma GCC diagnostic push -- #pragma GCC diagnostic ignored "-Wunused-function" -- #include -- #pragma GCC diagnostic pop - #elif defined(PETSC_HAVE_OPENBLAS_SET_NUM_THREADS) - EXTERN_C_BEGIN - void openblas_set_num_threads(int); +diff --git a/config/BuildSystem/config/packages/openmp.py b/config/BuildSystem/config/packages/openmp.py +index 38e9f1c1695..616228ac5e0 100644 +--- a/config/BuildSystem/config/packages/openmp.py ++++ b/config/BuildSystem/config/packages/openmp.py +@@ -28,6 +28,7 @@ class Configure(config.package.Package): + "-mp", # Portland Group + "-Qopenmp", # Intel windows + "-openmp", # Intel ++ "-qopenmp", # Intel + "-xopenmp", # Sun + "+Oopenmp", # HP + "/openmp" # Microsoft Visual Studio +diff --git a/config/BuildSystem/config/setCompilers.py b/config/BuildSystem/config/setCompilers.py +index 5824b52ed23..44b4cb5abbf 100644 +--- a/config/BuildSystem/config/setCompilers.py ++++ b/config/BuildSystem/config/setCompilers.py +@@ -1950,7 +1950,8 @@ class Configure(config.base.Configure): + 'invalid option','invalid suboption','bad ',' option','petsc error', + 'unbekannte option','linker input file unused because linking not done', + 'warning: // comments are not allowed in this language', +- 'no se reconoce la opci','non reconnue','warning: unsupported linker arg:','ignoring unknown option') ++ 'no se reconoce la opci','non reconnue','warning: unsupported linker arg:','ignoring unknown option', ++ 'warning: Use of \'-qopenmp\' recommended over \'-fopenmp\'') + outlo = output.lower() + return any(sub.lower() in outlo for sub in substrings) + +diff --git a/src/sys/objects/version.c b/src/sys/objects/version.c +index 176da9ba1ad..9a28b765748 100644 +--- a/src/sys/objects/version.c ++++ b/src/sys/objects/version.c +@@ -58,13 +58,12 @@ PetscErrorCode PetscGetVersionNumber(PetscInt *major, PetscInt *minor, PetscInt + if (release) *release = PETSC_VERSION_RELEASE; + return PETSC_SUCCESS; + } +-#if defined(PETSC_HAVE_MKL_SET_NUM_THREADS) ++#if defined(PETSC_HAVE_BLI_THREAD_SET_NUM_THREADS) ++EXTERN_C_BEGIN ++void bli_thread_set_num_threads(int); ++EXTERN_C_END ++#elif defined(PETSC_HAVE_MKL_SET_NUM_THREADS) + #include +-#elif defined(PETSC_HAVE_BLI_THREAD_SET_NUM_THREADS) +- #pragma GCC diagnostic push +- #pragma GCC diagnostic ignored "-Wunused-function" +- #include +- #pragma GCC diagnostic pop + #elif defined(PETSC_HAVE_OPENBLAS_SET_NUM_THREADS) + EXTERN_C_BEGIN + void openblas_set_num_threads(int); diff --git a/extern/patch/scalapack/patch_build.diff b/extern/patch/scalapack/patch_build.diff index caa6600f91..51440e1397 100644 --- a/extern/patch/scalapack/patch_build.diff +++ b/extern/patch/scalapack/patch_build.diff @@ -1,42 +1,42 @@ -diff --git a/CMakeLists.txt b/CMakeLists.txt -index 78f4560..356a5ac 100644 ---- a/CMakeLists.txt -+++ b/CMakeLists.txt -@@ -68,11 +68,8 @@ endif() - - install(TARGETS scalapack EXPORT ${PROJECT_NAME}-targets) - --install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindLAPACK.cmake DESTINATION cmake) -- - include(cmake/install.cmake) - -- - include(FeatureSummary) - - add_feature_info(real32 ${BUILD_SINGLE} "Build with single precision") -diff --git a/cmake/config.cmake.in b/cmake/config.cmake.in -index 694fc2d..98ea544 100644 ---- a/cmake/config.cmake.in -+++ b/cmake/config.cmake.in -@@ -4,7 +4,7 @@ include(CMakeFindDependencyMacro) - - find_dependency(MPI COMPONENTS C Fortran) - --list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}) -+# list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}) - - find_dependency(LAPACK) - -diff --git a/options.cmake b/options.cmake -index b987651..4593deb 100644 ---- a/options.cmake -+++ b/options.cmake -@@ -28,7 +28,7 @@ option(find "find LAPACK" on) - set(FETCHCONTENT_UPDATES_DISCONNECTED true) - set_property(DIRECTORY PROPERTY EP_UPDATE_DISCONNECTED true) - --list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake) -+# list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake) - - # Necessary for shared library with Visual Studio / Windows oneAPI - set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS true) +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 78f4560..356a5ac 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -68,11 +68,8 @@ endif() + + install(TARGETS scalapack EXPORT ${PROJECT_NAME}-targets) + +-install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/FindLAPACK.cmake DESTINATION cmake) +- + include(cmake/install.cmake) + +- + include(FeatureSummary) + + add_feature_info(real32 ${BUILD_SINGLE} "Build with single precision") +diff --git a/cmake/config.cmake.in b/cmake/config.cmake.in +index 694fc2d..98ea544 100644 +--- a/cmake/config.cmake.in ++++ b/cmake/config.cmake.in +@@ -4,7 +4,7 @@ include(CMakeFindDependencyMacro) + + find_dependency(MPI COMPONENTS C Fortran) + +-list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}) ++# list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}) + + find_dependency(LAPACK) + +diff --git a/options.cmake b/options.cmake +index b987651..4593deb 100644 +--- a/options.cmake ++++ b/options.cmake +@@ -28,7 +28,7 @@ option(find "find LAPACK" on) + set(FETCHCONTENT_UPDATES_DISCONNECTED true) + set_property(DIRECTORY PROPERTY EP_UPDATE_DISCONNECTED true) + +-list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake) ++# list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake) + + # Necessary for shared library with Visual Studio / Windows oneAPI + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS true) diff --git a/extern/patch/scalapack/patch_version.diff b/extern/patch/scalapack/patch_version.diff index 7e1288d860..dd3754670c 100644 --- a/extern/patch/scalapack/patch_version.diff +++ b/extern/patch/scalapack/patch_version.diff @@ -1,16 +1,16 @@ -diff --git a/cmake/libraries.json b/cmake/libraries.json -index 49d2a65..14e09cf 100644 ---- a/cmake/libraries.json -+++ b/cmake/libraries.json -@@ -1,10 +1,6 @@ - { -- "lapack": { -- "git": "https://github.com/Reference-LAPACK/lapack.git", -- "tag": "v3.11.0" -- }, - "scalapack_src": { -- "tag": "v2.2.1", -+ "tag": "2072b8602f0a5a84d77a712121f7715c58a2e80d", - "url": "https://github.com/Reference-ScaLAPACK/scalapack.git" - } - } +diff --git a/cmake/libraries.json b/cmake/libraries.json +index 49d2a65..14e09cf 100644 +--- a/cmake/libraries.json ++++ b/cmake/libraries.json +@@ -1,10 +1,6 @@ + { +- "lapack": { +- "git": "https://github.com/Reference-LAPACK/lapack.git", +- "tag": "v3.11.0" +- }, + "scalapack_src": { +- "tag": "v2.2.1", ++ "tag": "2072b8602f0a5a84d77a712121f7715c58a2e80d", + "url": "https://github.com/Reference-ScaLAPACK/scalapack.git" + } + } diff --git a/extern/patch/superlu_dist/patch_metis.diff b/extern/patch/superlu_dist/patch_metis.diff index bd9c723197..1016e81331 100644 --- a/extern/patch/superlu_dist/patch_metis.diff +++ b/extern/patch/superlu_dist/patch_metis.diff @@ -1,387 +1,387 @@ -diff --git a/SRC/get_perm_c.c b/SRC/get_perm_c.c -index 2f098b50..c945c152 100644 ---- a/SRC/get_perm_c.c -+++ b/SRC/get_perm_c.c -@@ -35,75 +35,39 @@ get_metis_dist( - int_t bnz, /* number of nonzeros in matrix A. */ - int_t *b_colptr, /* column pointer of size n+1 for matrix B. */ - int_t *b_rowind, /* row indices of size bnz for matrix B. */ -- int_t *perm_c /* out - the column permutation vector. */ -+ int_t *perm_c, /* out - the column permutation vector. */ -+ MPI_Comm comm /* MPI communicator to broadcast the permutation. */ - ) - { - #ifdef HAVE_PARMETIS -- /*#define METISOPTIONS 8*/ --#define METISOPTIONS 40 -- int_t metis_options[METISOPTIONS]; -- int_t i, nm, numflag = 0; /* C-Style ordering */ -- int_t *perm, *iperm; -- int_t *b_colptr_int, *b_rowind_int; -- -- extern int METIS_NodeND(int_t*, int_t*, int_t*, int_t*, int_t*, -- int_t*, int_t*); -- -- metis_options[0] = 0; /* Use Defaults for now */ -- -- perm = (int_t*) SUPERLU_MALLOC(2*n * sizeof(int_t)); -- if (!perm) ABORT("SUPERLU_MALLOC fails for perm."); -- iperm = perm + n; -- nm = n; -- --#if 0 --#if defined(_LONGINT) -- /* Metis can only take 32-bit integers */ -- -- if ( !(b_colptr_int = (int*) SUPERLU_MALLOC((n+1) * sizeof(int))) ) -- ABORT("SUPERLU_MALLOC fails for b_colptr_int."); -- for (i = 0; i < n+1; ++i) b_colptr_int[i] = b_colptr[i]; -- SUPERLU_FREE(b_colptr); -- -- if ( !(b_rowind_int = (int*) SUPERLU_MALLOC(bnz * sizeof(int))) ) -- ABORT("SUPERLU_MALLOC fails for b_rowind_int."); -- -- for (i = 0; i < bnz; ++i) b_rowind_int[i] = b_rowind[i]; -- SUPERLU_FREE(b_rowind); --#else -- b_colptr_int = b_colptr; -- b_rowind_int = b_rowind; --#endif --#endif -+ int iam; -+ MPI_Comm_rank( comm, &iam ); -+ if ( !iam ) { -+ int_t i, nm; -+ int_t *perm, *iperm; - -- /* Call metis */ --#undef USEEND --#ifdef USEEND -- METIS_EdgeND(&nm, b_colptr_int, b_rowind_int, &numflag, metis_options, -- perm, iperm); --#else -+ extern int METIS_NodeND(int_t*, int_t*, int_t*, int_t*, int_t*, -+ int_t*, int_t*); - -- /* Earlier version 3.x.x */ -- /* METIS_NodeND(&nm, b_colptr, b_rowind, &numflag, metis_options, -- perm, iperm);*/ -+ perm = (int_t*) SUPERLU_MALLOC(2*n * sizeof(int_t)); -+ if (!perm) ABORT("SUPERLU_MALLOC fails for perm."); -+ iperm = perm + n; -+ nm = n; - -- /* Latest version 4.x.x */ -- METIS_NodeND(&nm, b_colptr, b_rowind, NULL, NULL, perm, iperm); -+ /* Call metis */ -+ METIS_NodeND(&nm, b_colptr, b_rowind, NULL, NULL, perm, iperm); - -- /*check_perm_dist("metis perm", n, perm);*/ --#endif -+ /* Copy the permutation vector into SuperLU data structure. */ -+ for (i = 0; i < n; ++i) perm_c[i] = iperm[i]; - -- /* Copy the permutation vector into SuperLU data structure. */ -- for (i = 0; i < n; ++i) perm_c[i] = iperm[i]; -+ SUPERLU_FREE(perm); -+ } -+ MPI_Bcast( perm_c, n, mpi_int_t, 0, comm); - --#if 0 -- SUPERLU_FREE(b_colptr_int); -- SUPERLU_FREE(b_rowind_int); --#else - SUPERLU_FREE(b_colptr); - SUPERLU_FREE(b_rowind); --#endif -- SUPERLU_FREE(perm); -+#else -+ for (int i = 0; i < n; ++i) perm_c[i] = i; - #endif /* HAVE_PARMETIS */ - } - -@@ -114,31 +78,40 @@ get_colamd_dist( - const int nnz,/* number of nonzeros in matrix A. */ - int_t *colptr, /* column pointer of size n+1 for matrix A. */ - int_t *rowind, /* row indices of size nz for matrix A. */ -- int_t *perm_c /* out - the column permutation vector. */ -+ int_t *perm_c, /* out - the column permutation vector. */ -+ MPI_Comm comm /* MPI communicator to broadcast the permutation. */ - ) - { - #ifdef HAVE_COLAMD -- int Alen, *A, i, info, *p; -- double knobs[COLAMD_KNOBS]; -- int stats[COLAMD_STATS]; -- -- Alen = colamd_recommended(nnz, m, n); -- -- colamd_set_defaults(knobs); -- -- if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) ) -- ABORT("Malloc fails for A[]"); -- if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) ) -- ABORT("Malloc fails for p[]"); -- for (i = 0; i <= n; ++i) p[i] = colptr[i]; -- for (i = 0; i < nnz; ++i) A[i] = rowind[i]; -- info = colamd(m, n, Alen, A, p, knobs, stats); -- if ( info == FALSE ) ABORT("COLAMD failed"); -- -- for (i = 0; i < n; ++i) perm_c[p[i]] = i; -+ int iam; -+ MPI_Comm_rank( comm, &iam ); -+ if ( !iam ) { -+ int Alen, *A, i, info, *p; -+ double knobs[COLAMD_KNOBS]; -+ int stats[COLAMD_STATS]; -+ -+ Alen = colamd_recommended(nnz, m, n); -+ -+ colamd_set_defaults(knobs); -+ -+ if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) ) -+ ABORT("Malloc fails for A[]"); -+ if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) ) -+ ABORT("Malloc fails for p[]"); -+ for (i = 0; i <= n; ++i) p[i] = colptr[i]; -+ for (i = 0; i < nnz; ++i) A[i] = rowind[i]; -+ info = colamd(m, n, Alen, A, p, knobs, stats); -+ if ( info == FALSE ) ABORT("COLAMD failed"); -+ -+ for (i = 0; i < n; ++i) perm_c[p[i]] = i; -+ -+ SUPERLU_FREE(A); -+ SUPERLU_FREE(p); -+ } -+ MPI_Bcast( perm_c, n, mpi_int_t, 0, comm); - -- SUPERLU_FREE(A); -- SUPERLU_FREE(p); -+ SUPERLU_FREE(colptr); -+ SUPERLU_FREE(rowind); - #else - for (int i = 0; i < n; ++i) perm_c[i] = i; - #endif // HAVE_COLAMD -@@ -466,7 +439,13 @@ at_plus_a_dist( - * - */ - void --get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c) -+get_perm_c_dist( -+ int_t pnum, -+ int_t ispec, -+ SuperMatrix *A, -+ int_t *perm_c, -+ MPI_Comm comm -+ ) - - { - NCformat *Astore = A->Store; -@@ -516,7 +495,7 @@ get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c) - - case (COLAMD): /* Approximate minimum degree column ordering. */ - get_colamd_dist(m, n, Astore->nnz, Astore->colptr, Astore->rowind, -- perm_c); -+ perm_c, comm); - #if ( PRNTlevel>=1 ) - printf(".. Use approximate minimum degree column ordering.\n"); - #endif -@@ -528,7 +507,7 @@ get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c) - &bnz, &b_colptr, &b_rowind); - - if ( bnz ) { /* non-empty adjacency structure */ -- get_metis_dist(n, bnz, b_colptr, b_rowind, perm_c); -+ get_metis_dist(n, bnz, b_colptr, b_rowind, perm_c, comm); - } else { /* e.g., diagonal matrix */ - for (i = 0; i < n; ++i) perm_c[i] = i; - SUPERLU_FREE(b_colptr); -diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c -index c0bd343d..1abb5e8b 100644 ---- a/SRC/pdgssvx.c -+++ b/SRC/pdgssvx.c -@@ -1021,7 +1021,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, - return; - } - } else { -- get_perm_c_dist(iam, permc_spec, &GA, perm_c); -+ get_perm_c_dist(iam, permc_spec, &GA, perm_c, grid->comm); - } - } - -diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c -index 75da7dca..2fbdf665 100644 ---- a/SRC/pdgssvx3d.c -+++ b/SRC/pdgssvx3d.c -@@ -1073,7 +1073,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, - if (flinfo > 0) - ABORT ("ERROR in get perm_c parmetis."); - } else { -- get_perm_c_dist (iam, permc_spec, &GA, perm_c); -+ get_perm_c_dist (iam, permc_spec, &GA, perm_c, grid->comm); - } - } - -diff --git a/SRC/pdgssvx_ABglobal.c b/SRC/pdgssvx_ABglobal.c -index 24f20776..d0c0d996 100644 ---- a/SRC/pdgssvx_ABglobal.c -+++ b/SRC/pdgssvx_ABglobal.c -@@ -855,7 +855,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && Fact == DOFACT ) - /* Use an ordering provided by SuperLU */ -- get_perm_c_dist(iam, permc_spec, A, perm_c); -+ get_perm_c_dist(iam, permc_spec, A, perm_c, grid->comm); - - /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' - (a.k.a. column etree), depending on the choice of ColPerm. -diff --git a/SRC/psgssvx.c b/SRC/psgssvx.c -index 620c3584..c3c18d3f 100644 ---- a/SRC/psgssvx.c -+++ b/SRC/psgssvx.c -@@ -1021,7 +1021,7 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A, - return; - } - } else { -- get_perm_c_dist(iam, permc_spec, &GA, perm_c); -+ get_perm_c_dist(iam, permc_spec, &GA, perm_c, grid->comm); - } - } - -diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c -index 8ac7e954..d2f7bf0e 100644 ---- a/SRC/psgssvx3d.c -+++ b/SRC/psgssvx3d.c -@@ -1068,7 +1068,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, - if (flinfo > 0) - ABORT ("ERROR in get perm_c parmetis."); - } else { -- get_perm_c_dist (iam, permc_spec, &GA, perm_c); -+ get_perm_c_dist (iam, permc_spec, &GA, perm_c, grid->comm); - } - } - -diff --git a/SRC/psgssvx_ABglobal.c b/SRC/psgssvx_ABglobal.c -index 8c83409c..1ea7a78c 100644 ---- a/SRC/psgssvx_ABglobal.c -+++ b/SRC/psgssvx_ABglobal.c -@@ -855,7 +855,7 @@ psgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && Fact == DOFACT ) - /* Use an ordering provided by SuperLU */ -- get_perm_c_dist(iam, permc_spec, A, perm_c); -+ get_perm_c_dist(iam, permc_spec, A, perm_c, grid->comm); - - /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' - (a.k.a. column etree), depending on the choice of ColPerm. -diff --git a/SRC/psgssvx_d2.c b/SRC/psgssvx_d2.c -index 94d04e4c..a839da0f 100644 ---- a/SRC/psgssvx_d2.c -+++ b/SRC/psgssvx_d2.c -@@ -1075,7 +1075,7 @@ psgssvx_d2(superlu_dist_options_t *options, SuperMatrix *A, - return; - } - } else { -- get_perm_c_dist(iam, permc_spec, &GA, perm_c); -+ get_perm_c_dist(iam, permc_spec, &GA, perm_c, grid->comm); - } - } - -diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c -index 390c9709..5906a927 100644 ---- a/SRC/pzgssvx.c -+++ b/SRC/pzgssvx.c -@@ -1022,7 +1022,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, - return; - } - } else { -- get_perm_c_dist(iam, permc_spec, &GA, perm_c); -+ get_perm_c_dist(iam, permc_spec, &GA, perm_c, grid->comm); - } - } - -diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c -index 0f8c5aa7..8b13bcf2 100644 ---- a/SRC/pzgssvx3d.c -+++ b/SRC/pzgssvx3d.c -@@ -1069,7 +1069,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, - if (flinfo > 0) - ABORT ("ERROR in get perm_c parmetis."); - } else { -- get_perm_c_dist (iam, permc_spec, &GA, perm_c); -+ get_perm_c_dist (iam, permc_spec, &GA, perm_c, grid->comm); - } - } - -diff --git a/SRC/pzgssvx_ABglobal.c b/SRC/pzgssvx_ABglobal.c -index 644d35eb..3786e13d 100644 ---- a/SRC/pzgssvx_ABglobal.c -+++ b/SRC/pzgssvx_ABglobal.c -@@ -854,7 +854,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, - permc_spec = options->ColPerm; - if ( permc_spec != MY_PERMC && Fact == DOFACT ) - /* Use an ordering provided by SuperLU */ -- get_perm_c_dist(iam, permc_spec, A, perm_c); -+ get_perm_c_dist(iam, permc_spec, A, perm_c, grid->comm); - - /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' - (a.k.a. column etree), depending on the choice of ColPerm. -diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h -index 27db0107..5e504fb5 100644 ---- a/SRC/superlu_defs.h -+++ b/SRC/superlu_defs.h -@@ -1064,10 +1064,9 @@ extern "C" { - extern void superlu_gridinit(MPI_Comm, int, int, gridinfo_t *); - extern void superlu_gridmap(MPI_Comm, int, int, int [], int, gridinfo_t *); - extern void superlu_gridexit(gridinfo_t *); --extern void superlu_gridinit3d(MPI_Comm Bcomm, int nprow, int npcol, int npdep, -- gridinfo3d_t *grid) ; -+extern void superlu_gridinit3d(MPI_Comm, int, int, int, gridinfo3d_t *) ; - extern void superlu_gridmap3d(MPI_Comm, int, int, int, int [], gridinfo3d_t *); --extern void superlu_gridexit3d(gridinfo3d_t *grid); -+extern void superlu_gridexit3d(gridinfo3d_t *); - - extern void set_default_options_dist(superlu_dist_options_t *); - extern void print_options_dist(superlu_dist_options_t *); -@@ -1082,17 +1081,17 @@ extern void sp_colorder (superlu_dist_options_t*, SuperMatrix*, int_t*, int_t* - SuperMatrix*); - extern int sp_symetree_dist(int_t *, int_t *, int_t *, int_t, int_t *); - extern int sp_coletree_dist (int_t *, int_t *, int_t *, int_t, int_t, int_t *); --extern void get_perm_c_dist(int_t, int_t, SuperMatrix *, int_t *); -+extern void get_perm_c_dist(int_t, int_t, SuperMatrix *, int_t *, MPI_Comm); - extern void at_plus_a_dist(const int_t, const int_t, int_t *, int_t *, - int_t *, int_t **, int_t **); --extern int genmmd_dist_(int_t *, int_t *, int_t *a, -+extern int genmmd_dist_(int_t *, int_t *, int_t *, - int_t *, int_t *, int_t *, int_t *, - int_t *, int_t *, int_t *, int_t *, int_t *); - extern void bcast_tree(void *, int, MPI_Datatype, int, int, - gridinfo_t *, int, int *); - extern int_t symbfact(superlu_dist_options_t *, int, SuperMatrix *, int_t *, - int_t *, Glu_persist_t *, Glu_freeable_t *); --extern int_t symbfact_SubInit(superlu_dist_options_t *options, -+extern int_t symbfact_SubInit(superlu_dist_options_t *, - fact_t, void *, int_t, int_t, int_t, int_t, - Glu_persist_t *, Glu_freeable_t *); - extern int_t symbfact_SubXpand(int_t, int_t, int_t, MemType, int_t *, -@@ -1190,10 +1189,10 @@ extern int_t get_num_gpu_streams (void); - extern int getnGPUStreams(void); - extern int get_mpi_process_per_gpu (void); - /*to print out various statistics from GPU activities*/ --extern void printGPUStats(int nsupers, SuperLUStat_t *stat, gridinfo3d_t*); -+extern void printGPUStats(int, SuperLUStat_t *, gridinfo3d_t *); - #endif - --extern double estimate_cpu_time(int m, int n , int k); -+extern double estimate_cpu_time(int, int, int k); - - extern int get_thread_per_process(void); - extern int_t get_max_buffer_size (void); -@@ -1208,7 +1207,7 @@ extern int get_acc_offload(void); - extern void print_panel_seg_dist(int_t, int_t, int_t, int_t, int_t *, int_t *); - extern void check_repfnz_dist(int_t, int_t, int_t, int_t *); - extern int_t CheckZeroDiagonal(int_t, int_t *, int_t *, int_t *); --extern int check_perm_dist(char *what, int_t n, int_t *perm); -+extern int check_perm_dist(char *, int_t, int_t *); - extern void PrintDouble5(char *, int_t, double *); - extern void PrintInt10(char *, int_t, int_t *); - extern void PrintInt32(char *, int, int *); +diff --git a/SRC/get_perm_c.c b/SRC/get_perm_c.c +index 2f098b50..c945c152 100644 +--- a/SRC/get_perm_c.c ++++ b/SRC/get_perm_c.c +@@ -35,75 +35,39 @@ get_metis_dist( + int_t bnz, /* number of nonzeros in matrix A. */ + int_t *b_colptr, /* column pointer of size n+1 for matrix B. */ + int_t *b_rowind, /* row indices of size bnz for matrix B. */ +- int_t *perm_c /* out - the column permutation vector. */ ++ int_t *perm_c, /* out - the column permutation vector. */ ++ MPI_Comm comm /* MPI communicator to broadcast the permutation. */ + ) + { + #ifdef HAVE_PARMETIS +- /*#define METISOPTIONS 8*/ +-#define METISOPTIONS 40 +- int_t metis_options[METISOPTIONS]; +- int_t i, nm, numflag = 0; /* C-Style ordering */ +- int_t *perm, *iperm; +- int_t *b_colptr_int, *b_rowind_int; +- +- extern int METIS_NodeND(int_t*, int_t*, int_t*, int_t*, int_t*, +- int_t*, int_t*); +- +- metis_options[0] = 0; /* Use Defaults for now */ +- +- perm = (int_t*) SUPERLU_MALLOC(2*n * sizeof(int_t)); +- if (!perm) ABORT("SUPERLU_MALLOC fails for perm."); +- iperm = perm + n; +- nm = n; +- +-#if 0 +-#if defined(_LONGINT) +- /* Metis can only take 32-bit integers */ +- +- if ( !(b_colptr_int = (int*) SUPERLU_MALLOC((n+1) * sizeof(int))) ) +- ABORT("SUPERLU_MALLOC fails for b_colptr_int."); +- for (i = 0; i < n+1; ++i) b_colptr_int[i] = b_colptr[i]; +- SUPERLU_FREE(b_colptr); +- +- if ( !(b_rowind_int = (int*) SUPERLU_MALLOC(bnz * sizeof(int))) ) +- ABORT("SUPERLU_MALLOC fails for b_rowind_int."); +- +- for (i = 0; i < bnz; ++i) b_rowind_int[i] = b_rowind[i]; +- SUPERLU_FREE(b_rowind); +-#else +- b_colptr_int = b_colptr; +- b_rowind_int = b_rowind; +-#endif +-#endif ++ int iam; ++ MPI_Comm_rank( comm, &iam ); ++ if ( !iam ) { ++ int_t i, nm; ++ int_t *perm, *iperm; + +- /* Call metis */ +-#undef USEEND +-#ifdef USEEND +- METIS_EdgeND(&nm, b_colptr_int, b_rowind_int, &numflag, metis_options, +- perm, iperm); +-#else ++ extern int METIS_NodeND(int_t*, int_t*, int_t*, int_t*, int_t*, ++ int_t*, int_t*); + +- /* Earlier version 3.x.x */ +- /* METIS_NodeND(&nm, b_colptr, b_rowind, &numflag, metis_options, +- perm, iperm);*/ ++ perm = (int_t*) SUPERLU_MALLOC(2*n * sizeof(int_t)); ++ if (!perm) ABORT("SUPERLU_MALLOC fails for perm."); ++ iperm = perm + n; ++ nm = n; + +- /* Latest version 4.x.x */ +- METIS_NodeND(&nm, b_colptr, b_rowind, NULL, NULL, perm, iperm); ++ /* Call metis */ ++ METIS_NodeND(&nm, b_colptr, b_rowind, NULL, NULL, perm, iperm); + +- /*check_perm_dist("metis perm", n, perm);*/ +-#endif ++ /* Copy the permutation vector into SuperLU data structure. */ ++ for (i = 0; i < n; ++i) perm_c[i] = iperm[i]; + +- /* Copy the permutation vector into SuperLU data structure. */ +- for (i = 0; i < n; ++i) perm_c[i] = iperm[i]; ++ SUPERLU_FREE(perm); ++ } ++ MPI_Bcast( perm_c, n, mpi_int_t, 0, comm); + +-#if 0 +- SUPERLU_FREE(b_colptr_int); +- SUPERLU_FREE(b_rowind_int); +-#else + SUPERLU_FREE(b_colptr); + SUPERLU_FREE(b_rowind); +-#endif +- SUPERLU_FREE(perm); ++#else ++ for (int i = 0; i < n; ++i) perm_c[i] = i; + #endif /* HAVE_PARMETIS */ + } + +@@ -114,31 +78,40 @@ get_colamd_dist( + const int nnz,/* number of nonzeros in matrix A. */ + int_t *colptr, /* column pointer of size n+1 for matrix A. */ + int_t *rowind, /* row indices of size nz for matrix A. */ +- int_t *perm_c /* out - the column permutation vector. */ ++ int_t *perm_c, /* out - the column permutation vector. */ ++ MPI_Comm comm /* MPI communicator to broadcast the permutation. */ + ) + { + #ifdef HAVE_COLAMD +- int Alen, *A, i, info, *p; +- double knobs[COLAMD_KNOBS]; +- int stats[COLAMD_STATS]; +- +- Alen = colamd_recommended(nnz, m, n); +- +- colamd_set_defaults(knobs); +- +- if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) ) +- ABORT("Malloc fails for A[]"); +- if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) ) +- ABORT("Malloc fails for p[]"); +- for (i = 0; i <= n; ++i) p[i] = colptr[i]; +- for (i = 0; i < nnz; ++i) A[i] = rowind[i]; +- info = colamd(m, n, Alen, A, p, knobs, stats); +- if ( info == FALSE ) ABORT("COLAMD failed"); +- +- for (i = 0; i < n; ++i) perm_c[p[i]] = i; ++ int iam; ++ MPI_Comm_rank( comm, &iam ); ++ if ( !iam ) { ++ int Alen, *A, i, info, *p; ++ double knobs[COLAMD_KNOBS]; ++ int stats[COLAMD_STATS]; ++ ++ Alen = colamd_recommended(nnz, m, n); ++ ++ colamd_set_defaults(knobs); ++ ++ if (!(A = (int *) SUPERLU_MALLOC(Alen * sizeof(int))) ) ++ ABORT("Malloc fails for A[]"); ++ if (!(p = (int *) SUPERLU_MALLOC((n+1) * sizeof(int))) ) ++ ABORT("Malloc fails for p[]"); ++ for (i = 0; i <= n; ++i) p[i] = colptr[i]; ++ for (i = 0; i < nnz; ++i) A[i] = rowind[i]; ++ info = colamd(m, n, Alen, A, p, knobs, stats); ++ if ( info == FALSE ) ABORT("COLAMD failed"); ++ ++ for (i = 0; i < n; ++i) perm_c[p[i]] = i; ++ ++ SUPERLU_FREE(A); ++ SUPERLU_FREE(p); ++ } ++ MPI_Bcast( perm_c, n, mpi_int_t, 0, comm); + +- SUPERLU_FREE(A); +- SUPERLU_FREE(p); ++ SUPERLU_FREE(colptr); ++ SUPERLU_FREE(rowind); + #else + for (int i = 0; i < n; ++i) perm_c[i] = i; + #endif // HAVE_COLAMD +@@ -466,7 +439,13 @@ at_plus_a_dist( + * + */ + void +-get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c) ++get_perm_c_dist( ++ int_t pnum, ++ int_t ispec, ++ SuperMatrix *A, ++ int_t *perm_c, ++ MPI_Comm comm ++ ) + + { + NCformat *Astore = A->Store; +@@ -516,7 +495,7 @@ get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c) + + case (COLAMD): /* Approximate minimum degree column ordering. */ + get_colamd_dist(m, n, Astore->nnz, Astore->colptr, Astore->rowind, +- perm_c); ++ perm_c, comm); + #if ( PRNTlevel>=1 ) + printf(".. Use approximate minimum degree column ordering.\n"); + #endif +@@ -528,7 +507,7 @@ get_perm_c_dist(int_t pnum, int_t ispec, SuperMatrix *A, int_t *perm_c) + &bnz, &b_colptr, &b_rowind); + + if ( bnz ) { /* non-empty adjacency structure */ +- get_metis_dist(n, bnz, b_colptr, b_rowind, perm_c); ++ get_metis_dist(n, bnz, b_colptr, b_rowind, perm_c, comm); + } else { /* e.g., diagonal matrix */ + for (i = 0; i < n; ++i) perm_c[i] = i; + SUPERLU_FREE(b_colptr); +diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c +index c0bd343d..1abb5e8b 100644 +--- a/SRC/pdgssvx.c ++++ b/SRC/pdgssvx.c +@@ -1021,7 +1021,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, + return; + } + } else { +- get_perm_c_dist(iam, permc_spec, &GA, perm_c); ++ get_perm_c_dist(iam, permc_spec, &GA, perm_c, grid->comm); + } + } + +diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c +index 75da7dca..2fbdf665 100644 +--- a/SRC/pdgssvx3d.c ++++ b/SRC/pdgssvx3d.c +@@ -1073,7 +1073,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, + if (flinfo > 0) + ABORT ("ERROR in get perm_c parmetis."); + } else { +- get_perm_c_dist (iam, permc_spec, &GA, perm_c); ++ get_perm_c_dist (iam, permc_spec, &GA, perm_c, grid->comm); + } + } + +diff --git a/SRC/pdgssvx_ABglobal.c b/SRC/pdgssvx_ABglobal.c +index 24f20776..d0c0d996 100644 +--- a/SRC/pdgssvx_ABglobal.c ++++ b/SRC/pdgssvx_ABglobal.c +@@ -855,7 +855,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && Fact == DOFACT ) + /* Use an ordering provided by SuperLU */ +- get_perm_c_dist(iam, permc_spec, A, perm_c); ++ get_perm_c_dist(iam, permc_spec, A, perm_c, grid->comm); + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. +diff --git a/SRC/psgssvx.c b/SRC/psgssvx.c +index 620c3584..c3c18d3f 100644 +--- a/SRC/psgssvx.c ++++ b/SRC/psgssvx.c +@@ -1021,7 +1021,7 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A, + return; + } + } else { +- get_perm_c_dist(iam, permc_spec, &GA, perm_c); ++ get_perm_c_dist(iam, permc_spec, &GA, perm_c, grid->comm); + } + } + +diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c +index 8ac7e954..d2f7bf0e 100644 +--- a/SRC/psgssvx3d.c ++++ b/SRC/psgssvx3d.c +@@ -1068,7 +1068,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, + if (flinfo > 0) + ABORT ("ERROR in get perm_c parmetis."); + } else { +- get_perm_c_dist (iam, permc_spec, &GA, perm_c); ++ get_perm_c_dist (iam, permc_spec, &GA, perm_c, grid->comm); + } + } + +diff --git a/SRC/psgssvx_ABglobal.c b/SRC/psgssvx_ABglobal.c +index 8c83409c..1ea7a78c 100644 +--- a/SRC/psgssvx_ABglobal.c ++++ b/SRC/psgssvx_ABglobal.c +@@ -855,7 +855,7 @@ psgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && Fact == DOFACT ) + /* Use an ordering provided by SuperLU */ +- get_perm_c_dist(iam, permc_spec, A, perm_c); ++ get_perm_c_dist(iam, permc_spec, A, perm_c, grid->comm); + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. +diff --git a/SRC/psgssvx_d2.c b/SRC/psgssvx_d2.c +index 94d04e4c..a839da0f 100644 +--- a/SRC/psgssvx_d2.c ++++ b/SRC/psgssvx_d2.c +@@ -1075,7 +1075,7 @@ psgssvx_d2(superlu_dist_options_t *options, SuperMatrix *A, + return; + } + } else { +- get_perm_c_dist(iam, permc_spec, &GA, perm_c); ++ get_perm_c_dist(iam, permc_spec, &GA, perm_c, grid->comm); + } + } + +diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c +index 390c9709..5906a927 100644 +--- a/SRC/pzgssvx.c ++++ b/SRC/pzgssvx.c +@@ -1022,7 +1022,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, + return; + } + } else { +- get_perm_c_dist(iam, permc_spec, &GA, perm_c); ++ get_perm_c_dist(iam, permc_spec, &GA, perm_c, grid->comm); + } + } + +diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c +index 0f8c5aa7..8b13bcf2 100644 +--- a/SRC/pzgssvx3d.c ++++ b/SRC/pzgssvx3d.c +@@ -1069,7 +1069,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, + if (flinfo > 0) + ABORT ("ERROR in get perm_c parmetis."); + } else { +- get_perm_c_dist (iam, permc_spec, &GA, perm_c); ++ get_perm_c_dist (iam, permc_spec, &GA, perm_c, grid->comm); + } + } + +diff --git a/SRC/pzgssvx_ABglobal.c b/SRC/pzgssvx_ABglobal.c +index 644d35eb..3786e13d 100644 +--- a/SRC/pzgssvx_ABglobal.c ++++ b/SRC/pzgssvx_ABglobal.c +@@ -854,7 +854,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && Fact == DOFACT ) + /* Use an ordering provided by SuperLU */ +- get_perm_c_dist(iam, permc_spec, A, perm_c); ++ get_perm_c_dist(iam, permc_spec, A, perm_c, grid->comm); + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. +diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h +index 27db0107..5e504fb5 100644 +--- a/SRC/superlu_defs.h ++++ b/SRC/superlu_defs.h +@@ -1064,10 +1064,9 @@ extern "C" { + extern void superlu_gridinit(MPI_Comm, int, int, gridinfo_t *); + extern void superlu_gridmap(MPI_Comm, int, int, int [], int, gridinfo_t *); + extern void superlu_gridexit(gridinfo_t *); +-extern void superlu_gridinit3d(MPI_Comm Bcomm, int nprow, int npcol, int npdep, +- gridinfo3d_t *grid) ; ++extern void superlu_gridinit3d(MPI_Comm, int, int, int, gridinfo3d_t *) ; + extern void superlu_gridmap3d(MPI_Comm, int, int, int, int [], gridinfo3d_t *); +-extern void superlu_gridexit3d(gridinfo3d_t *grid); ++extern void superlu_gridexit3d(gridinfo3d_t *); + + extern void set_default_options_dist(superlu_dist_options_t *); + extern void print_options_dist(superlu_dist_options_t *); +@@ -1082,17 +1081,17 @@ extern void sp_colorder (superlu_dist_options_t*, SuperMatrix*, int_t*, int_t* + SuperMatrix*); + extern int sp_symetree_dist(int_t *, int_t *, int_t *, int_t, int_t *); + extern int sp_coletree_dist (int_t *, int_t *, int_t *, int_t, int_t, int_t *); +-extern void get_perm_c_dist(int_t, int_t, SuperMatrix *, int_t *); ++extern void get_perm_c_dist(int_t, int_t, SuperMatrix *, int_t *, MPI_Comm); + extern void at_plus_a_dist(const int_t, const int_t, int_t *, int_t *, + int_t *, int_t **, int_t **); +-extern int genmmd_dist_(int_t *, int_t *, int_t *a, ++extern int genmmd_dist_(int_t *, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *, + int_t *, int_t *, int_t *, int_t *, int_t *); + extern void bcast_tree(void *, int, MPI_Datatype, int, int, + gridinfo_t *, int, int *); + extern int_t symbfact(superlu_dist_options_t *, int, SuperMatrix *, int_t *, + int_t *, Glu_persist_t *, Glu_freeable_t *); +-extern int_t symbfact_SubInit(superlu_dist_options_t *options, ++extern int_t symbfact_SubInit(superlu_dist_options_t *, + fact_t, void *, int_t, int_t, int_t, int_t, + Glu_persist_t *, Glu_freeable_t *); + extern int_t symbfact_SubXpand(int_t, int_t, int_t, MemType, int_t *, +@@ -1190,10 +1189,10 @@ extern int_t get_num_gpu_streams (void); + extern int getnGPUStreams(void); + extern int get_mpi_process_per_gpu (void); + /*to print out various statistics from GPU activities*/ +-extern void printGPUStats(int nsupers, SuperLUStat_t *stat, gridinfo3d_t*); ++extern void printGPUStats(int, SuperLUStat_t *, gridinfo3d_t *); + #endif + +-extern double estimate_cpu_time(int m, int n , int k); ++extern double estimate_cpu_time(int, int, int k); + + extern int get_thread_per_process(void); + extern int_t get_max_buffer_size (void); +@@ -1208,7 +1207,7 @@ extern int get_acc_offload(void); + extern void print_panel_seg_dist(int_t, int_t, int_t, int_t, int_t *, int_t *); + extern void check_repfnz_dist(int_t, int_t, int_t, int_t *); + extern int_t CheckZeroDiagonal(int_t, int_t *, int_t *, int_t *); +-extern int check_perm_dist(char *what, int_t n, int_t *perm); ++extern int check_perm_dist(char *, int_t, int_t *); + extern void PrintDouble5(char *, int_t, double *); + extern void PrintInt10(char *, int_t, int_t *); + extern void PrintInt32(char *, int, int *); diff --git a/extern/patch/superlu_dist/patch_parmetis.diff b/extern/patch/superlu_dist/patch_parmetis.diff index b6a1df19ec..c8ae4a6ed8 100644 --- a/extern/patch/superlu_dist/patch_parmetis.diff +++ b/extern/patch/superlu_dist/patch_parmetis.diff @@ -1,324 +1,14 @@ -diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c -index 16af921e..a9d59ecf 100755 ---- a/EXAMPLE/pddrive2.c -+++ b/EXAMPLE/pddrive2.c -@@ -60,7 +60,7 @@ int main(int argc, char *argv[]) - double *b, *b1, *xtrue, *xtrue1; - int_t *colind, *colind1, *rowptr, *rowptr1; - int_t i, j, m, n, nnz_loc, m_loc; -- int nprow, npcol; -+ int nprow, npcol, colperm, rowperm, symbfact; - int iam, info, ldb, ldx, nrhs; - char **cpp, c, *postfix; - int ii, omp_mpi_level; -@@ -78,6 +78,9 @@ int main(int argc, char *argv[]) - nprow = 1; /* Default process rows. */ - npcol = 1; /* Default process columns. */ - nrhs = 1; /* Number of right-hand side. */ -+ colperm = -1; -+ rowperm = -1; -+ symbfact = -1; - - /* ------------------------------------------------------------ - INITIALIZE MPI ENVIRONMENT. -@@ -100,6 +103,12 @@ int main(int argc, char *argv[]) - break; - case 'c': npcol = atoi(*cpp); - break; -+ case 'p': rowperm = atoi(*cpp); -+ break; -+ case 'q': colperm = atoi(*cpp); -+ break; -+ case 's': symbfact = atoi(*cpp); -+ break; - } - } else { /* Last arg is considered a filename */ - if ( !(fp = fopen(*cpp, "r")) ) { -@@ -174,6 +183,10 @@ int main(int argc, char *argv[]) - */ - set_default_options_dist(&options); - -+ if (rowperm != -1) options.RowPerm = rowperm; -+ if (colperm != -1) options.ColPerm = colperm; -+ if (symbfact != -1) options.ParSymbFact = symbfact; -+ - if (!iam) { - print_options_dist(&options); - fflush(stdout); -diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c -index e961fcaf..6e15ad44 100755 ---- a/EXAMPLE/pddrive3.c -+++ b/EXAMPLE/pddrive3.c -@@ -65,7 +65,7 @@ int main(int argc, char *argv[]) - double *b, *b1, *xtrue, *nzval, *nzval1; - int_t *colind, *colind1, *rowptr, *rowptr1; - int_t i, j, m, n, nnz_loc, m_loc, fst_row; -- int nprow, npcol; -+ int nprow, npcol, colperm, rowperm, symbfact; - int iam, info, ldb, ldx, nrhs; - char **cpp, c, *postfix; - int ii, omp_mpi_level; -@@ -75,6 +75,9 @@ int main(int argc, char *argv[]) - nprow = 1; /* Default process rows. */ - npcol = 1; /* Default process columns. */ - nrhs = 1; /* Number of right-hand side. */ -+ colperm = -1; -+ rowperm = -1; -+ symbfact = -1; - - /* ------------------------------------------------------------ - INITIALIZE MPI ENVIRONMENT. -@@ -97,6 +100,12 @@ int main(int argc, char *argv[]) - break; - case 'c': npcol = atoi(*cpp); - break; -+ case 'p': rowperm = atoi(*cpp); -+ break; -+ case 'q': colperm = atoi(*cpp); -+ break; -+ case 's': symbfact = atoi(*cpp); -+ break; - } - } else { /* Last arg is considered a filename */ - if ( !(fp = fopen(*cpp, "r")) ) { -@@ -189,6 +198,10 @@ int main(int argc, char *argv[]) - */ - set_default_options_dist(&options); - -+ if (rowperm != -1) options.RowPerm = rowperm; -+ if (colperm != -1) options.ColPerm = colperm; -+ if (symbfact != -1) options.ParSymbFact = symbfact; -+ - if (!iam) { - print_options_dist(&options); - fflush(stdout); -diff --git a/EXAMPLE/psdrive2.c b/EXAMPLE/psdrive2.c -index 7f3a732a..68cb8a0f 100755 ---- a/EXAMPLE/psdrive2.c -+++ b/EXAMPLE/psdrive2.c -@@ -60,7 +60,7 @@ int main(int argc, char *argv[]) - float *b, *b1, *xtrue, *xtrue1; - int_t *colind, *colind1, *rowptr, *rowptr1; - int_t i, j, m, n, nnz_loc, m_loc; -- int nprow, npcol; -+ int nprow, npcol, colperm, rowperm, symbfact; - int iam, info, ldb, ldx, nrhs; - char **cpp, c, *postfix; - int ii, omp_mpi_level; -@@ -78,6 +78,9 @@ int main(int argc, char *argv[]) - nprow = 1; /* Default process rows. */ - npcol = 1; /* Default process columns. */ - nrhs = 1; /* Number of right-hand side. */ -+ colperm = -1; -+ rowperm = -1; -+ symbfact = -1; - - /* ------------------------------------------------------------ - INITIALIZE MPI ENVIRONMENT. -@@ -100,6 +103,12 @@ int main(int argc, char *argv[]) - break; - case 'c': npcol = atoi(*cpp); - break; -+ case 'p': rowperm = atoi(*cpp); -+ break; -+ case 'q': colperm = atoi(*cpp); -+ break; -+ case 's': symbfact = atoi(*cpp); -+ break; - } - } else { /* Last arg is considered a filename */ - if ( !(fp = fopen(*cpp, "r")) ) { -@@ -175,6 +184,10 @@ int main(int argc, char *argv[]) - set_default_options_dist(&options); - options.IterRefine = SLU_SINGLE; - -+ if (rowperm != -1) options.RowPerm = rowperm; -+ if (colperm != -1) options.ColPerm = colperm; -+ if (symbfact != -1) options.ParSymbFact = symbfact; -+ - if (!iam) { - print_options_dist(&options); - fflush(stdout); -diff --git a/EXAMPLE/psdrive3.c b/EXAMPLE/psdrive3.c -index 6cb1da97..8f8ce2c4 100755 ---- a/EXAMPLE/psdrive3.c -+++ b/EXAMPLE/psdrive3.c -@@ -65,7 +65,7 @@ int main(int argc, char *argv[]) - float *b, *b1, *xtrue, *nzval, *nzval1; - int_t *colind, *colind1, *rowptr, *rowptr1; - int_t i, j, m, n, nnz_loc, m_loc, fst_row; -- int nprow, npcol; -+ int nprow, npcol, colperm, rowperm, symbfact; - int iam, info, ldb, ldx, nrhs; - char **cpp, c, *postfix; - int ii, omp_mpi_level; -@@ -75,6 +75,9 @@ int main(int argc, char *argv[]) - nprow = 1; /* Default process rows. */ - npcol = 1; /* Default process columns. */ - nrhs = 1; /* Number of right-hand side. */ -+ colperm = -1; -+ rowperm = -1; -+ symbfact = -1; - - /* ------------------------------------------------------------ - INITIALIZE MPI ENVIRONMENT. -@@ -97,6 +100,12 @@ int main(int argc, char *argv[]) - break; - case 'c': npcol = atoi(*cpp); - break; -+ case 'p': rowperm = atoi(*cpp); -+ break; -+ case 'q': colperm = atoi(*cpp); -+ break; -+ case 's': symbfact = atoi(*cpp); -+ break; - } - } else { /* Last arg is considered a filename */ - if ( !(fp = fopen(*cpp, "r")) ) { -@@ -190,6 +199,10 @@ int main(int argc, char *argv[]) - set_default_options_dist(&options); - options.IterRefine = SLU_SINGLE; - -+ if (rowperm != -1) options.RowPerm = rowperm; -+ if (colperm != -1) options.ColPerm = colperm; -+ if (symbfact != -1) options.ParSymbFact = symbfact; -+ - if (!iam) { - print_options_dist(&options); - fflush(stdout); -diff --git a/EXAMPLE/pzdrive2.c b/EXAMPLE/pzdrive2.c -index 8e0ec3eb..8d4294c0 100755 ---- a/EXAMPLE/pzdrive2.c -+++ b/EXAMPLE/pzdrive2.c -@@ -59,7 +59,7 @@ int main(int argc, char *argv[]) - doublecomplex *b, *b1, *xtrue, *xtrue1; - int_t *colind, *colind1, *rowptr, *rowptr1; - int_t i, j, m, n, nnz_loc, m_loc; -- int nprow, npcol; -+ int nprow, npcol, colperm, rowperm, symbfact; - int iam, info, ldb, ldx, nrhs; - char **cpp, c, *postfix; - int ii, omp_mpi_level; -@@ -77,6 +77,9 @@ int main(int argc, char *argv[]) - nprow = 1; /* Default process rows. */ - npcol = 1; /* Default process columns. */ - nrhs = 1; /* Number of right-hand side. */ -+ colperm = -1; -+ rowperm = -1; -+ symbfact = -1; - - /* ------------------------------------------------------------ - INITIALIZE MPI ENVIRONMENT. -@@ -99,6 +102,12 @@ int main(int argc, char *argv[]) - break; - case 'c': npcol = atoi(*cpp); - break; -+ case 'p': rowperm = atoi(*cpp); -+ break; -+ case 'q': colperm = atoi(*cpp); -+ break; -+ case 's': symbfact = atoi(*cpp); -+ break; - } - } else { /* Last arg is considered a filename */ - if ( !(fp = fopen(*cpp, "r")) ) { -@@ -173,6 +182,10 @@ int main(int argc, char *argv[]) - */ - set_default_options_dist(&options); - -+ if (rowperm != -1) options.RowPerm = rowperm; -+ if (colperm != -1) options.ColPerm = colperm; -+ if (symbfact != -1) options.ParSymbFact = symbfact; -+ - if (!iam) { - print_options_dist(&options); - fflush(stdout); -diff --git a/EXAMPLE/pzdrive3.c b/EXAMPLE/pzdrive3.c -index 3a55b044..9277f4f8 100755 ---- a/EXAMPLE/pzdrive3.c -+++ b/EXAMPLE/pzdrive3.c -@@ -64,7 +64,7 @@ int main(int argc, char *argv[]) - doublecomplex *b, *b1, *xtrue, *nzval, *nzval1; - int_t *colind, *colind1, *rowptr, *rowptr1; - int_t i, j, m, n, nnz_loc, m_loc, fst_row; -- int nprow, npcol; -+ int nprow, npcol, colperm, rowperm, symbfact; - int iam, info, ldb, ldx, nrhs; - char **cpp, c, *postfix; - int ii, omp_mpi_level; -@@ -74,6 +74,9 @@ int main(int argc, char *argv[]) - nprow = 1; /* Default process rows. */ - npcol = 1; /* Default process columns. */ - nrhs = 1; /* Number of right-hand side. */ -+ colperm = -1; -+ rowperm = -1; -+ symbfact = -1; - - /* ------------------------------------------------------------ - INITIALIZE MPI ENVIRONMENT. -@@ -96,6 +99,12 @@ int main(int argc, char *argv[]) - break; - case 'c': npcol = atoi(*cpp); - break; -+ case 'p': rowperm = atoi(*cpp); -+ break; -+ case 'q': colperm = atoi(*cpp); -+ break; -+ case 's': symbfact = atoi(*cpp); -+ break; - } - } else { /* Last arg is considered a filename */ - if ( !(fp = fopen(*cpp, "r")) ) { -@@ -188,6 +197,10 @@ int main(int argc, char *argv[]) - */ - set_default_options_dist(&options); - -+ if (rowperm != -1) options.RowPerm = rowperm; -+ if (colperm != -1) options.ColPerm = colperm; -+ if (symbfact != -1) options.ParSymbFact = symbfact; -+ - if (!iam) { - print_options_dist(&options); - fflush(stdout); -diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c -index d465710c..f51b4d5e 100644 ---- a/SRC/pdgssvx.c -+++ b/SRC/pdgssvx.c -@@ -1132,8 +1132,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, - if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); - - /* Distribute entries of A into L & U data structures. */ -- //if (parSymbFact == NO || ???? Fact == SamePattern_SameRowPerm) { -- if ( parSymbFact == NO ) { -+ if ( parSymbFact == NO || Fact == SamePattern_SameRowPerm ) { - /* CASE OF SERIAL SYMBOLIC */ - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; -diff --git a/SRC/psgssvx.c b/SRC/psgssvx.c -index 6393b945..22a865cc 100644 ---- a/SRC/psgssvx.c -+++ b/SRC/psgssvx.c -@@ -1133,8 +1133,7 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A, - if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); - - /* Distribute entries of A into L & U data structures. */ -- //if (parSymbFact == NO || ???? Fact == SamePattern_SameRowPerm) { -- if ( parSymbFact == NO ) { -+ if ( parSymbFact == NO || Fact == SamePattern_SameRowPerm ) { - /* CASE OF SERIAL SYMBOLIC */ - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; -diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c -index 952f3164..072433b9 100644 ---- a/SRC/pzgssvx.c -+++ b/SRC/pzgssvx.c -@@ -1134,8 +1134,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, - if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); - - /* Distribute entries of A into L & U data structures. */ -- //if (parSymbFact == NO || ???? Fact == SamePattern_SameRowPerm) { -- if ( parSymbFact == NO ) { -+ if ( parSymbFact == NO || Fact == SamePattern_SameRowPerm ) { - /* CASE OF SERIAL SYMBOLIC */ - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; +diff --git a/SRC/single/psgssvx_d2.c b/SRC/single/psgssvx_d2.c +index 8a6b10c2..a8334501 100755 +--- a/SRC/single/psgssvx_d2.c ++++ b/SRC/single/psgssvx_d2.c +@@ -1181,8 +1181,7 @@ psgssvx_d2(superlu_dist_options_t *options, SuperMatrix *A, + if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); + + /* Distribute entries of A into L & U data structures. */ +- //if (parSymbFact == NO || ???? Fact == SamePattern_SameRowPerm) { +- if ( parSymbFact == NO ) { ++ if ( parSymbFact == NO || Fact == SamePattern_SameRowPerm ) { + /* CASE OF SERIAL SYMBOLIC */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; diff --git a/myPalaceNotes.txt b/myPalaceNotes.txt new file mode 100644 index 0000000000..68613f3fa1 --- /dev/null +++ b/myPalaceNotes.txt @@ -0,0 +1,7 @@ +mfem code changes: + +datacollection.cpp +//std::string pvdname = col_path + "/" + GeneratePVDFileName(); +std::filesystem::path pathObj(GeneratePVDFileName()); +std::string pvdname = col_path + "/" + pathObj.filename().string(); + diff --git a/myRunningScripts.txt b/myRunningScripts.txt deleted file mode 100644 index 37f21f7a4a..0000000000 --- a/myRunningScripts.txt +++ /dev/null @@ -1,51 +0,0 @@ -palace_d _palace_examples/spheres/spheres.json - -Electrostatic: -palace _palace_examples/spheres/spheres.json, 20s for OpenMP - -Magnetostatic: -palace _palace_examples/rings/rings.json, 36s for OpenMP - -Eigenmode: -palace _palace_examples/cavity/cavity_impedance.json, 14s for OpenMP -palace _palace_examples/cavity/cavity_pec.json, 10s for OpenMP - -Driven: -palace _palace_examples/cpw/cpw_lumped_adaptive.json, 216s for OpenMP -palace _palace_examples/cpw/cpw_lumped_uniform.json, 170s for OpenMP -palace _palace_examples/cpw/cpw_wave_adaptive.json -palace _palace_examples/cpw/cpw_wave_uniform.json - -Transient: -palace _palace_examples/coaxial/coaxial_matched.json, 30s for OpenMP -palace _palace_examples/coaxial/coaxial_open.json, 29s for OpenMP -palace _palace_examples/coaxial/coaxial_short.json, 29s for OpenMP - - -=== CUDA building === -#ifndef __CUDACC__ -#define __CUDACC__ -#endif -#include -#include -#include -//#include - - - -=== CUDA libs === -MAGMA\magma.lib -MAGMA\magma_sparse.lib -$(CUDA_PATH)\lib\x64\cublas.lib -$(CUDA_PATH)\lib\x64\cudart.lib -$(CUDA_PATH)\lib\x64\cuda.lib -$(CUDA_PATH)\lib\x64\cusparse.lib -$(CUDA_PATH)\lib\x64\nvrtc.lib - - - - -=== Currently CUDA Windows building issues === -1. In MFEM, On Windows, the enclosing parent function ("AddWithMarkers_") for an extended __host__ __device__ lambda cannot have internal or no linkage. -2. - diff --git a/palace/CMakeLists.txt b/palace/CMakeLists.txt index 407224c300..3484833054 100644 --- a/palace/CMakeLists.txt +++ b/palace/CMakeLists.txt @@ -1,282 +1,375 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# CMake configuration for the main Palace application -# - -# CMake 3.21 was released in Jul. 2021 (required for HIP support) -cmake_minimum_required(VERSION 3.21) - -# Prohibit in-source builds -if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR) - message(FATAL_ERROR "In-source builds are prohibited") -endif() - -# C++17 required for std::filesystem, among others -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_EXTENSIONS OFF) - -# Initialize the project -project(palace LANGUAGES CXX VERSION 0.11.2) - -# Define build settings and defaults -set(PALACE_WITH_OPENMP OFF CACHE BOOL "Use OpenMP for shared-memory parallelism") - -set(PALACE_WITH_SLEPC ON CACHE BOOL "Build with SLEPc eigenvalue solver") -set(PALACE_WITH_ARPACK OFF CACHE BOOL "Build with ARPACK eigenvalue solver") - -set(ANALYZE_SOURCES_CLANG_TIDY OFF CACHE BOOL "Run static analysis checks using clang-tidy") -set(ANALYZE_SOURCES_CPPCHECK OFF CACHE BOOL "Run static analysis checks using cppcheck") - -# Help find third-party dependencies -set(MFEM_DIR "" CACHE STRING - "Path to MFEM build or installation directory (not required if already on CMAKE_PREFIX_PATH)" -) -set(LIBCEED_DIR "" CACHE STRING - "Path to libCEED build or installation directory (not required if already on CMAKE_PREFIX_PATH or PKG_CONFIG_PATH)" -) -set(PETSC_DIR "" CACHE STRING - "Path to PETSc build or installation directory (not required if already on CMAKE_PREFIX_PATH or PKG_CONFIG_PATH)" -) -set(SLEPC_DIR "" CACHE STRING - "Path to SLEPc build or installation directory (not required if already on CMAKE_PREFIX_PATH or PKG_CONFIG_PATH)" -) -set(ARPACK_DIR "" CACHE STRING - "Path to ARPACK build or installation directory (not required if already on CMAKE_PREFIX_PATH)" -) - -# Enable Fortran if required -if(PALACE_WITH_ARPACK) - enable_language(Fortran) -endif() - -# Set a default build type if none was provided -if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - message(STATUS "Setting CMAKE_BUILD_TYPE to 'Release' as none was specified") - set(CMAKE_BUILD_TYPE "Release" CACHE STRING - "Specifies the build type ('Debug' or 'Release', for example)" FORCE - ) -endif() - -# Add extra CMake modules -list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") - -# Find MPI -find_package(MPI REQUIRED) - -# Find OpenMP -if(PALACE_WITH_OPENMP) - find_package(OpenMP REQUIRED) -endif() - -# Find nlohmann/json -find_package(nlohmann_json REQUIRED CONFIG) -message(STATUS "Found nlohmann/json: ${nlohmann_json_VERSION} in ${nlohmann_json_DIR}") - -# Find fmt -find_package(fmt REQUIRED CONFIG) -message(STATUS "Found fmt: ${fmt_VERSION} in ${fmt_DIR}") - -# Find Eigen -find_package(Eigen3 REQUIRED CONFIG) -message(STATUS "Found Eigen: ${Eigen3_VERSION} in ${Eigen3_DIR}") - -# Find MFEM (recent MFEM targets link to target Threads::Threads, for some reason) -if(NOT "${MFEM_DIR}" STREQUAL "") - set(MFEM_ROOT ${MFEM_DIR}) -endif() -find_package(Threads REQUIRED) -find_package(MFEM REQUIRED CONFIG) -message(STATUS "Found MFEM: ${MFEM_VERSION} in ${MFEM_DIR}") -if(NOT MFEM_USE_MPI) - message(FATAL_ERROR "Build requires MFEM with MPI support") -endif() -# if(MFEM_CXX_FLAGS) -# # Pull compiler flags from MFEM for OpenMP and optimizations -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MFEM_CXX_FLAGS}") -# endif() - -# Find libCEED -include(PkgConfigHelpers) -set(LIBCEED_TEST_DEPS) -if(PALACE_WITH_OPENMP) - list(APPEND LIBCEED_TEST_DEPS OpenMP::OpenMP_CXX) -endif() -find_libceed_pkgconfig("${LIBCEED_TEST_DEPS}" LIBCEED_TARGET) -if("${LIBCEED_TARGET}" STREQUAL "") - message(FATAL_ERROR "libCEED could not be found, be sure to set LIBCEED_DIR") -endif() - -# Find PETSc and SLEPc -if(PALACE_WITH_SLEPC) - set(PETSC_TEST_DEPS MPI::MPI_CXX) - if(PALACE_WITH_OPENMP) - list(APPEND PETSC_TEST_DEPS OpenMP::OpenMP_CXX) - endif() - find_petsc_pkgconfig("${PETSC_TEST_DEPS}" PETSC_TARGET) - if("${PETSC_TARGET}" STREQUAL "") - message(FATAL_ERROR "PETSc could not be found, be sure to set PETSC_DIR") - endif() - find_slepc_pkgconfig("${PETSC_TARGET};${PETSC_TEST_DEPS}" SLEPC_TARGET) - if("${SLEPC_TARGET}" STREQUAL "") - message(FATAL_ERROR "SLEPc could not be found, be sure to set SLEPC_DIR") - endif() -elseif(NOT PALACE_WITH_ARPACK) - message(FATAL_ERROR "Build requires at least one of ARPACK or SLEPc dependencies") -endif() - -# Find ARPACK -if(PALACE_WITH_ARPACK) - if(NOT "${ARPACK_DIR}" STREQUAL "") - set(arpackng_ROOT ${ARPACK_DIR}) - endif() - find_package(arpackng REQUIRED CONFIG) - message(STATUS "Found ARPACK: ${arpackng_VERSION} in ${arpackng_DIR}") -elseif(NOT PALACE_WITH_SLEPC) - message(FATAL_ERROR "Build requires at least one of ARPACK or SLEPc dependencies") -endif() - -# Optionally configure static analysis -include(StaticAnalysisHelpers) -if(ANALYZE_SOURCES_CLANG_TIDY) - configure_clang_tidy() -else() - message(STATUS "Static analysis with clang-tidy not requested") -endif() -if(ANALYZE_SOURCES_CPPCHECK) - configure_cppcheck() -else() - message(STATUS "Static analysis with cppcheck not requested") -endif() - -# Add library target -set(LIB_TARGET_NAME libpalace) -add_library(${LIB_TARGET_NAME} "") -target_include_directories(${LIB_TARGET_NAME} PUBLIC ${CMAKE_SOURCE_DIR}) - -# Add source files -add_subdirectory(drivers) -add_subdirectory(fem) -add_subdirectory(linalg) -add_subdirectory(models) -add_subdirectory(utils) - -# Add executable target -set(TARGET_NAME palace) -add_executable(${TARGET_NAME} ${CMAKE_SOURCE_DIR}/main.cpp) -target_link_libraries(${TARGET_NAME} PRIVATE ${LIB_TARGET_NAME}) - -# Add binary extension for build architecture -if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm") - set(TARGET_EXTENSION "arm64") -else() - set(TARGET_EXTENSION "x86_64") -endif() -set_target_properties(${TARGET_NAME} - PROPERTIES - OUTPUT_NAME "${TARGET_NAME}-${TARGET_EXTENSION}" - SUFFIX ".bin" -) -set_target_properties(${LIB_TARGET_NAME} - PROPERTIES OUTPUT_NAME "${TARGET_NAME}" -) - -# Add JIT source file path definition for libCEED -set_property( - SOURCE ${CMAKE_SOURCE_DIR}/main.cpp - APPEND PROPERTY COMPILE_DEFINITIONS "PALACE_LIBCEED_JIT_SOURCE;PALACE_LIBCEED_JIT_SOURCE_DIR=\"${CMAKE_INSTALL_PREFIX}/include/palace/\"" -) - -# Add Git revision information (forces reconfigure when Git status changes) -include(GetGitDescription) -git_describe(GIT_COMMIT_ID) -message(STATUS "Git string: ${GIT_COMMIT_ID}") -if(NOT GIT_COMMIT_ID MATCHES "NOTFOUND") - set_property( - SOURCE ${CMAKE_SOURCE_DIR}/main.cpp - APPEND PROPERTY COMPILE_DEFINITIONS "PALACE_GIT_COMMIT;PALACE_GIT_COMMIT_ID=\"${GIT_COMMIT_ID}\"" - ) -endif() - -# Check C++ compiler support for constexpr std::sqrt and std::filesystem -include(CheckCompilerFeatureSupport) -if(NOT DEFINED CONSTEXPR_SQRT_SUPPORT_CACHE) - check_constexpr_sqrt_support(CONSTEXPR_SQRT_SUPPORT) - set(CONSTEXPR_SQRT_SUPPORT_CACHE ${CONSTEXPR_SQRT_SUPPORT} CACHE INTERNAL "") -endif() -if(CONSTEXPR_SQRT_SUPPORT_CACHE) - target_compile_definitions(${LIB_TARGET_NAME} - PUBLIC PALACE_WITH_CONSTEXPR_SQRT - ) -endif() -if(NOT DEFINED STD_FS_LIBRARIES_CACHE) - check_std_fs_support(STD_FS_SUPPORT STD_FS_LIBRARIES) - if(NOT STD_FS_SUPPORT) - message(FATAL_ERROR "Could not compile a C++ program using std::filesystem") - endif() - set(STD_FS_LIBRARIES_CACHE ${STD_FS_LIBRARIES} CACHE INTERNAL "") -endif() -if(NOT "${STD_FS_LIBRARIES_CACHE}" STREQUAL "") - target_link_libraries(${LIB_TARGET_NAME} - PUBLIC ${STD_FS_LIBRARIES_CACHE} - ) -endif() - -# Link with third-party dependencies -if(PALACE_WITH_SLEPC) - target_link_libraries(${LIB_TARGET_NAME} - PUBLIC ${SLEPC_TARGET} ${PETSC_TARGET} - ) - target_compile_definitions(${LIB_TARGET_NAME} - PUBLIC PALACE_WITH_SLEPC - ) -endif() -if(PALACE_WITH_ARPACK) - target_link_libraries(${LIB_TARGET_NAME} - PUBLIC PARPACK::PARPACK ARPACK::ARPACK ${MPI_Fortran_LIBRARIES} - ) - target_compile_definitions(${LIB_TARGET_NAME} - PUBLIC PALACE_WITH_ARPACK - ) -endif() -if(PALACE_WITH_OPENMP) - target_link_libraries(${LIB_TARGET_NAME} - PUBLIC OpenMP::OpenMP_CXX - ) -endif() -target_link_libraries(${LIB_TARGET_NAME} - PUBLIC mfem ${LIBCEED_TARGET} nlohmann_json::nlohmann_json fmt::fmt Eigen3::Eigen MPI::MPI_CXX -) - -# Install target and helper scripts -install( - TARGETS ${TARGET_NAME} ${LIB_TARGET_NAME} - RUNTIME DESTINATION bin -) -install( - FILES ${CMAKE_SOURCE_DIR}/../scripts/palace ${CMAKE_SOURCE_DIR}/../scripts/validate-config - DESTINATION bin - PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE - WORLD_READ WORLD_EXECUTE -) -install( - DIRECTORY ${CMAKE_SOURCE_DIR}/../scripts/schema - DESTINATION bin - FILE_PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ -) -install( - DIRECTORY ${CMAKE_SOURCE_DIR}/fem/qfunctions - DESTINATION include/palace - FILE_PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ -) - -# Add tests (disabled by default) -add_subdirectory(../test/unit ${CMAKE_BINARY_DIR}/test/unit EXCLUDE_FROM_ALL) - -# Status messages for build settings -message(STATUS "CMake build type: ${CMAKE_BUILD_TYPE}") -message(STATUS "Building for architecture: ${CMAKE_SYSTEM_PROCESSOR}") -message(STATUS "Summary of extra compiler flags: ${CMAKE_CXX_FLAGS}") -message(STATUS "Installation directory: ${CMAKE_INSTALL_PREFIX}/bin") +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# CMake configuration for the main Palace application +# + +# CMake 3.21 was released in Jul. 2021 (required for HIP support) +cmake_minimum_required(VERSION 3.21) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Prohibit in-source builds +if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR) + message(FATAL_ERROR "In-source builds are prohibited") +endif() + +# C++17 required for std::filesystem, among others +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) + +# Initialize the project +project(palace LANGUAGES CXX VERSION 0.15.0) + +# Define build settings and defaults +set(PALACE_WITH_OPENMP OFF CACHE BOOL "Use OpenMP for shared-memory parallelism") +set(PALACE_WITH_CUDA OFF CACHE BOOL "Use CUDA for NVIDIA GPU support") +set(PALACE_WITH_HIP OFF CACHE BOOL "Use HIP for AMD or NVIDIA GPU support") + +set(PALACE_WITH_SLEPC ON CACHE BOOL "Build with SLEPc eigenvalue solver") +set(PALACE_WITH_ARPACK OFF CACHE BOOL "Build with ARPACK eigenvalue solver") + +set(ANALYZE_SOURCES_CLANG_TIDY OFF CACHE BOOL "Run static analysis checks using clang-tidy") +set(ANALYZE_SOURCES_CPPCHECK OFF CACHE BOOL "Run static analysis checks using cppcheck") + +set(PALACE_BUILD_WITH_COVERAGE OFF CACHE BOOL "Compile Palace with coverage flags (source-based for LLVM; gcov for GCC)") + +# Help find third-party dependencies +set(MFEM_DIR "" CACHE STRING + "Path to MFEM build or installation directory (not required if already on CMAKE_PREFIX_PATH)" +) +set(LIBCEED_DIR "" CACHE STRING + "Path to libCEED build or installation directory (not required if already on CMAKE_PREFIX_PATH or PKG_CONFIG_PATH)" +) +set(PETSC_DIR "" CACHE STRING + "Path to PETSc build or installation directory (not required if already on CMAKE_PREFIX_PATH or PKG_CONFIG_PATH)" +) +set(SLEPC_DIR "" CACHE STRING + "Path to SLEPc build or installation directory (not required if already on CMAKE_PREFIX_PATH or PKG_CONFIG_PATH)" +) +set(ARPACK_DIR "" CACHE STRING + "Path to ARPACK build or installation directory (not required if already on CMAKE_PREFIX_PATH)" +) + +# Enable Fortran if required +if(PALACE_WITH_ARPACK) + enable_language(Fortran) +endif() + +# Enable CUDA/HIP if required +if(PALACE_WITH_CUDA AND PALACE_WITH_HIP) + message(FATAL_ERROR "PALACE_WITH_CUDA is not compatible with PALACE_WITH_HIP") +endif() +if(PALACE_WITH_CUDA) + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") +elseif(PALACE_WITH_HIP) + enable_language(HIP) +endif() + +# Set a default build type if none was provided +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message(STATUS "Setting CMAKE_BUILD_TYPE to 'Release' as none was specified") + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Specifies the build type ('Debug' or 'Release', for example)" FORCE + ) +endif() + +# Centralize Error handling +add_library(common_warnings INTERFACE) +target_compile_options( + common_warnings + INTERFACE -Wall + -Wpedantic + -Wno-deprecated-declarations +) + +if(PALACE_BUILD_WITH_COVERAGE) + add_library(coverage_flags INTERFACE) + target_compile_options( + coverage_flags + INTERFACE + $<$:-fprofile-instr-generate> + $<$:-fcoverage-mapping> + $<$:--coverage> + $<$:-fprofile-abs-path> + ) + target_link_options( + coverage_flags + INTERFACE + $<$:-fprofile-instr-generate> + $<$:-fcoverage-mapping> + $<$:--coverage> + ) +endif() + +# Add extra CMake modules +list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") + +# Find MPI +find_package(MPI REQUIRED) + +# Find OpenMP +if(PALACE_WITH_OPENMP) + find_package(OpenMP REQUIRED) +endif() + +# Find LAPACK +find_package(LAPACK REQUIRED) + +# Find nlohmann/json +find_package(nlohmann_json REQUIRED CONFIG) +message(STATUS "Found nlohmann/json: ${nlohmann_json_VERSION} in ${nlohmann_json_DIR}") + +# Find fmt +find_package(fmt REQUIRED CONFIG) +message(STATUS "Found fmt: ${fmt_VERSION} in ${fmt_DIR}") + +# Find scn +find_package(scn REQUIRED CONFIG) +message(STATUS "Found scn: ${scn_VERSION} in ${scn_DIR}") + +# Find Eigen +find_package(Eigen3 REQUIRED CONFIG) +message(STATUS "Found Eigen: ${Eigen3_VERSION} in ${Eigen3_DIR}") + +# Find MFEM (recent MFEM targets link to target Threads::Threads, for some reason) +if(NOT "${MFEM_DIR}" STREQUAL "") + set(MFEM_ROOT ${MFEM_DIR}) +endif() +find_package(Threads REQUIRED) +find_package(MFEM REQUIRED CONFIG) +message(STATUS "Found MFEM: ${MFEM_VERSION} in ${MFEM_DIR}") +if(NOT MFEM_USE_MPI) + message(FATAL_ERROR "Build requires MFEM with MPI support") +endif() +# if(MFEM_CXX_FLAGS) +# # Pull compiler flags from MFEM for OpenMP and optimizations +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MFEM_CXX_FLAGS}") +# endif() + +# Find libCEED +include(PkgConfigHelpers) +set(LIBCEED_TEST_DEPS) +if(PALACE_WITH_OPENMP) + list(APPEND LIBCEED_TEST_DEPS OpenMP::OpenMP_CXX) +endif() +find_libceed_pkgconfig("${LIBCEED_TEST_DEPS}" LIBCEED_TARGET) +if("${LIBCEED_TARGET}" STREQUAL "") + message(FATAL_ERROR "libCEED could not be found, be sure to set LIBCEED_DIR") +endif() + +# Find PETSc and SLEPc +if(PALACE_WITH_SLEPC) + set(PETSC_TEST_DEPS MPI::MPI_CXX) + if(PALACE_WITH_OPENMP) + list(APPEND PETSC_TEST_DEPS OpenMP::OpenMP_CXX) + endif() + find_petsc_pkgconfig("${PETSC_TEST_DEPS}" PETSC_TARGET) + if("${PETSC_TARGET}" STREQUAL "") + message(FATAL_ERROR "PETSc could not be found, be sure to set PETSC_DIR") + endif() + find_slepc_pkgconfig("${PETSC_TARGET};${PETSC_TEST_DEPS}" SLEPC_TARGET) + if("${SLEPC_TARGET}" STREQUAL "") + message(FATAL_ERROR "SLEPc could not be found, be sure to set SLEPC_DIR") + endif() +elseif(NOT PALACE_WITH_ARPACK) + message(FATAL_ERROR "Build requires at least one of ARPACK or SLEPc dependencies") +endif() + +# Find ARPACK +if(PALACE_WITH_ARPACK) + if(NOT "${ARPACK_DIR}" STREQUAL "") + set(arpackng_ROOT ${ARPACK_DIR}) + endif() + find_package(arpackng REQUIRED CONFIG) + message(STATUS "Found ARPACK: ${arpackng_VERSION} in ${arpackng_DIR}") +elseif(NOT PALACE_WITH_SLEPC) + message(FATAL_ERROR "Build requires at least one of ARPACK or SLEPc dependencies") +endif() + +# Optionally configure static analysis +include(StaticAnalysisHelpers) +if(ANALYZE_SOURCES_CLANG_TIDY) + configure_clang_tidy() +else() + message(STATUS "Static analysis with clang-tidy not requested") +endif() +if(ANALYZE_SOURCES_CPPCHECK) + configure_cppcheck() +else() + message(STATUS "Static analysis with cppcheck not requested") +endif() + +# Add library target +set(LIB_TARGET_NAME libpalace) +add_library(${LIB_TARGET_NAME} "") +target_include_directories(${LIB_TARGET_NAME} PUBLIC ${CMAKE_SOURCE_DIR}) + +# Add source files +add_subdirectory(drivers) +add_subdirectory(fem) +add_subdirectory(linalg) +add_subdirectory(models) +add_subdirectory(utils) + +# Add executable target +set(TARGET_NAME palace) +add_executable(${TARGET_NAME} ${CMAKE_SOURCE_DIR}/main.cpp) +target_link_libraries(${TARGET_NAME} PRIVATE ${LIB_TARGET_NAME}) + +# Add binary extension for build architecture +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm") + set(TARGET_EXTENSION "arm64") +else() + set(TARGET_EXTENSION "x86_64") +endif() +set_target_properties(${TARGET_NAME} + PROPERTIES + OUTPUT_NAME "${TARGET_NAME}-${TARGET_EXTENSION}" + SUFFIX ".bin" +) +set_target_properties(${LIB_TARGET_NAME} + PROPERTIES OUTPUT_NAME "${TARGET_NAME}" +) + +# Handle device source code +# +# TARGET_SOURCES_DEVICE defines a list of files that contain device code. +# We manually specify this list because we cannot use compilers like nvcc for +# everything, since they tend to be slower and do not support everything we need. +if(NOT "${TARGET_SOURCES_DEVICE}" STREQUAL "" AND (PALACE_WITH_CUDA OR PALACE_WITH_HIP)) + if(PALACE_WITH_CUDA) + set(LANGUAGE_PROPERTY CUDA) + elseif(PALACE_WITH_HIP) + set(LANGUAGE_PROPERTY HIP) + endif() + set(COMPILE_OPTIONS_PROPERTY "-Wno-pedantic") + if(PALACE_WITH_OPENMP) + set(COMPILE_OPTIONS_PROPERTY "${COMPILE_OPTIONS_PROPERTY} ${OpenMP_CXX_FLAGS}") + endif() + set_property( + SOURCE ${TARGET_SOURCES_DEVICE} + PROPERTY LANGUAGE ${LANGUAGE_PROPERTY} + ) + set_property( + SOURCE ${TARGET_SOURCES_DEVICE} + APPEND PROPERTY COMPILE_OPTIONS "${COMPILE_OPTIONS_PROPERTY}" + ) +endif() + +if(PALACE_WITH_CUDA) + if (BUILD_SHARED_LIBS) + target_link_libraries(${LIB_TARGET_NAME} PUBLIC CUDA::cudart) + else() + target_link_libraries(${LIB_TARGET_NAME} PUBLIC CUDA::cudart_static) + endif() +endif() + +# Add JIT source file path definition for libCEED +set_property( + SOURCE ${CMAKE_SOURCE_DIR}/main.cpp + APPEND PROPERTY COMPILE_DEFINITIONS "PALACE_LIBCEED_JIT_SOURCE;PALACE_LIBCEED_JIT_SOURCE_DIR=\"${CMAKE_INSTALL_PREFIX}/include/palace/\"" +) + +# Add Git revision information (forces reconfigure when Git status changes) +include(GetGitDescription) +git_describe(GIT_COMMIT_ID) +message(STATUS "Git string: ${GIT_COMMIT_ID}") +if(NOT GIT_COMMIT_ID MATCHES "NOTFOUND") + set_property( + SOURCE ${CMAKE_SOURCE_DIR}/main.cpp + APPEND PROPERTY COMPILE_DEFINITIONS "PALACE_GIT_COMMIT;PALACE_GIT_COMMIT_ID=\"${GIT_COMMIT_ID}\"" + ) +endif() + +# Check C++ compiler support for constexpr std::sqrt and std::filesystem +include(CheckCompilerFeatureSupport) +if(NOT DEFINED STD_FS_LIBRARIES_CACHE) + check_std_fs_support(STD_FS_SUPPORT STD_FS_LIBRARIES) + if(NOT STD_FS_SUPPORT) + message(FATAL_ERROR "Could not compile a C++ program using std::filesystem") + endif() + set(STD_FS_LIBRARIES_CACHE ${STD_FS_LIBRARIES} CACHE INTERNAL "") +endif() +if(NOT "${STD_FS_LIBRARIES_CACHE}" STREQUAL "") + target_link_libraries(${LIB_TARGET_NAME} + PUBLIC ${STD_FS_LIBRARIES_CACHE} + ) +endif() + +# Link with third-party dependencies +if(PALACE_WITH_SLEPC) + target_link_libraries(${LIB_TARGET_NAME} + PUBLIC ${SLEPC_TARGET} ${PETSC_TARGET} + ) + target_compile_definitions(${LIB_TARGET_NAME} + PUBLIC PALACE_WITH_SLEPC + ) +endif() +if(PALACE_WITH_ARPACK) + target_link_libraries(${LIB_TARGET_NAME} + PUBLIC PARPACK::PARPACK ARPACK::ARPACK ${MPI_Fortran_LIBRARIES} + ) + target_compile_definitions(${LIB_TARGET_NAME} + PUBLIC PALACE_WITH_ARPACK + ) +endif() +if(PALACE_WITH_OPENMP) + target_link_libraries(${LIB_TARGET_NAME} + PUBLIC OpenMP::OpenMP_CXX + ) +endif() +target_link_libraries(${LIB_TARGET_NAME} + PUBLIC mfem ${LIBCEED_TARGET} nlohmann_json::nlohmann_json fmt::fmt scn::scn + Eigen3::Eigen LAPACK::LAPACK MPI::MPI_CXX +) +target_link_libraries( + ${LIB_TARGET_NAME} + PRIVATE $:common_warnings>> + $<$:coverage_flags>) +if(PALACE_WITH_GSLIB) + target_compile_definitions(${LIB_TARGET_NAME} + PUBLIC PALACE_WITH_GSLIB + ) +endif() +if(PALACE_WITH_GPU_AWARE_MPI) + target_compile_definitions(${LIB_TARGET_NAME} + PUBLIC PALACE_WITH_GPU_AWARE_MPI + ) +endif() + + +# Install target and helper scripts +install( + TARGETS ${TARGET_NAME} ${LIB_TARGET_NAME} + RUNTIME DESTINATION bin +) +install( + FILES ${CMAKE_SOURCE_DIR}/../scripts/palace ${CMAKE_SOURCE_DIR}/../scripts/validate-config + DESTINATION bin + PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE +) +install( + DIRECTORY ${CMAKE_SOURCE_DIR}/../scripts/schema + DESTINATION bin + FILE_PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ +) +install( + DIRECTORY ${CMAKE_SOURCE_DIR}/fem/qfunctions + DESTINATION include/palace + FILE_PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ +) + +# Add tests (disabled by default) +add_subdirectory(../test/unit ${CMAKE_BINARY_DIR}/test/unit EXCLUDE_FROM_ALL) + +# Status messages for build settings +message(STATUS "CMake build type: ${CMAKE_BUILD_TYPE}") +message(STATUS "Building for architecture: ${CMAKE_SYSTEM_PROCESSOR}") +message(STATUS "Summary of extra compiler flags: ${CMAKE_CXX_FLAGS}") +message(STATUS "Installation directory: ${CMAKE_INSTALL_PREFIX}/bin") diff --git a/palace/buildPalace.bat b/palace/buildPalace.bat index 29464cb5af..7e0aa75c86 100644 --- a/palace/buildPalace.bat +++ b/palace/buildPalace.bat @@ -1,3 +1,3 @@ -call env_var.bat - +call env_var.bat + %MSVS_DEVENV% /run palace.sln \ No newline at end of file diff --git a/palace/cmake/CheckCompilerFeatureSupport.cmake b/palace/cmake/CheckCompilerFeatureSupport.cmake index 7e05b3428d..06ea3cb8c0 100644 --- a/palace/cmake/CheckCompilerFeatureSupport.cmake +++ b/palace/cmake/CheckCompilerFeatureSupport.cmake @@ -1,96 +1,66 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Check compiler support for various features by compiling and running a test program -# - -if(__check_compiler_feature_support) - return() -endif() -set(__check_compiler_feature_support YES) - -function(check_constexpr_sqrt_support _has_constexpr_sqrt) - set(CONSTEXPR_SQRT_TEST_DIR ${CMAKE_BINARY_DIR}/CMakeFiles/try_run) - set(CONSTEXPR_SQRT_TEST_CPP ${CONSTEXPR_SQRT_TEST_DIR}/constexpr_sqrt_test.cpp) - file(WRITE ${CONSTEXPR_SQRT_TEST_CPP} -"#include -int main() -{ - constexpr double two = 2.0; - constexpr double four = two*two; - constexpr double sqrtfour = std::sqrt(four); - return 0; -} -") - try_run( - CONSTEXPR_SQRT_TEST_EXITCODE - CONSTEXPR_SQRT_TEST_COMPILED - ${CONSTEXPR_SQRT_TEST_DIR} - ${CONSTEXPR_SQRT_TEST_CPP} - COMPILE_OUTPUT_VARIABLE CONSTEXPR_SQRT_TEST_COMPILE_OUTPUT - RUN_OUTPUT_VARIABLE CONSTEXPR_SQRT_TEST_OUTPUT - ) - if(CONSTEXPR_SQRT_TEST_COMPILED AND CONSTEXPR_SQRT_TEST_EXITCODE EQUAL 0) - message(STATUS "CXX compiler supports constexpr std::sqrt") - set(${_has_constexpr_sqrt} TRUE PARENT_SCOPE) - else() - message(STATUS "CXX compiler does not support constexpr std::sqrt") - set(${_has_constexpr_sqrt} FALSE PARENT_SCOPE) - endif() -endfunction() - -function(check_std_fs_support _has_std_fs_support _extra_fs_libraries) - set(STD_FS_TEST_DIR ${CMAKE_BINARY_DIR}/CMakeFiles/try_run) - set(STD_FS_TEST_CPP ${STD_FS_TEST_DIR}/std_fs_test.cpp) - file(WRITE ${STD_FS_TEST_CPP} -"#include -#if defined(__cpp_lib_filesystem) || \ - defined(__has_include) && __has_include() -#include -namespace fs = std::filesystem; -#else -#include -namespace fs = std::experimental::filesystem; -#endif -int main() -{ - std::cout << \"Current path is \" << fs::current_path() << '\\n'; - return 0; -} -") - try_run( - STD_FS_TEST_EXITCODE - STD_FS_TEST_COMPILED - ${STD_FS_TEST_DIR} - ${STD_FS_TEST_CPP} - COMPILE_OUTPUT_VARIABLE STD_FS_TEST_COMPILE_OUTPUT - RUN_OUTPUT_VARIABLE STD_FS_TEST_OUTPUT - ) - if(STD_FS_TEST_COMPILED AND STD_FS_TEST_EXITCODE EQUAL 0) - message(STATUS "CXX compiler supports std::filesystem") - set(${_has_std_fs_support} TRUE PARENT_SCOPE) - set(${_extra_fs_libraries} "" PARENT_SCOPE) - return() - endif() - - # Try with -lstdc++fs - try_run( - STD_FS_TEST_EXITCODE - STD_FS_TEST_COMPILED - ${STD_FS_TEST_DIR} - ${STD_FS_TEST_CPP} - CMAKE_FLAGS - "-DLINK_LIBRARIES=stdc++fs" - COMPILE_OUTPUT_VARIABLE STD_FS_TEST_COMPILE_OUTPUT - RUN_OUTPUT_VARIABLE STD_FS_TEST_OUTPUT - ) - if(STD_FS_TEST_COMPILED AND STD_FS_TEST_EXITCODE EQUAL 0) - message(STATUS "CXX compiler supports std::filesystem with -lstdc++fs") - set(${_has_std_fs_support} TRUE PARENT_SCOPE) - set(${_extra_fs_libraries} stdc++fs PARENT_SCOPE) - else() - set(${_has_std_fs_support} FALSE PARENT_SCOPE) - set(${_extra_fs_libraries} "" PARENT_SCOPE) - endif() -endfunction() +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Check compiler support for various features by compiling and running a test program +# + +if(__check_compiler_feature_support) + return() +endif() +set(__check_compiler_feature_support YES) + +function(check_std_fs_support _has_std_fs_support _extra_fs_libraries) + set(STD_FS_TEST_DIR ${CMAKE_BINARY_DIR}/CMakeFiles/try_run) + set(STD_FS_TEST_CPP ${STD_FS_TEST_DIR}/std_fs_test.cpp) + file(WRITE ${STD_FS_TEST_CPP} +"#include +#if defined(__cpp_lib_filesystem) || \ + defined(__has_include) && __has_include() +#include +namespace fs = std::filesystem; +#else +#include +namespace fs = std::experimental::filesystem; +#endif +int main() +{ + std::cout << \"Current path is \" << fs::current_path() << '\\n'; + return 0; +} +") + try_run( + STD_FS_TEST_EXITCODE + STD_FS_TEST_COMPILED + ${STD_FS_TEST_DIR} + ${STD_FS_TEST_CPP} + COMPILE_OUTPUT_VARIABLE STD_FS_TEST_COMPILE_OUTPUT + RUN_OUTPUT_VARIABLE STD_FS_TEST_OUTPUT + ) + if(STD_FS_TEST_COMPILED AND STD_FS_TEST_EXITCODE EQUAL 0) + message(STATUS "CXX compiler supports std::filesystem") + set(${_has_std_fs_support} TRUE PARENT_SCOPE) + set(${_extra_fs_libraries} "" PARENT_SCOPE) + return() + endif() + + # Try with -lstdc++fs + try_run( + STD_FS_TEST_EXITCODE + STD_FS_TEST_COMPILED + ${STD_FS_TEST_DIR} + ${STD_FS_TEST_CPP} + CMAKE_FLAGS + "-DLINK_LIBRARIES=stdc++fs" + COMPILE_OUTPUT_VARIABLE STD_FS_TEST_COMPILE_OUTPUT + RUN_OUTPUT_VARIABLE STD_FS_TEST_OUTPUT + ) + if(STD_FS_TEST_COMPILED AND STD_FS_TEST_EXITCODE EQUAL 0) + message(STATUS "CXX compiler supports std::filesystem with -lstdc++fs") + set(${_has_std_fs_support} TRUE PARENT_SCOPE) + set(${_extra_fs_libraries} stdc++fs PARENT_SCOPE) + else() + set(${_has_std_fs_support} FALSE PARENT_SCOPE) + set(${_extra_fs_libraries} "" PARENT_SCOPE) + endif() +endfunction() diff --git a/palace/cmake/GetGitDescription.cmake b/palace/cmake/GetGitDescription.cmake index 87e87f9ac2..261a8ca572 100644 --- a/palace/cmake/GetGitDescription.cmake +++ b/palace/cmake/GetGitDescription.cmake @@ -1,100 +1,100 @@ -# Copyright 2009-2013, Iowa State University -# Copyright 2013-2020, Ryan Pavlik -# Copyright 2013-2020, Contributors -# SPDX-License-Identifier: BSL-1.0 -# Distributed under the Boost Software License, Version 1.0 -# See copy at http://www.boost.org/LICENSE_1_0.txt -# SPDX-License-Identifier: BSL-1.0 - -# -# Returns the refspec and sha hash of the current head revision of the results of git -# describe on the source tree. These functions force a re-configure on each git commit so -# that you can trust the values of the variables in your build system. -# -# Original author: 2009-2020 Ryan Pavlik -# - -if(__get_git_description) - return() -endif() -set(__get_git_description YES) - -set(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) - -function(git_find_closest_git_dir _start_dir _git_dir_var) - set(cur_dir "${_start_dir}") - set(git_dir "${_start_dir}/.git") - while(NOT EXISTS "${git_dir}") - # .git dir not found, search parent directories - set(git_previous_parent "${cur_dir}") - get_filename_component(cur_dir ${cur_dir} DIRECTORY) - if(cur_dir STREQUAL git_previous_parent) - # We have reached the root directory, we are not in git - set(${_git_dir_var} "" PARENT_SCOPE) - return() - endif() - set(git_dir "${cur_dir}/.git") - endwhile() - set(${_git_dir_var} "${git_dir}" PARENT_SCOPE) -endfunction() - -function(get_git_head_revision _refspecvar _hashvar) - git_find_closest_git_dir("${CMAKE_CURRENT_SOURCE_DIR}" GIT_DIR) - if("${GIT_DIR}" STREQUAL "" OR NOT IS_DIRECTORY ${GIT_DIR}) - set(${_refspecvar} "GITDIR-NOTFOUND" PARENT_SCOPE) - set(${_hashvar} "GITDIR-NOTFOUND" PARENT_SCOPE) - return() - endif() - - set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data") - if(NOT EXISTS "${GIT_DATA}") - file(MAKE_DIRECTORY "${GIT_DATA}") - endif() - set(HEAD_FILE "${GIT_DATA}/HEAD") - set(HEAD_SOURCE_FILE "${GIT_DIR}/HEAD") - if(NOT EXISTS "${HEAD_SOURCE_FILE}") - set(${_refspecvar} "GITHEAD-NOTFOUND" PARENT_SCOPE) - set(${_hashvar} "GITHEAD-NOTFOUND" PARENT_SCOPE) - return() - endif() - - configure_file("${HEAD_SOURCE_FILE}" "${HEAD_FILE}" COPYONLY) - configure_file("${CURRENT_LIST_DIR}/GetGitDescription.cmake.in" - "${GIT_DATA}/GetGitRefHash.cmake" @ONLY) - include("${GIT_DATA}/GetGitRefHash.cmake") - set(${_refspecvar} "${HEAD_REF}" PARENT_SCOPE) - set(${_hashvar} "${HEAD_HASH}" PARENT_SCOPE) -endfunction() - -function(git_describe _var) - if(NOT GIT_FOUND) - find_package(Git QUIET) - endif() - if(NOT GIT_FOUND) - set(${_var} "GIT-NOTFOUND" PARENT_SCOPE) - return() - endif() - - # Force rerun only if Git status has changed - get_git_head_revision(refspec hash) - if(NOT hash) - set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE) - return() - endif() - - # message(STATUS "Git head revision: ${refspec} ${hash}") - # message(STATUS "Arguments to execute_process: ${ARGN}") - - execute_process( - COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty ${ARGN} - WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" - RESULT_VARIABLE res - OUTPUT_VARIABLE out - ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE - ) - if(NOT res EQUAL 0) - set(${_var} "${out}-${res}-NOTFOUND" PARENT_SCOPE) - else() - set(${_var} "${out}" PARENT_SCOPE) - endif() -endfunction() +# Copyright 2009-2013, Iowa State University +# Copyright 2013-2020, Ryan Pavlik +# Copyright 2013-2020, Contributors +# SPDX-License-Identifier: BSL-1.0 +# Distributed under the Boost Software License, Version 1.0 +# See copy at http://www.boost.org/LICENSE_1_0.txt +# SPDX-License-Identifier: BSL-1.0 + +# +# Returns the refspec and sha hash of the current head revision of the results of git +# describe on the source tree. These functions force a re-configure on each git commit so +# that you can trust the values of the variables in your build system. +# +# Original author: 2009-2020 Ryan Pavlik +# + +if(__get_git_description) + return() +endif() +set(__get_git_description YES) + +set(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) + +function(git_find_closest_git_dir _start_dir _git_dir_var) + set(cur_dir "${_start_dir}") + set(git_dir "${_start_dir}/.git") + while(NOT EXISTS "${git_dir}") + # .git dir not found, search parent directories + set(git_previous_parent "${cur_dir}") + get_filename_component(cur_dir ${cur_dir} DIRECTORY) + if(cur_dir STREQUAL git_previous_parent) + # We have reached the root directory, we are not in git + set(${_git_dir_var} "" PARENT_SCOPE) + return() + endif() + set(git_dir "${cur_dir}/.git") + endwhile() + set(${_git_dir_var} "${git_dir}" PARENT_SCOPE) +endfunction() + +function(get_git_head_revision _refspecvar _hashvar) + git_find_closest_git_dir("${CMAKE_CURRENT_SOURCE_DIR}" GIT_DIR) + if("${GIT_DIR}" STREQUAL "" OR NOT IS_DIRECTORY ${GIT_DIR}) + set(${_refspecvar} "GITDIR-NOTFOUND" PARENT_SCOPE) + set(${_hashvar} "GITDIR-NOTFOUND" PARENT_SCOPE) + return() + endif() + + set(GIT_DATA "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/git-data") + if(NOT EXISTS "${GIT_DATA}") + file(MAKE_DIRECTORY "${GIT_DATA}") + endif() + set(HEAD_FILE "${GIT_DATA}/HEAD") + set(HEAD_SOURCE_FILE "${GIT_DIR}/HEAD") + if(NOT EXISTS "${HEAD_SOURCE_FILE}") + set(${_refspecvar} "GITHEAD-NOTFOUND" PARENT_SCOPE) + set(${_hashvar} "GITHEAD-NOTFOUND" PARENT_SCOPE) + return() + endif() + + configure_file("${HEAD_SOURCE_FILE}" "${HEAD_FILE}" COPYONLY) + configure_file("${CURRENT_LIST_DIR}/GetGitDescription.cmake.in" + "${GIT_DATA}/GetGitRefHash.cmake" @ONLY) + include("${GIT_DATA}/GetGitRefHash.cmake") + set(${_refspecvar} "${HEAD_REF}" PARENT_SCOPE) + set(${_hashvar} "${HEAD_HASH}" PARENT_SCOPE) +endfunction() + +function(git_describe _var) + if(NOT GIT_FOUND) + find_package(Git QUIET) + endif() + if(NOT GIT_FOUND) + set(${_var} "GIT-NOTFOUND" PARENT_SCOPE) + return() + endif() + + # Force rerun only if Git status has changed + get_git_head_revision(refspec hash) + if(NOT hash) + set(${_var} "HEAD-HASH-NOTFOUND" PARENT_SCOPE) + return() + endif() + + # message(STATUS "Git head revision: ${refspec} ${hash}") + # message(STATUS "Arguments to execute_process: ${ARGN}") + + execute_process( + COMMAND "${GIT_EXECUTABLE}" describe --tags --always --dirty ${ARGN} + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + RESULT_VARIABLE res + OUTPUT_VARIABLE out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(NOT res EQUAL 0) + set(${_var} "${res}-NOTFOUND" PARENT_SCOPE) + else() + set(${_var} "${out}" PARENT_SCOPE) + endif() +endfunction() diff --git a/palace/cmake/GetGitDescription.cmake.in b/palace/cmake/GetGitDescription.cmake.in index e8c0229481..bea01e2027 100644 --- a/palace/cmake/GetGitDescription.cmake.in +++ b/palace/cmake/GetGitDescription.cmake.in @@ -1,36 +1,36 @@ -# Copyright 2009-2012, Iowa State University -# Copyright 2011-2015, Contributors -# Distributed under the Boost Software License, Version 1.0 -# See copy at http://www.boost.org/LICENSE_1_0.txt -# SPDX-License-Identifier: BSL-1.0 - -# -# Internal file for GetGitDescription.cmake -# -# Original author: -# 2009-2010 Ryan Pavlik -# - -set(HEAD_HASH) - -file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024) - -string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS) -if(HEAD_CONTENTS MATCHES "ref") # named branch - string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}") - if(EXISTS "@GIT_DIR@/${HEAD_REF}") - configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY) - else() - configure_file("@GIT_DIR@/packed-refs" "@GIT_DATA@/packed-refs" COPYONLY) - file(READ "@GIT_DATA@/packed-refs" PACKED_REFS) - if(PACKED_REFS MATCHES "([0-9a-z]*) ${HEAD_REF}") - set(HEAD_HASH "${CMAKE_MATCH_1}") - endif() - endif() -else() # detached HEAD - configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY) -endif() -if(NOT HEAD_HASH) - file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024) - string(STRIP "${HEAD_HASH}" HEAD_HASH) -endif() +# Copyright 2009-2012, Iowa State University +# Copyright 2011-2015, Contributors +# Distributed under the Boost Software License, Version 1.0 +# See copy at http://www.boost.org/LICENSE_1_0.txt +# SPDX-License-Identifier: BSL-1.0 + +# +# Internal file for GetGitDescription.cmake +# +# Original author: +# 2009-2010 Ryan Pavlik +# + +set(HEAD_HASH) + +file(READ "@HEAD_FILE@" HEAD_CONTENTS LIMIT 1024) + +string(STRIP "${HEAD_CONTENTS}" HEAD_CONTENTS) +if(HEAD_CONTENTS MATCHES "ref") # named branch + string(REPLACE "ref: " "" HEAD_REF "${HEAD_CONTENTS}") + if(EXISTS "@GIT_DIR@/${HEAD_REF}") + configure_file("@GIT_DIR@/${HEAD_REF}" "@GIT_DATA@/head-ref" COPYONLY) + else() + configure_file("@GIT_DIR@/packed-refs" "@GIT_DATA@/packed-refs" COPYONLY) + file(READ "@GIT_DATA@/packed-refs" PACKED_REFS) + if(PACKED_REFS MATCHES "([0-9a-z]*) ${HEAD_REF}") + set(HEAD_HASH "${CMAKE_MATCH_1}") + endif() + endif() +else() # detached HEAD + configure_file("@GIT_DIR@/HEAD" "@GIT_DATA@/head-ref" COPYONLY) +endif() +if(NOT HEAD_HASH) + file(READ "@GIT_DATA@/head-ref" HEAD_HASH LIMIT 1024) + string(STRIP "${HEAD_HASH}" HEAD_HASH) +endif() diff --git a/palace/cmake/PkgConfigHelpers.cmake b/palace/cmake/PkgConfigHelpers.cmake index b78f2b0847..f862321303 100644 --- a/palace/cmake/PkgConfigHelpers.cmake +++ b/palace/cmake/PkgConfigHelpers.cmake @@ -1,265 +1,262 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Helper functions to import and test PETSc and SLEPc targets using pkg-config -# - -if(__pkg_config_helpers) - return() -endif() -set(__pkg_config_helpers YES) - -function(set_if_empty _variable _arg) - if("${${_variable}}" STREQUAL "") - set(${_variable} ${_arg} PARENT_SCOPE) - endif() -endfunction() - -set(_LIBCEED_DIR ${LIBCEED_DIR}) -set(_PETSC_DIR ${PETSC_DIR}) -set(_SLEPC_DIR ${SLEPC_DIR}) -set_if_empty(_LIBCEED_DIR "$ENV{LIBCEED_DIR}") -set_if_empty(_PETSC_DIR "$ENV{PETSC_DIR}") -set_if_empty(_SLEPC_DIR "$ENV{SLEPC_DIR}") -set_if_empty(_SLEPC_DIR "${_PETSC_DIR}") -if(NOT "${_LIBCEED_DIR}" STREQUAL "") - set(ENV{PKG_CONFIG_PATH} "${_LIBCEED_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}") - set(ENV{PKG_CONFIG_PATH} "${_LIBCEED_DIR}/lib64/pkgconfig:$ENV{PKG_CONFIG_PATH}") -endif() -if(NOT "${_PETSC_DIR}" STREQUAL "") - set(ENV{PKG_CONFIG_PATH} "${_PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}") - set(ENV{PKG_CONFIG_PATH} "${_PETSC_DIR}/lib64/pkgconfig:$ENV{PKG_CONFIG_PATH}") -endif() -if(NOT "${_SLEPC_DIR}" STREQUAL "" AND NOT _SLEPC_DIR STREQUAL _PETSC_DIR) - set(ENV{PKG_CONFIG_PATH} "${_SLEPC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}") - set(ENV{PKG_CONFIG_PATH} "${_SLEPC_DIR}/lib64/pkgconfig:$ENV{PKG_CONFIG_PATH}") -endif() -find_package(PkgConfig REQUIRED) - -function(check_libceed_build _libceed_target _libceed_test_success) - set(LIBCEED_LIB_TEST_DIR ${CMAKE_BINARY_DIR}/CMakeFiles/try_run) - set(LIBCEED_LIB_TEST_CPP ${LIBCEED_LIB_TEST_DIR}/libceed_test_lib.cpp) - file(WRITE ${LIBCEED_LIB_TEST_CPP} -"#include -#include -int main() -{ - Ceed ceed; - CeedCall(CeedInit(\"/cpu/self\", &ceed)); - CeedCall(CeedDestroy(&ceed)); - return 0; -} -") - try_compile( - LIBCEED_TEST_COMPILED - ${LIBCEED_LIB_TEST_DIR} - ${LIBCEED_LIB_TEST_CPP} - LINK_LIBRARIES ${_libceed_target} - OUTPUT_VARIABLE LIBCEED_TEST_COMPILE_OUTPUT - ) - # message(STATUS "LIBCEED_TEST_COMPILE_OUTPUT: ${LIBCEED_TEST_COMPILE_OUTPUT}") - if(LIBCEED_TEST_COMPILED) - # message(STATUS "libCEED test program - Successful") - set(${_libceed_test_success} TRUE PARENT_SCOPE) - else() - # message(STATUS "libCEED test program - Failed") - set(${_libceed_test_success} FALSE PARENT_SCOPE) - endif() -endfunction() - -function(find_libceed_pkgconfig _libceed_deps _libceed_target) - pkg_check_modules(libCEED IMPORTED_TARGET GLOBAL ceed) - if(NOT libCEED_FOUND) - set(${_libceed_target} "" PARENT_SCOPE) - return() - endif() - message(STATUS "Found libCEED: ${libCEED_VERSION}") - check_libceed_build("PkgConfig::libCEED;${_libceed_deps}" LIBCEED_TEST_SUCCESS) - if(LIBCEED_TEST_SUCCESS) - message(STATUS "libCEED test program - Successful") - set(${_libceed_target} PkgConfig::libCEED PARENT_SCOPE) - return() - endif() - - # Try with --static libraries - message(STATUS "libCEED test program - Failed") - set(PKG_CONFIG_EXECUTABLE_BACKUP ${PKG_CONFIG_EXECUTABLE}) - list(APPEND PKG_CONFIG_EXECUTABLE "--static") - pkg_check_modules(libCEED_STATIC QUIET IMPORTED_TARGET GLOBAL ceed) - set(PKG_CONFIG_EXECUTABLE ${PKG_CONFIG_EXECUTABLE_BACKUP}) - if(NOT libCEED_STATIC_FOUND) - set(${_libceed_target} "" PARENT_SCOPE) - return() - endif() - check_libceed_build("PkgConfig::libCEED_STATIC;${_libceed_deps}" LIBCEED_TEST_SUCCESS) - if(LIBCEED_TEST_SUCCESS) - message(STATUS "libCEED test program with static linkage - Success") - set(${_libceed_target} PkgConfig::libCEED_STATIC PARENT_SCOPE) - return() - endif() - - # Not able to build a libCEED test program - message(STATUS "libCEED test program with static linkage - Failed") - set(${_libceed_target} "" PARENT_SCOPE) -endfunction() - -function(check_petsc_build _petsc_target _petsc_test_success) - set(PETSC_LIB_TEST_DIR ${CMAKE_BINARY_DIR}/CMakeFiles/try_run) - set(PETSC_LIB_TEST_CPP ${PETSC_LIB_TEST_DIR}/petsc_test_lib.cpp) - file(WRITE ${PETSC_LIB_TEST_CPP} -"#include -int main() -{ - TS ts; - int argc = 0; - char **argv = NULL; - PetscCall(PetscInitialize(&argc, &argv, PETSC_NULLPTR, PETSC_NULLPTR)); - PetscCall(TSCreate(PETSC_COMM_WORLD, &ts)); - PetscCall(TSSetFromOptions(ts)); - PetscCall(TSDestroy(&ts)); - PetscCall(PetscFinalize()); - return 0; -} -") - try_run( - PETSC_TEST_EXITCODE - PETSC_TEST_COMPILED - ${PETSC_LIB_TEST_DIR} - ${PETSC_LIB_TEST_CPP} - LINK_LIBRARIES ${_petsc_target} - COMPILE_OUTPUT_VARIABLE PETSC_TEST_COMPILE_OUTPUT - RUN_OUTPUT_VARIABLE PETSC_TEST_OUTPUT - ) - # message(STATUS "PETSC_TEST_COMPILE_OUTPUT: ${PETSC_TEST_COMPILE_OUTPUT}") - # message(STATUS "PETSC_TEST_OUTPUT: ${PETSC_TEST_OUTPUT}") - if(PETSC_TEST_COMPILED AND PETSC_TEST_EXITCODE EQUAL 0) - # message(STATUS "PETSc test program - Successful") - set(${_petsc_test_success} TRUE PARENT_SCOPE) - else() - # message(STATUS "PETSc test program - Failed") - set(${_petsc_test_success} FALSE PARENT_SCOPE) - endif() -endfunction() - -function(find_petsc_pkgconfig _petsc_deps _petsc_target) - pkg_check_modules(PETSc IMPORTED_TARGET GLOBAL PETSc) - if(NOT PETSc_FOUND) - pkg_check_modules(PETSc IMPORTED_TARGET GLOBAL petsc) - endif() - if(NOT PETSc_FOUND) - set(${_petsc_target} "" PARENT_SCOPE) - return() - endif() - message(STATUS "Found PETSc: ${PETSc_VERSION}") - check_petsc_build("PkgConfig::PETSc;${_petsc_deps}" PETSC_TEST_SUCCESS) - if(PETSC_TEST_SUCCESS) - message(STATUS "PETSc test program - Successful") - set(${_petsc_target} PkgConfig::PETSc PARENT_SCOPE) - return() - endif() - - # Try with --static libraries - message(STATUS "PETSc test program - Failed") - set(PKG_CONFIG_EXECUTABLE_BACKUP ${PKG_CONFIG_EXECUTABLE}) - list(APPEND PKG_CONFIG_EXECUTABLE "--static") - pkg_check_modules(PETSc_STATIC QUIET IMPORTED_TARGET GLOBAL PETSc) - if(NOT PETSc_STATIC_FOUND) - pkg_check_modules(PETSc_STATIC QUIET IMPORTED_TARGET GLOBAL petsc) - endif() - set(PKG_CONFIG_EXECUTABLE ${PKG_CONFIG_EXECUTABLE_BACKUP}) - if(NOT PETSc_STATIC_FOUND) - set(${_petsc_target} "" PARENT_SCOPE) - return() - endif() - check_petsc_build("PkgConfig::PETSc_STATIC;${_petsc_deps}" PETSC_TEST_SUCCESS) - if(PETSC_TEST_SUCCESS) - message(STATUS "PETSc test program with static linkage - Success") - set(${_petsc_target} PkgConfig::PETSc_STATIC PARENT_SCOPE) - return() - endif() - - # Not able to build a PETSc test program - message(STATUS "PETSc test program with static linkage - Failed") - set(${_petsc_target} "" PARENT_SCOPE) -endfunction() - -function(check_slepc_build _slepc_target _slepc_test_success) - set(SLEPC_LIB_TEST_DIR ${CMAKE_BINARY_DIR}/CMakeFiles/try_run) - set(SLEPC_LIB_TEST_CPP ${SLEPC_LIB_TEST_DIR}/slepc_test_lib.cpp) - file(WRITE ${SLEPC_LIB_TEST_CPP} -"#include -#include -int main() -{ - EPS eps; - int argc = 0; - char **argv = NULL; - PetscCall(SlepcInitialize(&argc, &argv, PETSC_NULLPTR, PETSC_NULLPTR)); - PetscCall(EPSCreate(PETSC_COMM_SELF, &eps)); - PetscCall(EPSDestroy(&eps)); - PetscCall(SlepcFinalize()); - return 0; -} -") - try_run( - SLEPC_TEST_EXITCODE - SLEPC_TEST_COMPILED - ${SLEPC_LIB_TEST_DIR} - ${SLEPC_LIB_TEST_CPP} - LINK_LIBRARIES ${_slepc_target} - COMPILE_OUTPUT_VARIABLE SLEPC_TEST_COMPILE_OUTPUT - RUN_OUTPUT_VARIABLE SLEPC_TEST_OUTPUT - ) - # message(STATUS "SLEPC_TEST_COMPILE_OUTPUT: ${SLEPC_TEST_COMPILE_OUTPUT}") - # message(STATUS "SLEPC_TEST_OUTPUT: ${SLEPC_TEST_OUTPUT}") - if(SLEPC_TEST_COMPILED AND SLEPC_TEST_EXITCODE EQUAL 0) - # message(STATUS "SLEPc test program - Successful") - set(${_slepc_test_success} TRUE PARENT_SCOPE) - else() - # message(STATUS "SLEPc test program - Failed") - set(${_slepc_test_success} FALSE PARENT_SCOPE) - endif() -endfunction() - -function(find_slepc_pkgconfig _slepc_deps _slepc_target) - pkg_check_modules(SLEPc IMPORTED_TARGET GLOBAL SLEPc) - if(NOT SLEPc_FOUND) - pkg_check_modules(SLEPc IMPORTED_TARGET GLOBAL slepc) - endif() - if(NOT SLEPc_FOUND) - set(${_slepc_target} "" PARENT_SCOPE) - return() - endif() - message(STATUS "Found SLEPc: ${SLEPc_VERSION}") - check_slepc_build("PkgConfig::SLEPc;${_slepc_deps}" SLEPC_TEST_SUCCESS) - if(SLEPC_TEST_SUCCESS) - message(STATUS "SLEPc test program - Success") - set(${_slepc_target} PkgConfig::SLEPc PARENT_SCOPE) - return() - endif() - - # Try with --static libraries - message(STATUS "SLEPc test program - Failed") - set(PKG_CONFIG_EXECUTABLE_BACKUP ${PKG_CONFIG_EXECUTABLE}) - list(APPEND PKG_CONFIG_EXECUTABLE "--static") - pkg_check_modules(SLEPc_STATIC QUIET IMPORTED_TARGET GLOBAL SLEPc) - if(NOT SLEPc_STATIC_FOUND) - pkg_check_modules(SLEPc_STATIC QUIET IMPORTED_TARGET GLOBAL slepc) - endif() - set(PKG_CONFIG_EXECUTABLE ${PKG_CONFIG_EXECUTABLE_BACKUP}) - if(NOT SLEPc_STATIC_FOUND) - set(${_slepc_target} "" PARENT_SCOPE) - return() - endif() - check_slepc_build("PkgConfig::SLEPc_STATIC;${_slepc_deps}" SLEPC_TEST_SUCCESS) - if(SLEPC_TEST_SUCCESS) - message(STATUS "SLEPc test program with static linkage - Successful") - set(${_slepc_target} PkgConfig::SLEPc_STATIC PARENT_SCOPE) - return() - endif() - - # Not able to build a SLEPc test program - message(STATUS "SLEPc test program with static linkage - Failed") - set(${_slepc_target} "" PARENT_SCOPE) -endfunction() +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Helper functions to import and test PETSc and SLEPc targets using pkg-config +# + +if(__pkg_config_helpers) + return() +endif() +set(__pkg_config_helpers YES) + +function(set_if_empty _variable _arg) + if("${${_variable}}" STREQUAL "") + set(${_variable} ${_arg} PARENT_SCOPE) + endif() +endfunction() + +set(_LIBCEED_DIR ${LIBCEED_DIR}) +set(_PETSC_DIR ${PETSC_DIR}) +set(_SLEPC_DIR ${SLEPC_DIR}) +set_if_empty(_LIBCEED_DIR "$ENV{LIBCEED_DIR}") +set_if_empty(_PETSC_DIR "$ENV{PETSC_DIR}") +set_if_empty(_SLEPC_DIR "$ENV{SLEPC_DIR}") +set_if_empty(_SLEPC_DIR "${_PETSC_DIR}") +if(NOT "${_LIBCEED_DIR}" STREQUAL "") + set(ENV{PKG_CONFIG_PATH} "${_LIBCEED_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}") + set(ENV{PKG_CONFIG_PATH} "${_LIBCEED_DIR}/lib64/pkgconfig:$ENV{PKG_CONFIG_PATH}") +endif() +if(NOT "${_PETSC_DIR}" STREQUAL "") + set(ENV{PKG_CONFIG_PATH} "${_PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}") + set(ENV{PKG_CONFIG_PATH} "${_PETSC_DIR}/lib64/pkgconfig:$ENV{PKG_CONFIG_PATH}") +endif() +if(NOT "${_SLEPC_DIR}" STREQUAL "" AND NOT _SLEPC_DIR STREQUAL _PETSC_DIR) + set(ENV{PKG_CONFIG_PATH} "${_SLEPC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}") + set(ENV{PKG_CONFIG_PATH} "${_SLEPC_DIR}/lib64/pkgconfig:$ENV{PKG_CONFIG_PATH}") +endif() +find_package(PkgConfig REQUIRED) + +function(check_libceed_build _libceed_target _libceed_test_success) + set(LIBCEED_LIB_TEST_DIR ${CMAKE_BINARY_DIR}/CMakeFiles/try_run) + set(LIBCEED_LIB_TEST_CPP ${LIBCEED_LIB_TEST_DIR}/libceed_test_lib.cpp) + file(WRITE ${LIBCEED_LIB_TEST_CPP} +"#include +#include +int main() +{ + Ceed ceed; + CeedCall(CeedInit(\"/cpu/self\", &ceed)); + CeedCall(CeedDestroy(&ceed)); + return 0; +} +") + try_compile( + LIBCEED_TEST_COMPILED + ${LIBCEED_LIB_TEST_DIR} + ${LIBCEED_LIB_TEST_CPP} + LINK_LIBRARIES ${_libceed_target} + OUTPUT_VARIABLE LIBCEED_TEST_COMPILE_OUTPUT + ) + if(LIBCEED_TEST_COMPILED) + # message(STATUS "libCEED test program - Successful") + set(${_libceed_test_success} TRUE PARENT_SCOPE) + else() + message(STATUS "libCEED test program - Failed") + message(STATUS "LIBCEED_TEST_COMPILE_OUTPUT: ${LIBCEED_TEST_COMPILE_OUTPUT}") + set(${_libceed_test_success} FALSE PARENT_SCOPE) + endif() +endfunction() + +function(find_libceed_pkgconfig _libceed_deps _libceed_target) + pkg_check_modules(libCEED IMPORTED_TARGET GLOBAL ceed) + if(NOT libCEED_FOUND) + set(${_libceed_target} "" PARENT_SCOPE) + return() + endif() + message(STATUS "Found libCEED: ${libCEED_VERSION}") + check_libceed_build("PkgConfig::libCEED;${_libceed_deps}" LIBCEED_TEST_SUCCESS) + if(LIBCEED_TEST_SUCCESS) + message(STATUS "libCEED test program - Successful") + set(${_libceed_target} PkgConfig::libCEED PARENT_SCOPE) + return() + endif() + + # Try with --static libraries + message(STATUS "libCEED test program - Failed") + set(PKG_CONFIG_EXECUTABLE_BACKUP ${PKG_CONFIG_EXECUTABLE}) + list(APPEND PKG_CONFIG_EXECUTABLE "--static") + pkg_check_modules(libCEED_STATIC QUIET IMPORTED_TARGET GLOBAL ceed) + set(PKG_CONFIG_EXECUTABLE ${PKG_CONFIG_EXECUTABLE_BACKUP}) + if(NOT libCEED_STATIC_FOUND) + set(${_libceed_target} "" PARENT_SCOPE) + return() + endif() + check_libceed_build("PkgConfig::libCEED_STATIC;${_libceed_deps}" LIBCEED_TEST_SUCCESS) + if(LIBCEED_TEST_SUCCESS) + message(STATUS "libCEED test program with static linkage - Success") + set(${_libceed_target} PkgConfig::libCEED_STATIC PARENT_SCOPE) + return() + endif() + + # Not able to build a libCEED test program + message(STATUS "libCEED test program with static linkage - Failed") + set(${_libceed_target} "" PARENT_SCOPE) +endfunction() + +function(check_petsc_build _petsc_target _petsc_test_success) + set(PETSC_LIB_TEST_DIR ${CMAKE_BINARY_DIR}/CMakeFiles/try_run) + set(PETSC_LIB_TEST_CPP ${PETSC_LIB_TEST_DIR}/petsc_test_lib.cpp) + file(WRITE ${PETSC_LIB_TEST_CPP} +"#include +int main() +{ + TS ts; + int argc = 0; + char **argv = NULL; + PetscCall(PetscInitialize(&argc, &argv, PETSC_NULLPTR, PETSC_NULLPTR)); + PetscCall(TSCreate(PETSC_COMM_WORLD, &ts)); + PetscCall(TSSetFromOptions(ts)); + PetscCall(TSDestroy(&ts)); + PetscCall(PetscFinalize()); + return 0; +} +") + try_run( + PETSC_TEST_EXITCODE + PETSC_TEST_COMPILED + ${PETSC_LIB_TEST_DIR} + ${PETSC_LIB_TEST_CPP} + CMAKE_FLAGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + LINK_LIBRARIES ${_petsc_target} + COMPILE_OUTPUT_VARIABLE PETSC_TEST_COMPILE_OUTPUT + RUN_OUTPUT_VARIABLE PETSC_TEST_OUTPUT + ) + if(PETSC_TEST_COMPILED AND PETSC_TEST_EXITCODE EQUAL 0) + set(${_petsc_test_success} TRUE PARENT_SCOPE) + else() + set(${_petsc_test_success} FALSE PARENT_SCOPE) + endif() +endfunction() + +function(find_petsc_pkgconfig _petsc_deps _petsc_target) + pkg_check_modules(PETSc IMPORTED_TARGET GLOBAL PETSc) + if(NOT PETSc_FOUND) + pkg_check_modules(PETSc IMPORTED_TARGET GLOBAL petsc) + endif() + if(NOT PETSc_FOUND) + set(${_petsc_target} "" PARENT_SCOPE) + return() + endif() + message(STATUS "Found PETSc: ${PETSc_VERSION}") + check_petsc_build("PkgConfig::PETSc;${_petsc_deps}" PETSC_TEST_SUCCESS) + if(PETSC_TEST_SUCCESS) + message(STATUS "PETSc test program - Successful") + set(${_petsc_target} PkgConfig::PETSc PARENT_SCOPE) + return() + endif() + + # Try with --static libraries + message(STATUS "PETSc test program - Failed") + set(PKG_CONFIG_EXECUTABLE_BACKUP ${PKG_CONFIG_EXECUTABLE}) + list(APPEND PKG_CONFIG_EXECUTABLE "--static") + pkg_check_modules(PETSc_STATIC QUIET IMPORTED_TARGET GLOBAL PETSc) + if(NOT PETSc_STATIC_FOUND) + pkg_check_modules(PETSc_STATIC QUIET IMPORTED_TARGET GLOBAL petsc) + endif() + set(PKG_CONFIG_EXECUTABLE ${PKG_CONFIG_EXECUTABLE_BACKUP}) + if(NOT PETSc_STATIC_FOUND) + set(${_petsc_target} "" PARENT_SCOPE) + return() + endif() + check_petsc_build("PkgConfig::PETSc_STATIC;${_petsc_deps}" PETSC_TEST_SUCCESS) + if(PETSC_TEST_SUCCESS) + message(STATUS "PETSc test program with static linkage - Success") + set(${_petsc_target} PkgConfig::PETSc_STATIC PARENT_SCOPE) + return() + endif() + + # Not able to build a PETSc test program + message(STATUS "PETSc test program with static linkage - Failed") + set(${_petsc_target} "" PARENT_SCOPE) +endfunction() + +function(check_slepc_build _slepc_target _slepc_test_success) + set(SLEPC_LIB_TEST_DIR ${CMAKE_BINARY_DIR}/CMakeFiles/try_run) + set(SLEPC_LIB_TEST_CPP ${SLEPC_LIB_TEST_DIR}/slepc_test_lib.cpp) + file(WRITE ${SLEPC_LIB_TEST_CPP} +"#include +#include +int main() +{ + EPS eps; + int argc = 0; + char **argv = NULL; + PetscCall(SlepcInitialize(&argc, &argv, PETSC_NULLPTR, PETSC_NULLPTR)); + PetscCall(EPSCreate(PETSC_COMM_SELF, &eps)); + PetscCall(EPSDestroy(&eps)); + PetscCall(SlepcFinalize()); + return 0; +} +") + try_run( + SLEPC_TEST_EXITCODE + SLEPC_TEST_COMPILED + ${SLEPC_LIB_TEST_DIR} + ${SLEPC_LIB_TEST_CPP} + LINK_LIBRARIES ${_slepc_target} + COMPILE_OUTPUT_VARIABLE SLEPC_TEST_COMPILE_OUTPUT + RUN_OUTPUT_VARIABLE SLEPC_TEST_OUTPUT + ) + # message(STATUS "SLEPC_TEST_COMPILE_OUTPUT: ${SLEPC_TEST_COMPILE_OUTPUT}") + # message(STATUS "SLEPC_TEST_OUTPUT: ${SLEPC_TEST_OUTPUT}") + if(SLEPC_TEST_COMPILED AND SLEPC_TEST_EXITCODE EQUAL 0) + # message(STATUS "SLEPc test program - Successful") + set(${_slepc_test_success} TRUE PARENT_SCOPE) + else() + # message(STATUS "SLEPc test program - Failed") + set(${_slepc_test_success} FALSE PARENT_SCOPE) + endif() +endfunction() + +function(find_slepc_pkgconfig _slepc_deps _slepc_target) + pkg_check_modules(SLEPc IMPORTED_TARGET GLOBAL SLEPc) + if(NOT SLEPc_FOUND) + pkg_check_modules(SLEPc IMPORTED_TARGET GLOBAL slepc) + endif() + if(NOT SLEPc_FOUND) + set(${_slepc_target} "" PARENT_SCOPE) + return() + endif() + message(STATUS "Found SLEPc: ${SLEPc_VERSION}") + check_slepc_build("PkgConfig::SLEPc;${_slepc_deps}" SLEPC_TEST_SUCCESS) + if(SLEPC_TEST_SUCCESS) + message(STATUS "SLEPc test program - Success") + set(${_slepc_target} PkgConfig::SLEPc PARENT_SCOPE) + return() + endif() + + # Try with --static libraries + message(STATUS "SLEPc test program - Failed") + set(PKG_CONFIG_EXECUTABLE_BACKUP ${PKG_CONFIG_EXECUTABLE}) + list(APPEND PKG_CONFIG_EXECUTABLE "--static") + pkg_check_modules(SLEPc_STATIC QUIET IMPORTED_TARGET GLOBAL SLEPc) + if(NOT SLEPc_STATIC_FOUND) + pkg_check_modules(SLEPc_STATIC QUIET IMPORTED_TARGET GLOBAL slepc) + endif() + set(PKG_CONFIG_EXECUTABLE ${PKG_CONFIG_EXECUTABLE_BACKUP}) + if(NOT SLEPc_STATIC_FOUND) + set(${_slepc_target} "" PARENT_SCOPE) + return() + endif() + check_slepc_build("PkgConfig::SLEPc_STATIC;${_slepc_deps}" SLEPC_TEST_SUCCESS) + if(SLEPC_TEST_SUCCESS) + message(STATUS "SLEPc test program with static linkage - Successful") + set(${_slepc_target} PkgConfig::SLEPc_STATIC PARENT_SCOPE) + return() + endif() + + # Not able to build a SLEPc test program + message(STATUS "SLEPc test program with static linkage - Failed") + set(${_slepc_target} "" PARENT_SCOPE) +endfunction() diff --git a/palace/cmake/StaticAnalysisHelpers.cmake b/palace/cmake/StaticAnalysisHelpers.cmake index 61f0f1da0e..cd94d76246 100644 --- a/palace/cmake/StaticAnalysisHelpers.cmake +++ b/palace/cmake/StaticAnalysisHelpers.cmake @@ -1,90 +1,90 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Helper functions configure static source code analysis with clang-tidy or cppcheck -# - -if(__static_analysis_helpers) - return() -endif() -set(__static_analysis_helpers YES) - -function(configure_clang_tidy) - find_program(CLANG_TIDY_EXE - NAMES clang-tidy - ) - if(CLANG_TIDY_EXE) - # If not explicitly specified, clang-tidy will recurse parent folders and find closest - # .clang-tidy file (for explicit path specification, add "--config-file= - # ${CMAKE_SOURCE_DIR}/../.clang-tidy") - message(STATUS "Found clang-tidy for static analysis: ${CLANG_TIDY_EXE}") - set(CLANG_TIDY_COMMAND "${CLANG_TIDY_EXE}") - - # Try to extract MPI compiler wrapper include paths from compile command line if not - # found already (clang-tidy will error about not finding mpi.h otherwise) - if(MPI_FOUND) - if(NOT MPI_CXX_INCLUDE_DIRS) - execute_process( - COMMAND ${MPI_CXX_COMPILER} -show - OUTPUT_VARIABLE MPI_COMPILE_CMDLINE OUTPUT_STRIP_TRAILING_WHITESPACE - ERROR_VARIABLE MPI_COMPILE_CMDLINE ERROR_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE MPI_COMPILER_RETURN - ) - if(MPI_COMPILER_RETURN EQUAL 0) - string(REGEX - MATCHALL "(^| )-I([^\" ]+|\"[^\"]+\")" - MPI_ALL_INCLUDE_PATHS - "${MPI_COMPILE_CMDLINE}" - ) - foreach(IPATH IN LISTS MPI_ALL_INCLUDE_PATHS) - string(REGEX REPLACE "^ ?-I" "" IPATH ${IPATH}) - string(REGEX REPLACE "//" "/" IPATH ${IPATH}) - list(APPEND MPI_CXX_INCLUDE_DIRS ${IPATH}) - endforeach() - endif() - endif() - if(MPI_CXX_INCLUDE_DIRS) - set(CLANG_TIDY_EXTRA_ARG) - foreach(INCLUDE_DIR IN LISTS MPI_CXX_INCLUDE_DIRS) - set(CLANG_TIDY_EXTRA_ARG "${CLANG_TIDY_EXTRA_ARG} -I${INCLUDE_DIR}") - endforeach() - string(STRIP "${CLANG_TIDY_EXTRA_ARG}" CLANG_TIDY_EXTRA_ARG) - list(APPEND CLANG_TIDY_COMMAND - "-extra-arg=${CLANG_TIDY_EXTRA_ARG}" - ) - endif() - endif() - set(CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_COMMAND}" CACHE STRING "" FORCE) - else() - message(WARNING "Static analysis with clang-tidy requested, but skipped because the \ -executable clang-tidy was not found") - endif() -endfunction() - -function(configure_cppcheck) - find_program(CPPCHECK_EXE - NAMES cppcheck - ) - if(CPPCHECK_EXE) - message(STATUS "Found cppcheck for static analysis: ${CPPCHECK_EXE}") - file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/CMakeFiles/cppcheck) - execute_process( - COMMAND ${CMAKE_COMMAND} -E echo "*:${CMAKE_BINARY_DIR}/_deps/*" - OUTPUT_FILE ${CMAKE_BINARY_DIR}/CMakeFiles/cppcheck/suppressions.txt - ) - set(CPPCHECK_COMMAND - "${CPPCHECK_EXE}" - "--quiet" - "--force" - "--template=gcc" - "--std=c++17" - "--enable=warning,style,performance,portability" - "--suppressions-list=${CMAKE_BINARY_DIR}/CMakeFiles/cppcheck/suppressions.txt" - ) - set(CMAKE_CXX_CPPCHECK "${CPPCHECK_COMMAND}" CACHE STRING "" FORCE) - else() - message(WARNING "Static analysis with cppcheck requested, but skipped because the \ -executable cppcheck was not found") - endif() -endfunction() +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Helper functions configure static source code analysis with clang-tidy or cppcheck +# + +if(__static_analysis_helpers) + return() +endif() +set(__static_analysis_helpers YES) + +function(configure_clang_tidy) + find_program(CLANG_TIDY_EXE + NAMES clang-tidy + ) + if(CLANG_TIDY_EXE) + # If not explicitly specified, clang-tidy will recurse parent folders and find closest + # .clang-tidy file (for explicit path specification, add "--config-file= + # ${CMAKE_SOURCE_DIR}/../.clang-tidy") + message(STATUS "Found clang-tidy for static analysis: ${CLANG_TIDY_EXE}") + set(CLANG_TIDY_COMMAND "${CLANG_TIDY_EXE}") + + # Try to extract MPI compiler wrapper include paths from compile command line if not + # found already (clang-tidy will error about not finding mpi.h otherwise) + if(MPI_FOUND) + if(NOT MPI_CXX_INCLUDE_DIRS) + execute_process( + COMMAND ${MPI_CXX_COMPILER} -show + OUTPUT_VARIABLE MPI_COMPILE_CMDLINE OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_VARIABLE MPI_COMPILE_CMDLINE ERROR_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE MPI_COMPILER_RETURN + ) + if(MPI_COMPILER_RETURN EQUAL 0) + string(REGEX + MATCHALL "(^| )-I([^\" ]+|\"[^\"]+\")" + MPI_ALL_INCLUDE_PATHS + "${MPI_COMPILE_CMDLINE}" + ) + foreach(IPATH IN LISTS MPI_ALL_INCLUDE_PATHS) + string(REGEX REPLACE "^ ?-I" "" IPATH ${IPATH}) + string(REGEX REPLACE "//" "/" IPATH ${IPATH}) + list(APPEND MPI_CXX_INCLUDE_DIRS ${IPATH}) + endforeach() + endif() + endif() + if(MPI_CXX_INCLUDE_DIRS) + set(CLANG_TIDY_EXTRA_ARG) + foreach(INCLUDE_DIR IN LISTS MPI_CXX_INCLUDE_DIRS) + set(CLANG_TIDY_EXTRA_ARG "${CLANG_TIDY_EXTRA_ARG} -I${INCLUDE_DIR}") + endforeach() + string(STRIP "${CLANG_TIDY_EXTRA_ARG}" CLANG_TIDY_EXTRA_ARG) + list(APPEND CLANG_TIDY_COMMAND + "-extra-arg=${CLANG_TIDY_EXTRA_ARG}" + ) + endif() + endif() + set(CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_COMMAND}" CACHE STRING "" FORCE) + else() + message(WARNING "Static analysis with clang-tidy requested, but skipped because the \ +executable clang-tidy was not found") + endif() +endfunction() + +function(configure_cppcheck) + find_program(CPPCHECK_EXE + NAMES cppcheck + ) + if(CPPCHECK_EXE) + message(STATUS "Found cppcheck for static analysis: ${CPPCHECK_EXE}") + file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/CMakeFiles/cppcheck) + execute_process( + COMMAND ${CMAKE_COMMAND} -E echo "*:${CMAKE_BINARY_DIR}/_deps/*" + OUTPUT_FILE ${CMAKE_BINARY_DIR}/CMakeFiles/cppcheck/suppressions.txt + ) + set(CPPCHECK_COMMAND + "${CPPCHECK_EXE}" + "--quiet" + "--force" + "--template=gcc" + "--std=c++17" + "--enable=warning,style,performance,portability" + "--suppressions-list=${CMAKE_BINARY_DIR}/CMakeFiles/cppcheck/suppressions.txt" + ) + set(CMAKE_CXX_CPPCHECK "${CPPCHECK_COMMAND}" CACHE STRING "" FORCE) + else() + message(WARNING "Static analysis with cppcheck requested, but skipped because the \ +executable cppcheck was not found") + endif() +endfunction() diff --git a/palace/drivers/CMakeLists.txt b/palace/drivers/CMakeLists.txt index 6489d56650..4e38e3600c 100644 --- a/palace/drivers/CMakeLists.txt +++ b/palace/drivers/CMakeLists.txt @@ -1,16 +1,16 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Add source files and subdirectories. -# - -target_sources(${LIB_TARGET_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/basesolver.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/drivensolver.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/eigensolver.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/electrostaticsolver.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/magnetostaticsolver.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/transientsolver.cpp -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Add source files and subdirectories. +# + +target_sources(${LIB_TARGET_NAME} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/basesolver.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/drivensolver.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/eigensolver.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/electrostaticsolver.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/magnetostaticsolver.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/transientsolver.cpp +) diff --git a/palace/drivers/basesolver.cpp b/palace/drivers/basesolver.cpp index c459137cd2..43294e007a 100644 --- a/palace/drivers/basesolver.cpp +++ b/palace/drivers/basesolver.cpp @@ -1,768 +1,303 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "basesolver.hpp" - -#include -#include -#include -#include -#include -#include "drivers/transientsolver.hpp" -#include "fem/errorindicator.hpp" -#include "fem/fespace.hpp" -#include "linalg/ksp.hpp" -#include "models/domainpostoperator.hpp" -#include "models/postoperator.hpp" -#include "models/surfacepostoperator.hpp" -#include "utils/communication.hpp" -#include "utils/dorfler.hpp" -#include "utils/filesystem.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" -#include "utils/timer.hpp" - -namespace palace -{ - -using json = nlohmann::json; - -namespace -{ - -std::string GetPostDir(const std::string &output) -{ - return (output.length() > 0 && output.back() != '/') ? output + '/' : output; -} - -std::string GetIterationPostDir(const std::string &output, int step, int width) -{ - return fmt::format("{}adapt{:0{}d}/", output, step, width); -} - -void SaveIteration(MPI_Comm comm, const std::string &output, int step, int width) -{ - namespace fs = std::filesystem; - BlockTimer bt(Timer::IO); - Mpi::Barrier(comm); // Wait for all processes to write postprocessing files - if (Mpi::Root(comm)) - { - // Create a subfolder for the results of this adaptation. - const std::string step_output = GetIterationPostDir(output, step, width); - if (!fs::exists(step_output)) - { - fs::create_directories(step_output); - } - constexpr auto options = - fs::copy_options::recursive | fs::copy_options::overwrite_existing; - for (const auto &f : fs::directory_iterator(output)) - { - if (f.path().filename().string().rfind("adapt") == 0) - { - continue; - } - fs::copy(f, step_output + f.path().filename().string(), options); - } - } - Mpi::Barrier(comm); -} - -json LoadMetadata(const std::string &post_dir) -{ - std::string path = post_dir + "palace.json"; - std::ifstream fi(path); - if (!fi.is_open()) - { - MFEM_ABORT("Unable to open metadata file \"" << path << "\"!"); - } - return json::parse(fi); -} - -void WriteMetadata(const std::string &post_dir, const json &meta) -{ - std::string path = post_dir + "palace.json"; - std::ofstream fo(path); - if (!fo.is_open()) - { - MFEM_ABORT("Unable to open metadata file \"" << path << "\"!"); - } - fo << meta.dump(2) << '\n'; -} - -// Returns an array of indices corresponding to marked elements. -mfem::Array MarkedElements(const Vector &e, double threshold) -{ - mfem::Array ind; - ind.Reserve(e.Size()); - for (int i = 0; i < e.Size(); i++) - { - if (e[i] >= threshold) - { - ind.Append(i); - } - } - return ind; -} - -} // namespace - -BaseSolver::BaseSolver(const IoData &iodata, bool root, int size, int num_thread, - const char *git_tag) - : iodata(iodata), post_dir(GetPostDir(iodata.problem.output)), root(root), table(8, 9, 9) -{ - // Create directory for output. - if (root && !std::filesystem::exists(post_dir)) - { - std::filesystem::create_directories(post_dir); - } - - // Initialize simulation metadata for this simulation. - if (root && post_dir.length() > 0) - { - json meta; - if (git_tag) - { - meta["GitTag"] = std::string(git_tag); - } - if (size > 0) - { - meta["Problem"]["MPISize"] = size; - } - if (num_thread > 0) - { - meta["Problem"]["OpenMPThreads"] = num_thread; - } - WriteMetadata(post_dir, meta); - } -} - -void BaseSolver::SolveEstimateMarkRefine( - std::vector> &mesh) const -{ - const auto &refinement = iodata.model.refinement; - const bool use_amr = [&]() - { - if (dynamic_cast(this) != nullptr) - { - Mpi::Warning("AMR is not currently supported for transient simulations!\n"); - return false; - } - return refinement.max_it > 0; - }(); - if (use_amr && mesh.size() > 1) - { - Mpi::Print("\nFlattening mesh sequence:\n AMR will start from the final mesh in " - "the sequence of a priori refinements\n"); - mesh.erase(mesh.begin(), mesh.end() - 1); - constexpr bool refine = true, fix_orientation = true; - mesh.back()->Finalize(refine, fix_orientation); - } - - // Perform initial solve and estimation. - MPI_Comm comm = mesh.back()->GetComm(); - auto [indicators, ntdof] = Solve(mesh); - double err = indicators.Norml2(comm); - - // Collection of all tests that might exhaust resources. - auto ExhaustedResources = [&refinement](auto it, auto ntdof) - { - bool ret = false; - // Run out of iterations. - ret |= (it >= refinement.max_it); - // Run out of DOFs if a limit was set. - ret |= (refinement.max_size >= 1 && ntdof > refinement.max_size); - return ret; - }; - - // Main AMR loop. - int it = 0; - while (!ExhaustedResources(it, ntdof) && err >= refinement.tol) - { - BlockTimer bt(Timer::ADAPTATION); - Mpi::Print("\nAdaptive mesh refinement (AMR) iteration {:d}:\n" - " Indicator norm = {:.3e}, size = {:d}\n" - " Maximum iterations = {:d}, tol. = {:.3e}{}\n", - ++it, err, ntdof, refinement.max_it, refinement.tol, - (refinement.max_size > 0 - ? ", maximum size = " + std::to_string(refinement.max_size) - : "")); - - // Optionally save off the previous solution. - if (refinement.save_adapt_iterations) - { - SaveIteration(comm, post_dir, it, - 1 + static_cast(std::log10(refinement.max_it))); - } - - // Mark. - const auto [threshold, marked_error] = utils::ComputeDorflerThreshold( - comm, indicators.Local(), refinement.update_fraction); - const auto marked_elements = MarkedElements(indicators.Local(), threshold); - const auto [glob_marked_elements, glob_elements] = - linalg::GlobalSize2(comm, marked_elements, indicators.Local()); - Mpi::Print( - " Marked {:d}/{:d} elements for refinement ({:.2f}% of the error, θ = {:.2f})\n", - glob_marked_elements, glob_elements, 100 * marked_error, - refinement.update_fraction); - - // Refine. - const auto initial_elem_count = mesh.back()->GetGlobalNE(); - mesh.back()->GeneralRefinement(marked_elements, -1, refinement.max_nc_levels); - const auto final_elem_count = mesh.back()->GetGlobalNE(); - Mpi::Print(" Mesh refinement added {:d} elements (initial: {}, final: {})\n", - final_elem_count - initial_elem_count, initial_elem_count, final_elem_count); - - // Optionally rebalance and write the adapted mesh to file. - const auto ratio_pre = - mesh::RebalanceMesh(iodata, mesh.back(), refinement.maximum_imbalance); - if (ratio_pre > refinement.maximum_imbalance) - { - int min_elem, max_elem; - min_elem = max_elem = mesh.back()->GetNE(); - Mpi::GlobalMin(1, &min_elem, comm); - Mpi::GlobalMax(1, &max_elem, comm); - const auto ratio_post = double(max_elem) / min_elem; - Mpi::Print(" Rebalanced mesh: Ratio {:.3f} exceeded maximum allowed value {:.3f} " - "(new ratio = {:.3f})\n", - ratio_pre, refinement.maximum_imbalance, ratio_post); - } - - // Solve + estimate. - Mpi::Print("\nProceeding with solve/estimate iteration {}...\n", 1 + it); - std::tie(indicators, ntdof) = Solve(mesh); - err = indicators.Norml2(comm); - } - Mpi::Print("\nCompleted {:d} iteration{} of adaptive mesh refinement (AMR):\n" - " Indicator norm = {:.3e}, size = {:d}\n" - " Maximum iterations = {:d}, tol. = {:.3e}{}\n", - it, (it == 1 ? "" : "s"), err, ntdof, refinement.max_it, refinement.tol, - (refinement.max_size > 0 - ? ", maximum size = " + std::to_string(refinement.max_size) - : "")); -} - -void BaseSolver::SaveMetadata(const FiniteElementSpaceHierarchy &fespaces) const -{ - if (post_dir.length() == 0) - { - return; - } - const auto &fespace = fespaces.GetFinestFESpace(); - HYPRE_BigInt ne = fespace.GetParMesh()->GetNE(); - Mpi::GlobalSum(1, &ne, fespace.GetComm()); - std::vector ndofs(fespaces.GetNumLevels()); - for (std::size_t l = 0; l < fespaces.GetNumLevels(); l++) - { - ndofs[l] = fespaces.GetFESpaceAtLevel(l).GlobalTrueVSize(); - } - if (root) - { - json meta = LoadMetadata(post_dir); - meta["Problem"]["MeshElements"] = ne; - meta["Problem"]["DegreesOfFreedom"] = ndofs.back(); - meta["Problem"]["MultigridDegreesOfFreedom"] = ndofs; - WriteMetadata(post_dir, meta); - } -} - -template -void BaseSolver::SaveMetadata(const SolverType &ksp) const -{ - if (post_dir.length() == 0) - { - return; - } - if (root) - { - json meta = LoadMetadata(post_dir); - meta["LinearSolver"]["TotalSolves"] = ksp.NumTotalMult(); - meta["LinearSolver"]["TotalIts"] = ksp.NumTotalMultIterations(); - WriteMetadata(post_dir, meta); - } -} - -void BaseSolver::SaveMetadata(const Timer &timer) const -{ - if (post_dir.length() == 0) - { - return; - } - if (root) - { - json meta = LoadMetadata(post_dir); - for (int i = Timer::INIT; i < Timer::NUMTIMINGS; i++) - { - auto key = Timer::descriptions[i]; - key.erase(std::remove_if(key.begin(), key.end(), isspace), key.end()); - meta["ElapsedTime"]["Durations"][key] = timer.Data((Timer::Index)i); - meta["ElapsedTime"]["Counts"][key] = timer.Counts((Timer::Index)i); - } - WriteMetadata(post_dir, meta); - } -} - -namespace -{ - -struct EpsData -{ - const int idx; // Domain or interface index - const double pl; // Participation ratio - const double Ql; // Quality factor -}; - -struct CapData -{ - const int idx; // Surface index - const double Cij; // Capacitance (integrated charge) -}; - -struct IndData -{ - const int idx; // Surface index - const double Mij; // Inductance (integrated flux) -}; - -struct ProbeData -{ - const int idx; // Probe index - const std::complex Fx, Fy, Fz; // Field values at probe location -}; - -} // namespace - -void BaseSolver::PostprocessDomains(const PostOperator &postop, const std::string &name, - int step, double time, double E_elec, double E_mag, - double E_cap, double E_ind) const -{ - // If domains have been specified for postprocessing, compute the corresponding values - // and write out to disk. - if (post_dir.length() == 0) - { - return; - } - - // Write the field and lumped element energies. - if (root) - { - std::string path = post_dir + "domain-E.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - // clang-format off - output.print("{:>{}s},{:>{}s},{:>{}s},{:>{}s},{:>{}s}\n", - name, table.w1, - "E_elec (J)", table.w, - "E_mag (J)", table.w, - "E_cap (J)", table.w, - "E_ind (J)", table.w); - // clang-format on - } - // clang-format off - output.print("{:{}.{}e},{:+{}.{}e},{:+{}.{}e},{:+{}.{}e},{:+{}.{}e}\n", - time, table.w1, table.p1, - iodata.DimensionalizeValue(IoData::ValueType::ENERGY, E_elec), - table.w, table.p, - iodata.DimensionalizeValue(IoData::ValueType::ENERGY, E_mag), - table.w, table.p, - iodata.DimensionalizeValue(IoData::ValueType::ENERGY, E_cap), - table.w, table.p, - iodata.DimensionalizeValue(IoData::ValueType::ENERGY, E_ind), - table.w, table.p); - // clang-format on - } - - // Write the Q-factors due to bulk dielectric loss. - std::vector eps_data; - eps_data.reserve(postop.GetDomainPostOp().GetEps().size()); - for (const auto &[idx, data] : postop.GetDomainPostOp().GetEps()) - { - const double pl = postop.GetBulkParticipation(idx, E_elec + E_cap); - const double Ql = postop.GetBulkQualityFactor(idx, E_elec + E_cap); - eps_data.push_back({idx, pl, Ql}); - } - if (root && !eps_data.empty()) - { - std::string path = post_dir + "domain-Q.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", name, table.w1); - for (const auto &data : eps_data) - { - // clang-format off - output.print("{:>{}s},{:>{}s}{}", - "p_bulk[" + std::to_string(data.idx) + "]", table.w, - "Q_bulk[" + std::to_string(data.idx) + "]", table.w, - (data.idx == eps_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - output.print("{:{}.{}e},", time, table.w1, table.p1); - for (const auto &data : eps_data) - { - // clang-format off - output.print("{:+{}.{}e},{:+{}.{}e}{}", - data.pl, table.w, table.p, - data.Ql, table.w, table.p, - (data.idx == eps_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } -} - -void BaseSolver::PostprocessSurfaces(const PostOperator &postop, const std::string &name, - int step, double time, double E_elec, double E_mag, - double Vinc, double Iinc) const -{ - // If surfaces have been specified for postprocessing, compute the corresponding values - // and write out to disk. This output uses the complex magnitude of the computed charge - // and flux for frequency domain simulations. For capacitance/inductance, use the - // excitation voltage or current across all sources and excited ports. The passed in - // E_elec is the sum of the E-field and lumped capacitor energies, and E_mag is the same - // for the B-field and lumped inductors. - if (post_dir.length() == 0) - { - return; - } - - // Write the Q-factors due to interface dielectric loss. - std::vector eps_data; - eps_data.reserve(postop.GetSurfacePostOp().GetEps().size()); - for (const auto &[idx, data] : postop.GetSurfacePostOp().GetEps()) - { - const double pl = postop.GetInterfaceParticipation(idx, E_elec); - const double tandelta = postop.GetSurfacePostOp().GetInterfaceLossTangent(idx); - const double Ql = - (pl == 0.0 || tandelta == 0.0) ? mfem::infinity() : 1.0 / (tandelta * pl); - eps_data.push_back({idx, pl, Ql}); - } - if (root && !eps_data.empty()) - { - std::string path = post_dir + "surface-Q.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", name, table.w1); - for (const auto &data : eps_data) - { - // clang-format off - output.print("{:>{}s},{:>{}s}{}", - "p_surf[" + std::to_string(data.idx) + "]", table.w, - "Q_surf[" + std::to_string(data.idx) + "]", table.w, - (data.idx == eps_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - output.print("{:{}.{}e},", time, table.w1, table.p1); - for (const auto &data : eps_data) - { - // clang-format off - output.print("{:+{}.{}e},{:+{}.{}e}{}", - data.pl, table.w, table.p, - data.Ql, table.w, table.p, - (data.idx == eps_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - - // Write the surface capacitance (integrated charge). - std::vector cap_data; - cap_data.reserve(postop.GetSurfacePostOp().GetCap().size()); - for (const auto &[idx, data] : postop.GetSurfacePostOp().GetCap()) - { - const double Cij = (std::abs(Vinc) > 0.0) ? postop.GetSurfaceCharge(idx) / Vinc : 0.0; - cap_data.push_back( - {idx, iodata.DimensionalizeValue(IoData::ValueType::CAPACITANCE, Cij)}); - } - if (root && !cap_data.empty()) - { - std::string path = post_dir + "surface-C.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", name, table.w1); - for (const auto &data : cap_data) - { - // clang-format off - output.print("{:>{}s}{}", - "C[" + std::to_string(data.idx) + "] (F)", table.w, - (data.idx == cap_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - output.print("{:{}.{}e},", time, table.w1, table.p1); - for (const auto &data : cap_data) - { - // clang-format off - output.print("{:+{}.{}e}{}", - data.Cij, table.w, table.p, - (data.idx == cap_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - - // Write the surface inductance (integrated flux). - std::vector ind_data; - ind_data.reserve(postop.GetSurfacePostOp().GetInd().size()); - for (const auto &[idx, data] : postop.GetSurfacePostOp().GetInd()) - { - const double Mij = (std::abs(Iinc) > 0.0) ? postop.GetSurfaceFlux(idx) / Iinc : 0.0; - ind_data.push_back( - {idx, iodata.DimensionalizeValue(IoData::ValueType::INDUCTANCE, Mij)}); - } - if (root && !ind_data.empty()) - { - std::string path = post_dir + "surface-M.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", name, table.w1); - for (const auto &data : ind_data) - { - // clang-format off - output.print("{:>{}s}{}", - "M[" + std::to_string(data.idx) + "] (H)", table.w, - (data.idx == ind_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - output.print("{:{}.{}e},", time, table.w1, table.p1); - for (const auto &data : ind_data) - { - // clang-format off - output.print("{:+{}.{}e}{}", - data.Mij, table.w, table.p, - (data.idx == ind_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } -} - -void BaseSolver::PostprocessProbes(const PostOperator &postop, const std::string &name, - int step, double time) const -{ -#if defined(MFEM_USE_GSLIB) - // If probe locations have been specified for postprocessing, compute the corresponding - // field values and write out to disk. - if (post_dir.length() == 0) - { - return; - } - - // Write the computed field values at probe locations. - if (postop.GetProbes().size() == 0) - { - return; - } - const bool has_imaginary = postop.HasImaginary(); - for (int f = 0; f < 2; f++) - { - // Probe data is ordered as [Fx1, Fy1, Fz1, Fx2, Fy2, Fz2, ...]. - if (f == 0 && !postop.HasE()) - { - continue; - } - if (f == 1 && !postop.HasB()) - { - continue; - } - std::vector probe_data; - probe_data.reserve(postop.GetProbes().size()); - const std::vector> vF = - (f == 0) ? postop.ProbeEField() : postop.ProbeBField(); - const int dim = vF.size() / postop.GetProbes().size(); - int i = 0; - for (const auto &idx : postop.GetProbes()) - { - probe_data.push_back( - {idx, vF[i * dim], vF[i * dim + 1], (dim == 3) ? vF[i * dim + 2] : 0.0}); - i++; - } - const std::string F = (f == 0) ? "E" : "B"; - const std::string unit = (f == 0) ? "(V/m)" : "(Wb/m²)"; - const auto type = (f == 0) ? IoData::ValueType::FIELD_E : IoData::ValueType::FIELD_B; - if (root && !probe_data.empty()) - { - std::string path = post_dir + "probe-" + F + ".csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", name, table.w1); - if (has_imaginary) - { - for (const auto &data : probe_data) - { - // clang-format off - output.print("{:>{}s},{:>{}s},{:>{}s},{:>{}s}", - "Re{" + F + "_x[" + std::to_string(data.idx) + "]} " + unit, table.w, - "Im{" + F + "_x[" + std::to_string(data.idx) + "]} " + unit, table.w, - "Re{" + F + "_y[" + std::to_string(data.idx) + "]} " + unit, table.w, - "Im{" + F + "_y[" + std::to_string(data.idx) + "]} " + unit, table.w); - // clang-format on - if (dim == 3) - { - // clang-format off - output.print(",{:>{}s},{:>{}s}{}", - "Re{" + F + "_z[" + std::to_string(data.idx) + "]} " + unit, table.w, - "Im{" + F + "_z[" + std::to_string(data.idx) + "]} " + unit, table.w, - (data.idx == probe_data.back().idx) ? "" : ","); - // clang-format on - } - else - { - // clang-format off - output.print("{}", - (data.idx == probe_data.back().idx) ? "" : ","); - // clang-format on - } - } - } - else - { - for (const auto &data : probe_data) - { - // clang-format off - output.print("{:>{}s},{:>{}s}", - F + "_x[" + std::to_string(data.idx) + "] " + unit, table.w, - F + "_y[" + std::to_string(data.idx) + "] " + unit, table.w); - // clang-format on - if (dim == 3) - { - // clang-format off - output.print(",{:>{}s}{}", - F + "_z[" + std::to_string(data.idx) + "] " + unit, table.w, - (data.idx == probe_data.back().idx) ? "" : ","); - // clang-format on - } - else - { - // clang-format off - output.print("{}", - (data.idx == probe_data.back().idx) ? "" : ","); - // clang-format on - } - } - } - output.print("\n"); - } - output.print("{:{}.{}e},", time, table.w1, table.p1); - if (has_imaginary) - { - for (const auto &data : probe_data) - { - // clang-format off - output.print("{:+{}.{}e},{:+{}.{}e},{:+{}.{}e},{:+{}.{}e}", - iodata.DimensionalizeValue(type, data.Fx.real()), table.w, table.p, - iodata.DimensionalizeValue(type, data.Fx.imag()), table.w, table.p, - iodata.DimensionalizeValue(type, data.Fy.real()), table.w, table.p, - iodata.DimensionalizeValue(type, data.Fy.imag()), table.w, table.p); - // clang-format on - if (dim == 3) - { - // clang-format off - output.print(",{:+{}.{}e},{:+{}.{}e}{}", - iodata.DimensionalizeValue(type, data.Fz.real()), table.w, table.p, - iodata.DimensionalizeValue(type, data.Fz.imag()), table.w, table.p, - (data.idx == probe_data.back().idx) ? "" : ","); - // clang-format on - } - else - { - // clang-format off - output.print("{}", - (data.idx == probe_data.back().idx) ? "" : ","); - // clang-format on - } - } - } - else - { - for (const auto &data : probe_data) - { - // clang-format off - output.print("{:+{}.{}e},{:+{}.{}e}", - iodata.DimensionalizeValue(type, data.Fx.real()), table.w, table.p, - iodata.DimensionalizeValue(type, data.Fy.real()), table.w, table.p); - // clang-format on - if (dim == 3) - { - // clang-format off - output.print(",{:+{}.{}e}{}", - iodata.DimensionalizeValue(type, data.Fz.real()), table.w, table.p, - (data.idx == probe_data.back().idx) ? "" : ","); - // clang-format on - } - else - { - // clang-format off - output.print("{}", - (data.idx == probe_data.back().idx) ? "" : ","); - // clang-format on - } - } - } - output.print("\n"); - } - } -#endif -} - -void BaseSolver::PostprocessFields(const PostOperator &postop, int step, double time, - const ErrorIndicator *indicator) const -{ - // Save the computed fields in parallel in format for viewing with ParaView. - BlockTimer bt(Timer::IO); - if (post_dir.length() == 0) - { - Mpi::Warning(postop.GetComm(), - "No file specified under [\"Problem\"][\"Output\"]!\nSkipping saving of " - "fields to disk!\n"); - return; - } - postop.WriteFields(step, time, indicator); - Mpi::Barrier(postop.GetComm()); -} - -void BaseSolver::PostprocessErrorIndicator(const PostOperator &postop, - const ErrorIndicator &indicator) const -{ - // Write the indicator statistics. - if (post_dir.length() == 0) - { - return; - } - MPI_Comm comm = postop.GetComm(); - std::array data = {indicator.Norml2(comm), indicator.Min(comm), - indicator.Max(comm), indicator.Mean(comm)}; - if (root) - { - std::string path = post_dir + "error-indicators.csv"; - auto output = OutputFile(path, false); - // clang-format off - output.print("{:>{}s},{:>{}s},{:>{}s},{:>{}s}\n", - "Norm", table.w, - "Minimum", table.w, - "Maximum", table.w, - "Mean", table.w); - output.print("{:+{}.{}e},{:+{}.{}e},{:+{}.{}e},{:+{}.{}e}\n", - data[0], table.w, table.p, - data[1], table.w, table.p, - data[2], table.w, table.p, - data[3], table.w, table.p); - // clang-format on - } -} - -template void BaseSolver::SaveMetadata(const KspSolver &) const; -template void BaseSolver::SaveMetadata(const ComplexKspSolver &) const; - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "basesolver.hpp" + +#include +#include +#include +#include +#include +#include "drivers/transientsolver.hpp" +#include "fem/errorindicator.hpp" +#include "fem/fespace.hpp" +#include "fem/mesh.hpp" +#include "linalg/ksp.hpp" +#include "models/domainpostoperator.hpp" +#include "models/portexcitations.hpp" +#include "models/postoperator.hpp" +#include "models/surfacepostoperator.hpp" +#include "utils/communication.hpp" +#include "utils/dorfler.hpp" +#include "utils/filesystem.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +using json = nlohmann::json; + +namespace +{ + +void SaveIteration(MPI_Comm comm, const fs::path &output_dir, int step, int width) +{ + BlockTimer bt(Timer::IO); + Mpi::Barrier(comm); // Wait for all processes to write postprocessing files + if (Mpi::Root(comm)) + { + // Create a subfolder for the results of this adaptation. + auto step_output = output_dir / fmt::format("iteration{:0{}d}", step, width); + if (!fs::exists(step_output)) + { + fs::create_directories(step_output); + } + constexpr auto options = + fs::copy_options::recursive | fs::copy_options::overwrite_existing; + for (const auto &f : fs::directory_iterator(output_dir)) + { + if (f.path().filename().string().rfind("iteration") == 0) + { + continue; + } + fs::copy(f, step_output / f.path().filename(), options); + } + } + Mpi::Barrier(comm); +} + +json LoadMetadata(const fs::path &post_dir) +{ + std::string path = fs::path(post_dir / "palace.json").string(); + std::ifstream fi(path); + if (!fi.is_open()) + { + MFEM_ABORT("Unable to open metadata file \"" << path << "\"!"); + } + return json::parse(fi); +} + +void WriteMetadata(const fs::path &post_dir, const json &meta) +{ + std::string path = fs::path(post_dir / "palace.json").string(); + std::ofstream fo(path); + if (!fo.is_open()) + { + MFEM_ABORT("Unable to open metadata file \"" << path << "\"!"); + } + fo << meta.dump(2) << '\n'; +} + +// Returns an array of indices corresponding to marked elements. +mfem::Array MarkedElements(const Vector &e, double threshold) +{ + mfem::Array ind; + ind.Reserve(e.Size()); + for (int i = 0; i < e.Size(); i++) + { + if (e[i] >= threshold) + { + ind.Append(i); + } + } + return ind; +} + +} // namespace + +BaseSolver::BaseSolver(const IoData &iodata, bool root, int size, int num_thread, + const char *git_tag) + : iodata(iodata), post_dir(iodata.problem.output), root(root) +{ + // Initialize simulation metadata for this simulation. + if (root) + { + json meta; + if (git_tag) + { + meta["GitTag"] = std::string(git_tag); + } + if (size > 0) + { + meta["Problem"]["MPISize"] = size; + } + if (num_thread > 0) + { + meta["Problem"]["OpenMPThreads"] = num_thread; + } + WriteMetadata(post_dir, meta); + } +} + +void BaseSolver::SolveEstimateMarkRefine(std::vector> &mesh) const +{ + const auto &refinement = iodata.model.refinement; + const bool use_amr = [&]() + { + if (refinement.max_it > 0 && dynamic_cast(this) != nullptr) + { + Mpi::Warning("AMR is not currently supported for transient simulations!\n"); + return false; + } + return (refinement.max_it > 0); + }(); + if (use_amr && mesh.size() > 1) + { + Mpi::Print("\nFlattening mesh sequence:\n AMR will start from the final mesh in " + "the sequence of a priori refinements\n"); + mesh.erase(mesh.begin(), mesh.end() - 1); + } + MPI_Comm comm = mesh.back()->GetComm(); + + // Perform initial solve and estimation. + auto [indicators, ntdof] = Solve(mesh); + double err = indicators.Norml2(comm); + + // Collection of all tests that might exhaust resources. + auto ExhaustedResources = [&refinement](auto it, auto ntdof) + { + bool ret = false; + // Run out of iterations. + ret |= (it >= refinement.max_it); + // Run out of DOFs if a limit was set. + ret |= (refinement.max_size > 0 && ntdof > refinement.max_size); + return ret; + }; + + // Main AMR loop. + int it = 0; + while (!ExhaustedResources(it, ntdof) && err >= refinement.tol) + { + // Print timing summary. + Mpi::Print("\nCumulative timing statistics:\n"); + BlockTimer::Print(comm); + SaveMetadata(BlockTimer::GlobalTimer()); + + BlockTimer bt(Timer::ADAPTATION); + Mpi::Print("\nAdaptive mesh refinement (AMR) iteration {:d}:\n" + " Indicator norm = {:.3e}, global unknowns = {:d}\n" + " Max. iterations = {:d}, tol. = {:.3e}{}\n", + ++it, err, ntdof, refinement.max_it, refinement.tol, + (refinement.max_size > 0 + ? ", max. size = " + std::to_string(refinement.max_size) + : "")); + + // Optionally save off the previous solution. + if (refinement.save_adapt_iterations) + { + SaveIteration(comm, post_dir, it, + 1 + static_cast(std::log10(refinement.max_it))); + } + + // Mark. + const auto marked_elements = [&comm, &refinement](const auto &indicators) + { + const auto [threshold, marked_error] = utils::ComputeDorflerThreshold( + comm, indicators.Local(), refinement.update_fraction); + const auto marked_elements = MarkedElements(indicators.Local(), threshold); + const auto [glob_marked_elements, glob_elements] = + linalg::GlobalSize2(comm, marked_elements, indicators.Local()); + Mpi::Print( + " Marked {:d}/{:d} elements for refinement ({:.2f}% of the error, θ = {:.2f})\n", + glob_marked_elements, glob_elements, 100 * marked_error, + refinement.update_fraction); + return marked_elements; + }(indicators); + + // Refine. + { + mfem::ParMesh &fine_mesh = *mesh.back(); + const auto initial_elem_count = fine_mesh.GetGlobalNE(); + fine_mesh.GeneralRefinement(marked_elements, -1, refinement.max_nc_levels); + const auto final_elem_count = fine_mesh.GetGlobalNE(); + Mpi::Print(" {} mesh refinement added {:d} elements (initial = {:d}, final = {:d})\n", + fine_mesh.Nonconforming() ? "Nonconforming" : "Conforming", + final_elem_count - initial_elem_count, initial_elem_count, + final_elem_count); + } + + // Optionally rebalance and write the adapted mesh to file. + { + const auto ratio_pre = mesh::RebalanceMesh(iodata, *mesh.back()); + if (ratio_pre > refinement.maximum_imbalance) + { + int min_elem, max_elem; + min_elem = max_elem = mesh.back()->GetNE(); + Mpi::GlobalMin(1, &min_elem, comm); + Mpi::GlobalMax(1, &max_elem, comm); + const auto ratio_post = double(max_elem) / min_elem; + Mpi::Print(" Rebalanced mesh: Ratio {:.3f} exceeded max. allowed value {:.3f} " + "(new ratio = {:.3f})\n", + ratio_pre, refinement.maximum_imbalance, ratio_post); + } + mesh.back()->Update(); + } + + // Solve + estimate. + Mpi::Print("\nProceeding with solve/estimate iteration {}...\n", it + 1); + std::tie(indicators, ntdof) = Solve(mesh); + err = indicators.Norml2(comm); + } + Mpi::Print("\nCompleted {:d} iteration{} of adaptive mesh refinement (AMR):\n" + " Indicator norm = {:.3e}, global unknowns = {:d}\n" + " Max. iterations = {:d}, tol. = {:.3e}{}\n", + it, (it == 1 ? "" : "s"), err, ntdof, refinement.max_it, refinement.tol, + (refinement.max_size > 0 + ? ", max. size = " + std::to_string(refinement.max_size) + : "")); +} + +void BaseSolver::SaveMetadata(const FiniteElementSpaceHierarchy &fespaces) const +{ + const auto &fespace = fespaces.GetFinestFESpace(); + HYPRE_BigInt ne = fespace.GetParMesh().GetNE(); + Mpi::GlobalSum(1, &ne, fespace.GetComm()); + std::vector ndofs(fespaces.GetNumLevels()); + for (std::size_t l = 0; l < fespaces.GetNumLevels(); l++) + { + ndofs[l] = fespaces.GetFESpaceAtLevel(l).GlobalTrueVSize(); + } + if (root) + { + json meta = LoadMetadata(post_dir); + meta["Problem"]["MeshElements"] = ne; + meta["Problem"]["DegreesOfFreedom"] = ndofs.back(); + meta["Problem"]["MultigridDegreesOfFreedom"] = ndofs; + WriteMetadata(post_dir, meta); + } +} + +template +void BaseSolver::SaveMetadata(const SolverType &ksp) const +{ + if (root) + { + json meta = LoadMetadata(post_dir); + meta["LinearSolver"]["TotalSolves"] = ksp.NumTotalMult(); + meta["LinearSolver"]["TotalIts"] = ksp.NumTotalMultIterations(); + WriteMetadata(post_dir, meta); + } +} + +void BaseSolver::SaveMetadata(const Timer &timer) const +{ + if (root) + { + json meta = LoadMetadata(post_dir); + for (int i = Timer::INIT; i < Timer::NUM_TIMINGS; i++) + { + auto key = Timer::descriptions[i]; + key.erase(std::remove_if(key.begin(), key.end(), isspace), key.end()); + meta["ElapsedTime"]["Durations"][key] = timer.Data((Timer::Index)i); + meta["ElapsedTime"]["Counts"][key] = timer.Counts((Timer::Index)i); + } + WriteMetadata(post_dir, meta); + } +} + +void BaseSolver::SaveMetadata(const PortExcitations &excitation_helper) const +{ + if (root) + { + nlohmann::json meta = LoadMetadata(post_dir); + meta["Excitations"] = excitation_helper; + WriteMetadata(post_dir, meta); + } +} + +template void BaseSolver::SaveMetadata(const KspSolver &) const; +template void BaseSolver::SaveMetadata(const ComplexKspSolver &) const; + +} // namespace palace diff --git a/palace/drivers/basesolver.hpp b/palace/drivers/basesolver.hpp index d6ca996e90..1426f33ee3 100644 --- a/palace/drivers/basesolver.hpp +++ b/palace/drivers/basesolver.hpp @@ -1,106 +1,60 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_DRIVERS_BASE_SOLVER_HPP -#define PALACE_DRIVERS_BASE_SOLVER_HPP - -#include -#include -#include -#include - -namespace mfem -{ - -class ParMesh; - -} // namespace mfem - -namespace palace -{ - -class ErrorIndicator; -class FiniteElementSpaceHierarchy; -class IoData; -class PostOperator; -class Timer; - -// -// Base driver class for all simulation types. -// -class BaseSolver -{ -protected: - // Reference to configuration file data (not owned). - const IoData &iodata; - - // Parameters for writing postprocessing outputs. - const std::string post_dir; - const bool root; - - // Table formatting for output files. - struct Table - { - int w; // Total column width = precision + spaces + 7 extra (signs/exponent) - int sp; // Table column spaces - int p; // Floating point precision for data - int w1; // First column width = precision + 7 extra - int p1; // Floating point precision for first column - Table(int sp, int p, int p1) : w(sp + p + 7), sp(sp), p(p), w1(p1 + 7), p1(p1) {} - }; - const Table table; - - // Helper method for creating/appending to output files. - fmt::ostream OutputFile(const std::string &path, bool append) const - { - return append ? fmt::output_file(path, fmt::file::WRONLY | fmt::file::APPEND) - : fmt::output_file(path, fmt::file::WRONLY | fmt::file::CREATE | - fmt::file::TRUNC); - } - - // Common domain postprocessing for all simulation types. - void PostprocessDomains(const PostOperator &postop, const std::string &name, int step, - double time, double E_elec, double E_mag, double E_cap, - double E_ind) const; - - // Common surface postprocessing for all simulation types. - void PostprocessSurfaces(const PostOperator &postop, const std::string &name, int step, - double time, double E_elec, double E_mag, double Vinc, - double Iinc) const; - - // Common probe postprocessing for all simulation types. - void PostprocessProbes(const PostOperator &postop, const std::string &name, int step, - double time) const; - - // Common field visualization postprocessing for all simulation types. - void PostprocessFields(const PostOperator &postop, int step, double time, - const ErrorIndicator *indicator = nullptr) const; - - // Common error indicator postprocessing for all simulation types. - void PostprocessErrorIndicator(const PostOperator &postop, - const ErrorIndicator &indicator) const; - - // Performs a solve using the mesh sequence, then reports error indicators and the number - // of global true dofs. - virtual std::pair - Solve(const std::vector> &mesh) const = 0; - -public: - BaseSolver(const IoData &iodata, bool root, int size = 0, int num_thread = 0, - const char *git_tag = nullptr); - virtual ~BaseSolver() = default; - - // Performs adaptive mesh refinement using the solve-estimate-mark-refine paradigm. - // Dispatches to the Solve method for the driver specific calculations. - void SolveEstimateMarkRefine(std::vector> &mesh) const; - - // These methods write different simulation metadata to a JSON file in post_dir. - void SaveMetadata(const FiniteElementSpaceHierarchy &fespaces) const; - template - void SaveMetadata(const SolverType &ksp) const; - void SaveMetadata(const Timer &timer) const; -}; - -} // namespace palace - -#endif // PALACE_DRIVERS_BASE_SOLVER_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_DRIVERS_BASE_SOLVER_HPP +#define PALACE_DRIVERS_BASE_SOLVER_HPP + +#include +#include +#include +#include "fem/errorindicator.hpp" +#include "utils/filesystem.hpp" + +namespace palace +{ + +class FiniteElementSpaceHierarchy; +class IoData; +class Mesh; +class Timer; +class PortExcitations; + +// +// Base driver class for all simulation types. +// +class BaseSolver +{ +protected: + // Reference to configuration file data (not owned). + // TODO(C++20): Update to reference wrapper of incomplete type. + const IoData &iodata; + + // Parameters for writing postprocessing outputs. + fs::path post_dir; + bool root; + + // Performs a solve using the mesh sequence, then reports error indicators and the number + // of global true dofs. + virtual std::pair + Solve(const std::vector> &mesh) const = 0; + +public: + BaseSolver(const IoData &iodata, bool root, int size = 0, int num_thread = 0, + const char *git_tag = nullptr); + virtual ~BaseSolver() = default; + + // Performs adaptive mesh refinement using the solve-estimate-mark-refine paradigm. + // Dispatches to the Solve method for the driver specific calculations. + void SolveEstimateMarkRefine(std::vector> &mesh) const; + + // These methods write different simulation metadata to a JSON file in post_dir. + void SaveMetadata(const FiniteElementSpaceHierarchy &fespaces) const; + template + void SaveMetadata(const SolverType &ksp) const; + void SaveMetadata(const Timer &timer) const; + void SaveMetadata(const PortExcitations &excitation_helper) const; +}; + +} // namespace palace + +#endif // PALACE_DRIVERS_BASE_SOLVER_HPP diff --git a/palace/drivers/drivensolver.cpp b/palace/drivers/drivensolver.cpp index 2b70f56b62..36bddb3329 100644 --- a/palace/drivers/drivensolver.cpp +++ b/palace/drivers/drivensolver.cpp @@ -1,744 +1,421 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "drivensolver.hpp" - -#include -#include -#include "fem/errorindicator.hpp" -#include "linalg/errorestimator.hpp" -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" -#include "models/lumpedportoperator.hpp" -#include "models/postoperator.hpp" -#include "models/romoperator.hpp" -#include "models/spaceoperator.hpp" -#include "models/surfacecurrentoperator.hpp" -#include "models/waveportoperator.hpp" -#include "utils/communication.hpp" -#include "utils/iodata.hpp" -#include "utils/prettyprint.hpp" -#include "utils/timer.hpp" - -namespace palace -{ - -using namespace std::complex_literals; - -std::pair -DrivenSolver::Solve(const std::vector> &mesh) const -{ - // Set up the spatial discretization and frequency sweep. - BlockTimer bt0(Timer::CONSTRUCT); - SpaceOperator spaceop(iodata, mesh); - int nstep = GetNumSteps(iodata.solver.driven.min_f, iodata.solver.driven.max_f, - iodata.solver.driven.delta_f); - int step0 = (iodata.solver.driven.rst > 0) ? iodata.solver.driven.rst - 1 : 0; - double delta_omega = iodata.solver.driven.delta_f; - double omega0 = iodata.solver.driven.min_f + step0 * delta_omega; - bool adaptive = (iodata.solver.driven.adaptive_tol > 0.0); - if (adaptive && nstep <= 2) - { - Mpi::Warning("Adaptive frequency sweep requires > 2 total frequency samples!\n" - "Reverting to uniform sweep!\n"); - adaptive = false; - } - SaveMetadata(spaceop.GetNDSpaces()); - - // Frequencies will be sampled uniformly in the frequency domain. Index sets are for - // computing things like S-parameters in postprocessing. - PostOperator postop(iodata, spaceop, "driven"); - { - Mpi::Print("\nComputing {}frequency response for:\n", adaptive ? "adaptive fast " : ""); - bool first = true; - for (const auto &[idx, data] : spaceop.GetLumpedPortOp()) - { - if (data.IsExcited()) - { - if (first) - { - Mpi::Print(" Lumped port excitation specified on port{}", - (spaceop.GetLumpedPortOp().Size() > 1) ? "s" : ""); - first = false; - } - Mpi::Print(" {:d}", idx); - } - } - int excitations = first; - first = true; - for (const auto &[idx, data] : spaceop.GetWavePortOp()) - { - if (data.IsExcited()) - { - if (first) - { - Mpi::Print(" Wave port excitation specified on port{}", - (spaceop.GetWavePortOp().Size() > 1) ? "s" : ""); - first = false; - } - Mpi::Print(" {:d}", idx); - } - } - excitations += first; - first = true; - for (const auto &[idx, data] : spaceop.GetSurfaceCurrentOp()) - { - if (first) - { - Mpi::Print(" Surface current excitation specified on port{}", - (spaceop.GetSurfaceCurrentOp().Size() > 1) ? "s" : ""); - first = false; - } - Mpi::Print(" {:d}", idx); - } - excitations += first; - MFEM_VERIFY(excitations > 0, "No excitation specified for driven simulation!"); - } - Mpi::Print("\n"); - - // Main frequency sweep loop. - return {adaptive ? SweepAdaptive(spaceop, postop, nstep, step0, omega0, delta_omega) - : SweepUniform(spaceop, postop, nstep, step0, omega0, delta_omega), - spaceop.GlobalTrueVSize()}; -} - -ErrorIndicator DrivenSolver::SweepUniform(SpaceOperator &spaceop, PostOperator &postop, - int nstep, int step0, double omega0, - double delta_omega) const -{ - // Construct the system matrices defining the linear operator. PEC boundaries are handled - // simply by setting diagonal entries of the system matrix for the corresponding dofs. - // Because the Dirichlet BC is always homogenous, no special elimination is required on - // the RHS. Assemble the linear system for the initial frequency (so we can call - // KspSolver::SetOperators). Compute everything at the first frequency step. - BlockTimer bt0(Timer::CONSTRUCT); - auto K = spaceop.GetStiffnessMatrix(Operator::DIAG_ONE); - auto C = spaceop.GetDampingMatrix(Operator::DIAG_ZERO); - auto M = spaceop.GetMassMatrix(Operator::DIAG_ZERO); - auto A2 = spaceop.GetExtraSystemMatrix(omega0, Operator::DIAG_ZERO); - const auto &Curl = spaceop.GetCurlMatrix(); - - // Set up the linear solver and set operators for the first frequency step. The - // preconditioner for the complex linear system is constructed from a real approximation - // to the complex system matrix. - auto A = spaceop.GetSystemMatrix(std::complex(1.0, 0.0), 1i * omega0, - std::complex(-omega0 * omega0, 0.0), K.get(), - C.get(), M.get(), A2.get()); - auto P = spaceop.GetPreconditionerMatrix(1.0, omega0, -omega0 * omega0, - omega0); - - ComplexKspSolver ksp(iodata, spaceop.GetNDSpaces(), &spaceop.GetH1Spaces()); - ksp.SetOperators(*A, *P); - - // Set up RHS vector for the incident field at port boundaries, and the vector for the - // first frequency step. - ComplexVector RHS(Curl.Width()), E(Curl.Width()), B(Curl.Height()); - E = 0.0; - B = 0.0; - - // Initialize structures for storing and reducing the results of error estimation. - CurlFluxErrorEstimator estimator( - spaceop.GetMaterialOp(), spaceop.GetNDSpaces(), iodata.solver.linear.estimator_tol, - iodata.solver.linear.estimator_max_it, 0, iodata.solver.pa_order_threshold); - ErrorIndicator indicator; - - // Main frequency sweep loop. - int step = step0; - double omega = omega0; - auto t0 = Timer::Now(); - while (step < nstep) - { - const double freq = iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, omega); - Mpi::Print("\nIt {:d}/{:d}: ω/2π = {:.3e} GHz (elapsed time = {:.2e} s)\n", step + 1, - nstep, freq, Timer::Duration(Timer::Now() - t0).count()); - - // Assemble the linear system. - if (step > step0) - { - // Update frequency-dependent excitation and operators. - A2 = spaceop.GetExtraSystemMatrix(omega, Operator::DIAG_ZERO); - A = spaceop.GetSystemMatrix(std::complex(1.0, 0.0), 1i * omega, - std::complex(-omega * omega, 0.0), K.get(), - C.get(), M.get(), A2.get()); - P = spaceop.GetPreconditionerMatrix(1.0, omega, -omega * omega, - omega); - ksp.SetOperators(*A, *P); - } - spaceop.GetExcitationVector(omega, RHS); - - // Solve the linear system. - BlockTimer bt1(Timer::SOLVE); - Mpi::Print("\n"); - ksp.Mult(RHS, E); - - // Compute B = -1/(iω) ∇ x E on the true dofs, and set the internal GridFunctions in - // PostOperator for all postprocessing operations. - BlockTimer bt2(Timer::POSTPRO); - double E_elec = 0.0, E_mag = 0.0; - Curl.Mult(E.Real(), B.Real()); - Curl.Mult(E.Imag(), B.Imag()); - B *= -1.0 / (1i * omega); - postop.SetEGridFunction(E); - postop.SetBGridFunction(B); - postop.UpdatePorts(spaceop.GetLumpedPortOp(), spaceop.GetWavePortOp(), omega); - Mpi::Print(" Sol. ||E|| = {:.6e} (||RHS|| = {:.6e})\n", - linalg::Norml2(spaceop.GetComm(), E), - linalg::Norml2(spaceop.GetComm(), RHS)); - if (!iodata.solver.driven.only_port_post) - { - const double J = iodata.DimensionalizeValue(IoData::ValueType::ENERGY, 1.0); - E_elec = postop.GetEFieldEnergy(); - E_mag = postop.GetHFieldEnergy(); - Mpi::Print(" Field energy E ({:.3e} J) + H ({:.3e} J) = {:.3e} J\n", E_elec * J, - E_mag * J, (E_elec + E_mag) * J); - } - - // Calculate and record the error indicators. - Mpi::Print(" Updating solution error estimates\n"); - estimator.AddErrorIndicator(E, indicator); - - // Postprocess S-parameters and optionally write solution to disk. - Postprocess(postop, spaceop.GetLumpedPortOp(), spaceop.GetWavePortOp(), - spaceop.GetSurfaceCurrentOp(), step, omega, E_elec, E_mag, - !iodata.solver.driven.only_port_post, - (step == nstep - 1) ? &indicator : nullptr); - - // Increment frequency. - step++; - omega += delta_omega; - } - SaveMetadata(ksp); - return indicator; -} - -ErrorIndicator DrivenSolver::SweepAdaptive(SpaceOperator &spaceop, PostOperator &postop, - int nstep, int step0, double omega0, - double delta_omega) const -{ - // Configure default parameters if not specified. - BlockTimer bt0(Timer::CONSTRUCT); - double offline_tol = iodata.solver.driven.adaptive_tol; - int nmax = iodata.solver.driven.adaptive_nmax; - int ncand = iodata.solver.driven.adaptive_ncand; - MFEM_VERIFY(nmax <= 0 || nmax > 2, - "Adaptive frequency sweep must sample at least two frequency points!"); - if (nmax <= 0) - { - nmax = 20; // Default value - } - nmax = std::min(nmax, nstep - step0); // Maximum number sample points dictated by sweep - if (ncand > 0) - { - if (ncand > nstep - step0) - { - Mpi::Warning("Requested candidate points {:d} > number of total frequency sweep " - "samples {:d}!\n" - "Resetting to the smaller value!\n", - ncand, nstep - step0); - ncand = nstep - step0; - } - } - else - { - constexpr int inc = 5; - ncand = (nstep - step0 + inc - 1) / inc; // Default value, always >= 1 - } - - // Allocate negative curl matrix for postprocessing the B-field and vectors for the - // high-dimensional field solution. - const auto &Curl = spaceop.GetCurlMatrix(); - ComplexVector E(Curl.Width()), B(Curl.Height()); - E = 0.0; - B = 0.0; - - // Initialize structures for storing and reducing the results of error estimation. - CurlFluxErrorEstimator estimator( - spaceop.GetMaterialOp(), spaceop.GetNDSpaces(), iodata.solver.linear.estimator_tol, - iodata.solver.linear.estimator_max_it, 0, iodata.solver.pa_order_threshold); - ErrorIndicator indicator; - - // Configure the PROM operator which performs the parameter space sampling and basis - // construction during the offline phase as well as the PROM solution during the online - // phase. - auto t0 = Timer::Now(); - const double f0 = iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, 1.0); - Mpi::Print("\nBeginning PROM construction offline phase:\n" - " {:d} points for frequency sweep over [{:.3e}, {:.3e}] GHz\n", - nstep - step0, omega0 * f0, (omega0 + (nstep - step0 - 1) * delta_omega) * f0); - RomOperator prom(iodata, spaceop); - prom.Initialize(omega0, delta_omega, nstep - step0, nmax); - spaceop.GetWavePortOp().SetSuppressOutput(true); // Suppress wave port output for offline - - // Initialize the basis with samples from the top and bottom of the frequency - // range of interest. Each call for an HDM solution adds the frequency sample to P_S and - // removes it from P \ P_S. Timing for the HDM construction and solve is handled inside - // of the RomOperator. - BlockTimer bt1(Timer::CONSTRUCTPROM); - prom.SolveHDM(omega0, E); // Print matrix stats at first HDM solve - prom.AddHDMSample(omega0, E); - estimator.AddErrorIndicator(E, indicator); - prom.SolveHDM(omega0 + (nstep - step0 - 1) * delta_omega, E); - prom.AddHDMSample(omega0 + (nstep - step0 - 1) * delta_omega, E); - estimator.AddErrorIndicator(E, indicator); - - // Greedy procedure for basis construction (offline phase). Basis is initialized with - // solutions at frequency sweep endpoints. - int it = static_cast(prom.GetSampleFrequencies().size()), it0 = it; - double max_error; - while (true) - { - // Compute maximum error in parameter domain with current PROM. - double omega_star; - max_error = prom.ComputeMaxError(ncand, omega_star); - if (max_error < offline_tol || it == nmax) - { - break; - } - - // Sample HDM and add solution to basis. - Mpi::Print( - "\nGreedy iteration {:d} (n = {:d}): ω* = {:.3e} GHz ({:.3e}), error = {:.3e}\n", - it - it0 + 1, prom.GetReducedDimension(), omega_star * f0, omega_star, max_error); - prom.SolveHDM(omega_star, E); - prom.AddHDMSample(omega_star, E); - estimator.AddErrorIndicator(E, indicator); - it++; - } - Mpi::Print("\nAdaptive sampling{} {:d} frequency samples:\n" - " n = {:d}, error = {:.3e}, tol. = {:.3e}\n", - (it == nmax) ? " reached maximum" : " converged with", it, - prom.GetReducedDimension(), max_error, offline_tol); - utils::PrettyPrint(prom.GetSampleFrequencies(), f0, " Sampled frequencies (GHz):"); - Mpi::Print(" Total offline phase elapsed time: {:.2e} s\n", - Timer::Duration(Timer::Now() - t0).count()); // Timing on root - SaveMetadata(prom.GetLinearSolver()); - - // Main fast frequency sweep loop (online phase). - BlockTimer bt2(Timer::CONSTRUCT); - Mpi::Print("\nBeginning fast frequency sweep online phase\n"); - spaceop.GetWavePortOp().SetSuppressOutput(false); // Disable output suppression - int step = step0; - double omega = omega0; - while (step < nstep) - { - const double freq = iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, omega); - Mpi::Print("\nIt {:d}/{:d}: ω/2π = {:.3e} GHz (elapsed time = {:.2e} s)\n", step + 1, - nstep, freq, Timer::Duration(Timer::Now() - t0).count()); - - // Assemble the linear system. - prom.AssemblePROM(omega); - - // Solve the linear system. - BlockTimer bt3(Timer::SOLVEPROM); - Mpi::Print("\n"); - prom.SolvePROM(E); - - // Compute B = -1/(iω) ∇ x E on the true dofs, and set the internal GridFunctions in - // PostOperator for all postprocessing operations. - BlockTimer bt4(Timer::POSTPRO); - double E_elec = 0.0, E_mag = 0.0; - Curl.Mult(E.Real(), B.Real()); - Curl.Mult(E.Imag(), B.Imag()); - B *= -1.0 / (1i * omega); - postop.SetEGridFunction(E); - postop.SetBGridFunction(B); - postop.UpdatePorts(spaceop.GetLumpedPortOp(), spaceop.GetWavePortOp(), omega); - Mpi::Print(" Sol. ||E|| = {:.6e}\n", linalg::Norml2(spaceop.GetComm(), E)); - if (!iodata.solver.driven.only_port_post) - { - const double J = iodata.DimensionalizeValue(IoData::ValueType::ENERGY, 1.0); - E_elec = postop.GetEFieldEnergy(); - E_mag = postop.GetHFieldEnergy(); - Mpi::Print(" Field energy E ({:.3e} J) + H ({:.3e} J) = {:.3e} J\n", E_elec * J, - E_mag * J, (E_elec + E_mag) * J); - } - - // Postprocess S-parameters and optionally write solution to disk. - Postprocess(postop, spaceop.GetLumpedPortOp(), spaceop.GetWavePortOp(), - spaceop.GetSurfaceCurrentOp(), step, omega, E_elec, E_mag, - !iodata.solver.driven.only_port_post, - (step == nstep - 1) ? &indicator : nullptr); - - // Increment frequency. - step++; - omega += delta_omega; - } - return indicator; -} - -int DrivenSolver::GetNumSteps(double start, double end, double delta) const -{ - MFEM_VERIFY(delta != 0.0, "Zero frequency step is not allowed!"); - constexpr double delta_eps = 1.0e-9; // 9 digits of precision comparing endpoint - double dnfreq = std::abs(end - start) / std::abs(delta); - int nstep = 1 + static_cast(dnfreq); - double dfinal = start + nstep * delta; - return nstep + ((delta < 0.0 && dfinal - end > -delta_eps * end) || - (delta > 0.0 && dfinal - end < delta_eps * end)); -} - -void DrivenSolver::Postprocess(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, - const WavePortOperator &wave_port_op, - const SurfaceCurrentOperator &surf_j_op, int step, - double omega, double E_elec, double E_mag, bool full, - const ErrorIndicator *indicator) const -{ - // The internal GridFunctions for PostOperator have already been set from the E and B - // solutions in the main frequency sweep loop. - double freq = iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, omega); - PostprocessCurrents(postop, surf_j_op, step, omega); - PostprocessPorts(postop, lumped_port_op, step, omega); - if (surf_j_op.Size() == 0) - { - PostprocessSParameters(postop, lumped_port_op, wave_port_op, step, omega); - } - if (full) - { - double E_cap = postop.GetLumpedCapacitorEnergy(lumped_port_op); - double E_ind = postop.GetLumpedInductorEnergy(lumped_port_op); - PostprocessDomains(postop, "f (GHz)", step, freq, E_elec, E_mag, E_cap, E_ind); - PostprocessSurfaces(postop, "f (GHz)", step, freq, E_elec + E_cap, E_mag + E_ind, 1.0, - 1.0); - PostprocessProbes(postop, "f (GHz)", step, freq); - } - if (iodata.solver.driven.delta_post > 0 && step % iodata.solver.driven.delta_post == 0) - { - Mpi::Print("\n"); - PostprocessFields(postop, step / iodata.solver.driven.delta_post, freq, indicator); - Mpi::Print(" Wrote fields to disk at step {:d}\n", step + 1); - } - if (indicator) - { - PostprocessErrorIndicator(postop, *indicator); - } -} - -namespace -{ - -struct CurrentData -{ - const int idx; // Current source index - const double Iinc; // Excitation current -}; - -struct PortVIData -{ - const int idx; // Lumped port index - const bool excitation; // Flag for excited ports - const double Vinc, Iinc; // Incident voltage, current - const std::complex Vi, Ii; // Port voltage, current -}; - -struct PortSData -{ - const int idx; // Port index - const std::complex Sij; // Scattering parameter -}; - -} // namespace - -void DrivenSolver::PostprocessCurrents(const PostOperator &postop, - const SurfaceCurrentOperator &surf_j_op, int step, - double omega) const -{ - // Postprocess the frequency domain surface current excitations. - if (post_dir.length() == 0) - { - return; - } - std::vector j_data; - j_data.reserve(surf_j_op.Size()); - for (const auto &[idx, data] : surf_j_op) - { - const double Iinc = data.GetExcitationCurrent(); - j_data.push_back({idx, iodata.DimensionalizeValue(IoData::ValueType::CURRENT, Iinc)}); - } - if (root && !j_data.empty()) - { - std::string path = post_dir + "surface-I.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", "f (GHz)", table.w1); - for (const auto &data : j_data) - { - // clang-format off - output.print("{:>{}s}{}", - "Iinc[" + std::to_string(data.idx) + "] (A)", table.w, - (data.idx == j_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - // clang-format off - output.print("{:{}.{}e},", - iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, omega), - table.w1, table.p1); - // clang-format on - for (const auto &data : j_data) - { - // clang-format off - output.print("{:+{}.{}e}{}", - data.Iinc, table.w, table.p, - (data.idx == j_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } -} - -void DrivenSolver::PostprocessPorts(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, int step, - double omega) const -{ - // Postprocess the frequency domain lumped port voltages and currents (complex magnitude - // = sqrt(2) * RMS). - if (post_dir.length() == 0) - { - return; - } - std::vector port_data; - port_data.reserve(lumped_port_op.Size()); - for (const auto &[idx, data] : lumped_port_op) - { - const double Vinc = data.GetExcitationVoltage(); - const double Iinc = (std::abs(Vinc) > 0.0) ? data.GetExcitationPower() / Vinc : 0.0; - const std::complex Vi = postop.GetPortVoltage(lumped_port_op, idx); - const std::complex Ii = postop.GetPortCurrent(lumped_port_op, idx); - port_data.push_back({idx, data.IsExcited(), - iodata.DimensionalizeValue(IoData::ValueType::VOLTAGE, Vinc), - iodata.DimensionalizeValue(IoData::ValueType::CURRENT, Iinc), - iodata.DimensionalizeValue(IoData::ValueType::VOLTAGE, Vi), - iodata.DimensionalizeValue(IoData::ValueType::CURRENT, Ii)}); - } - if (root && !port_data.empty()) - { - // Write the port voltages. - { - std::string path = post_dir + "port-V.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", "f (GHz)", table.w1); - for (const auto &data : port_data) - { - if (data.excitation) - { - // clang-format off - output.print("{:>{}s},", - "V_inc[" + std::to_string(data.idx) + "] (V)", table.w); - // clang-format on - } - } - for (const auto &data : port_data) - { - // clang-format off - output.print("{:>{}s},{:>{}s}{}", - "Re{V[" + std::to_string(data.idx) + "]} (V)", table.w, - "Im{V[" + std::to_string(data.idx) + "]} (V)", table.w, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - // clang-format off - output.print("{:{}.{}e},", - iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, omega), - table.w1, table.p1); - // clang-format on - for (const auto &data : port_data) - { - if (data.excitation) - { - // clang-format off - output.print("{:+{}.{}e},", - data.Vinc, table.w, table.p); - // clang-format on - } - } - for (const auto &data : port_data) - { - // clang-format off - output.print("{:+{}.{}e},{:+{}.{}e}{}", - data.Vi.real(), table.w, table.p, - data.Vi.imag(), table.w, table.p, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - - // Write the port currents. - { - std::string path = post_dir + "port-I.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", "f (GHz)", table.w1); - for (const auto &data : port_data) - { - if (data.excitation) - { - // clang-format off - output.print("{:>{}s},", - "I_inc[" + std::to_string(data.idx) + "] (A)", table.w); - // clang-format on - } - } - for (const auto &data : port_data) - { - // clang-format off - output.print("{:>{}s},{:>{}s}{}", - "Re{I[" + std::to_string(data.idx) + "]} (A)", table.w, - "Im{I[" + std::to_string(data.idx) + "]} (A)", table.w, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - // clang-format off - output.print("{:{}.{}e},", - iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, omega), - table.w1, table.p1); - // clang-format on - for (const auto &data : port_data) - { - if (data.excitation) - { - // clang-format off - output.print("{:+{}.{}e},", - data.Iinc, table.w, table.p); - // clang-format on - } - } - for (const auto &data : port_data) - { - // clang-format off - output.print("{:+{}.{}e},{:+{}.{}e}{}", - data.Ii.real(), table.w, table.p, - data.Ii.imag(), table.w, table.p, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - } -} - -void DrivenSolver::PostprocessSParameters(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, - const WavePortOperator &wave_port_op, int step, - double omega) const -{ - // Postprocess S-parameters. This computes a column of the S matrix corresponding to the - // excited port index specified in the configuration file, storing |S_ij| and arg - // (S_ij) in dB and degrees, respectively. S-parameter output is only available for a - // single lumped or wave port excitation. - bool src_lumped_port = false; - bool src_wave_port = false; - int source_idx = -1; - for (const auto &[idx, data] : lumped_port_op) - { - if (data.IsExcited()) - { - if (src_lumped_port || src_wave_port) - { - return; - } - src_lumped_port = true; - source_idx = idx; - } - } - for (const auto &[idx, data] : wave_port_op) - { - if (data.IsExcited()) - { - if (src_lumped_port || src_wave_port) - { - return; - } - src_wave_port = true; - source_idx = idx; - } - } - if (!src_lumped_port && !src_wave_port) - { - return; - } - std::vector port_data; - port_data.reserve(src_lumped_port ? lumped_port_op.Size() : wave_port_op.Size()); - if (src_lumped_port) - { - // Compute lumped port S-parameters. - for (const auto &[idx, data] : lumped_port_op) - { - const std::complex Sij = - postop.GetSParameter(lumped_port_op, idx, source_idx); - port_data.push_back({idx, Sij}); - } - } - else // src_wave_port - { - // Compute wave port S-parameters. - for (const auto &[idx, data] : wave_port_op) - { - const std::complex Sij = postop.GetSParameter(wave_port_op, idx, source_idx); - port_data.push_back({idx, Sij}); - } - } - - // Print table to stdout. - for (const auto &data : port_data) - { - std::string str = - "S[" + std::to_string(data.idx) + "][" + std::to_string(source_idx) + "]"; - // clang-format off - Mpi::Print(" {} = {:+.3e}{:+.3e}i, |{}| = {:+.3e}, arg({}) = {:+.3e}\n", - str, data.Sij.real(), data.Sij.imag(), - str, 20.0 * std::log10(std::abs(data.Sij)), - str, std::arg(data.Sij) * 180.0 / M_PI); - // clang-format on - } - - // Print table to file. - if (root && post_dir.length() > 0) - { - std::string path = post_dir + "port-S.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", "f (GHz)", table.w1); - for (const auto &data : port_data) - { - std::string str = - "S[" + std::to_string(data.idx) + "][" + std::to_string(source_idx) + "]"; - // clang-format off - output.print("{:>{}s},{:>{}s}{}", - "|" + str + "| (dB)", table.w, - "arg(" + str + ") (deg.)", table.w, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - // clang-format off - output.print("{:{}.{}e},", - iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, omega), - table.w1, table.p1); - for (const auto &data : port_data) - { - // clang-format off - output.print("{:+{}.{}e},{:+{}.{}e}{}", - 20.0 * std::log10(std::abs(data.Sij)), table.w, table.p, - std::arg(data.Sij) * 180.0 / M_PI, table.w, table.p, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "drivensolver.hpp" + +#include +#include +#include "fem/errorindicator.hpp" +#include "fem/mesh.hpp" +#include "linalg/errorestimator.hpp" +#include "linalg/floquetcorrection.hpp" +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" +#include "models/lumpedportoperator.hpp" +#include "models/portexcitations.hpp" +#include "models/postoperator.hpp" +#include "models/romoperator.hpp" +#include "models/spaceoperator.hpp" +#include "models/surfacecurrentoperator.hpp" +#include "models/waveportoperator.hpp" +#include "utils/communication.hpp" +#include "utils/iodata.hpp" +#include "utils/prettyprint.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +using namespace std::complex_literals; + +std::pair +DrivenSolver::Solve(const std::vector> &mesh) const +{ + // Set up the spatial discretization and frequency sweep. + BlockTimer bt0(Timer::CONSTRUCT); + SpaceOperator space_op(iodata, mesh); + const auto &port_excitations = space_op.GetPortExcitations(); + SaveMetadata(port_excitations); + + const auto &omega_sample = iodata.solver.driven.sample_f; + + bool adaptive = (iodata.solver.driven.adaptive_tol > 0.0); + if (adaptive && omega_sample.size() <= iodata.solver.driven.prom_indices.size()) + { + Mpi::Warning("Adaptive frequency sweep requires > {} total frequency samples!\n" + "Reverting to uniform sweep!\n", + iodata.solver.driven.prom_indices.size()); + adaptive = false; + } + SaveMetadata(space_op.GetNDSpaces()); + Mpi::Print("\nComputing {}frequency response for:\n{}", adaptive ? "adaptive fast " : "", + port_excitations.FmtLog()); + + auto restart = iodata.solver.driven.restart; + if (restart != 1) + { + int max_iter = omega_sample.size() * space_op.GetPortExcitations().Size(); + MFEM_VERIFY( + restart - 1 < max_iter, + fmt::format("\"Restart\" ({}) is greater than the number of total samples ({})!", + restart, max_iter)); + + Mpi::Print("\nRestarting from solve {}", iodata.solver.driven.restart); + } + + // Main frequency sweep loop. + return {adaptive ? SweepAdaptive(space_op) : SweepUniform(space_op), + space_op.GlobalTrueVSize()}; +} + +ErrorIndicator DrivenSolver::SweepUniform(SpaceOperator &space_op) const +{ + const auto &port_excitations = space_op.GetPortExcitations(); + const auto &omega_sample = iodata.solver.driven.sample_f; + + // Initialize postprocessing for measurement and printers. + // Initialize write directory with default path; will be changed for multi-excitations. + PostOperator post_op(iodata, space_op); + + // Construct the system matrices defining the linear operator. PEC boundaries are handled + // simply by setting diagonal entries of the system matrix for the corresponding dofs. + // Because the Dirichlet BC is always homogeneous, no special elimination is required on + // the RHS. Assemble the linear system for the initial frequency (so we can call + // KspSolver::SetOperators). Compute everything at the first frequency step. + auto K = space_op.GetStiffnessMatrix(Operator::DIAG_ONE); + auto C = space_op.GetDampingMatrix(Operator::DIAG_ZERO); + auto M = space_op.GetMassMatrix(Operator::DIAG_ZERO); + const auto &Curl = space_op.GetCurlMatrix(); + + // Set up the linear solver. + // The operators are constructed for each frequency step and used to initialize the ksp. + ComplexKspSolver ksp(iodata, space_op.GetNDSpaces(), &space_op.GetH1Spaces()); + + // Set up RHS vector for the incident field at port boundaries, and the vector for the + // first frequency step. + ComplexVector RHS(Curl.Width()), E(Curl.Width()), B(Curl.Height()); + RHS.UseDevice(true); + E.UseDevice(true); + B.UseDevice(true); + E = 0.0; + B = 0.0; + + // Initialize structures for storing and reducing the results of error estimation. + TimeDependentFluxErrorEstimator estimator( + space_op.GetMaterialOp(), space_op.GetNDSpaces(), space_op.GetRTSpaces(), + iodata.solver.linear.estimator_tol, iodata.solver.linear.estimator_max_it, 0, + iodata.solver.linear.estimator_mg); + ErrorIndicator indicator; + + // If using Floquet BCs, a correction term (kp x E) needs to be added to the B field. + std::unique_ptr> floquet_corr; + if (space_op.GetMaterialOp().HasWaveVector()) + { + floquet_corr = std::make_unique>( + space_op.GetMaterialOp(), space_op.GetNDSpace(), space_op.GetRTSpace(), + iodata.solver.linear.tol, iodata.solver.linear.max_it, 0); + } + + // Main excitation and frequency loop. + auto t0 = Timer::Now(); + int excitation_counter = 0; + const int excitation_restart_counter = + ((iodata.solver.driven.restart - 1) / omega_sample.size()) + 1; + const int freq_restart_idx = (iodata.solver.driven.restart - 1) % omega_sample.size(); + for (const auto &[excitation_idx, excitation_spec] : port_excitations) + { + if (++excitation_counter < excitation_restart_counter) + { + continue; + } + if (port_excitations.Size() > 1) + { + Mpi::Print("\nSweeping excitation index {:d} ({:d}/{:d}):\n", excitation_idx, + excitation_counter, port_excitations.Size()); + } + // Switch paraview subfolders: one for each excitation, if nr_excitations > 1. + post_op.InitializeParaviewDataCollection(excitation_idx); + + // Frequency loop. + for (std::size_t omega_i = + ((excitation_counter == excitation_restart_counter) ? freq_restart_idx : 0); + omega_i < omega_sample.size(); omega_i++) + { + auto omega = omega_sample[omega_i]; + // Assemble frequency dependent matrices and initialize operators in linear + // solver. + auto A2 = space_op.GetExtraSystemMatrix(omega, Operator::DIAG_ZERO); + auto A = space_op.GetSystemMatrix(1.0 + 0.0i, 1i * omega, -omega * omega + 0.0i, + K.get(), C.get(), M.get(), A2.get()); + auto P = space_op.GetPreconditionerMatrix( + 1.0 + 0.0i, 1i * omega, -omega * omega + 0.0i, omega); + ksp.SetOperators(*A, *P); + + Mpi::Print( + "\nIt {:d}/{:d}: ω/2π = {:.3e} GHz (total elapsed time = {:.2e} s{})\n", + omega_i + 1, omega_sample.size(), + iodata.units.Dimensionalize(omega) / (2 * M_PI), + Timer::Duration(Timer::Now() - t0).count(), + (port_excitations.Size() > 1) + ? fmt::format(", solve {:d}/{:d}", + 1 + omega_i + (excitation_counter - 1) * omega_sample.size(), + omega_sample.size() * port_excitations.Size()) + : ""); + + // Solve linear system. + space_op.GetExcitationVector(excitation_idx, omega, RHS); + Mpi::Print("\n"); + ksp.Mult(RHS, E); + + // Start Post-processing. + BlockTimer bt0(Timer::POSTPRO); + Mpi::Print(" Sol. ||E|| = {:.6e} (||RHS|| = {:.6e})\n", + linalg::Norml2(space_op.GetComm(), E), + linalg::Norml2(space_op.GetComm(), RHS)); + + // Compute B = -1/(iω) ∇ x E on the true dofs. + Curl.Mult(E.Real(), B.Real()); + Curl.Mult(E.Imag(), B.Imag()); + B *= -1.0 / (1i * omega); + if (space_op.GetMaterialOp().HasWaveVector()) + { + // Calculate B field correction for Floquet BCs. + // B = -1/(iω) ∇ x E + 1/ω kp x E + floquet_corr->AddMult(E, B, 1.0 / omega); + } + + auto total_domain_energy = + post_op.MeasureAndPrintAll(excitation_idx, int(omega_i), E, B, omega); + + // Calculate and record the error indicators. + Mpi::Print(" Updating solution error estimates\n"); + estimator.AddErrorIndicator(E, B, total_domain_energy, indicator); + } + + // Final postprocessing & printing. + BlockTimer bt0(Timer::POSTPRO); + SaveMetadata(ksp); + } + post_op.MeasureFinalize(indicator); + return indicator; +} + +ErrorIndicator DrivenSolver::SweepAdaptive(SpaceOperator &space_op) const +{ + const auto &port_excitations = space_op.GetPortExcitations(); + const auto &omega_sample = iodata.solver.driven.sample_f; + + // Initialize postprocessing for measurement and printers. + // Initialize write directory with default path; will be changed for multi-excitations. + PostOperator post_op(iodata, space_op); + + // Configure PROM parameters if not specified. + double offline_tol = iodata.solver.driven.adaptive_tol; + int convergence_memory = iodata.solver.driven.adaptive_memory; + int max_size_per_excitation = iodata.solver.driven.adaptive_max_size; + int nprom_indices = static_cast(iodata.solver.driven.prom_indices.size()); + MFEM_VERIFY(max_size_per_excitation <= 0 || max_size_per_excitation >= nprom_indices, + "Adaptive frequency sweep must sample at least " << nprom_indices + << " frequency points!"); + // Maximum size — no more than nr steps needed. + max_size_per_excitation = + std::min(max_size_per_excitation, static_cast(omega_sample.size())); + + // Allocate negative curl matrix for postprocessing the B-field and vectors for the + // high-dimensional field solution. + const auto &Curl = space_op.GetCurlMatrix(); + ComplexVector E(Curl.Width()), Eh(Curl.Width()), B(Curl.Height()); + E.UseDevice(true); + Eh.UseDevice(true); + B.UseDevice(true); + E = 0.0; + Eh = 0.0; + B = 0.0; + + // Initialize structures for storing and reducing the results of error estimation. + TimeDependentFluxErrorEstimator estimator( + space_op.GetMaterialOp(), space_op.GetNDSpaces(), space_op.GetRTSpaces(), + iodata.solver.linear.estimator_tol, iodata.solver.linear.estimator_max_it, 0, + iodata.solver.linear.estimator_mg); + ErrorIndicator indicator; + + // If using Floquet BCs, a correction term (kp x E) needs to be added to the B field. + std::unique_ptr> floquet_corr; + if (space_op.GetMaterialOp().HasWaveVector()) + { + floquet_corr = std::make_unique>( + space_op.GetMaterialOp(), space_op.GetNDSpace(), space_op.GetRTSpace(), + iodata.solver.linear.tol, iodata.solver.linear.max_it, 0); + } + + // Configure the PROM operator which performs the parameter space sampling and basis + // construction during the offline phase as well as the PROM solution during the online + // phase. + auto t0 = Timer::Now(); + const double unit_GHz = + iodata.units.Dimensionalize(1.0) / (2 * M_PI); + Mpi::Print("\nBeginning PROM construction offline phase:\n" + " {:d} points for frequency sweep over [{:.3e}, {:.3e}] GHz\n", + omega_sample.size(), omega_sample.front() * unit_GHz, + omega_sample.back() * unit_GHz); + RomOperator prom_op(iodata, space_op, max_size_per_excitation); + space_op.GetWavePortOp().SetSuppressOutput(true); + + // Initialize the basis with samples from the top and bottom of the frequency + // range of interest. Each call for an HDM solution adds the frequency sample to P_S and + // removes it from P \ P_S. Timing for the HDM construction and solve is handled inside + // of the RomOperator. + auto UpdatePROM = [&](int excitation_idx, double omega) + { + // Add the HDM solution to the PROM reduced basis. + prom_op.UpdatePROM(E); + prom_op.UpdateMRI(excitation_idx, omega, E); + + // Compute B = -1/(iω) ∇ x E on the true dofs, and set the internal GridFunctions in + // PostOperator for energy postprocessing and error estimation. + BlockTimer bt0(Timer::POSTPRO); + Curl.Mult(E.Real(), B.Real()); + Curl.Mult(E.Imag(), B.Imag()); + B *= -1.0 / (1i * omega); + if (space_op.GetMaterialOp().HasWaveVector()) + { + // Calculate B field correction for Floquet BCs. + // B = -1/(iω) ∇ x E + 1/ω kp x E + floquet_corr->AddMult(E, B, 1.0 / omega); + } + + // Measure domain energies for the error indicator only. Don't exchange face_nbr_data, + // unless printing paraview fields. + auto total_domain_energy = post_op.MeasureDomainFieldEnergyOnly(E, B); + estimator.AddErrorIndicator(E, B, total_domain_energy, indicator); + }; + + // Loop excitations to add to PROM. + // + // Restart should not really be used for adaptive sweeps, but must work. Construct PROM in + // the same way same regardless of restart for consistency. Don't shift excitation start. + int excitation_counter = 0; + for (const auto &[excitation_idx, excitation_spec] : port_excitations) + { + if (port_excitations.Size() > 1) + { + Mpi::Print("\nAdding excitation index {:d} ({:d}/{:d}):\n", excitation_idx, + ++excitation_counter, port_excitations.Size()); + } + prom_op.SetExcitationIndex(excitation_idx); // Pre-compute RHS1 + + // Initialize PROM with explicit HDM samples, record the estimate but do not act on it. + std::vector max_errors; + for (auto i : iodata.solver.driven.prom_indices) + { + auto omega = omega_sample[i]; + prom_op.SolveHDM(excitation_idx, omega, E); + prom_op.SolvePROM(excitation_idx, omega, Eh); + linalg::AXPY(-1.0, E, Eh); + max_errors.push_back(linalg::Norml2(space_op.GetComm(), Eh) / + linalg::Norml2(space_op.GetComm(), E)); + UpdatePROM(excitation_idx, omega); + } + // The estimates associated to the end points are assumed inaccurate. + max_errors[0] = std::numeric_limits::infinity(); + max_errors[1] = std::numeric_limits::infinity(); + int memory = std::distance(max_errors.rbegin(), + std::find_if(max_errors.rbegin(), max_errors.rend(), + [=](auto x) { return x > offline_tol; })); + + // Greedy procedure for basis construction (offline phase). Basis is initialized with + // solutions at frequency sweep endpoints and explicit sample frequencies. + int it = static_cast(max_errors.size()); + for (int it0 = it; it < max_size_per_excitation && memory < convergence_memory; it++) + { + // Compute the location of the maximum error in parameter domain (bounded by the + // previous samples). + double omega_star = prom_op.FindMaxError(excitation_idx)[0]; + + // Sample HDM and add solution to basis. + prom_op.SolveHDM(excitation_idx, omega_star, E); + prom_op.SolvePROM(excitation_idx, omega_star, Eh); + linalg::AXPY(-1.0, E, Eh); + max_errors.push_back(linalg::Norml2(space_op.GetComm(), Eh) / + linalg::Norml2(space_op.GetComm(), E)); + memory = max_errors.back() < offline_tol ? memory + 1 : 0; + + Mpi::Print("\nGreedy iteration {:d} (n = {:d}): ω* = {:.3e} GHz ({:.3e}), error = " + "{:.3e}{}\n", + it - it0 + 1, prom_op.GetReducedDimension(), omega_star * unit_GHz, + omega_star, max_errors.back(), + (memory == 0) + ? "" + : fmt::format(", memory = {:d}/{:d}", memory, convergence_memory)); + UpdatePROM(excitation_idx, omega_star); + } + Mpi::Print("\nAdaptive sampling{} {:d} frequency samples:\n" + " n = {:d}, error = {:.3e}, tol = {:.3e}, memory = {:d}/{:d}\n", + (it == max_size_per_excitation) ? " reached maximum" : " converged with", it, + prom_op.GetReducedDimension(), max_errors.back(), offline_tol, memory, + convergence_memory); + utils::PrettyPrint(prom_op.GetSamplePoints(excitation_idx), unit_GHz, + " Sampled frequencies (GHz):"); + utils::PrettyPrint(max_errors, 1.0, " Sample errors:"); + } + + Mpi::Print(" Total offline phase elapsed time: {:.2e} s\n", + Timer::Duration(Timer::Now() - t0).count()); // Timing on root + + // XX TODO: Add output of eigenvalue estimates from the PROM system (and nonlinear EVP + // in the general case with wave ports, etc.?) + + // Main fast frequency sweep loop (online phase). + Mpi::Print("\nBeginning fast frequency sweep online phase\n"); + space_op.GetWavePortOp().SetSuppressOutput(false); // Disable output suppression + for (const auto &[excitation_idx, excitation_spec] : port_excitations) + { + if (port_excitations.Size() > 1) + { + Mpi::Print("\nSweeping excitation index {:d} ({:d}/{:d}):\n", excitation_idx, + excitation_counter, port_excitations.Size()); + } + // Switch paraview subfolders: one for each excitation, if nr_excitations > 1. + post_op.InitializeParaviewDataCollection(excitation_idx); + + // Frequency loop. + for (std::size_t omega_i = 0; omega_i < omega_sample.size(); omega_i++) + { + auto omega = omega_sample[omega_i]; + Mpi::Print("\nIt {:d}/{:d}: ω/2π = {:.3e} GHz (total elapsed time = {:.2e} s)\n", + omega_i + 1, omega_sample.size(), + iodata.units.Dimensionalize(omega) / + (2 * M_PI), + Timer::Duration(Timer::Now() - t0).count()); + + // Assemble and solve the PROM linear system. + prom_op.SolvePROM(excitation_idx, omega, E); + Mpi::Print("\n"); + + // Start Post-processing. + BlockTimer bt0(Timer::POSTPRO); + Mpi::Print(" Sol. ||E|| = {:.6e}\n", linalg::Norml2(space_op.GetComm(), E)); + + // Compute B = -1/(iω) ∇ x E on the true dofs. + Curl.Mult(E.Real(), B.Real()); + Curl.Mult(E.Imag(), B.Imag()); + B *= -1.0 / (1i * omega); + if (space_op.GetMaterialOp().HasWaveVector()) + { + // Calculate B field correction for Floquet BCs. + // B = -1/(iω) ∇ x E + 1/ω kp x E + floquet_corr->AddMult(E, B, 1.0 / omega); + } + post_op.MeasureAndPrintAll(excitation_idx, int(omega_i), E, B, omega); + } + + // Final postprocessing & printing: no change to indicator since these are in PROM. + BlockTimer bt0(Timer::POSTPRO); + SaveMetadata(prom_op.GetLinearSolver()); + } + post_op.MeasureFinalize(indicator); + return indicator; +} + +} // namespace palace diff --git a/palace/drivers/drivensolver.hpp b/palace/drivers/drivensolver.hpp index 2a92b9e995..c2452007fd 100644 --- a/palace/drivers/drivensolver.hpp +++ b/palace/drivers/drivensolver.hpp @@ -1,72 +1,40 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_DRIVERS_DRIVEN_SOLVER_HPP -#define PALACE_DRIVERS_DRIVEN_SOLVER_HPP - -#include -#include -#include "drivers/basesolver.hpp" - -namespace mfem -{ - -class ParMesh; - -} // namespace mfem - -namespace palace -{ - -class ErrorIndicator; -class IoData; -class LumpedPortOperator; -class PostOperator; -class SpaceOperator; -class SurfaceCurrentOperator; -class Timer; -class WavePortOperator; - -// -// Driver class for driven terminal simulations. -// -class DrivenSolver : public BaseSolver -{ -private: - int GetNumSteps(double start, double end, double delta) const; - - ErrorIndicator SweepUniform(SpaceOperator &spaceop, PostOperator &postop, int nstep, - int step0, double omega0, double delta_omega) const; - - ErrorIndicator SweepAdaptive(SpaceOperator &spaceop, PostOperator &postop, int nstep, - int step0, double omega0, double delta_omega) const; - - void Postprocess(const PostOperator &postop, const LumpedPortOperator &lumped_port_op, - const WavePortOperator &wave_port_op, - const SurfaceCurrentOperator &surf_j_op, int step, double omega, - double E_elec, double E_mag, bool full, - const ErrorIndicator *indicator) const; - - void PostprocessCurrents(const PostOperator &postop, - const SurfaceCurrentOperator &surf_j_op, int step, - double omega) const; - - void PostprocessPorts(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, int step, - double omega) const; - - void PostprocessSParameters(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, - const WavePortOperator &wave_port_op, int step, - double omega) const; - - std::pair - Solve(const std::vector> &mesh) const override; - -public: - using BaseSolver::BaseSolver; -}; - -} // namespace palace - -#endif // PALACE_DRIVERS_DRIVEN_SOLVER_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_DRIVERS_DRIVEN_SOLVER_HPP +#define PALACE_DRIVERS_DRIVEN_SOLVER_HPP + +#include +#include +#include "drivers/basesolver.hpp" +#include "utils/configfile.hpp" + +namespace palace +{ + +class ErrorIndicator; +class Mesh; +template +class PostOperator; +class SpaceOperator; + +// +// Driver class for driven terminal simulations. +// +class DrivenSolver : public BaseSolver +{ +private: + ErrorIndicator SweepUniform(SpaceOperator &space_op) const; + + ErrorIndicator SweepAdaptive(SpaceOperator &space_op) const; + + std::pair + Solve(const std::vector> &mesh) const override; + +public: + using BaseSolver::BaseSolver; +}; + +} // namespace palace + +#endif // PALACE_DRIVERS_DRIVEN_SOLVER_HPP diff --git a/palace/drivers/eigensolver.cpp b/palace/drivers/eigensolver.cpp index 55c3cc1a0a..5430c62825 100644 --- a/palace/drivers/eigensolver.cpp +++ b/palace/drivers/eigensolver.cpp @@ -1,626 +1,454 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "eigensolver.hpp" - -#include -#include "fem/errorindicator.hpp" -#include "linalg/arpack.hpp" -#include "linalg/divfree.hpp" -#include "linalg/errorestimator.hpp" -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "linalg/slepc.hpp" -#include "linalg/vector.hpp" -#include "models/lumpedportoperator.hpp" -#include "models/postoperator.hpp" -#include "models/spaceoperator.hpp" -#include "utils/communication.hpp" -#include "utils/iodata.hpp" -#include "utils/timer.hpp" - -namespace palace -{ - -using namespace std::complex_literals; - -std::pair -EigenSolver::Solve(const std::vector> &mesh) const -{ - // Construct and extract the system matrices defining the eigenvalue problem. The diagonal - // values for the mass matrix PEC dof shift the Dirichlet eigenvalues out of the - // computational range. The damping matrix may be nullptr. - BlockTimer bt0(Timer::CONSTRUCT); - SpaceOperator spaceop(iodata, mesh); - auto K = spaceop.GetStiffnessMatrix(Operator::DIAG_ONE); - auto C = spaceop.GetDampingMatrix(Operator::DIAG_ZERO); - auto M = spaceop.GetMassMatrix(Operator::DIAG_ZERO); - const auto &Curl = spaceop.GetCurlMatrix(); - SaveMetadata(spaceop.GetNDSpaces()); - - // Configure objects for postprocessing. - PostOperator postop(iodata, spaceop, "eigenmode"); - ComplexVector E(Curl.Width()), B(Curl.Height()); - - // Define and configure the eigensolver to solve the eigenvalue problem: - // (K + λ C + λ² M) u = 0 or K u = -λ² M u - // with λ = iω. In general, the system matrices are complex and symmetric. - std::unique_ptr eigen; - config::EigenSolverData::Type type = iodata.solver.eigenmode.type; -#if defined(PALACE_WITH_ARPACK) && defined(PALACE_WITH_SLEPC) - if (type == config::EigenSolverData::Type::DEFAULT) - { - type = config::EigenSolverData::Type::SLEPC; - } -#elif defined(PALACE_WITH_ARPACK) - if (iodata.solver.eigenmode.type == config::EigenSolverData::Type::SLEPC) - { - Mpi::Warning("SLEPc eigensolver not available, using ARPACK!\n"); - } - else if (iodata.solver.eigenmode.type == config::EigenSolverData::Type::FEAST) - { - Mpi::Warning("FEAST eigensolver requires SLEPc, using ARPACK!\n"); - } - type = config::EigenSolverData::Type::ARPACK; -#elif defined(PALACE_WITH_SLEPC) - if (iodata.solver.eigenmode.type == config::EigenSolverData::Type::ARPACK) - { - Mpi::Warning("ARPACK eigensolver not available, using SLEPc!\n"); - } - type = config::EigenSolverData::Type::SLEPC; -#else -#error "Eigenmode solver requires building with ARPACK or SLEPc!" -#endif - if (type == config::EigenSolverData::Type::FEAST) - { - MFEM_ABORT("FEAST eigenvalue solver is currently not supported!"); - } - else if (type == config::EigenSolverData::Type::ARPACK) - { -#if defined(PALACE_WITH_ARPACK) - Mpi::Print("\nConfiguring ARPACK eigenvalue solver\n"); - if (C) - { - eigen = std::make_unique(spaceop.GetComm(), - iodata.problem.verbose); - } - else - { - eigen = std::make_unique(spaceop.GetComm(), - iodata.problem.verbose); - } -#endif - } - else // config::EigenSolverData::Type::SLEPC - { -#if defined(PALACE_WITH_SLEPC) - Mpi::Print("\nConfiguring SLEPc eigenvalue solver\n"); - std::unique_ptr slepc; - if (C) - { - if (!iodata.solver.eigenmode.pep_linear) - { - slepc = std::make_unique(spaceop.GetComm(), - iodata.problem.verbose); - slepc->SetType(slepc::SlepcEigenvalueSolver::Type::TOAR); - } - else - { - slepc = std::make_unique(spaceop.GetComm(), - iodata.problem.verbose); - slepc->SetType(slepc::SlepcEigenvalueSolver::Type::KRYLOVSCHUR); - } - } - else - { - slepc = std::make_unique(spaceop.GetComm(), - iodata.problem.verbose); - slepc->SetType(slepc::SlepcEigenvalueSolver::Type::KRYLOVSCHUR); - } - slepc->SetProblemType(slepc::SlepcEigenvalueSolver::ProblemType::GEN_NON_HERMITIAN); - slepc->SetOrthogonalization( - iodata.solver.linear.gs_orthog_type == config::LinearSolverData::OrthogType::MGS, - iodata.solver.linear.gs_orthog_type == config::LinearSolverData::OrthogType::CGS2); - eigen = std::move(slepc); -#endif - } - EigenvalueSolver::ScaleType scale = iodata.solver.eigenmode.scale - ? EigenvalueSolver::ScaleType::NORM_2 - : EigenvalueSolver::ScaleType::NONE; - if (C) - { - eigen->SetOperators(*K, *C, *M, scale); - } - else - { - eigen->SetOperators(*K, *M, scale); - } - eigen->SetNumModes(iodata.solver.eigenmode.n, iodata.solver.eigenmode.max_size); - eigen->SetTol(iodata.solver.eigenmode.tol); - eigen->SetMaxIter(iodata.solver.eigenmode.max_it); - Mpi::Print(" Scaling γ = {:.3e}, δ = {:.3e}\n", eigen->GetScalingGamma(), - eigen->GetScalingDelta()); - - // If desired, use an M-inner product for orthogonalizing the eigenvalue subspace. The - // constructed matrix just references the real SPD part of the mass matrix (no copy is - // performed). Boundary conditions don't need to be eliminated here. - std::unique_ptr KM; - if (iodata.solver.eigenmode.mass_orthog) - { - // Mpi::Print(" Basis uses M-inner product\n"); - // KM = spaceop.GetInnerProductMatrix(0.0, 1.0, nullptr, M.get()); - // eigen->SetBMat(*KM); - - Mpi::Print(" Basis uses (K + M)-inner product\n"); - KM = spaceop.GetInnerProductMatrix(1.0, 1.0, K.get(), M.get()); - eigen->SetBMat(*KM); - } - - // Construct a divergence-free projector so the eigenvalue solve is performed in the space - // orthogonal to the zero eigenvalues of the stiffness matrix. - std::unique_ptr divfree; - if (iodata.solver.linear.divfree_max_it > 0) - { - constexpr int divfree_verbose = 0; - divfree = std::make_unique( - spaceop.GetMaterialOp(), spaceop.GetNDSpace(), spaceop.GetH1Spaces(), - spaceop.GetAuxBdrTDofLists(), iodata.solver.linear.divfree_tol, - iodata.solver.linear.divfree_max_it, divfree_verbose, - iodata.solver.pa_order_threshold); - eigen->SetDivFreeProjector(*divfree); - } - - // Set up the initial space for the eigenvalue solve. Satisfies boundary conditions and is - // projected appropriately. - if (iodata.solver.eigenmode.init_v0) - { - ComplexVector v0; - if (iodata.solver.eigenmode.init_v0_const) - { - Mpi::Print(" Using constant starting vector\n"); - spaceop.GetConstantInitialVector(v0); - } - else - { - Mpi::Print(" Using random starting vector\n"); - spaceop.GetRandomInitialVector(v0); - } - if (divfree) - { - divfree->Mult(v0); - } - eigen->SetInitialSpace(v0); // Copies the vector - - // Debug - // const auto &Grad = spaceop.GetGradMatrix(); - // ComplexVector r0(Grad->Width()); - // Grad.MultTranspose(v0.Real(), r0.Real()); - // Grad.MultTranspose(v0.Imag(), r0.Imag()); - // r0.Print(); - } - - // Configure the shift-and-invert strategy is employed to solve for the eigenvalues - // closest to the specified target, σ. - const double target = iodata.solver.eigenmode.target; - const double f_target = iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, target); - Mpi::Print(" Shift-and-invert σ = {:.3e} GHz ({:.3e})\n", f_target, target); - if (C) - { - // Search for eigenvalues closest to λ = iσ. - eigen->SetShiftInvert(1i * target); - if (type == config::EigenSolverData::Type::ARPACK) - { - // ARPACK searches based on eigenvalues of the transformed problem. The eigenvalue - // 1 / (λ - σ) will be a large-magnitude negative imaginary number for an eigenvalue - // λ with frequency close to but not below the target σ. - eigen->SetWhichEigenpairs(EigenvalueSolver::WhichType::SMALLEST_IMAGINARY); - } - else - { - eigen->SetWhichEigenpairs(EigenvalueSolver::WhichType::TARGET_IMAGINARY); - } - } - else - { - // Linear EVP has eigenvalues μ = -λ² = ω². Search for eigenvalues closest to μ = σ². - eigen->SetShiftInvert(target * target); - if (type == config::EigenSolverData::Type::ARPACK) - { - // ARPACK searches based on eigenvalues of the transformed problem. 1 / (μ - σ²) - // will be a large-magnitude positive real number for an eigenvalue μ with frequency - // close to but below the target σ². - eigen->SetWhichEigenpairs(EigenvalueSolver::WhichType::LARGEST_REAL); - } - else - { - eigen->SetWhichEigenpairs(EigenvalueSolver::WhichType::TARGET_REAL); - } - } - - // Set up the linear solver required for solving systems involving the shifted operator - // (K - σ² M) or P(iσ) = (K + iσ C - σ² M) during the eigenvalue solve. The - // preconditioner for complex linear systems is constructed from a real approximation - // to the complex system matrix. - auto A = spaceop.GetSystemMatrix(std::complex(1.0, 0.0), 1i * target, - std::complex(-target * target, 0.0), K.get(), - C.get(), M.get()); - auto P = spaceop.GetPreconditionerMatrix(1.0, target, -target * target, - target); - - auto ksp = std::make_unique(iodata, spaceop.GetNDSpaces(), - &spaceop.GetH1Spaces()); - ksp->SetOperators(*A, *P); - eigen->SetLinearSolver(*ksp); - - // Eigenvalue problem solve. - BlockTimer bt1(Timer::SOLVE); - Mpi::Print("\n"); - int num_conv = eigen->Solve(); - { - std::complex lambda = (num_conv > 0) ? eigen->GetEigenvalue(0) : 0.0; - Mpi::Print(" Found {:d} converged eigenvalue{}{}\n\n", num_conv, - (num_conv > 1) ? "s" : "", - (num_conv > 0) - ? fmt::format(" (first = {:.3e}{:+.3e}i)", lambda.real(), lambda.imag()) - : ""); - } - SaveMetadata(*ksp); - - // Calculate and record the error indicators. - Mpi::Print("Computing solution error estimates\n\n"); - CurlFluxErrorEstimator estimator( - spaceop.GetMaterialOp(), spaceop.GetNDSpaces(), iodata.solver.linear.estimator_tol, - iodata.solver.linear.estimator_max_it, 0, iodata.solver.pa_order_threshold); - ErrorIndicator indicator; - for (int i = 0; i < iodata.solver.eigenmode.n; i++) - { - eigen->GetEigenvector(i, E); - estimator.AddErrorIndicator(E, indicator); - } - - // Postprocess the results. - BlockTimer bt2(Timer::POSTPRO); - for (int i = 0; i < num_conv; i++) - { - // Get the eigenvalue and relative error. - std::complex omega = eigen->GetEigenvalue(i); - double error_bkwd = eigen->GetError(i, EigenvalueSolver::ErrorType::BACKWARD); - double error_abs = eigen->GetError(i, EigenvalueSolver::ErrorType::ABSOLUTE); - if (!C) - { - // Linear EVP has eigenvalue μ = -λ² = ω². - omega = std::sqrt(omega); - } - else - { - // Quadratic EVP solves for eigenvalue λ = iω. - omega /= 1i; - } - - // Compute B = -1/(iω) ∇ x E on the true dofs, and set the internal GridFunctions in - // PostOperator for all postprocessing operations. - eigen->GetEigenvector(i, E); - Curl.Mult(E.Real(), B.Real()); - Curl.Mult(E.Imag(), B.Imag()); - B *= -1.0 / (1i * omega); - postop.SetEGridFunction(E); - postop.SetBGridFunction(B); - postop.UpdatePorts(spaceop.GetLumpedPortOp(), omega.real()); - - // Postprocess the mode. - Postprocess(postop, spaceop.GetLumpedPortOp(), i, omega, error_bkwd, error_abs, - num_conv, (i == 0) ? &indicator : nullptr); - } - return {indicator, spaceop.GlobalTrueVSize()}; -} - -void EigenSolver::Postprocess(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, int i, - std::complex omega, double error_bkwd, - double error_abs, int num_conv, - const ErrorIndicator *indicator) const -{ - // The internal GridFunctions for PostOperator have already been set from the E and B - // solutions in the main loop over converged eigenvalues. Note: The energies output are - // nondimensional (they can be dimensionalized using the scaling μ₀ * H₀² * L₀³, which - // are the free space permeability, characteristic magnetic field strength, and - // characteristic length scale, respectively). - double E_elec = postop.GetEFieldEnergy(); - double E_mag = postop.GetHFieldEnergy(); - double E_cap = postop.GetLumpedCapacitorEnergy(lumped_port_op); - double E_ind = postop.GetLumpedInductorEnergy(lumped_port_op); - PostprocessEigen(i, omega, error_bkwd, error_abs, num_conv); - PostprocessPorts(postop, lumped_port_op, i); - PostprocessEPR(postop, lumped_port_op, i, omega, E_elec + E_cap); - PostprocessDomains(postop, "m", i, i + 1, E_elec, E_mag, E_cap, E_ind); - PostprocessSurfaces(postop, "m", i, i + 1, E_elec + E_cap, E_mag + E_ind, 1.0, 1.0); - PostprocessProbes(postop, "m", i, i + 1); - if (i < iodata.solver.eigenmode.n_post) - { - PostprocessFields(postop, i, i + 1, indicator); - Mpi::Print(" Wrote mode {:d} to disk\n", i + 1); - } - if (indicator) - { - PostprocessErrorIndicator(postop, *indicator); - } -} - -namespace -{ - -struct PortVIData -{ - const int idx; // Lumped port index - const std::complex Vi, Ii; // Port voltage, current -}; - -struct EprLData -{ - const int idx; // Lumped port index - const double pj; // Inductor energy-participation ratio -}; - -struct EprIOData -{ - const int idx; // Lumped port index - const double Ql; // Quality factor - const double Kl; // κ for loss rate -}; - -} // namespace - -void EigenSolver::PostprocessEigen(int i, std::complex omega, double error_bkwd, - double error_abs, int num_conv) const -{ - // Dimensionalize the result and print in a nice table of frequencies and Q-factors. Save - // to file if user has specified. - const std::complex f = { - iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, omega.real()), - iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, omega.imag())}; - const double Q = - (f.imag() == 0.0) ? mfem::infinity() : 0.5 * std::abs(f) / std::abs(f.imag()); - - // Print table to stdout. - { - const int int_width = 1 + static_cast(std::log10(num_conv)); - constexpr int p = 6; - constexpr int w = 6 + p + 7; // Column spaces + precision + extra for table - if (i == 0) - { - // clang-format off - Mpi::Print("{:>{}s}{:>{}s}{:>{}s}{:>{}s}{:>{}s}\n{}\n", - "m", int_width, - "Re{ω}/2π (GHz)", w, - "Im{ω}/2π (GHz)", w, - "Bkwd. Error", w, - "Abs. Error", w, - std::string(int_width + 4 * w, '=')); - // clang-format on - } - // clang-format off - Mpi::Print("{:{}d}{:+{}.{}e}{:+{}.{}e}{:+{}.{}e}{:+{}.{}e}\n", - i + 1, int_width, - f.real(), w, p, - f.imag(), w, p, - error_bkwd, w, p, - error_abs, w, p); - // clang-format on - } - - // Print table to file. - if (root && post_dir.length() > 0) - { - std::string path = post_dir + "eig.csv"; - auto output = OutputFile(path, (i > 0)); - if (i == 0) - { - // clang-format off - output.print("{:>{}s},{:>{}s},{:>{}s},{:>{}s},{:>{}s},{:>{}s}\n", - "m", table.w1, - "Re{f} (GHz)", table.w, - "Im{f} (GHz)", table.w, - "Q", table.w, - "Error (Bkwd.)", table.w, - "Error (Abs.)", table.w); - // clang-format on - } - // clang-format off - output.print("{:{}.{}e},{:+{}.{}e},{:+{}.{}e},{:+{}.{}e},{:+{}.{}e},{:+{}.{}e}\n", - static_cast(i + 1), table.w1, table.p1, - f.real(), table.w, table.p, - f.imag(), table.w, table.p, - Q, table.w, table.p, - error_bkwd, table.w, table.p, - error_abs, table.w, table.p); - // clang-format on - } -} - -void EigenSolver::PostprocessPorts(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, int i) const -{ - // Postprocess the frequency domain lumped port voltages and currents (complex magnitude - // = sqrt(2) * RMS). - if (post_dir.length() == 0) - { - return; - } - std::vector port_data; - port_data.reserve(lumped_port_op.Size()); - for (const auto &[idx, data] : lumped_port_op) - { - const std::complex Vi = postop.GetPortVoltage(lumped_port_op, idx); - const std::complex Ii = postop.GetPortCurrent(lumped_port_op, idx); - port_data.push_back({idx, iodata.DimensionalizeValue(IoData::ValueType::VOLTAGE, Vi), - iodata.DimensionalizeValue(IoData::ValueType::CURRENT, Ii)}); - } - if (root && !port_data.empty()) - { - // Write the port voltages. - { - std::string path = post_dir + "port-V.csv"; - auto output = OutputFile(path, (i > 0)); - if (i == 0) - { - output.print("{:>{}s},", "m", table.w1); - for (const auto &data : port_data) - { - // clang-format off - output.print("{:>{}s},{:>{}s}{}", - "Re{V[" + std::to_string(data.idx) + "]} (V)", table.w, - "Im{V[" + std::to_string(data.idx) + "]} (V)", table.w, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - // clang-format off - output.print("{:{}.{}e},", - static_cast(i + 1), table.w1, table.p1); - // clang-format on - for (const auto &data : port_data) - { - // clang-format off - output.print("{:+{}.{}e},{:+{}.{}e}{}", - data.Vi.real(), table.w, table.p, - data.Vi.imag(), table.w, table.p, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - - // Write the port currents. - { - std::string path = post_dir + "port-I.csv"; - auto output = OutputFile(path, (i > 0)); - if (i == 0) - { - output.print("{:>{}s},", "m", table.w1); - for (const auto &data : port_data) - { - // clang-format off - output.print("{:>{}s},{:>{}s}{}", - "Re{I[" + std::to_string(data.idx) + "]} (A)", table.w, - "Im{I[" + std::to_string(data.idx) + "]} (A)", table.w, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - // clang-format off - output.print("{:{}.{}e},", - static_cast(i + 1), table.w1, table.p1); - // clang-format on - for (const auto &data : port_data) - { - // clang-format off - output.print("{:+{}.{}e},{:+{}.{}e}{}", - data.Ii.real(), table.w, table.p, - data.Ii.imag(), table.w, table.p, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - } -} - -void EigenSolver::PostprocessEPR(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, int i, - std::complex omega, double Em) const -{ - // If ports have been specified in the model, compute the corresponding energy- - // participation ratios (EPR) and write out to disk. - if (post_dir.length() == 0) - { - return; - } - - // Write the mode EPR for lumped inductor elements. - std::vector epr_L_data; - epr_L_data.reserve(lumped_port_op.Size()); - for (const auto &[idx, data] : lumped_port_op) - { - if (std::abs(data.GetL()) > 0.0) - { - const double pj = postop.GetInductorParticipation(lumped_port_op, idx, Em); - epr_L_data.push_back({idx, pj}); - } - } - if (root && !epr_L_data.empty()) - { - std::string path = post_dir + "port-EPR.csv"; - auto output = OutputFile(path, (i > 0)); - if (i == 0) - { - output.print("{:>{}s},", "m", table.w1); - for (const auto &data : epr_L_data) - { - // clang-format off - output.print("{:>{}s}{}", - "p[" + std::to_string(data.idx) + "]", table.w, - (data.idx == epr_L_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - output.print("{:{}.{}e},", static_cast(i + 1), table.w1, table.p1); - for (const auto &data : epr_L_data) - { - // clang-format off - output.print("{:+{}.{}e}{}", - data.pj, table.w, table.p, - (data.idx == epr_L_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - - // Write the mode EPR for lumped resistor elements. - std::vector epr_IO_data; - epr_IO_data.reserve(lumped_port_op.Size()); - for (const auto &[idx, data] : lumped_port_op) - { - if (std::abs(data.GetR()) > 0.0) - { - const double Kl = postop.GetExternalKappa(lumped_port_op, idx, Em); - const double Ql = (Kl == 0.0) ? mfem::infinity() : omega.real() / std::abs(Kl); - epr_IO_data.push_back( - {idx, Ql, iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, Kl)}); - } - } - if (root && !epr_IO_data.empty()) - { - std::string path = post_dir + "port-Q.csv"; - auto output = OutputFile(path, (i > 0)); - if (i == 0) - { - output.print("{:>{}s},", "m", table.w1); - for (const auto &data : epr_IO_data) - { - // clang-format off - output.print("{:>{}s},{:>{}s}{}", - "Q_ext[" + std::to_string(data.idx) + "]", table.w, - "κ_ext[" + std::to_string(data.idx) + "] (GHz)", table.w, - (data.idx == epr_IO_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - output.print("{:{}.{}e},", static_cast(i + 1), table.w1, table.p1); - for (const auto &data : epr_IO_data) - { - // clang-format off - output.print("{:+{}.{}e},{:+{}.{}e}{}", - data.Ql, table.w, table.p, - data.Kl, table.w, table.p, - (data.idx == epr_IO_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "eigensolver.hpp" + +#include +#include +#include "fem/errorindicator.hpp" +#include "fem/mesh.hpp" +#include "linalg/arpack.hpp" +#include "linalg/divfree.hpp" +#include "linalg/errorestimator.hpp" +#include "linalg/floquetcorrection.hpp" +#include "linalg/ksp.hpp" +#include "linalg/nleps.hpp" +#include "linalg/operator.hpp" +#include "linalg/rap.hpp" +#include "linalg/slepc.hpp" +#include "linalg/vector.hpp" +#include "models/lumpedportoperator.hpp" +#include "models/postoperator.hpp" +#include "models/spaceoperator.hpp" +#include "utils/communication.hpp" +#include "utils/iodata.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +using namespace std::complex_literals; + +std::pair +EigenSolver::Solve(const std::vector> &mesh) const +{ + // Construct and extract the system matrices defining the eigenvalue problem. The diagonal + // values for the mass matrix PEC dof shift the Dirichlet eigenvalues out of the + // computational range. The damping matrix may be nullptr. + BlockTimer bt0(Timer::CONSTRUCT); + SpaceOperator space_op(iodata, mesh); + auto K = space_op.GetStiffnessMatrix(Operator::DIAG_ONE); + auto C = space_op.GetDampingMatrix(Operator::DIAG_ZERO); + auto M = space_op.GetMassMatrix(Operator::DIAG_ZERO); + + // Check if there are nonlinear terms and, if so, setup interpolation operator. + auto funcA2 = [&space_op](double omega) -> std::unique_ptr + { return space_op.GetExtraSystemMatrix(omega, Operator::DIAG_ZERO); }; + auto funcP = [&space_op](std::complex a0, std::complex a1, + std::complex a2, + double omega) -> std::unique_ptr + { return space_op.GetPreconditionerMatrix(a0, a1, a2, omega); }; + const double target = iodata.solver.eigenmode.target; + auto A2 = funcA2(target); + bool has_A2 = (A2 != nullptr); + + // Extend K, C, M operators with interpolated A2 operator. + // K' = K + A2_0, C' = C + A2_1, M' = M + A2_2 + std::unique_ptr Kp, Cp, Mp; + std::unique_ptr interp_op; + std::unique_ptr A2_0, A2_1, A2_2; + NonlinearEigenSolver nonlinear_type = iodata.solver.eigenmode.nonlinear_type; + if (has_A2 && nonlinear_type == NonlinearEigenSolver::HYBRID) + { + const double target_max = iodata.solver.eigenmode.target_upper; + interp_op = std::make_unique(funcA2, A2->Width()); + interp_op->Interpolate(1i * target, 1i * target_max); + A2_0 = interp_op->GetInterpolationOperator(0); + A2_1 = interp_op->GetInterpolationOperator(1); + A2_2 = interp_op->GetInterpolationOperator(2); + Kp = BuildParSumOperator({1.0 + 0i, 1.0 + 0i}, {K.get(), A2_0.get()}); + Cp = BuildParSumOperator({1.0 + 0i, 1.0 + 0i}, {C.get(), A2_1.get()}); + Mp = BuildParSumOperator({1.0 + 0i, 1.0 + 0i}, {M.get(), A2_2.get()}); + } + + const auto &Curl = space_op.GetCurlMatrix(); + SaveMetadata(space_op.GetNDSpaces()); + + // Configure objects for postprocessing. + PostOperator post_op(iodata, space_op); + ComplexVector E(Curl.Width()), B(Curl.Height()); + E.UseDevice(true); + B.UseDevice(true); + + // Define and configure the eigensolver to solve the eigenvalue problem: + // (K + λ C + λ² M) u = 0 or K u = -λ² M u + // with λ = iω. In general, the system matrices are complex and symmetric. + std::unique_ptr eigen; + EigenSolverBackend type = iodata.solver.eigenmode.type; + +#if defined(PALACE_WITH_ARPACK) && defined(PALACE_WITH_SLEPC) + if (type == EigenSolverBackend::DEFAULT) + { + type = EigenSolverBackend::SLEPC; + } +#elif defined(PALACE_WITH_ARPACK) + if (type == EigenSolverBackend::SLEPC) + { + Mpi::Warning("SLEPc eigensolver not available, using ARPACK!\n"); + } + type = EigenSolverBackend::ARPACK; + if (nonlinear_type == NonlinearEigenSolver::SLP) + { + Mpi::Warning("SLP nonlinear eigensolver not available, using Hybrid!\n"); + } + nonlinear_type = NonlinearEigenSolver::HYBRID; +#elif defined(PALACE_WITH_SLEPC) + if (type == EigenSolverBackend::ARPACK) + { + Mpi::Warning("ARPACK eigensolver not available, using SLEPc!\n"); + } + type = EigenSolverBackend::SLEPC; +#else +#error "Eigenmode solver requires building with ARPACK or SLEPc!" +#endif + if (type == EigenSolverBackend::ARPACK) + { +#if defined(PALACE_WITH_ARPACK) + Mpi::Print("\nConfiguring ARPACK eigenvalue solver:\n"); + if (C || has_A2) + { + eigen = std::make_unique(space_op.GetComm(), + iodata.problem.verbose); + } + else + { + eigen = std::make_unique(space_op.GetComm(), + iodata.problem.verbose); + } +#endif + } + else // EigenSolverBackend::SLEPC + { +#if defined(PALACE_WITH_SLEPC) + Mpi::Print("\nConfiguring SLEPc eigenvalue solver:\n"); + std::unique_ptr slepc; + if (nonlinear_type == NonlinearEigenSolver::SLP) + { + slepc = std::make_unique(space_op.GetComm(), + iodata.problem.verbose); + slepc->SetType(slepc::SlepcEigenvalueSolver::Type::SLP); + slepc->SetProblemType(slepc::SlepcEigenvalueSolver::ProblemType::GENERAL); + } + else + { + if (C || has_A2) + { + if (!iodata.solver.eigenmode.pep_linear) + { + slepc = std::make_unique(space_op.GetComm(), + iodata.problem.verbose); + slepc->SetType(slepc::SlepcEigenvalueSolver::Type::TOAR); + } + else + { + slepc = std::make_unique(space_op.GetComm(), + iodata.problem.verbose); + slepc->SetType(slepc::SlepcEigenvalueSolver::Type::KRYLOVSCHUR); + } + } + else + { + slepc = std::make_unique(space_op.GetComm(), + iodata.problem.verbose); + slepc->SetType(slepc::SlepcEigenvalueSolver::Type::KRYLOVSCHUR); + } + slepc->SetProblemType(slepc::SlepcEigenvalueSolver::ProblemType::GEN_NON_HERMITIAN); + } + slepc->SetOrthogonalization(iodata.solver.linear.gs_orthog == Orthogonalization::MGS, + iodata.solver.linear.gs_orthog == Orthogonalization::CGS2); + eigen = std::move(slepc); +#endif + } + EigenvalueSolver::ScaleType scale = iodata.solver.eigenmode.scale + ? EigenvalueSolver::ScaleType::NORM_2 + : EigenvalueSolver::ScaleType::NONE; + if (nonlinear_type == NonlinearEigenSolver::SLP) + { + eigen->SetOperators(*K, *C, *M, EigenvalueSolver::ScaleType::NONE); + eigen->SetExtraSystemMatrix(funcA2); + eigen->SetPreconditionerUpdate(funcP); + } + else + { + if (has_A2) + { + eigen->SetOperators(*Kp, *Cp, *Mp, scale); + } + else if (C) + { + eigen->SetOperators(*K, *C, *M, scale); + } + else + { + eigen->SetOperators(*K, *M, scale); + } + } + eigen->SetNumModes(iodata.solver.eigenmode.n, iodata.solver.eigenmode.max_size); + const double tol = (has_A2 && nonlinear_type == NonlinearEigenSolver::HYBRID) + ? iodata.solver.eigenmode.linear_tol + : iodata.solver.eigenmode.tol; + eigen->SetTol(tol); + eigen->SetMaxIter(iodata.solver.eigenmode.max_it); + Mpi::Print(" Scaling γ = {:.3e}, δ = {:.3e}\n", eigen->GetScalingGamma(), + eigen->GetScalingDelta()); + + // If desired, use an M-inner product for orthogonalizing the eigenvalue subspace. The + // constructed matrix just references the real SPD part of the mass matrix (no copy is + // performed). Boundary conditions don't need to be eliminated here. + std::unique_ptr KM; + if (iodata.solver.eigenmode.mass_orthog) + { + Mpi::Print(" Basis uses M-inner product\n"); + KM = space_op.GetInnerProductMatrix(0.0, 1.0, nullptr, M.get()); + eigen->SetBMat(*KM); + + // Mpi::Print(" Basis uses (K + M)-inner product\n"); + // KM = space_op.GetInnerProductMatrix(1.0, 1.0, K.get(), M.get()); + // eigen->SetBMat(*KM); + } + + // Construct a divergence-free projector so the eigenvalue solve is performed in the space + // orthogonal to the zero eigenvalues of the stiffness matrix. + std::unique_ptr> divfree; + if (iodata.solver.linear.divfree_max_it > 0 && + !space_op.GetMaterialOp().HasWaveVector() && + !space_op.GetMaterialOp().HasLondonDepth()) + { + Mpi::Print(" Configuring divergence-free projection\n"); + constexpr int divfree_verbose = 0; + divfree = std::make_unique>( + space_op.GetMaterialOp(), space_op.GetNDSpace(), space_op.GetH1Spaces(), + space_op.GetAuxBdrTDofLists(), iodata.solver.linear.divfree_tol, + iodata.solver.linear.divfree_max_it, divfree_verbose); + eigen->SetDivFreeProjector(*divfree); + } + + // If using Floquet BCs, a correction term (kp x E) needs to be added to the B field. + std::unique_ptr> floquet_corr; + if (space_op.GetMaterialOp().HasWaveVector()) + { + floquet_corr = std::make_unique>( + space_op.GetMaterialOp(), space_op.GetNDSpace(), space_op.GetRTSpace(), + iodata.solver.linear.tol, iodata.solver.linear.max_it, 0); + } + + // Set up the initial space for the eigenvalue solve. Satisfies boundary conditions and is + // projected appropriately. + if (iodata.solver.eigenmode.init_v0) + { + ComplexVector v0; + if (iodata.solver.eigenmode.init_v0_const) + { + Mpi::Print(" Using constant starting vector\n"); + space_op.GetConstantInitialVector(v0); + } + else + { + Mpi::Print(" Using random starting vector\n"); + space_op.GetRandomInitialVector(v0); + } + if (divfree) + { + divfree->Mult(v0); + } + eigen->SetInitialSpace(v0); // Copies the vector + + // Debug + // const auto &Grad = space_op.GetGradMatrix(); + // ComplexVector r0(Grad->Width()); + // r0.UseDevice(true); + // Grad.MultTranspose(v0.Real(), r0.Real()); + // Grad.MultTranspose(v0.Imag(), r0.Imag()); + // r0.Print(); + } + + // Configure the shift-and-invert strategy is employed to solve for the eigenvalues + // closest to the specified target, σ. + { + const double f_target = + iodata.units.Dimensionalize(target) / (2 * M_PI); + Mpi::Print(" Shift-and-invert σ = {:.3e} GHz ({:.3e})\n", f_target, target); + } + if (C || has_A2 || nonlinear_type == NonlinearEigenSolver::SLP) + { + // Search for eigenvalues closest to λ = iσ. + eigen->SetShiftInvert(1i * target); + if (type == EigenSolverBackend::ARPACK) + { + // ARPACK searches based on eigenvalues of the transformed problem. The eigenvalue + // 1 / (λ - σ) will be a large-magnitude negative imaginary number for an eigenvalue + // λ with frequency close to but not below the target σ. + eigen->SetWhichEigenpairs(EigenvalueSolver::WhichType::SMALLEST_IMAGINARY); + } + else if (nonlinear_type == NonlinearEigenSolver::SLP) + { + eigen->SetWhichEigenpairs(EigenvalueSolver::WhichType::TARGET_MAGNITUDE); + } + else + { + eigen->SetWhichEigenpairs(EigenvalueSolver::WhichType::TARGET_IMAGINARY); + } + } + else + { + // Linear EVP has eigenvalues μ = -λ² = ω². Search for eigenvalues closest to μ = σ². + eigen->SetShiftInvert(target * target); + if (type == EigenSolverBackend::ARPACK) + { + // ARPACK searches based on eigenvalues of the transformed problem. 1 / (μ - σ²) + // will be a large-magnitude positive real number for an eigenvalue μ with frequency + // close to but below the target σ². + eigen->SetWhichEigenpairs(EigenvalueSolver::WhichType::LARGEST_REAL); + } + else + { + eigen->SetWhichEigenpairs(EigenvalueSolver::WhichType::TARGET_REAL); + } + } + + // Set up the linear solver required for solving systems involving the shifted operator + // (K - σ² M) or P(iσ) = (K + iσ C - σ² M) during the eigenvalue solve. The + // preconditioner for complex linear systems is constructed from a real approximation + // to the complex system matrix. + auto A = space_op.GetSystemMatrix(1.0 + 0.0i, 1i * target, -target * target + 0.0i, + K.get(), C.get(), M.get(), A2.get()); + auto P = space_op.GetPreconditionerMatrix( + 1.0 + 0.0i, 1i * target, -target * target + 0.0i, target); + auto ksp = std::make_unique(iodata, space_op.GetNDSpaces(), + &space_op.GetH1Spaces()); + ksp->SetOperators(*A, *P); + eigen->SetLinearSolver(*ksp); + + // Initialize structures for storing and reducing the results of error estimation. + TimeDependentFluxErrorEstimator estimator( + space_op.GetMaterialOp(), space_op.GetNDSpaces(), space_op.GetRTSpaces(), + iodata.solver.linear.estimator_tol, iodata.solver.linear.estimator_max_it, 0, + iodata.solver.linear.estimator_mg); + ErrorIndicator indicator; + + // Eigenvalue problem solve. + BlockTimer bt1(Timer::EPS); + Mpi::Print("\n"); + int num_conv = eigen->Solve(); + { + std::complex lambda = (num_conv > 0) ? eigen->GetEigenvalue(0) : 0.0; + Mpi::Print(" Found {:d} converged eigenvalue{}{}\n", num_conv, + (num_conv > 1) ? "s" : "", + (num_conv > 0) + ? fmt::format(" (first = {:.3e}{:+.3e}i)", lambda.real(), lambda.imag()) + : ""); + } + + if (has_A2 && nonlinear_type == NonlinearEigenSolver::HYBRID) + { + Mpi::Print("\n Refining eigenvalues with Quasi-Newton solver\n"); + auto qn = std::make_unique(space_op.GetComm(), std::move(eigen), + num_conv, iodata.problem.verbose, + iodata.solver.eigenmode.refine_nonlinear); + qn->SetTol(iodata.solver.eigenmode.tol); + qn->SetMaxIter(iodata.solver.eigenmode.max_it); + if (C) + { + qn->SetOperators(*K, *C, *M, EigenvalueSolver::ScaleType::NONE); + } + else + { + qn->SetOperators(*K, *M, EigenvalueSolver::ScaleType::NONE); + } + qn->SetExtraSystemMatrix(funcA2); + qn->SetPreconditionerUpdate(funcP); + qn->SetNumModes(iodata.solver.eigenmode.n, iodata.solver.eigenmode.max_size); + qn->SetPreconditionerLag(iodata.solver.eigenmode.preconditioner_lag, + iodata.solver.eigenmode.preconditioner_lag_tol); + qn->SetMaxRestart(iodata.solver.eigenmode.max_restart); + qn->SetLinearSolver(*ksp); + qn->SetShiftInvert(1i * target); + eigen = std::move(qn); + + // Suppress wave port output during nonlinear eigensolver iterations. + space_op.GetWavePortOp().SetSuppressOutput(true); + num_conv = eigen->Solve(); + space_op.GetWavePortOp().SetSuppressOutput(false); + } + + BlockTimer bt2(Timer::POSTPRO); + SaveMetadata(*ksp); + + // Calculate and record the error indicators, and postprocess the results. + Mpi::Print("\nComputing solution error estimates and performing postprocessing\n"); + if (!KM) + { + // Normalize the finalized eigenvectors with respect to mass matrix (unit electric field + // energy) even if they are not computed to be orthogonal with respect to it. + KM = space_op.GetInnerProductMatrix(0.0, 1.0, nullptr, M.get()); + eigen->SetBMat(*KM); + eigen->RescaleEigenvectors(num_conv); + } + Mpi::Print("\n"); + + for (int i = 0; i < num_conv; i++) + { + // Get the eigenvalue and relative error. + std::complex omega = eigen->GetEigenvalue(i); + double error_bkwd = eigen->GetError(i, EigenvalueSolver::ErrorType::BACKWARD); + double error_abs = eigen->GetError(i, EigenvalueSolver::ErrorType::ABSOLUTE); + if (!C && !has_A2) + { + // Linear EVP has eigenvalue μ = -λ² = ω². + omega = std::sqrt(omega); + } + else + { + // Quadratic EVP solves for eigenvalue λ = iω. + omega /= 1i; + } + + // Compute B = -1/(iω) ∇ x E on the true dofs, and set the internal GridFunctions in + // PostOperator for all postprocessing operations. + eigen->GetEigenvector(i, E); + + linalg::NormalizePhase(space_op.GetComm(), E); + + Curl.Mult(E.Real(), B.Real()); + Curl.Mult(E.Imag(), B.Imag()); + B *= -1.0 / (1i * omega); + if (space_op.GetMaterialOp().HasWaveVector()) + { + // Calculate B field correction for Floquet BCs. + // B = -1/(iω) ∇ x E + 1/ω kp x E. + floquet_corr->AddMult(E, B, 1.0 / omega); + } + + auto total_domain_energy = + post_op.MeasureAndPrintAll(i, E, B, omega, error_abs, error_bkwd, num_conv); + + // Calculate and record the error indicators. + if (i < iodata.solver.eigenmode.n) + { + estimator.AddErrorIndicator(E, B, total_domain_energy, indicator); + } + + // Final write: Different condition than end of loop (i = num_conv - 1). + if (i == iodata.solver.eigenmode.n - 1) + { + post_op.MeasureFinalize(indicator); + } + } + MFEM_VERIFY(num_conv >= iodata.solver.eigenmode.n, "Eigenmode solve only found " + << num_conv << " modes when " + << iodata.solver.eigenmode.n + << " were requested!"); + return {indicator, space_op.GlobalTrueVSize()}; +} + +} // namespace palace diff --git a/palace/drivers/eigensolver.hpp b/palace/drivers/eigensolver.hpp index 45077717d0..0b1d762d54 100644 --- a/palace/drivers/eigensolver.hpp +++ b/palace/drivers/eigensolver.hpp @@ -1,56 +1,32 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_DRIVERS_EIGEN_SOLVER_HPP -#define PALACE_DRIVERS_EIGEN_SOLVER_HPP - -#include -#include -#include -#include "drivers/basesolver.hpp" - -namespace mfem -{ - -class ParMesh; - -} // namespace mfem - -namespace palace -{ - -class ErrorIndicator; -class IoData; -class LumpedPortOperator; -class PostOperator; -class Timer; - -// -// Driver class for eigenmode simulations. -// -class EigenSolver : public BaseSolver -{ -private: - void Postprocess(const PostOperator &postop, const LumpedPortOperator &lumped_port_op, - int i, std::complex omega, double error_bkwd, double error_abs, - int num_conv, const ErrorIndicator *indicator) const; - - void PostprocessEigen(int i, std::complex omega, double error_bkwd, - double error_abs, int num_conv) const; - - void PostprocessPorts(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, int i) const; - - void PostprocessEPR(const PostOperator &postop, const LumpedPortOperator &lumped_port_op, - int i, std::complex omega, double Em) const; - - std::pair - Solve(const std::vector> &mesh) const override; - -public: - using BaseSolver::BaseSolver; -}; - -} // namespace palace - -#endif // PALACE_DRIVERS_EIGEN_SOLVER_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_DRIVERS_EIGEN_SOLVER_HPP +#define PALACE_DRIVERS_EIGEN_SOLVER_HPP + +#include +#include +#include "drivers/basesolver.hpp" + +namespace palace +{ + +class ErrorIndicator; +class Mesh; + +// +// Driver class for eigenmode simulations. +// +class EigenSolver : public BaseSolver +{ +private: + std::pair + Solve(const std::vector> &mesh) const override; + +public: + using BaseSolver::BaseSolver; +}; + +} // namespace palace + +#endif // PALACE_DRIVERS_EIGEN_SOLVER_HPP diff --git a/palace/drivers/electrostaticsolver.cpp b/palace/drivers/electrostaticsolver.cpp index 903d32b33f..3fc1044dc7 100644 --- a/palace/drivers/electrostaticsolver.cpp +++ b/palace/drivers/electrostaticsolver.cpp @@ -1,222 +1,193 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "electrostaticsolver.hpp" - -#include -#include "fem/errorindicator.hpp" -#include "linalg/errorestimator.hpp" -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "models/laplaceoperator.hpp" -#include "models/postoperator.hpp" -#include "utils/communication.hpp" -#include "utils/iodata.hpp" -#include "utils/timer.hpp" - -namespace palace -{ - -std::pair -ElectrostaticSolver::Solve(const std::vector> &mesh) const -{ - // Construct the system matrix defining the linear operator. Dirichlet boundaries are - // handled eliminating the rows and columns of the system matrix for the corresponding - // dofs. The eliminated matrix is stored in order to construct the RHS vector for nonzero - // prescribed BC values. - BlockTimer bt0(Timer::CONSTRUCT); - LaplaceOperator laplaceop(iodata, mesh); - auto K = laplaceop.GetStiffnessMatrix(); - SaveMetadata(laplaceop.GetH1Spaces()); - - // Set up the linear solver. - KspSolver ksp(iodata, laplaceop.GetH1Spaces()); - ksp.SetOperators(*K, *K); - - // Terminal indices are the set of boundaries over which to compute the capacitance - // matrix. Terminal boundaries are aliases for ports. - PostOperator postop(iodata, laplaceop, "electrostatic"); - int nstep = static_cast(laplaceop.GetSources().size()); - MFEM_VERIFY(nstep > 0, "No terminal boundaries specified for electrostatic simulation!"); - - // Right-hand side term and solution vector storage. - Vector RHS(K->Height()); - std::vector V(nstep); - - // Main loop over terminal boundaries. - Mpi::Print("\nComputing electrostatic fields for {:d} terminal boundar{}\n", nstep, - (nstep > 1) ? "ies" : "y"); - int step = 0; - auto t0 = Timer::Now(); - for (const auto &[idx, data] : laplaceop.GetSources()) - { - Mpi::Print("\nIt {:d}/{:d}: Index = {:d} (elapsed time = {:.2e} s)\n", step + 1, nstep, - idx, Timer::Duration(Timer::Now() - t0).count()); - - // Form and solve the linear system for a prescribed nonzero voltage on the specified - // terminal. - Mpi::Print("\n"); - laplaceop.GetExcitationVector(idx, *K, V[step], RHS); - - BlockTimer bt1(Timer::SOLVE); - ksp.Mult(RHS, V[step]); - - BlockTimer bt2(Timer::POSTPRO); - Mpi::Print(" Sol. ||V|| = {:.6e} (||RHS|| = {:.6e})\n", - linalg::Norml2(laplaceop.GetComm(), V[step]), - linalg::Norml2(laplaceop.GetComm(), RHS)); - - // Next terminal. - step++; - } - - // Postprocess the capacitance matrix from the computed field solutions. - BlockTimer bt1(Timer::POSTPRO); - SaveMetadata(ksp); - return {Postprocess(laplaceop, postop, V), laplaceop.GlobalTrueVSize()}; -} - -ErrorIndicator ElectrostaticSolver::Postprocess(LaplaceOperator &laplaceop, - PostOperator &postop, - const std::vector &V) const -{ - // Postprocess the Maxwell capacitance matrix. See p. 97 of the COMSOL AC/DC Module manual - // for the associated formulas based on the electric field energy based on a unit voltage - // excitation for each terminal. Alternatively, we could compute the resulting terminal - // charges from the prescribed voltage to get C directly as: - // Q_i = ∫ ρ dV = ∫ ∇ ⋅ (ε E) dV = ∫ (ε E) ⋅ n dS - // and C_ij = Q_i/V_j. The energy formulation avoids having to locally integrate E = -∇V. - const auto &Grad = laplaceop.GetGradMatrix(); - const std::map> &terminal_sources = laplaceop.GetSources(); - int nstep = static_cast(terminal_sources.size()); - mfem::DenseMatrix C(nstep), Cm(nstep); - Vector E(Grad.Height()), Vij(Grad.Width()); - if (iodata.solver.electrostatic.n_post > 0) - { - Mpi::Print("\n"); - } - - // Calculate and record the error indicators. - Mpi::Print("Computing solution error estimates\n\n"); - GradFluxErrorEstimator estimator(laplaceop.GetMaterialOp(), laplaceop.GetH1Spaces(), - iodata.solver.linear.estimator_tol, - iodata.solver.linear.estimator_max_it, 0, - iodata.solver.pa_order_threshold); - ErrorIndicator indicator; - for (int i = 0; i < nstep; i++) - { - estimator.AddErrorIndicator(V[i], indicator); - } - - int i = 0; - for (const auto &[idx, data] : terminal_sources) - { - // Compute E = -∇V on the true dofs, and set the internal GridFunctions in PostOperator - // for all postprocessing operations. - E = 0.0; - Grad.AddMult(V[i], E, -1.0); - postop.SetEGridFunction(E); - postop.SetVGridFunction(V[i]); - double Ue = postop.GetEFieldEnergy(); - PostprocessDomains(postop, "i", i, idx, Ue, 0.0, 0.0, 0.0); - PostprocessSurfaces(postop, "i", i, idx, Ue, 0.0, 1.0, 0.0); - PostprocessProbes(postop, "i", i, idx); - if (i < iodata.solver.electrostatic.n_post) - { - PostprocessFields(postop, i, idx, (i == 0) ? &indicator : nullptr); - Mpi::Print("Wrote fields to disk for terminal {:d}\n", idx); - } - if (i == 0) - { - PostprocessErrorIndicator(postop, indicator); - } - - // Diagonal: C_ii = 2 U_e(V_i) / V_i². - C(i, i) = Cm(i, i) = 2.0 * Ue; - i++; - } - - // Off-diagonals: C_ij = U_e(V_i + V_j) / (V_i V_j) - 1/2 (V_i/V_j C_ii + V_j/V_i C_jj). - for (i = 0; i < C.Height(); i++) - { - for (int j = 0; j < C.Width(); j++) - { - if (j < i) - { - // Copy lower triangle from already computed upper triangle. - C(i, j) = C(j, i); - Cm(i, j) = Cm(j, i); - Cm(i, i) -= Cm(i, j); - } - else if (j > i) - { - linalg::AXPBYPCZ(1.0, V[i], 1.0, V[j], 0.0, Vij); - E = 0.0; - Grad.AddMult(Vij, E, -1.0); - postop.SetEGridFunction(E); - double Ue = postop.GetEFieldEnergy(); - C(i, j) = Ue - 0.5 * (C(i, i) + C(j, j)); - Cm(i, j) = -C(i, j); - Cm(i, i) -= Cm(i, j); - } - } - } - mfem::DenseMatrix Cinv(C); - Cinv.Invert(); // In-place, uses LAPACK (when available) and should be cheap - PostprocessTerminals(terminal_sources, C, Cinv, Cm); - return indicator; -} - -void ElectrostaticSolver::PostprocessTerminals( - const std::map> &terminal_sources, const mfem::DenseMatrix &C, - const mfem::DenseMatrix &Cinv, const mfem::DenseMatrix &Cm) const -{ - // Only root writes to disk (every process has full matrices). - if (!root || post_dir.length() == 0) - { - return; - } - - // Write capactance matrix data. - auto PrintMatrix = [&terminal_sources, this](const std::string &file, - const std::string &name, - const std::string &unit, - const mfem::DenseMatrix &mat, double scale) - { - std::string path = post_dir + file; - auto output = OutputFile(path, false); - output.print("{:>{}s},", "i", table.w1); - for (const auto &[idx2, data2] : terminal_sources) - { - // clang-format off - output.print("{:>{}s}{}", - name + "[i][" + std::to_string(idx2) + "] " + unit, table.w, - (idx2 == terminal_sources.rbegin()->first) ? "" : ","); - // clang-format on - } - output.print("\n"); - int i = 0; - for (const auto &[idx, data] : terminal_sources) - { - int j = 0; - output.print("{:{}.{}e},", static_cast(idx), table.w1, table.p1); - for (const auto &[idx2, data2] : terminal_sources) - { - // clang-format off - output.print("{:+{}.{}e}{}", - mat(i, j) * scale, table.w, table.p, - (idx2 == terminal_sources.rbegin()->first) ? "" : ","); - // clang-format on - j++; - } - output.print("\n"); - i++; - } - }; - const double F = iodata.DimensionalizeValue(IoData::ValueType::CAPACITANCE, 1.0); - PrintMatrix("terminal-C.csv", "C", "(F)", C, F); - PrintMatrix("terminal-Cinv.csv", "C⁻¹", "(1/F)", Cinv, 1.0 / F); - PrintMatrix("terminal-Cm.csv", "C_m", "(F)", Cm, F); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "electrostaticsolver.hpp" + +#include +#include "fem/errorindicator.hpp" +#include "fem/mesh.hpp" +#include "linalg/errorestimator.hpp" +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "models/laplaceoperator.hpp" +#include "models/postoperator.hpp" +#include "utils/communication.hpp" +#include "utils/iodata.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +std::pair +ElectrostaticSolver::Solve(const std::vector> &mesh) const +{ + // Construct the system matrix defining the linear operator. Dirichlet boundaries are + // handled eliminating the rows and columns of the system matrix for the corresponding + // dofs. The eliminated matrix is stored in order to construct the RHS vector for nonzero + // prescribed BC values. + BlockTimer bt0(Timer::CONSTRUCT); + LaplaceOperator laplace_op(iodata, mesh); + auto K = laplace_op.GetStiffnessMatrix(); + const auto &Grad = laplace_op.GetGradMatrix(); + SaveMetadata(laplace_op.GetH1Spaces()); + + // Set up the linear solver. + KspSolver ksp(iodata, laplace_op.GetH1Spaces()); + ksp.SetOperators(*K, *K); + + // Terminal indices are the set of boundaries over which to compute the capacitance + // matrix. Terminal boundaries are aliases for ports. + PostOperator post_op(iodata, laplace_op); + int n_step = static_cast(laplace_op.GetSources().size()); + MFEM_VERIFY(n_step > 0, "No terminal boundaries specified for electrostatic simulation!"); + + // Right-hand side term and solution vector storage. + Vector RHS(Grad.Width()), E(Grad.Height()); + std::vector V(n_step); + + // Initialize structures for storing and reducing the results of error estimation. + GradFluxErrorEstimator estimator( + laplace_op.GetMaterialOp(), laplace_op.GetNDSpace(), laplace_op.GetRTSpaces(), + iodata.solver.linear.estimator_tol, iodata.solver.linear.estimator_max_it, 0, + iodata.solver.linear.estimator_mg); + ErrorIndicator indicator; + + // Main loop over terminal boundaries. + Mpi::Print("\nComputing electrostatic fields for {:d} terminal {}\n", n_step, + (n_step > 1) ? "boundaries" : "boundary"); + int step = 0; + auto t0 = Timer::Now(); + for (const auto &[idx, data] : laplace_op.GetSources()) + { + Mpi::Print("\nIt {:d}/{:d}: Index = {:d} (elapsed time = {:.2e} s)\n", step + 1, n_step, + idx, Timer::Duration(Timer::Now() - t0).count()); + + // Form and solve the linear system for a prescribed nonzero voltage on the specified + // terminal. + Mpi::Print("\n"); + laplace_op.GetExcitationVector(idx, *K, V[step], RHS); + ksp.Mult(RHS, V[step]); + + // Start Post-processing. + BlockTimer bt2(Timer::POSTPRO); + Mpi::Print(" Sol. ||V|| = {:.6e} (||RHS|| = {:.6e})\n", + linalg::Norml2(laplace_op.GetComm(), V[step]), + linalg::Norml2(laplace_op.GetComm(), RHS)); + + // Compute E = -∇V on the true dofs. + E = 0.0; + Grad.AddMult(V[step], E, -1.0); + + // Measurement and printing. + auto total_domain_energy = post_op.MeasureAndPrintAll(step, V[step], E, idx); + + // Calculate and record the error indicators. + Mpi::Print(" Updating solution error estimates\n"); + estimator.AddErrorIndicator(E, total_domain_energy, indicator); + + // Next terminal. + step++; + } + + // Postprocess the capacitance matrix from the computed field solutions. + BlockTimer bt1(Timer::POSTPRO); + SaveMetadata(ksp); + PostprocessTerminals(post_op, laplace_op.GetSources(), V); + post_op.MeasureFinalize(indicator); + return {indicator, laplace_op.GlobalTrueVSize()}; +} + +void ElectrostaticSolver::PostprocessTerminals( + PostOperator &post_op, + const std::map> &terminal_sources, + const std::vector &V) const +{ + // Postprocess the Maxwell capacitance matrix. See p. 97 of the COMSOL AC/DC Module manual + // for the associated formulas based on the electric field energy based on a unit voltage + // excitation for each terminal. Alternatively, we could compute the resulting terminal + // charges from the prescribed voltage to get C directly as: + // Q_i = ∫ ρ dV = ∫ ∇ ⋅ (ε E) dV = ∫ (ε E) ⋅ n dS + // and C_ij = Q_i/V_j. The energy formulation avoids having to locally integrate E = -∇V. + mfem::DenseMatrix C(V.size()), Cm(V.size()); + for (int i = 0; i < C.Height(); i++) + { + // Diagonal: Cᵢᵢ = 2 Uₑ(Vᵢ) / Vᵢ² = (Vᵢᵀ K Vᵢ) / Vᵢ² (with ∀i, Vᵢ = 1) + auto &V_gf = post_op.GetVGridFunction().Real(); + auto &D_gf = post_op.GetDomainPostOp().D; + V_gf.SetFromTrueDofs(V[i]); + post_op.GetDomainPostOp().M_elec->Mult(V_gf, D_gf); + C(i, i) = Cm(i, i) = linalg::Dot(post_op.GetComm(), V_gf, D_gf); + + // Off-diagonals: Cᵢⱼ = Uₑ(Vᵢ + Vⱼ) / (Vᵢ Vⱼ) - 1/2 (Vᵢ/Vⱼ Cᵢᵢ + Vⱼ/Vᵢ Cⱼⱼ) + // = (Vⱼᵀ K Vᵢ) / (Vᵢ Vⱼ) + for (int j = i + 1; j < C.Width(); j++) + { + V_gf.SetFromTrueDofs(V[j]); + C(i, j) = linalg::Dot(post_op.GetComm(), V_gf, D_gf); + Cm(i, j) = -C(i, j); + Cm(i, i) -= Cm(i, j); + } + + // Copy lower triangle from already computed upper triangle. + for (int j = 0; j < i; j++) + { + C(i, j) = C(j, i); + Cm(i, j) = Cm(j, i); + Cm(i, i) -= Cm(i, j); + } + } + mfem::DenseMatrix Cinv(C); + Cinv.Invert(); // In-place, uses LAPACK (when available) and should be cheap + + // Only root writes to disk (every process has full matrices). + if (!root) + { + return; + } + using VT = Units::ValueType; + using fmt::format; + + // Write capacitance matrix data. + auto PrintMatrix = [&terminal_sources, this](const std::string &file, + const std::string &name, + const std::string &unit, + const mfem::DenseMatrix &mat, double scale) + { + TableWithCSVFile output(fs::path(post_dir / file).string()); + output.table.insert(Column("i", "i", 0, 0, 2, "")); + int j = 0; + for (const auto &[idx2, data2] : terminal_sources) + { + output.table.insert(format("i2{}", idx2), format("{}[i][{}] {}", name, idx2, unit)); + // Use the fact that iterator over i and j is the same span. + output.table["i"] << idx2; + + auto &col = output.table[format("i2{}", idx2)]; + for (std::size_t i = 0; i < terminal_sources.size(); i++) + { + col << mat(i, j) * scale; + } + j++; + } + output.WriteFullTableTrunc(); + }; + const double F = iodata.units.Dimensionalize(1.0); + PrintMatrix("terminal-C.csv", "C", "(F)", C, F); + PrintMatrix("terminal-Cinv.csv", "C⁻¹", "(1/F)", Cinv, 1.0 / F); + PrintMatrix("terminal-Cm.csv", "C_m", "(F)", Cm, F); + + // Also write out a file with terminal voltage excitations. + { + TableWithCSVFile terminal_V(fs::path(post_dir / "terminal-V.csv").string()); + terminal_V.table.insert(Column("i", "i", 0, 0, 2, "")); + terminal_V.table.insert("Vinc", "V_inc[i] (V)"); + for (const auto &[idx, data] : terminal_sources) + { + terminal_V.table["i"] << double(idx); + terminal_V.table["Vinc"] << iodata.units.Dimensionalize(1.0); + } + terminal_V.WriteFullTableTrunc(); + } +} + +} // namespace palace diff --git a/palace/drivers/electrostaticsolver.hpp b/palace/drivers/electrostaticsolver.hpp index 2e503110b4..040e9d3676 100644 --- a/palace/drivers/electrostaticsolver.hpp +++ b/palace/drivers/electrostaticsolver.hpp @@ -1,54 +1,49 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_DRIVERS_ELECTROSTATIC_SOLVER_HPP -#define PALACE_DRIVERS_ELECTROSTATIC_SOLVER_HPP - -#include -#include -#include -#include "drivers/basesolver.hpp" -#include "linalg/vector.hpp" - -namespace mfem -{ - -template -class Array; -class DenseMatrix; -class ParMesh; - -} // namespace mfem - -namespace palace -{ - -class ErrorIndicator; -class IoData; -class LaplaceOperator; -class PostOperator; -class Timer; - -// -// Driver class for electrostatic simulations. -// -class ElectrostaticSolver : public BaseSolver -{ -private: - ErrorIndicator Postprocess(LaplaceOperator &laplaceop, PostOperator &postop, - const std::vector &V) const; - - void PostprocessTerminals(const std::map> &terminal_sources, - const mfem::DenseMatrix &C, const mfem::DenseMatrix &Cinv, - const mfem::DenseMatrix &Cm) const; - - std::pair - Solve(const std::vector> &mesh) const override; - -public: - using BaseSolver::BaseSolver; -}; - -} // namespace palace - -#endif // PALACE_DRIVERS_ELECTROSTATIC_SOLVER_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_DRIVERS_ELECTROSTATIC_SOLVER_HPP +#define PALACE_DRIVERS_ELECTROSTATIC_SOLVER_HPP + +#include +#include +#include +#include "drivers/basesolver.hpp" +#include "linalg/vector.hpp" +#include "utils/configfile.hpp" + +namespace mfem +{ + +template +class Array; + +} // namespace mfem + +namespace palace +{ + +class ErrorIndicator; +class Mesh; +template +class PostOperator; + +// +// Driver class for electrostatic simulations. +// +class ElectrostaticSolver : public BaseSolver +{ +private: + void PostprocessTerminals(PostOperator &post_op, + const std::map> &terminal_sources, + const std::vector &V) const; + + std::pair + Solve(const std::vector> &mesh) const override; + +public: + using BaseSolver::BaseSolver; +}; + +} // namespace palace + +#endif // PALACE_DRIVERS_ELECTROSTATIC_SOLVER_HPP diff --git a/palace/drivers/magnetostaticsolver.cpp b/palace/drivers/magnetostaticsolver.cpp index 795f443c7c..8e29654aca 100644 --- a/palace/drivers/magnetostaticsolver.cpp +++ b/palace/drivers/magnetostaticsolver.cpp @@ -1,229 +1,201 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "magnetostaticsolver.hpp" - -#include -#include "fem/errorindicator.hpp" -#include "linalg/errorestimator.hpp" -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "models/curlcurloperator.hpp" -#include "models/postoperator.hpp" -#include "models/surfacecurrentoperator.hpp" -#include "utils/communication.hpp" -#include "utils/iodata.hpp" -#include "utils/timer.hpp" - -namespace palace -{ - -std::pair -MagnetostaticSolver::Solve(const std::vector> &mesh) const -{ - // Construct the system matrix defining the linear operator. Dirichlet boundaries are - // handled eliminating the rows and columns of the system matrix for the corresponding - // dofs. - BlockTimer bt0(Timer::CONSTRUCT); - CurlCurlOperator curlcurlop(iodata, mesh); - auto K = curlcurlop.GetStiffnessMatrix(); - SaveMetadata(curlcurlop.GetNDSpaces()); - - // Set up the linear solver. - KspSolver ksp(iodata, curlcurlop.GetNDSpaces(), &curlcurlop.GetH1Spaces()); - ksp.SetOperators(*K, *K); - - // Terminal indices are the set of boundaries over which to compute the inductance matrix. - PostOperator postop(iodata, curlcurlop, "magnetostatic"); - int nstep = static_cast(curlcurlop.GetSurfaceCurrentOp().Size()); - MFEM_VERIFY(nstep > 0, - "No surface current boundaries specified for magnetostatic simulation!"); - - // Source term and solution vector storage. - Vector RHS(K->Height()); - std::vector A(nstep); - - // Main loop over current source boundaries. - Mpi::Print("\nComputing magnetostatic fields for {:d} source boundar{}\n", nstep, - (nstep > 1) ? "ies" : "y"); - int step = 0; - auto t0 = Timer::Now(); - for (const auto &[idx, data] : curlcurlop.GetSurfaceCurrentOp()) - { - Mpi::Print("\nIt {:d}/{:d}: Index = {:d} (elapsed time = {:.2e} s)\n", step + 1, nstep, - idx, Timer::Duration(Timer::Now() - t0).count()); - - // Form and solve the linear system for a prescribed current on the specified source. - Mpi::Print("\n"); - A[step].SetSize(RHS.Size()); - A[step] = 0.0; - curlcurlop.GetExcitationVector(idx, RHS); - - BlockTimer bt1(Timer::SOLVE); - ksp.Mult(RHS, A[step]); - - BlockTimer bt2(Timer::POSTPRO); - Mpi::Print(" Sol. ||A|| = {:.6e} (||RHS|| = {:.6e})\n", - linalg::Norml2(curlcurlop.GetComm(), A[step]), - linalg::Norml2(curlcurlop.GetComm(), RHS)); - - // Next source. - step++; - } - - // Postprocess the capacitance matrix from the computed field solutions. - BlockTimer bt1(Timer::POSTPRO); - SaveMetadata(ksp); - return {Postprocess(curlcurlop, postop, A), curlcurlop.GlobalTrueVSize()}; -} - -ErrorIndicator MagnetostaticSolver::Postprocess(CurlCurlOperator &curlcurlop, - PostOperator &postop, - const std::vector &A) const -{ - // Postprocess the Maxwell inductance matrix. See p. 97 of the COMSOL AC/DC Module manual - // for the associated formulas based on the magnetic field energy based on a current - // excitation for each port. Alternatively, we could compute the resulting loop fluxes to - // get M directly as: - // Φ_i = ∫ B ⋅ n_j dS - // and M_ij = Φ_i/I_j. The energy formulation avoids having to locally integrate B = - // ∇ x A. - const auto &Curl = curlcurlop.GetCurlMatrix(); - const SurfaceCurrentOperator &surf_j_op = curlcurlop.GetSurfaceCurrentOp(); - int nstep = static_cast(surf_j_op.Size()); - mfem::DenseMatrix M(nstep), Mm(nstep); - Vector B(Curl.Height()), Aij(Curl.Width()); - Vector Iinc(nstep); - if (iodata.solver.magnetostatic.n_post > 0) - { - Mpi::Print("\n"); - } - - // Calculate and record the error indicators. - Mpi::Print("Computing solution error estimates\n\n"); - CurlFluxErrorEstimator estimator( - curlcurlop.GetMaterialOp(), curlcurlop.GetNDSpaces(), - iodata.solver.linear.estimator_tol, iodata.solver.linear.estimator_max_it, 0, - iodata.solver.pa_order_threshold); - ErrorIndicator indicator; - for (int i = 0; i < nstep; i++) - { - estimator.AddErrorIndicator(A[i], indicator); - } - - int i = 0; - for (const auto &[idx, data] : surf_j_op) - { - // Get the magnitude of the current excitations (unit J_s,inc, but circuit current I is - // the integral of J_s,inc over port). - Iinc(i) = data.GetExcitationCurrent(); - MFEM_VERIFY(Iinc(i) > 0.0, "Zero current excitation for magnetostatic solver!"); - - // Compute B = ∇ x A on the true dofs, and set the internal GridFunctions in - // PostOperator for all postprocessing operations. - Curl.Mult(A[i], B); - postop.SetBGridFunction(B); - postop.SetAGridFunction(A[i]); - double Um = postop.GetHFieldEnergy(); - PostprocessDomains(postop, "i", i, idx, 0.0, Um, 0.0, 0.0); - PostprocessSurfaces(postop, "i", i, idx, 0.0, Um, 0.0, Iinc(i)); - PostprocessProbes(postop, "i", i, idx); - if (i < iodata.solver.magnetostatic.n_post) - { - PostprocessFields(postop, i, idx, (i == 0) ? &indicator : nullptr); - Mpi::Print("Wrote fields to disk for terminal {:d}\n", idx); - } - if (i == 0) - { - PostprocessErrorIndicator(postop, indicator); - } - - // Diagonal: M_ii = 2 U_m(A_i) / I_i². - M(i, i) = Mm(i, i) = 2.0 * Um / (Iinc(i) * Iinc(i)); - i++; - } - - // Off-diagonals: M_ij = U_m(A_i + A_j) / (I_i I_j) - 1/2 (I_i/I_j M_ii + I_j/I_i M_jj). - for (i = 0; i < M.Height(); i++) - { - for (int j = 0; j < M.Width(); j++) - { - if (j < i) - { - // Copy lower triangle from already computed upper triangle. - M(i, j) = M(j, i); - Mm(i, j) = Mm(j, i); - Mm(i, i) -= Mm(i, j); - } - else if (j > i) - { - linalg::AXPBYPCZ(1.0, A[i], 1.0, A[j], 0.0, Aij); - Curl.Mult(Aij, B); - postop.SetBGridFunction(B); - double Um = postop.GetHFieldEnergy(); - M(i, j) = Um / (Iinc(i) * Iinc(j)) - - 0.5 * (M(i, i) * Iinc(i) / Iinc(j) + M(j, j) * Iinc(j) / Iinc(i)); - Mm(i, j) = -M(i, j); - Mm(i, i) -= Mm(i, j); - } - } - } - mfem::DenseMatrix Minv(M); - Minv.Invert(); // In-place, uses LAPACK (when available) and should be cheap - PostprocessTerminals(surf_j_op, M, Minv, Mm); - return indicator; -} - -void MagnetostaticSolver::PostprocessTerminals(const SurfaceCurrentOperator &surf_j_op, - const mfem::DenseMatrix &M, - const mfem::DenseMatrix &Minv, - const mfem::DenseMatrix &Mm) const -{ - // Only root writes to disk (every process has full matrices). - if (!root || post_dir.length() == 0) - { - return; - } - - // Write inductance matrix data. - auto PrintMatrix = [&surf_j_op, this](const std::string &file, const std::string &name, - const std::string &unit, - const mfem::DenseMatrix &mat, double scale) - { - std::string path = post_dir + file; - auto output = OutputFile(path, false); - output.print("{:>{}s},", "i", table.w1); - for (const auto &[idx2, data2] : surf_j_op) - { - // clang-format off - output.print("{:>{}s}{}", - name + "[i][" + std::to_string(idx2) + "] " + unit, table.w, - (idx2 == surf_j_op.rbegin()->first) ? "" : ","); - // clang-format on - } - output.print("\n"); - int i = 0; - for (const auto &[idx, data] : surf_j_op) - { - int j = 0; - output.print("{:{}.{}e},", static_cast(idx), table.w1, table.p1); - for (const auto &[idx2, data2] : surf_j_op) - { - // clang-format off - output.print("{:+{}.{}e}{}", - mat(i, j) * scale, table.w, table.p, - (idx2 == surf_j_op.rbegin()->first) ? "" : ","); - // clang-format on - j++; - } - output.print("\n"); - i++; - } - }; - const double H = iodata.DimensionalizeValue(IoData::ValueType::INDUCTANCE, 1.0); - PrintMatrix("terminal-M.csv", "M", "(H)", M, H); - PrintMatrix("terminal-Minv.csv", "M⁻¹", "(1/H)", Minv, 1.0 / H); - PrintMatrix("terminal-Mm.csv", "M_m", "(H)", Mm, H); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "magnetostaticsolver.hpp" + +#include +#include "fem/errorindicator.hpp" +#include "fem/mesh.hpp" +#include "linalg/errorestimator.hpp" +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "models/curlcurloperator.hpp" +#include "models/postoperator.hpp" +#include "models/surfacecurrentoperator.hpp" +#include "utils/communication.hpp" +#include "utils/iodata.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +std::pair +MagnetostaticSolver::Solve(const std::vector> &mesh) const +{ + // Construct the system matrix defining the linear operator. Dirichlet boundaries are + // handled eliminating the rows and columns of the system matrix for the corresponding + // dofs. + BlockTimer bt0(Timer::CONSTRUCT); + CurlCurlOperator curlcurl_op(iodata, mesh); + auto K = curlcurl_op.GetStiffnessMatrix(); + const auto &Curl = curlcurl_op.GetCurlMatrix(); + SaveMetadata(curlcurl_op.GetNDSpaces()); + + // Set up the linear solver. + KspSolver ksp(iodata, curlcurl_op.GetNDSpaces(), &curlcurl_op.GetH1Spaces()); + ksp.SetOperators(*K, *K); + + // Terminal indices are the set of boundaries over which to compute the inductance matrix. + PostOperator post_op(iodata, curlcurl_op); + int n_step = static_cast(curlcurl_op.GetSurfaceCurrentOp().Size()); + MFEM_VERIFY(n_step > 0, + "No surface current boundaries specified for magnetostatic simulation!"); + + // Source term and solution vector storage. + Vector RHS(Curl.Width()), B(Curl.Height()); + std::vector A(n_step); + std::vector I_inc(n_step); + + // Initialize structures for storing and reducing the results of error estimation. + CurlFluxErrorEstimator estimator( + curlcurl_op.GetMaterialOp(), curlcurl_op.GetRTSpace(), curlcurl_op.GetNDSpaces(), + iodata.solver.linear.estimator_tol, iodata.solver.linear.estimator_max_it, 0, + iodata.solver.linear.estimator_mg); + ErrorIndicator indicator; + + // Main loop over current source boundaries. + Mpi::Print("\nComputing magnetostatic fields for {:d} source {}\n", n_step, + (n_step > 1) ? "boundaries" : "boundary"); + int step = 0; + auto t0 = Timer::Now(); + for (const auto &[idx, data] : curlcurl_op.GetSurfaceCurrentOp()) + { + Mpi::Print("\nIt {:d}/{:d}: Index = {:d} (elapsed time = {:.2e} s)\n", step + 1, n_step, + idx, Timer::Duration(Timer::Now() - t0).count()); + + // Form and solve the linear system for a prescribed current on the specified source. + Mpi::Print("\n"); + A[step].SetSize(RHS.Size()); + A[step].UseDevice(true); + A[step] = 0.0; + curlcurl_op.GetExcitationVector(idx, RHS); + ksp.Mult(RHS, A[step]); + + // Start Post-processing. + BlockTimer bt2(Timer::POSTPRO); + Mpi::Print(" Sol. ||A|| = {:.6e} (||RHS|| = {:.6e})\n", + linalg::Norml2(curlcurl_op.GetComm(), A[step]), + linalg::Norml2(curlcurl_op.GetComm(), RHS)); + + // Compute B = ∇ x A on the true dofs. + Curl.Mult(A[step], B); + + // Save excitation current for inductance matrix calculation. + I_inc[step] = data.GetExcitationCurrent(); + + // Measurement and printing. + auto total_domain_energy = post_op.MeasureAndPrintAll(step, A[step], B, idx); + + // Calculate and record the error indicators. + Mpi::Print(" Updating solution error estimates\n"); + estimator.AddErrorIndicator(B, total_domain_energy, indicator); + + // Next source. + step++; + } + + // Postprocess the inductance matrix from the computed field solutions. + BlockTimer bt1(Timer::POSTPRO); + SaveMetadata(ksp); + PostprocessTerminals(post_op, curlcurl_op.GetSurfaceCurrentOp(), A, I_inc); + post_op.MeasureFinalize(indicator); + return {indicator, curlcurl_op.GlobalTrueVSize()}; +} + +void MagnetostaticSolver::PostprocessTerminals( + PostOperator &post_op, + const SurfaceCurrentOperator &surf_j_op, const std::vector &A, + const std::vector &I_inc) const +{ + // Postprocess the Maxwell inductance matrix. See p. 97 of the COMSOL AC/DC Module manual + // for the associated formulas based on the magnetic field energy based on a current + // excitation for each port. Alternatively, we could compute the resulting loop fluxes to + // get M directly as: + // Φ_i = ∫ B ⋅ n_j dS + // and M_ij = Φ_i/I_j. The energy formulation avoids having to locally integrate B = + // ∇ x A. + mfem::DenseMatrix M(A.size()), Mm(A.size()); + for (int i = 0; i < M.Height(); i++) + { + // Diagonal: Mᵢᵢ = 2 Uₘ(Aᵢ) / Iᵢ² = (Aᵢᵀ K Aᵢ) / Iᵢ² + auto &A_gf = post_op.GetAGridFunction().Real(); + auto &H_gf = post_op.GetDomainPostOp().H; + A_gf.SetFromTrueDofs(A[i]); + post_op.GetDomainPostOp().M_mag->Mult(A_gf, H_gf); + M(i, i) = Mm(i, i) = + linalg::Dot(post_op.GetComm(), A_gf, H_gf) / (I_inc[i] * I_inc[i]); + + // Off-diagonals: Mᵢⱼ = Uₘ(Aᵢ + Aⱼ) / (Iᵢ Iⱼ) - 1/2 (Iᵢ/Iⱼ Mᵢᵢ + Iⱼ/Iᵢ Mⱼⱼ) + // = (Aⱼᵀ K Aᵢ) / (Iᵢ Iⱼ) + for (int j = i + 1; j < M.Width(); j++) + { + A_gf.SetFromTrueDofs(A[j]); + M(i, j) = linalg::Dot(post_op.GetComm(), A_gf, H_gf) / (I_inc[i] * I_inc[j]); + Mm(i, j) = -M(i, j); + Mm(i, i) -= Mm(i, j); + } + + // Copy lower triangle from already computed upper triangle. + for (int j = 0; j < i; j++) + { + M(i, j) = M(j, i); + Mm(i, j) = Mm(j, i); + Mm(i, i) -= Mm(i, j); + } + } + mfem::DenseMatrix Minv(M); + Minv.Invert(); // In-place, uses LAPACK (when available) and should be cheap + + // Only root writes to disk (every process has full matrices). + if (!root) + { + return; + } + using fmt::format; + + // Write inductance matrix data. + auto PrintMatrix = [&surf_j_op, this](const std::string &file, const std::string &name, + const std::string &unit, + const mfem::DenseMatrix &mat, double scale) + { + TableWithCSVFile output(fs::path(post_dir / file).string()); + output.table.insert(Column("i", "i", 0, 0, 2, "")); + int j = 0; + for (const auto &[idx2, data2] : surf_j_op) + { + output.table.insert(format("i2{}", idx2), format("{}[i][{}] {}", name, idx2, unit)); + // Use the fact that iterator over i and j is the same span. + output.table["i"] << idx2; + + auto &col = output.table[format("i2{}", idx2)]; + for (std::size_t i = 0; i < surf_j_op.Size(); i++) + { + col << mat(i, j) * scale; + } + j++; + } + output.WriteFullTableTrunc(); + }; + const double H = iodata.units.GetScaleFactor(); + PrintMatrix("terminal-M.csv", "M", "(H)", M, H); + PrintMatrix("terminal-Minv.csv", "M⁻¹", "(1/H)", Minv, 1.0 / H); + PrintMatrix("terminal-Mm.csv", "M_m", "(H)", Mm, H); + + // Also write out a file with source current excitations. + { + TableWithCSVFile terminal_I(fs::path(post_dir / "terminal-I.csv").string()); + terminal_I.table.insert(Column("i", "i", 0, 0, 2, "")); + terminal_I.table.insert("Iinc", "I_inc[i] (A)"); + int i = 0; + for (const auto &[idx, data] : surf_j_op) + { + terminal_I.table["i"] << double(idx); + terminal_I.table["Iinc"] << iodata.units.Dimensionalize( + I_inc[i]); + i++; + } + terminal_I.WriteFullTableTrunc(); + } +} + +} // namespace palace diff --git a/palace/drivers/magnetostaticsolver.hpp b/palace/drivers/magnetostaticsolver.hpp index db6a08d8d3..2145306427 100644 --- a/palace/drivers/magnetostaticsolver.hpp +++ b/palace/drivers/magnetostaticsolver.hpp @@ -1,52 +1,42 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_DRIVERS_MAGNETOSTATIC_SOLVER_HPP -#define PALACE_DRIVERS_MAGNETOSTATIC_SOLVER_HPP - -#include -#include -#include "drivers/basesolver.hpp" -#include "linalg/vector.hpp" - -namespace mfem -{ - -class DenseMatrix; -class ParMesh; - -} // namespace mfem - -namespace palace -{ - -class CurlCurlOperator; -class ErrorIndicator; -class IoData; -class PostOperator; -class SurfaceCurrentOperator; -class Timer; - -// -// Driver class for magnetostatic simulations. -// -class MagnetostaticSolver : public BaseSolver -{ -private: - ErrorIndicator Postprocess(CurlCurlOperator &curlcurlop, PostOperator &postop, - const std::vector &A) const; - - void PostprocessTerminals(const SurfaceCurrentOperator &surf_j_op, - const mfem::DenseMatrix &M, const mfem::DenseMatrix &Minv, - const mfem::DenseMatrix &Mm) const; - - std::pair - Solve(const std::vector> &mesh) const override; - -public: - using BaseSolver::BaseSolver; -}; - -} // namespace palace - -#endif // PALACE_DRIVERS_MAGNETOSTATIC_SOLVER_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_DRIVERS_MAGNETOSTATIC_SOLVER_HPP +#define PALACE_DRIVERS_MAGNETOSTATIC_SOLVER_HPP + +#include +#include +#include "drivers/basesolver.hpp" +#include "linalg/vector.hpp" +#include "utils/configfile.hpp" + +namespace palace +{ + +class ErrorIndicator; +class Mesh; +template +class PostOperator; +class SurfaceCurrentOperator; + +// +// Driver class for magnetostatic simulations. +// +class MagnetostaticSolver : public BaseSolver +{ +private: + void PostprocessTerminals(PostOperator &post_op, + const SurfaceCurrentOperator &surf_j_op, + const std::vector &A, + const std::vector &I_inc) const; + + std::pair + Solve(const std::vector> &mesh) const override; + +public: + using BaseSolver::BaseSolver; +}; + +} // namespace palace + +#endif // PALACE_DRIVERS_MAGNETOSTATIC_SOLVER_HPP diff --git a/palace/drivers/transientsolver.cpp b/palace/drivers/transientsolver.cpp index a5792cfb8f..dbad617c34 100644 --- a/palace/drivers/transientsolver.cpp +++ b/palace/drivers/transientsolver.cpp @@ -1,486 +1,187 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "transientsolver.hpp" - -#include -#include "fem/errorindicator.hpp" -#include "linalg/errorestimator.hpp" -#include "linalg/vector.hpp" -#include "models/lumpedportoperator.hpp" -#include "models/postoperator.hpp" -#include "models/spaceoperator.hpp" -#include "models/surfacecurrentoperator.hpp" -#include "models/timeoperator.hpp" -#include "utils/communication.hpp" -#include "utils/excitations.hpp" -#include "utils/iodata.hpp" -#include "utils/timer.hpp" - -namespace palace -{ - -std::pair -TransientSolver::Solve(const std::vector> &mesh) const -{ - // Set up the spatial discretization and time integrators for the E and B fields. - BlockTimer bt0(Timer::CONSTRUCT); - std::function J_coef = GetTimeExcitation(false); - std::function dJdt_coef = GetTimeExcitation(true); - SpaceOperator spaceop(iodata, mesh); - TimeOperator timeop(iodata, spaceop, dJdt_coef); - double delta_t = iodata.solver.transient.delta_t; - if (timeop.isExplicit()) - { - // Stability limited time step. - const double dt_max = timeop.GetMaxTimeStep(); - const double dts_max = iodata.DimensionalizeValue(IoData::ValueType::TIME, dt_max); - Mpi::Print(" Maximum stable time step: {:.6e} ns\n", dts_max); - delta_t = std::min(delta_t, 0.95 * dt_max); - } - int nstep = GetNumSteps(0.0, iodata.solver.transient.max_t, delta_t); - SaveMetadata(spaceop.GetNDSpaces()); - - // Time stepping is uniform in the time domain. Index sets are for computing things like - // port voltages and currents in postprocessing. - PostOperator postop(iodata, spaceop, "transient"); - { - Mpi::Print("\nComputing transient response for:\n"); - bool first = true; - for (const auto &[idx, data] : spaceop.GetLumpedPortOp()) - { - if (data.IsExcited()) - { - if (first) - { - Mpi::Print(" Lumped port excitation specified on port{}", - (spaceop.GetLumpedPortOp().Size() > 1) ? "s" : ""); - first = false; - } - Mpi::Print(" {:d}", idx); - } - } - int excitations = first; - first = true; - for (const auto &[idx, data] : spaceop.GetSurfaceCurrentOp()) - { - if (first) - { - Mpi::Print(" Surface current excitation specified on port{}", - (spaceop.GetSurfaceCurrentOp().Size() > 1) ? "s" : ""); - first = false; - } - Mpi::Print(" {:d}", idx); - } - excitations += first; - MFEM_VERIFY(excitations > 0, "No excitation specified for transient simulation!"); - } - Mpi::Print("\n"); - - // Initialize structures for storing and reducing the results of error estimation. - CurlFluxErrorEstimator estimator( - spaceop.GetMaterialOp(), spaceop.GetNDSpaces(), iodata.solver.linear.estimator_tol, - iodata.solver.linear.estimator_max_it, 0, iodata.solver.pa_order_threshold); - ErrorIndicator indicator; - - // Main time integration loop. - int step = 0; - double t = -delta_t; - auto t0 = Timer::Now(); - while (step < nstep) - { - const double ts = iodata.DimensionalizeValue(IoData::ValueType::TIME, t + delta_t); - Mpi::Print("\nIt {:d}/{:d}: t = {:e} ns (elapsed time = {:.2e} s)\n", step, nstep - 1, - ts, Timer::Duration(Timer::Now() - t0).count()); - - // Single time step t -> t + dt. - BlockTimer bt1(Timer::SOLVE); - if (step == 0) - { - Mpi::Print("\n"); - t += delta_t; - timeop.Init(); // Initial conditions - } - else - { - timeop.Step(t, delta_t); // Advances t internally - } - - // Postprocess for the time step. - BlockTimer bt2(Timer::POSTPRO); - double E_elec = 0.0, E_mag = 0.0; - const Vector &E = timeop.GetE(); - const Vector &B = timeop.GetB(); - postop.SetEGridFunction(E); - postop.SetBGridFunction(B); - postop.UpdatePorts(spaceop.GetLumpedPortOp()); - Mpi::Print(" Sol. ||E|| = {:.6e}, ||B|| = {:.6e}\n", - linalg::Norml2(spaceop.GetComm(), E), linalg::Norml2(spaceop.GetComm(), B)); - if (!iodata.solver.transient.only_port_post) - { - const double J = iodata.DimensionalizeValue(IoData::ValueType::ENERGY, 1.0); - E_elec = postop.GetEFieldEnergy(); - E_mag = postop.GetHFieldEnergy(); - Mpi::Print(" Field energy E ({:.3e} J) + H ({:.3e} J) = {:.3e} J\n", E_elec * J, - E_mag * J, (E_elec + E_mag) * J); - } - - // Calculate and record the error indicators. - Mpi::Print(" Updating solution error estimates\n"); - estimator.AddErrorIndicator(E, indicator); - - // Postprocess port voltages/currents and optionally write solution to disk. - Postprocess(postop, spaceop.GetLumpedPortOp(), spaceop.GetSurfaceCurrentOp(), step, t, - J_coef(t), E_elec, E_mag, !iodata.solver.transient.only_port_post, - (step == nstep - 1) ? &indicator : nullptr); - - // Increment time step. - step++; - } - SaveMetadata(timeop.GetLinearSolver()); - return {indicator, spaceop.GlobalTrueVSize()}; -} - -std::function TransientSolver::GetTimeExcitation(bool dot) const -{ - using namespace excitations; - using F = std::function; - const config::TransientSolverData &data = iodata.solver.transient; - const config::TransientSolverData::ExcitationType &type = data.excitation; - if (type == config::TransientSolverData::ExcitationType::SINUSOIDAL || - type == config::TransientSolverData::ExcitationType::MOD_GAUSSIAN) - { - MFEM_VERIFY(data.pulse_f > 0.0, - "Excitation frequency is missing for transient simulation!"); - } - if (type == config::TransientSolverData::ExcitationType::GAUSSIAN || - type == config::TransientSolverData::ExcitationType::DIFF_GAUSSIAN || - type == config::TransientSolverData::ExcitationType::MOD_GAUSSIAN || - type == config::TransientSolverData::ExcitationType::SMOOTH_STEP) - { - MFEM_VERIFY(data.pulse_tau > 0.0, - "Excitation width is missing for transient simulation!"); - } - const double delay = - (type == config::TransientSolverData::ExcitationType::GAUSSIAN || - type == config::TransientSolverData::ExcitationType::DIFF_GAUSSIAN || - type == config::TransientSolverData::ExcitationType::MOD_GAUSSIAN) - ? 4.5 * data.pulse_tau - : 0.0; - switch (type) - { - case config::TransientSolverData::ExcitationType::SINUSOIDAL: - if (dot) - { - return F{[=](double t) { return dpulse_sinusoidal(t, data.pulse_f, delay); }}; - } - else - { - return F{[=](double t) { return pulse_sinusoidal(t, data.pulse_f, delay); }}; - } - break; - case config::TransientSolverData::ExcitationType::GAUSSIAN: - if (dot) - { - return F{[=](double t) { return dpulse_gaussian(t, data.pulse_tau, delay); }}; - } - else - { - return F{[=](double t) { return pulse_gaussian(t, data.pulse_tau, delay); }}; - } - break; - case config::TransientSolverData::ExcitationType::DIFF_GAUSSIAN: - if (dot) - { - return F{[=](double t) { return dpulse_gaussian_diff(t, data.pulse_tau, delay); }}; - } - else - { - return F{[=](double t) { return pulse_gaussian_diff(t, data.pulse_tau, delay); }}; - } - break; - case config::TransientSolverData::ExcitationType::MOD_GAUSSIAN: - if (dot) - { - return F{[=](double t) - { return dpulse_gaussian_mod(t, data.pulse_f, data.pulse_tau, delay); }}; - } - else - { - return F{[=](double t) - { return pulse_gaussian_mod(t, data.pulse_f, data.pulse_tau, delay); }}; - } - break; - case config::TransientSolverData::ExcitationType::RAMP_STEP: - if (dot) - { - return F{[=](double t) { return dpulse_ramp(t, data.pulse_tau, delay); }}; - } - else - { - return F{[=](double t) { return pulse_ramp(t, data.pulse_tau, delay); }}; - } - break; - case config::TransientSolverData::ExcitationType::SMOOTH_STEP: - if (dot) - { - return F{[=](double t) { return dpulse_smootherstep(t, data.pulse_tau, delay); }}; - } - else - { - return F{[=](double t) { return pulse_smootherstep(t, data.pulse_tau, delay); }}; - } - break; - } - return F{}; -} - -int TransientSolver::GetNumSteps(double start, double end, double delta) const -{ - MFEM_VERIFY(delta > 0.0, "Zero time step is not allowed!"); - constexpr double delta_eps = 1.0e-9; // 9 digits of precision comparing endpoint - double dnfreq = std::abs(end - start) / std::abs(delta); - int nstep = 1 + static_cast(dnfreq); - double dfinal = start + nstep * delta; - return nstep + ((delta < 0.0 && dfinal - end > -delta_eps * end) || - (delta > 0.0 && dfinal - end < delta_eps * end)); -} - -void TransientSolver::Postprocess(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, - const SurfaceCurrentOperator &surf_j_op, int step, - double t, double J_coef, double E_elec, double E_mag, - bool full, const ErrorIndicator *indicator) const -{ - // The internal GridFunctions for PostOperator have already been set from the E and B - // solutions in the main time integration loop. - const double ts = iodata.DimensionalizeValue(IoData::ValueType::TIME, t); - PostprocessCurrents(postop, surf_j_op, step, t, J_coef); - PostprocessPorts(postop, lumped_port_op, step, t, J_coef); - if (full) - { - double E_cap = postop.GetLumpedCapacitorEnergy(lumped_port_op); - double E_ind = postop.GetLumpedInductorEnergy(lumped_port_op); - PostprocessDomains(postop, "t (ns)", step, ts, E_elec, E_mag, E_cap, E_ind); - PostprocessSurfaces(postop, "t (ns)", step, ts, E_elec + E_cap, E_mag + E_ind, 1.0, - 1.0); - PostprocessProbes(postop, "t (ns)", step, ts); - } - if (iodata.solver.transient.delta_post > 0 && - step % iodata.solver.transient.delta_post == 0) - { - Mpi::Print("\n"); - PostprocessFields(postop, step / iodata.solver.transient.delta_post, ts, indicator); - Mpi::Print(" Wrote fields to disk at step {:d}\n", step); - } - if (indicator) - { - PostprocessErrorIndicator(postop, *indicator); - } -} - -namespace -{ - -struct CurrentData -{ - const int idx; // Current source index - const double Iinc; // Excitation current -}; - -struct PortData -{ - const int idx; // Port index - const bool excitation; // Flag for excited ports - const double Vinc, Iinc; // Incident voltage, current - const double Vi, Ii; // Port voltage, current -}; - -} // namespace - -void TransientSolver::PostprocessCurrents(const PostOperator &postop, - const SurfaceCurrentOperator &surf_j_op, int step, - double t, double J_coef) const -{ - // Postprocess the time domain surface current excitations. - if (post_dir.length() == 0) - { - return; - } - std::vector j_data; - j_data.reserve(surf_j_op.Size()); - for (const auto &[idx, data] : surf_j_op) - { - const double Iinc = data.GetExcitationCurrent() * J_coef; // Iinc(t) = g(t) Iinc - j_data.push_back({idx, iodata.DimensionalizeValue(IoData::ValueType::CURRENT, Iinc)}); - } - if (root && !j_data.empty()) - { - std::string path = post_dir + "surface-I.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", "t (ns)", table.w1); - for (const auto &data : j_data) - { - // clang-format off - output.print("{:>{}s}{}", - "Iinc[" + std::to_string(data.idx) + "] (A)", table.w, - (data.idx == j_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - // clang-format off - output.print("{:{}.{}e},", - iodata.DimensionalizeValue(IoData::ValueType::TIME, t), - table.w1, table.p1); - // clang-format on - for (const auto &data : j_data) - { - // clang-format off - output.print("{:+{}.{}e}{}", - data.Iinc, table.w, table.p, - (data.idx == j_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } -} - -void TransientSolver::PostprocessPorts(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, int step, - double t, double J_coef) const -{ - // Postprocess the time domain lumped port voltages and currents, which can then be used - // to compute S- or Z-parameters. - if (post_dir.length() == 0) - { - return; - } - std::vector port_data; - port_data.reserve(lumped_port_op.Size()); - for (const auto &[idx, data] : lumped_port_op) - { - const double Vinc = data.GetExcitationVoltage() * J_coef; // Vinc(t) = g(t) Vinc - const double Iinc = - (std::abs(Vinc) > 0.0) ? data.GetExcitationPower() * J_coef * J_coef / Vinc : 0.0; - const double Vi = postop.GetPortVoltage(lumped_port_op, idx).real(); - const double Ii = postop.GetPortCurrent(lumped_port_op, idx).real(); - port_data.push_back({idx, data.IsExcited(), - iodata.DimensionalizeValue(IoData::ValueType::VOLTAGE, Vinc), - iodata.DimensionalizeValue(IoData::ValueType::CURRENT, Iinc), - iodata.DimensionalizeValue(IoData::ValueType::VOLTAGE, Vi), - iodata.DimensionalizeValue(IoData::ValueType::CURRENT, Ii)}); - } - if (root && !port_data.empty()) - { - // Write the port voltages. - { - std::string path = post_dir + "port-V.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", "t (ns)", table.w1); - for (const auto &data : port_data) - { - if (data.excitation) - { - // clang-format off - output.print("{:>{}s},", - "V_inc[" + std::to_string(data.idx) + "] (V)", table.w); - // clang-format on - } - } - for (const auto &data : port_data) - { - // clang-format off - output.print("{:>{}s}{}", - "V[" + std::to_string(data.idx) + "] (V)", table.w, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - // clang-format off - output.print("{:{}.{}e},", - iodata.DimensionalizeValue(IoData::ValueType::TIME, t), - table.w1, table.p1); - // clang-format on - for (const auto &data : port_data) - { - if (data.excitation) - { - // clang-format off - output.print("{:+{}.{}e},", - data.Vinc, table.w, table.p); - // clang-format on - } - } - for (const auto &data : port_data) - { - // clang-format off - output.print("{:+{}.{}e}{}", - data.Vi, table.w, table.p, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - - // Write the port currents. - { - std::string path = post_dir + "port-I.csv"; - auto output = OutputFile(path, (step > 0)); - if (step == 0) - { - output.print("{:>{}s},", "t (ns)", table.w1); - for (const auto &data : port_data) - { - if (data.excitation) - { - // clang-format off - output.print("{:>{}s},", - "I_inc[" + std::to_string(data.idx) + "] (A)", table.w); - // clang-format on - } - } - for (const auto &data : port_data) - { - // clang-format off - output.print("{:>{}s}{}", - "I[" + std::to_string(data.idx) + "] (A)", table.w, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - // clang-format off - output.print("{:{}.{}e},", - iodata.DimensionalizeValue(IoData::ValueType::TIME, t), - table.w1, table.p1); - // clang-format on - for (const auto &data : port_data) - { - if (data.excitation) - { - // clang-format off - output.print("{:+{}.{}e},", - data.Iinc, table.w, table.p); - // clang-format on - } - } - for (const auto &data : port_data) - { - // clang-format off - output.print("{:+{}.{}e}{}", - data.Ii, table.w, table.p, - (data.idx == port_data.back().idx) ? "" : ","); - // clang-format on - } - output.print("\n"); - } - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "transientsolver.hpp" + +#include +#include "fem/errorindicator.hpp" +#include "fem/mesh.hpp" +#include "linalg/errorestimator.hpp" +#include "linalg/vector.hpp" +#include "models/lumpedportoperator.hpp" +#include "models/portexcitations.hpp" +#include "models/postoperator.hpp" +#include "models/spaceoperator.hpp" +#include "models/surfacecurrentoperator.hpp" +#include "models/timeoperator.hpp" +#include "utils/communication.hpp" +#include "utils/excitations.hpp" +#include "utils/iodata.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +std::pair +TransientSolver::Solve(const std::vector> &mesh) const +{ + // Set up the spatial discretization and time integrators for the E and B fields. + BlockTimer bt0(Timer::CONSTRUCT); + std::function J_coef = GetTimeExcitation(false); + std::function dJdt_coef = GetTimeExcitation(true); + SpaceOperator space_op(iodata, mesh); + TimeOperator time_op(iodata, space_op, dJdt_coef); + + double delta_t = iodata.solver.transient.delta_t; + int n_step = config::GetNumSteps(0.0, iodata.solver.transient.max_t, delta_t); + SaveMetadata(space_op.GetNDSpaces()); + + // Time stepping is uniform in the time domain. Index sets are for computing things like + // port voltages and currents in postprocessing. + PostOperator post_op(iodata, space_op); + + // Transient solver only supports a single excitation, this is check in SpaceOperator. + Mpi::Print("\nComputing transient response for:\n{}", + space_op.GetPortExcitations().FmtLog()); + + // Initialize structures for storing and reducing the results of error estimation. + TimeDependentFluxErrorEstimator estimator( + space_op.GetMaterialOp(), space_op.GetNDSpaces(), space_op.GetRTSpaces(), + iodata.solver.linear.estimator_tol, iodata.solver.linear.estimator_max_it, 0, + iodata.solver.linear.estimator_mg); + ErrorIndicator indicator; + + // Main time integration loop. + double t = -delta_t; + auto t0 = Timer::Now(); + for (int step = 0; step < n_step; step++) + { + const double ts = iodata.units.Dimensionalize(t + delta_t); + Mpi::Print("\nIt {:d}/{:d}: t = {:e} ns (elapsed time = {:.2e} s)\n", step, n_step - 1, + ts, Timer::Duration(Timer::Now() - t0).count()); + + // Single time step t -> t + dt. + BlockTimer bt1(Timer::TS); + if (step == 0) + { + Mpi::Print("\n"); + t += delta_t; + time_op.Init(); // Initial conditions + } + else + { + time_op.Step(t, delta_t); // Advances t internally + } + + // Postprocess for the time step. + BlockTimer bt2(Timer::POSTPRO); + const Vector &E = time_op.GetE(); + const Vector &B = time_op.GetB(); + Mpi::Print(" Sol. ||E|| = {:.6e}, ||B|| = {:.6e}\n", + linalg::Norml2(space_op.GetComm(), E), + linalg::Norml2(space_op.GetComm(), B)); + + auto total_domain_energy = post_op.MeasureAndPrintAll(step, E, B, t, J_coef(t)); + + // Calculate and record the error indicators. + Mpi::Print(" Updating solution error estimates\n"); + estimator.AddErrorIndicator(E, B, total_domain_energy, indicator); + } + // Final postprocessing & printing. + BlockTimer bt1(Timer::POSTPRO); + time_op.PrintStats(); + SaveMetadata(time_op.GetLinearSolver()); + post_op.MeasureFinalize(indicator); + return {indicator, space_op.GlobalTrueVSize()}; +} + +std::function TransientSolver::GetTimeExcitation(bool dot) const +{ + using namespace excitations; + using F = std::function; + const config::TransientSolverData &data = iodata.solver.transient; + const Excitation &type = data.excitation; + if (type == Excitation::SINUSOIDAL || type == Excitation::MOD_GAUSSIAN) + { + MFEM_VERIFY(data.pulse_f > 0.0, + "Excitation frequency is missing for transient simulation!"); + } + if (type == Excitation::GAUSSIAN || type == Excitation::DIFF_GAUSSIAN || + type == Excitation::MOD_GAUSSIAN || type == Excitation::SMOOTH_STEP) + { + MFEM_VERIFY(data.pulse_tau > 0.0, + "Excitation width is missing for transient simulation!"); + } + const double delay = (type == Excitation::GAUSSIAN || type == Excitation::DIFF_GAUSSIAN || + type == Excitation::MOD_GAUSSIAN) + ? 4.5 * data.pulse_tau + : 0.0; + switch (type) + { + case Excitation::SINUSOIDAL: + if (dot) + { + return F{[=](double t) { return dpulse_sinusoidal(t, data.pulse_f, delay); }}; + } + else + { + return F{[=](double t) { return pulse_sinusoidal(t, data.pulse_f, delay); }}; + } + break; + case Excitation::GAUSSIAN: + if (dot) + { + return F{[=](double t) { return dpulse_gaussian(t, data.pulse_tau, delay); }}; + } + else + { + return F{[=](double t) { return pulse_gaussian(t, data.pulse_tau, delay); }}; + } + break; + case Excitation::DIFF_GAUSSIAN: + if (dot) + { + return F{[=](double t) { return dpulse_gaussian_diff(t, data.pulse_tau, delay); }}; + } + else + { + return F{[=](double t) { return pulse_gaussian_diff(t, data.pulse_tau, delay); }}; + } + break; + case Excitation::MOD_GAUSSIAN: + if (dot) + { + return F{[=](double t) + { return dpulse_gaussian_mod(t, data.pulse_f, data.pulse_tau, delay); }}; + } + else + { + return F{[=](double t) + { return pulse_gaussian_mod(t, data.pulse_f, data.pulse_tau, delay); }}; + } + break; + case Excitation::RAMP_STEP: + if (dot) + { + return F{[=](double t) { return dpulse_ramp(t, data.pulse_tau, delay); }}; + } + else + { + return F{[=](double t) { return pulse_ramp(t, data.pulse_tau, delay); }}; + } + break; + case Excitation::SMOOTH_STEP: + if (dot) + { + return F{[=](double t) { return dpulse_smootherstep(t, data.pulse_tau, delay); }}; + } + else + { + return F{[=](double t) { return pulse_smootherstep(t, data.pulse_tau, delay); }}; + } + break; + } + return F{}; +} + +} // namespace palace diff --git a/palace/drivers/transientsolver.hpp b/palace/drivers/transientsolver.hpp index 26a0e4b143..9dacbd1579 100644 --- a/palace/drivers/transientsolver.hpp +++ b/palace/drivers/transientsolver.hpp @@ -1,61 +1,35 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_DRIVERS_TRANSIENT_SOLVER_HPP -#define PALACE_DRIVERS_TRANSIENT_SOLVER_HPP - -#include -#include -#include -#include "drivers/basesolver.hpp" - -namespace mfem -{ - -class ParMesh; - -} // namespace mfem - -namespace palace -{ - -class ErrorIndicator; -class IoData; -class LumpedPortOperator; -class PostOperator; -class SurfaceCurrentOperator; -class Timer; - -// -// Driver class for time-dependent driven terminal simulations. -// -class TransientSolver : public BaseSolver -{ -private: - std::function GetTimeExcitation(bool dot) const; - - int GetNumSteps(double start, double end, double delta) const; - - void Postprocess(const PostOperator &postop, const LumpedPortOperator &lumped_port_op, - const SurfaceCurrentOperator &surf_j_op, int step, double t, - double J_coef, double E_elec, double E_mag, bool full, - const ErrorIndicator *indicator) const; - - void PostprocessCurrents(const PostOperator &postop, - const SurfaceCurrentOperator &surf_j_op, int step, double t, - double J_coef) const; - - void PostprocessPorts(const PostOperator &postop, - const LumpedPortOperator &lumped_port_op, int step, double t, - double J_coef) const; - - std::pair - Solve(const std::vector> &mesh) const override; - -public: - using BaseSolver::BaseSolver; -}; - -} // namespace palace - -#endif // PALACE_DRIVERS_TRANSIENT_SOLVER_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_DRIVERS_TRANSIENT_SOLVER_HPP +#define PALACE_DRIVERS_TRANSIENT_SOLVER_HPP + +#include +#include +#include +#include "drivers/basesolver.hpp" + +namespace palace +{ + +class ErrorIndicator; +class Mesh; + +// +// Driver class for time-dependent driven terminal simulations. +// +class TransientSolver : public BaseSolver +{ +private: + std::function GetTimeExcitation(bool dot) const; + + std::pair + Solve(const std::vector> &mesh) const override; + +public: + using BaseSolver::BaseSolver; +}; + +} // namespace palace + +#endif // PALACE_DRIVERS_TRANSIENT_SOLVER_HPP diff --git a/palace/env_var.bat b/palace/env_var.bat index 0964a30b0a..03297e2613 100644 --- a/palace/env_var.bat +++ b/palace/env_var.bat @@ -1,5 +1,5 @@ set WELSIM_LIBPACK=D:\WelSimLLC\CodeDV\libPack -set WELSIM_EXEC=D:\WelSimLLC\executable28 +set WELSIM_EXEC=D:\WelSimLLC\executable32 set BOOST_VC=143 set BOOST_VERSION=1_80 @@ -7,7 +7,9 @@ set VTK_VER=8.2 set ITK_VER=5.2 set QTDIR=C:\Qt\5.15.2\msvc2019_64 set QTBIN=%QTDIR%\bin -set INTEL_MKL=C:\Program Files (x86)\Intel\oneAPI\mkl\2022.2.0 +set INTEL_ONEAPI=C:\Program Files (x86)\Intel\oneAPI +set INTEL_MKL=%INTEL_ONEAPI%\mkl\latest +set INTEL_TBB=%INTEL_ONEAPI%\tbb\latest set PATH="C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\IDE\";%QTBIN%;%PATH% diff --git a/palace/fem/CMakeLists.txt b/palace/fem/CMakeLists.txt index 714e5b8019..9efc0537f8 100644 --- a/palace/fem/CMakeLists.txt +++ b/palace/fem/CMakeLists.txt @@ -1,32 +1,45 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Add source files and subdirectories. -# - -target_sources(${LIB_TARGET_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/bilinearform.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/coefficient.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/errorindicator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/fespace.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integrator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/interpolator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/lumpedelement.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integ/curlcurl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integ/curlcurlmass.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integ/diffusion.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integ/diffusionmass.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integ/divdiv.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integ/divdivmass.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integ/grad.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integ/mass.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integ/mixedveccurl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integ/mixedvecgrad.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/integ/vecfemass.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libceed/basis.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libceed/operator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libceed/restriction.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libceed/utils.cpp -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Add source files and subdirectories. +# + +target_sources(${LIB_TARGET_NAME} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/bilinearform.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/coefficient.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/errorindicator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/fespace.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/gridfunction.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integrator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/interpolator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/lumpedelement.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mesh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integ/curlcurl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integ/curlcurlmass.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integ/diffusion.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integ/diffusionmass.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integ/divdiv.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integ/divdivmass.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integ/grad.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integ/mass.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integ/mixedveccurl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integ/mixedvecgrad.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/integ/vecfemass.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libceed/basis.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libceed/ceed.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libceed/coefficient.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libceed/integrator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libceed/operator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libceed/restriction.cpp +) + +# Handle device source code +set(TARGET_SOURCES_DEVICE + ${TARGET_SOURCES_DEVICE} + ${CMAKE_CURRENT_SOURCE_DIR}/errorindicator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mesh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libceed/operator.cpp + PARENT_SCOPE +) diff --git a/palace/fem/bilinearform.cpp b/palace/fem/bilinearform.cpp index 3540807f0c..e1467688a9 100644 --- a/palace/fem/bilinearform.cpp +++ b/palace/fem/bilinearform.cpp @@ -1,253 +1,284 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "bilinearform.hpp" - -#include -#include -#include "fem/fespace.hpp" -#include "fem/libceed/hash.hpp" -#include "fem/libceed/utils.hpp" -#include "utils/omp.hpp" - -namespace palace -{ - -namespace -{ - -using ceed::internal::FiniteElementKey; -using ceed::internal::FiniteElementPairHash; -using ceed::internal::FiniteElementPairKey; - -// Count the number of elements of each type in the local mesh. -std::unordered_map, FiniteElementPairHash> -GetElementIndices(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, bool use_bdr, int start, - int stop) -{ - std::unordered_map counts, offsets; - std::unordered_map, FiniteElementPairHash> - element_indices; - - // Count the number of elements of each type and order. - for (int i = start; i < stop; i++) - { - const mfem::FiniteElement &trial_fe = - use_bdr ? *trial_fespace.GetBE(i) : *trial_fespace.GetFE(i); - const mfem::FiniteElement &test_fe = - use_bdr ? *test_fespace.GetBE(i) : *test_fespace.GetFE(i); - FiniteElementPairKey key = - std::make_pair(FiniteElementKey(trial_fe), FiniteElementKey(test_fe)); - auto value = counts.find(key); - if (value == counts.end()) - { - counts[key] = 1; - } - else - { - value->second++; - } - } - - // Populate the indices arrays for each element type. - for (const auto &value : counts) - { - offsets[value.first] = 0; - element_indices[value.first] = std::vector(value.second); - } - for (int i = start; i < stop; i++) - { - const mfem::FiniteElement &trial_fe = - use_bdr ? *trial_fespace.GetBE(i) : *trial_fespace.GetFE(i); - const mfem::FiniteElement &test_fe = - use_bdr ? *test_fespace.GetBE(i) : *test_fespace.GetFE(i); - FiniteElementPairKey key = - std::make_pair(FiniteElementKey(trial_fe), FiniteElementKey(test_fe)); - int &offset = offsets[key]; - std::vector &indices = element_indices[key]; - indices[offset++] = i; - } - - return element_indices; -} - -} // namespace - -std::unique_ptr BilinearForm::Assemble() const -{ - MFEM_VERIFY(trial_fespace.GetParMesh() == test_fespace.GetParMesh(), - "Trial and test finite element spaces must correspond to the same mesh!"); - mfem::ParMesh &mesh = *trial_fespace.GetParMesh(); - { - // In the following, we copy the mesh FE space for the nodes as a - // palace::FiniteElementSpace and replace it in the nodal grid function. Unfortunately - // mfem::ParFiniteElementSpace does not have a move constructor to make this more - // efficient, but it's only done once for the lifetime of the mesh. - mesh.EnsureNodes(); - mfem::GridFunction *mesh_nodes = mesh.GetNodes(); - mfem::FiniteElementSpace *mesh_fespace = mesh_nodes->FESpace(); - MFEM_VERIFY(dynamic_cast(mesh_fespace), - "Unexpected non-parallel FiniteElementSpace for mesh nodes!"); - if (!dynamic_cast(mesh_fespace)) - { - // Ensure the FiniteElementCollection associated with the original nodes is not - // deleted. - auto *new_mesh_fespace = - new FiniteElementSpace(*static_cast(mesh_fespace)); - mfem::FiniteElementCollection *mesh_fec = mesh_nodes->OwnFEC(); - MFEM_VERIFY(mesh_fec, "Replacing the FiniteElementSpace for mesh nodes is only " - "possible when it owns its fec/fes members!"); - mesh_nodes->MakeOwner(nullptr); - mesh.SetNodalFESpace(new_mesh_fespace); - mfem::GridFunction *new_mesh_nodes = mesh.GetNodes(); - new_mesh_nodes->MakeOwner(mesh_fec); - delete mesh_fespace; - } - } - - std::unique_ptr op; - if (&trial_fespace == &test_fespace) - { - op = std::make_unique(test_fespace.GetVSize(), - trial_fespace.GetVSize()); - } - else - { - op = - std::make_unique(test_fespace.GetVSize(), trial_fespace.GetVSize()); - } - - // Assemble the libCEED operator in parallel, each thread builds a composite operator. - // This should work fine if some threads create an empty operator (no elements or bounday - // elements). - const std::size_t nt = ceed::internal::GetCeedObjects().size(); - PalacePragmaOmp(parallel for schedule(static)) - for (std::size_t i = 0; i < nt; i++) - { - Ceed ceed = ceed::internal::GetCeedObjects()[i]; - CeedOperator loc_op, loc_op_t; - PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op)); - PalaceCeedCall(ceed, CeedCompositeOperatorCreate(ceed, &loc_op_t)); - - // Domain integrators first. - if (!domain_integs.empty()) - { - const int ne = mesh.GetNE(); - const int stride = (ne + nt - 1) / nt; - const int start = i * stride; - const int stop = std::min(start + stride, ne); - const bool use_bdr = false; - - const auto element_indices = - GetElementIndices(trial_fespace, test_fespace, use_bdr, start, stop); - - for (const auto &value : element_indices) - { - const std::vector &indices = value.second; - const int q_order = fem::GetDefaultIntegrationOrder( - trial_fespace, test_fespace, indices, use_bdr, q_extra_pk, q_extra_qk); - const mfem::IntegrationRule &ir = - mfem::IntRules.Get(mesh.GetElementGeometry(indices[0]), q_order); - - for (const auto &integ : domain_integs) - { - CeedOperator sub_op, sub_op_t; - integ->Assemble(trial_fespace, test_fespace, ir, indices, ceed, &sub_op, - &sub_op_t); - - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); - if (sub_op_t) - { - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op_t, sub_op_t)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op_t)); - } - } - } - } - - // Boundary integrators next. - if (!boundary_integs.empty()) - { - const int nbe = mesh.GetNBE(); - const int stride = (nbe + nt - 1) / nt; - const int start = i * stride; - const int stop = std::min(start + stride, nbe); - const bool use_bdr = true; - - const auto element_indices = - GetElementIndices(trial_fespace, test_fespace, use_bdr, start, stop); - - for (const auto &value : element_indices) - { - const std::vector &indices = value.second; - const int q_order = fem::GetDefaultIntegrationOrder( - trial_fespace, test_fespace, indices, use_bdr, q_extra_pk, q_extra_qk); - const mfem::IntegrationRule &ir = - mfem::IntRules.Get(mesh.GetBdrElementGeometry(indices[0]), q_order); - - for (const auto &integ : boundary_integs) - { - CeedOperator sub_op, sub_op_t; - integ->AssembleBoundary(trial_fespace, test_fespace, ir, indices, ceed, &sub_op, - &sub_op_t); - - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op, sub_op)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); - if (sub_op_t) - { - PalaceCeedCall(ceed, CeedCompositeOperatorAddSub(loc_op_t, sub_op_t)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op_t)); - } - } - } - } - - PalaceCeedCall(ceed, CeedOperatorCheckReady(loc_op)); - PalaceCeedCall(ceed, CeedOperatorCheckReady(loc_op_t)); - op->AddOper(loc_op, loc_op_t); // Thread-safe - } - - return op; -} - -std::unique_ptr BilinearForm::FullAssemble(const ceed::Operator &op, - bool skip_zeros) -{ - return ceed::CeedOperatorFullAssemble(op, skip_zeros, false); -} - -std::unique_ptr DiscreteLinearOperator::Assemble() const -{ - // Construct dof multiplicity vector for scaling to account for dofs shared between - // elements (on host, then copy to device). - const auto &test_fespace = a.GetTestSpace(); - Vector test_multiplicity(test_fespace.GetVSize()); - test_multiplicity = 0.0; - mfem::Array dofs; - auto *h_mult = test_multiplicity.HostReadWrite(); - for (int i = 0; i < test_fespace.GetNE(); i++) - { - test_fespace.GetElementVDofs(i, dofs); - for (int j = 0; j < dofs.Size(); j++) - { - const int k = dofs[j]; - h_mult[(k >= 0) ? k : -1 - k] += 1.0; - } - } - test_multiplicity.UseDevice(true); - test_multiplicity.Reciprocal(); - - auto op = a.Assemble(); - op->SetDofMultiplicity(std::move(test_multiplicity)); - return op; -} - -std::unique_ptr -DiscreteLinearOperator::FullAssemble(const ceed::Operator &op, bool skip_zeros) -{ - return ceed::CeedOperatorFullAssemble(op, skip_zeros, true); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "bilinearform.hpp" + +#include "fem/fespace.hpp" +#include "fem/libceed/basis.hpp" +#include "fem/libceed/ceed.hpp" +#include "fem/mesh.hpp" +#include "utils/omp.hpp" + +namespace palace +{ + +void BilinearForm::AssembleQuadratureData() +{ + for (auto &integ : domain_integs) + { + integ->AssembleQuadratureData(); + } + for (auto &integ : boundary_integs) + { + integ->AssembleQuadratureData(); + } +} + +std::unique_ptr +BilinearForm::PartialAssemble(const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace) const +{ + MFEM_VERIFY(&trial_fespace.GetMesh() == &test_fespace.GetMesh(), + "Trial and test finite element spaces must correspond to the same mesh!"); + const auto &mesh = trial_fespace.GetMesh(); + + // Initialize the operator. + std::unique_ptr op; + if (&trial_fespace == &test_fespace) + { + op = std::make_unique(test_fespace.GetVSize(), + trial_fespace.GetVSize()); + } + else + { + op = + std::make_unique(test_fespace.GetVSize(), trial_fespace.GetVSize()); + } + + // Assemble the libCEED operator in parallel, each thread builds a composite operator. + // This should work fine if some threads create an empty operator (no elements or boundary + // elements). + PalacePragmaOmp(parallel if (ceed::internal::NumCeeds() > 1)) + { + Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; + for (const auto &[geom, data] : mesh.GetCeedGeomFactorData(ceed)) + { + const auto trial_map_type = + trial_fespace.GetFEColl().GetMapType(mfem::Geometry::Dimension[geom]); + const auto test_map_type = + test_fespace.GetFEColl().GetMapType(mfem::Geometry::Dimension[geom]); + + if (mfem::Geometry::Dimension[geom] == mesh.Dimension() && !domain_integs.empty()) + { + // Assemble domain integrators on this element geometry type. + CeedElemRestriction trial_restr = + trial_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedElemRestriction test_restr = + test_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedBasis trial_basis = trial_fespace.GetCeedBasis(ceed, geom); + CeedBasis test_basis = test_fespace.GetCeedBasis(ceed, geom); + + for (const auto &integ : domain_integs) + { + CeedOperator sub_op; + integ->SetMapTypes(trial_map_type, test_map_type); + integ->Assemble(ceed, trial_restr, test_restr, trial_basis, test_basis, + data.geom_data, data.geom_data_restr, &sub_op); + op->AddSubOperator(sub_op); // Sub-operator owned by ceed::Operator + } + } + else if (mfem::Geometry::Dimension[geom] == mesh.Dimension() - 1 && + !boundary_integs.empty()) + { + // Assemble boundary integrators on this element geometry type. + CeedElemRestriction trial_restr = + trial_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedElemRestriction test_restr = + test_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedBasis trial_basis = trial_fespace.GetCeedBasis(ceed, geom); + CeedBasis test_basis = test_fespace.GetCeedBasis(ceed, geom); + + for (const auto &integ : boundary_integs) + { + CeedOperator sub_op; + integ->SetMapTypes(trial_map_type, test_map_type); + integ->Assemble(ceed, trial_restr, test_restr, trial_basis, test_basis, + data.geom_data, data.geom_data_restr, &sub_op); + op->AddSubOperator(sub_op); // Sub-operator owned by ceed::Operator + } + } + } + } + + // Finalize the operator (call CeedOperatorCheckReady). + op->Finalize(); + + return op; +} + +std::unique_ptr BilinearForm::FullAssemble(const ceed::Operator &op, + bool skip_zeros, bool set) +{ + return ceed::CeedOperatorFullAssemble(op, skip_zeros, set); +} + +namespace +{ + +bool UseFullAssembly(const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, int pa_order_threshold) +{ + // Returns order such that the minimum for all element types is 1. MFEM's + // RT_FECollection actually already returns order + 1 for GetOrder() for historical + // reasons. + const auto &trial_fec = trial_fespace.GetFEColl(); + const auto &test_fec = test_fespace.GetFEColl(); + int max_order = std::max( + dynamic_cast(&trial_fec) ? trial_fec.GetOrder() + 1 + : trial_fec.GetOrder(), + dynamic_cast(&test_fec) ? test_fec.GetOrder() + 1 + : test_fec.GetOrder()); + return (max_order < pa_order_threshold); +} + +bool UseFullAssembly(const FiniteElementSpace &fespace, int pa_order_threshold) +{ + return UseFullAssembly(fespace, fespace, pa_order_threshold); +} + +} // namespace + +std::unique_ptr BilinearForm::Assemble(bool skip_zeros) const +{ + if (UseFullAssembly(trial_fespace, test_fespace, pa_order_threshold)) + { + return FullAssemble(skip_zeros); + } + else + { + return PartialAssemble(); + } +} + +std::vector> +BilinearForm::Assemble(const FiniteElementSpaceHierarchy &fespaces, bool skip_zeros, + std::size_t l0) const +{ + // Only available for square operators (same test and trial spaces). + MFEM_VERIFY(&trial_fespace == &test_fespace && + &fespaces.GetFinestFESpace() == &trial_fespace, + "Assembly on a FiniteElementSpaceHierarchy should have the same BilinearForm " + "spaces and fine space of the hierarchy!"); + + // First partially assemble all of the operators. + MFEM_VERIFY(l0 < fespaces.GetNumLevels(), + "No levels available for operator coarsening (l0 = " << l0 << ")!"); + std::vector> pa_ops; + pa_ops.reserve(fespaces.GetNumLevels() - l0); + for (std::size_t l = l0; l < fespaces.GetNumLevels(); l++) + { + if (l > l0 && &fespaces.GetFESpaceAtLevel(l).GetMesh() == + &fespaces.GetFESpaceAtLevel(l - 1).GetMesh()) + { + pa_ops.push_back( + ceed::CeedOperatorCoarsen(*pa_ops.back(), fespaces.GetFESpaceAtLevel(l))); + } + else + { + pa_ops.push_back( + PartialAssemble(fespaces.GetFESpaceAtLevel(l), fespaces.GetFESpaceAtLevel(l))); + } + } + + // Construct the final operators using full or partial assemble as needed. We do not + // force the coarse-level operator to be fully assembled always, it will be only assembled + // as needed for parallel assembly. + std::vector> ops; + ops.reserve(fespaces.GetNumLevels() - l0); + for (std::size_t l = l0; l < fespaces.GetNumLevels(); l++) + { + if (UseFullAssembly(fespaces.GetFESpaceAtLevel(l), pa_order_threshold)) + { + ops.push_back(FullAssemble(*pa_ops[l - l0], skip_zeros)); + } + else + { + ops.push_back(std::move(pa_ops[l - l0])); + } + } + + return ops; +} + +std::unique_ptr DiscreteLinearOperator::PartialAssemble() const +{ + MFEM_VERIFY(&trial_fespace.GetMesh() == &test_fespace.GetMesh(), + "Trial and test finite element spaces must correspond to the same mesh!"); + const auto &mesh = trial_fespace.GetMesh(); + + // Initialize the operator. + auto op = + std::make_unique(test_fespace.GetVSize(), trial_fespace.GetVSize()); + + // Assemble the libCEED operator in parallel, each thread builds a composite operator. + // This should work fine if some threads create an empty operator (no elements or boundary + // elements). + PalacePragmaOmp(parallel if (ceed::internal::NumCeeds() > 1)) + { + Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; + for (const auto &[geom, data] : mesh.GetCeedGeomFactorData(ceed)) + { + if (mfem::Geometry::Dimension[geom] == mesh.Dimension() && !domain_interps.empty()) + { + // Assemble domain interpolators on this element geometry type. + CeedElemRestriction trial_restr = + trial_fespace.GetInterpCeedElemRestriction(ceed, geom, data.indices); + CeedElemRestriction test_restr = + test_fespace.GetInterpRangeCeedElemRestriction(ceed, geom, data.indices); + + // Construct the interpolator basis. + CeedBasis interp_basis; + const mfem::FiniteElement &trial_fe = + *trial_fespace.GetFEColl().FiniteElementForGeometry(geom); + const mfem::FiniteElement &test_fe = + *test_fespace.GetFEColl().FiniteElementForGeometry(geom); + const int trial_vdim = trial_fespace.GetVDim(); + const int test_vdim = test_fespace.GetVDim(); + ceed::InitInterpolatorBasis(trial_fe, test_fe, trial_vdim, test_vdim, ceed, + &interp_basis); + + for (const auto &interp : domain_interps) + { + CeedOperator sub_op, sub_op_t; + interp->Assemble(ceed, trial_restr, test_restr, interp_basis, &sub_op, &sub_op_t); + op->AddSubOperator(sub_op, sub_op_t); // Sub-operator owned by ceed::Operator + } + + // Basis is owned by the operator. + PalaceCeedCall(ceed, CeedBasisDestroy(&interp_basis)); + } + } + } + + // Finalize the operator (call CeedOperatorCheckReady). + op->Finalize(); + + // Construct dof multiplicity vector for scaling to account for dofs shared between + // elements (on host, then copy to device). + Vector test_multiplicity(test_fespace.GetVSize()); + test_multiplicity = 0.0; + auto *h_mult = test_multiplicity.HostReadWrite(); + PalacePragmaOmp(parallel) + { + mfem::Array dofs; + mfem::DofTransformation dof_trans; + PalacePragmaOmp(for schedule(static)) + for (int i = 0; i < test_fespace.GetMesh().GetNE(); i++) + { + test_fespace.Get().GetElementVDofs(i, dofs, dof_trans); + for (int j = 0; j < dofs.Size(); j++) + { + const int k = dofs[j]; + PalacePragmaOmp(atomic update) + h_mult[(k >= 0) ? k : -1 - k] += 1.0; + } + } + } + test_multiplicity.UseDevice(true); + test_multiplicity.Reciprocal(); + op->SetDofMultiplicity(std::move(test_multiplicity)); + + return op; +} + +} // namespace palace diff --git a/palace/fem/bilinearform.hpp b/palace/fem/bilinearform.hpp index ef2d4860bd..32ad57bc92 100644 --- a/palace/fem/bilinearform.hpp +++ b/palace/fem/bilinearform.hpp @@ -1,156 +1,136 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_FEM_BILINEARFORM_HPP -#define PALACE_FEM_BILINEARFORM_HPP - -#include -#include -#include -#include "fem/integrator.hpp" -#include "fem/libceed/operator.hpp" - -namespace palace -{ - -// -// This class implements bilinear and mixed bilinear forms based on integrators assembled -// using the libCEED library. Assembly in the form of a partially assembled operator or -// fully assembled sparse matrix is available. -// -class BilinearForm -{ -protected: - // Domain and range finite element spaces. - const mfem::ParFiniteElementSpace &trial_fespace, &test_fespace; - - // List of domain and boundary integrators making up the bilinear form. - std::vector> domain_integs, boundary_integs; - - // Integration order for quadrature rules is calculated as p_trial + p_test + w + q_extra, - // where p_test and p_trial are the test and trial space basis function orders and w is - // the geometry order. - int q_extra_pk, q_extra_qk; - -public: - BilinearForm(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, int q_extra_pk, - int q_extra_qk) - : trial_fespace(trial_fespace), test_fespace(test_fespace), q_extra_pk(q_extra_pk), - q_extra_qk(q_extra_qk) - { - } - BilinearForm(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, int q_extra = 0) - : BilinearForm(trial_fespace, test_fespace, q_extra, q_extra) - { - } - BilinearForm(const mfem::ParFiniteElementSpace &fespace, int q_extra_pk, int q_extra_qk) - : BilinearForm(fespace, fespace, q_extra_pk, q_extra_qk) - { - } - BilinearForm(const mfem::ParFiniteElementSpace &fespace, int q_extra = 0) - : BilinearForm(fespace, fespace, q_extra, q_extra) - { - } - - const auto &GetTrialSpace() const { return trial_fespace; } - const auto &GetTestSpace() const { return test_fespace; } - - // MFEM's RT_FECollection actually returns order + 1 for GetOrder() for historical - // reasons. - auto GetMaxElementOrder() const - { - const auto &trial_fec = *trial_fespace.FEColl(); - const auto &test_fec = *test_fespace.FEColl(); - return std::max( - dynamic_cast(&trial_fec) ? trial_fec.GetOrder() - 1 - : trial_fec.GetOrder(), - dynamic_cast(&test_fec) ? test_fec.GetOrder() - 1 - : test_fec.GetOrder()); - } - - template - void AddDomainIntegrator(U &&...args) - { - domain_integs.push_back(std::make_unique(std::forward(args)...)); - } - - template - void AddBoundaryIntegrator(U &&...args) - { - boundary_integs.push_back(std::make_unique(std::forward(args)...)); - } - - std::unique_ptr Assemble(int pa_order_threshold, bool skip_zeros) const - { - if (GetMaxElementOrder() >= pa_order_threshold) - { - return Assemble(); - } - else - { - return FullAssemble(skip_zeros); - } - } - - std::unique_ptr FullAssemble(bool skip_zeros) const - { - return FullAssemble(*Assemble(), skip_zeros); - } - - std::unique_ptr Assemble() const; - - static std::unique_ptr FullAssemble(const ceed::Operator &op, - bool skip_zeros); -}; - -// Discrete linear operators map primal vectors to primal vectors for interpolation between -// spaces. -class DiscreteLinearOperator -{ -private: - BilinearForm a; - -public: - DiscreteLinearOperator(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace) - : a(trial_fespace, test_fespace) - { - } - - const auto &GetTrialSpace() const { return a.GetTrialSpace(); } - const auto &GetTestSpace() const { return a.GetTestSpace(); } - - template - void AddDomainInterpolator(U &&...args) - { - a.AddDomainIntegrator(std::forward(args)...); - } - - std::unique_ptr Assemble(int pa_order_threshold, bool skip_zeros) const - { - if (a.GetMaxElementOrder() >= pa_order_threshold) - { - return Assemble(); - } - else - { - return FullAssemble(skip_zeros); - } - } - - std::unique_ptr FullAssemble(bool skip_zeros) const - { - return FullAssemble(*Assemble(), skip_zeros); - } - - std::unique_ptr Assemble() const; - - static std::unique_ptr FullAssemble(const ceed::Operator &op, - bool skip_zeros); -}; - -} // namespace palace - -#endif // PALACE_FEM_BILINEARFORM_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_FEM_BILINEARFORM_HPP +#define PALACE_FEM_BILINEARFORM_HPP + +#include +#include +#include +#include "fem/integrator.hpp" +#include "fem/libceed/operator.hpp" +#include "linalg/hypre.hpp" + +namespace palace +{ + +class FiniteElementSpace; +class FiniteElementSpaceHierarchy; + +// +// This class implements bilinear and mixed bilinear forms based on integrators assembled +// using the libCEED library. Assembly in the form of a partially assembled operator or +// fully assembled sparse matrix is available. +// +class BilinearForm +{ +protected: + // Domain and range finite element spaces. + const FiniteElementSpace &trial_fespace, &test_fespace; + + // List of domain and boundary integrators making up the bilinear form. + std::vector> domain_integs, boundary_integs; + + std::unique_ptr + PartialAssemble(const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace) const; + +public: + // Order above which to use partial assembly vs. full. + inline static int pa_order_threshold = 1; + +public: + BilinearForm(const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace) + : trial_fespace(trial_fespace), test_fespace(test_fespace) + { + } + BilinearForm(const FiniteElementSpace &fespace) : BilinearForm(fespace, fespace) {} + + const auto &GetTrialSpace() const { return trial_fespace; } + const auto &GetTestSpace() const { return test_fespace; } + + template + void AddDomainIntegrator(U &&...args) + { + domain_integs.push_back(std::make_unique(std::forward(args)...)); + } + + template + void AddBoundaryIntegrator(U &&...args) + { + boundary_integs.push_back(std::make_unique(std::forward(args)...)); + } + + void AssembleQuadratureData(); + + std::unique_ptr PartialAssemble() const + { + return PartialAssemble(GetTrialSpace(), GetTestSpace()); + } + + std::unique_ptr FullAssemble(bool skip_zeros) const + { + return FullAssemble(*PartialAssemble(), skip_zeros, false); + } + + static std::unique_ptr FullAssemble(const ceed::Operator &op, + bool skip_zeros) + { + return FullAssemble(op, skip_zeros, false); + } + + static std::unique_ptr FullAssemble(const ceed::Operator &op, + bool skip_zeros, bool set); + + std::unique_ptr Assemble(bool skip_zeros) const; + + std::vector> + Assemble(const FiniteElementSpaceHierarchy &fespaces, bool skip_zeros, + std::size_t l0 = 0) const; +}; + +// Discrete linear operators map primal vectors to primal vectors for interpolation between +// spaces. +class DiscreteLinearOperator +{ +private: + // Domain and range finite element spaces. + const FiniteElementSpace &trial_fespace, &test_fespace; + + // List of domain interpolators making up the discrete linear operator. + std::vector> domain_interps; + +public: + DiscreteLinearOperator(const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace) + : trial_fespace(trial_fespace), test_fespace(test_fespace) + { + } + + const auto &GetTrialSpace() const { return trial_fespace; } + const auto &GetTestSpace() const { return test_fespace; } + + template + void AddDomainInterpolator(U &&...args) + { + domain_interps.push_back(std::make_unique(std::forward(args)...)); + } + + std::unique_ptr PartialAssemble() const; + + std::unique_ptr FullAssemble(bool skip_zeros) const + { + return BilinearForm::FullAssemble(*PartialAssemble(), skip_zeros, true); + } + + static std::unique_ptr FullAssemble(const ceed::Operator &op, + bool skip_zeros) + { + return BilinearForm::FullAssemble(op, skip_zeros, true); + } +}; + +} // namespace palace + +#endif // PALACE_FEM_BILINEARFORM_HPP diff --git a/palace/fem/coefficient.cpp b/palace/fem/coefficient.cpp index c83c88244a..7d78bedcde 100644 --- a/palace/fem/coefficient.cpp +++ b/palace/fem/coefficient.cpp @@ -1,64 +1,49 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "coefficient.hpp" - -namespace palace -{ - -void BdrGridFunctionCoefficient::GetElementTransformations(mfem::ElementTransformation &T, - const mfem::IntegrationPoint &ip, - mfem::ElementTransformation *&T1, - mfem::ElementTransformation *&T2, - mfem::Vector *C1) -{ - // Return transformations for elements attached to boundary element T. T1 always exists - // but T2 may not if the element is truly a single-sided boundary. - MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, - "Unexpected element type in BdrGridFunctionCoefficient!"); - MFEM_ASSERT(&mesh == T.mesh, "Invalid mesh for BdrGridFunctionCoefficient!"); - int i, o; - int iel1, iel2, info1, info2; - mesh.GetBdrElementFace(T.ElementNo, &i, &o); - mesh.GetFaceElements(i, &iel1, &iel2); - mesh.GetFaceInfos(i, &info1, &info2); - - // Master faces can never be boundary elements, thus only need to check for the state of - // info2 and el2, and do not need to access the ncface numbering. See mfem::Mesh::FaceInfo - // for details. - mfem::FaceElementTransformations *FET; - if (info2 >= 0 && iel2 < 0) - { - // Face is shared with another subdomain. - const int &ishared = local_to_shared.at(i); - FET = mesh.GetSharedFaceTransformations(ishared); - } - else - { - // Face is either internal to the subdomain, or a true one-sided boundary. - FET = mesh.GetFaceElementTransformations(i); - } - - // Boundary elements and boundary faces may have different orientations so adjust the - // integration point if necessary. See mfem::GridFunction::GetValue and GetVectorValue. - mfem::IntegrationPoint fip = - mfem::Mesh::TransformBdrElementToFace(FET->GetGeometryType(), o, ip); - FET->SetAllIntPoints(&fip); - T1 = &FET->GetElement1Transformation(); - T2 = (info2 >= 0) ? &FET->GetElement2Transformation() : nullptr; - - // If desired, get vector pointing from center of boundary element into element 1 for - // orientations. - if (C1) - { - mfem::Vector CF(T.GetSpaceDim()); - mfem::ElementTransformation &TF = *mesh.GetFaceTransformation(i); - TF.Transform(mfem::Geometries.GetCenter(mesh.GetFaceGeometry(i)), CF); - - C1->SetSize(T.GetSpaceDim()); - T1->Transform(mfem::Geometries.GetCenter(T1->GetGeometryType()), *C1); - *C1 -= CF; // Points into element 1 from the face - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "coefficient.hpp" + +namespace palace +{ + +bool BdrGridFunctionCoefficient::GetBdrElementNeighborTransformations( + int i, const mfem::ParMesh &mesh, mfem::FaceElementTransformations &FET, + mfem::IsoparametricTransformation &T1, mfem::IsoparametricTransformation &T2, + const mfem::IntegrationPoint *ip) +{ + // Return transformations for elements attached to the given boundary element. FET.Elem1 + // always exists but FET.Elem2 may not if the element is truly a single-sided boundary. + int f, o; + int iel1, iel2, info1, info2; + mesh.GetBdrElementFace(i, &f, &o); + mesh.GetFaceElements(f, &iel1, &iel2); + mesh.GetFaceInfos(f, &info1, &info2); + + // Master faces can never be boundary elements, thus only need to check for the state of + // info2 and el2, and do not need to access the ncface numbering. See mfem::Mesh::FaceInfo + // for details. + if (info2 >= 0 && iel2 < 0) + { + // Face is shared with another subdomain. + mesh.GetSharedFaceTransformationsByLocalIndex(f, FET, T1, T2); + } + else + { + // Face is either internal to the subdomain, or a true one-sided boundary. + mesh.GetFaceElementTransformations(f, FET, T1, T2); + } + + // Boundary elements and boundary faces may have different orientations so adjust the + // integration point if necessary. See mfem::GridFunction::GetValue and GetVectorValue. + if (ip) + { + mfem::IntegrationPoint fip = + mfem::Mesh::TransformBdrElementToFace(FET.GetGeometryType(), o, *ip); + FET.SetAllIntPoints(&fip); + } + + // Return whether or not the boundary element and face share the same orientations. + return (o % 2 == 0); +} + +} // namespace palace diff --git a/palace/fem/coefficient.hpp b/palace/fem/coefficient.hpp index d268c24380..4336f9a8e5 100644 --- a/palace/fem/coefficient.hpp +++ b/palace/fem/coefficient.hpp @@ -1,932 +1,902 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_FEM_COEFFICIENT_HPP -#define PALACE_FEM_COEFFICIENT_HPP - -#include -#include -#include -#include -#include -#include -#include "models/materialoperator.hpp" - -namespace palace -{ - -// -// Derived coefficients which compute single values on internal boundaries where a possibly -// discontinuous function is given as an input grid function. These are all cheap to -// construct by design. All methods assume the provided grid function is ready for parallel -// comm on shared faces after a call to ExchangeFaceNbrData. -// - -enum class MaterialPropertyType -{ - INV_PERMEABILITY, - PERMITTIVITY_REAL, - PERMITTIVITY_IMAG, - PERMITTIVITY_ABS, - CONDUCTIVITY, - INV_LONDON_DEPTH, - INV_Z0, - INV_PERMEABILITY_C0 -}; - -enum class MeshElementType -{ - ELEMENT, - BDR_ELEMENT, - SUBMESH, - BDR_SUBMESH -}; - -// Returns the property value of the material for the given index. Two separate classes for -// domain element access and boundary element access, which returns the material property of -// the neighboring domain element. -template -class MaterialPropertyCoefficient : public mfem::MatrixCoefficient -{ -private: - const MaterialOperator &mat_op; - const double coef; - - static int GetAttribute(mfem::ElementTransformation &T) - { - if constexpr (ElemType == MeshElementType::SUBMESH || - ElemType == MeshElementType::BDR_SUBMESH) - { - MFEM_ASSERT( - T.ElementType == mfem::ElementTransformation::ELEMENT, - "Invalid usage of MaterialPropertyCoefficient for given MeshElementType!"); - const mfem::ParSubMesh &submesh = *static_cast(T.mesh); - const mfem::ParMesh &mesh = *submesh.GetParent(); - if constexpr (ElemType == MeshElementType::SUBMESH) - { - MFEM_ASSERT( - submesh.GetFrom() == mfem::SubMesh::From::Domain, - "Invalid usage of MaterialPropertyCoefficient for given MeshElementType!"); - return mesh.GetAttribute(submesh.GetParentElementIDMap()[T.ElementNo]); - } - else if constexpr (ElemType == MeshElementType::BDR_SUBMESH) - { - MFEM_ASSERT( - submesh.GetFrom() == mfem::SubMesh::From::Boundary, - "Invalid usage of MaterialPropertyCoefficient for given MeshElementType!"); - int i, o, iel1, iel2; - mesh.GetBdrElementFace(submesh.GetParentElementIDMap()[T.ElementNo], &i, &o); - mesh.GetFaceElements(i, &iel1, &iel2); -#ifdef MFEM_DEBUG - int info1, info2, nc; - mesh.GetFaceInfos(i, &info1, &info2, &nc); - MFEM_VERIFY(nc == -1 && iel2 < 0 && info2 < 0, - "MaterialPropertyCoefficient should only be used for exterior " - "(single-sided) boundaries!"); -#endif - return mesh.GetAttribute(iel1); - } - } - else if constexpr (ElemType == MeshElementType::ELEMENT) - { - MFEM_ASSERT( - T.ElementType == mfem::ElementTransformation::ELEMENT, - "Invalid usage of MaterialPropertyCoefficient for given MeshElementType!"); - return T.Attribute; - } - else if constexpr (ElemType == MeshElementType::BDR_ELEMENT) - { - MFEM_ASSERT( - T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, - "Invalid usage of MaterialPropertyCoefficient for given MeshElementType!"); - int i, o, iel1, iel2; - const mfem::Mesh &mesh = *T.mesh; - mesh.GetBdrElementFace(T.ElementNo, &i, &o); - mesh.GetFaceElements(i, &iel1, &iel2); -#ifdef MFEM_DEBUG - int info1, info2, nc; - mesh.GetFaceInfos(i, &info1, &info2, &nc); - MFEM_VERIFY(nc == -1 && iel2 < 0 && info2 < 0, - "MaterialPropertyCoefficient should only be used for exterior " - "(single-sided) boundaries!"); -#endif - return mesh.GetAttribute(iel1); - } - MFEM_ABORT("Unsupported element type in MaterialPropertyCoefficient!"); - return 0; - } - -public: - MaterialPropertyCoefficient(const MaterialOperator &op, double c = 1.0) - : mfem::MatrixCoefficient(op.SpaceDimension()), mat_op(op), coef(c) - { - } - - void Eval(mfem::DenseMatrix &K, mfem::ElementTransformation &T, - const mfem::IntegrationPoint &ip) override - { - const int attr = GetAttribute(T); - if constexpr (MatType == MaterialPropertyType::INV_PERMEABILITY) - { - K = mat_op.GetInvPermeability(attr); - } - else if constexpr (MatType == MaterialPropertyType::PERMITTIVITY_REAL) - { - K = mat_op.GetPermittivityReal(attr); - } - else if constexpr (MatType == MaterialPropertyType::PERMITTIVITY_IMAG) - { - K = mat_op.GetPermittivityImag(attr); - } - else if constexpr (MatType == MaterialPropertyType::PERMITTIVITY_ABS) - { - K = mat_op.GetPermittivityAbs(attr); - } - else if constexpr (MatType == MaterialPropertyType::CONDUCTIVITY) - { - K = mat_op.GetConductivity(attr); - } - else if constexpr (MatType == MaterialPropertyType::INV_LONDON_DEPTH) - { - K = mat_op.GetInvLondonDepth(attr); - } - else if constexpr (MatType == MaterialPropertyType::INV_Z0) - { - K = mat_op.GetInvImpedance(attr); - } - else if constexpr (MatType == MaterialPropertyType::INV_PERMEABILITY_C0) - { - K.SetSize(height, width); - Mult(mat_op.GetInvPermeability(attr), mat_op.GetLightSpeed(attr), K); - } - else - { - MFEM_ABORT("MaterialPropertyCoefficient::Eval() is not implemented for this " - "material property type!"); - } - K *= coef; - } -}; - -// Base class for coefficients which need to evaluate a GridFunction in a domain element -// attached to a boundary element, or both domain elements on either side for internal -// boundaries. -class BdrGridFunctionCoefficient -{ -protected: - mfem::ParMesh &mesh; - const std::map &local_to_shared; - - void GetElementTransformations(mfem::ElementTransformation &T, - const mfem::IntegrationPoint &ip, - mfem::ElementTransformation *&T1, - mfem::ElementTransformation *&T2, - mfem::Vector *C1 = nullptr); - -public: - BdrGridFunctionCoefficient(mfem::ParMesh &mesh, const std::map &local_to_shared) - : mesh(mesh), local_to_shared(local_to_shared) - { - } - - // Return normal vector to the boundary element at an integration point (it is assumed - // that the element transformation has already been configured at the integration point of - // interest). - static void GetNormal(mfem::ElementTransformation &T, mfem::Vector &normal) - { - normal.SetSize(T.GetSpaceDim()); - mfem::CalcOrtho(T.Jacobian(), normal); - normal /= normal.Norml2(); - } -}; - -// Computes surface current J_s = n x H on boundaries from B as a vector grid function -// where n is an inward normal (computes -n x H for outward normal n). For a two-sided -// internal boundary, the contributions from both sides add. -class BdrCurrentVectorCoefficient : public mfem::VectorCoefficient, - public BdrGridFunctionCoefficient -{ -private: - const mfem::ParGridFunction &B; - const MaterialOperator &mat_op; - mfem::Vector C1, W, VU, VL, nor; - -public: - BdrCurrentVectorCoefficient(const mfem::ParGridFunction &gf, const MaterialOperator &op) - : mfem::VectorCoefficient(gf.ParFESpace()->GetParMesh()->SpaceDimension()), - BdrGridFunctionCoefficient(*gf.ParFESpace()->GetParMesh(), - op.GetLocalToSharedFaceMap()), - B(gf), mat_op(op), C1(gf.VectorDim()), W(gf.VectorDim()), VU(gf.VectorDim()), - VL(gf.VectorDim()), nor(gf.VectorDim()) - { - } - - void Eval(mfem::Vector &V, mfem::ElementTransformation &T, - const mfem::IntegrationPoint &ip) override - { - // Get neighboring elements. - MFEM_ASSERT(vdim == 3, "BdrJVectorCoefficient expects a mesh in 3D space!"); - mfem::ElementTransformation *T1, *T2; - GetElementTransformations(T, ip, T1, T2, &C1); - - // For interior faces, compute J_s = -n x H = -n x μ⁻¹(B1 - B2), where B1 (B2) is B in - // el1 (el2) and n points out from el1. - B.GetVectorValue(*T1, T1->GetIntPoint(), W); - mat_op.GetInvPermeability(T1->Attribute).Mult(W, VU); - if (T2) - { - // Double-sided, not a true boundary. - B.GetVectorValue(*T2, T2->GetIntPoint(), W); - mat_op.GetInvPermeability(T2->Attribute).Mult(W, VL); - VU -= VL; - } - - // Orient with normal pointing into el1. - GetNormal(T, nor); - V.SetSize(vdim); - if (C1 * nor < 0.0) - { - V[0] = -nor[1] * VU[2] + nor[2] * VU[1]; - V[1] = -nor[2] * VU[0] + nor[0] * VU[2]; - V[2] = -nor[0] * VU[1] + nor[1] * VU[0]; - } - else - { - V[0] = nor[1] * VU[2] - nor[2] * VU[1]; - V[1] = nor[2] * VU[0] - nor[0] * VU[2]; - V[2] = nor[0] * VU[1] - nor[1] * VU[0]; - } - } -}; - -// Computes a single-valued surface charge ρ_s = D ⋅ n on boundaries from E given as a -// vector grid function. For a two-sided internal boundary, the contributions from both -// sides add. -class BdrChargeCoefficient : public mfem::Coefficient, public BdrGridFunctionCoefficient -{ -private: - const mfem::ParGridFunction &E; - const MaterialOperator &mat_op; - mfem::Vector C1, W, VU, VL, nor; - -public: - BdrChargeCoefficient(const mfem::ParGridFunction &gf, const MaterialOperator &op) - : mfem::Coefficient(), BdrGridFunctionCoefficient(*gf.ParFESpace()->GetParMesh(), - op.GetLocalToSharedFaceMap()), - E(gf), mat_op(op), C1(gf.VectorDim()), W(gf.VectorDim()), VU(gf.VectorDim()), - VL(gf.VectorDim()), nor(gf.VectorDim()) - { - } - - double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override - { - // Get neighboring elements. - mfem::ElementTransformation *T1, *T2; - GetElementTransformations(T, ip, T1, T2, &C1); - - // For interior faces, compute D ⋅ n = ε (E1 - E2) ⋅ n, where E1 (E2) is E in el1 (el2) - // to get a single-valued function. - E.GetVectorValue(*T1, T1->GetIntPoint(), W); - mat_op.GetPermittivityReal(T1->Attribute).Mult(W, VU); - if (T2) - { - E.GetVectorValue(*T2, T2->GetIntPoint(), W); - mat_op.GetPermittivityReal(T2->Attribute).Mult(W, VL); - VU -= VL; - } - - // Orient with normal pointing into el1. - GetNormal(T, nor); - return (C1 * nor < 0.0) ? -(VU * nor) : VU * nor; - } -}; - -// Computes the flux Φ_s = B ⋅ n on interior boundary elements using the user specified -// normal direction. Manually implements InnerProductCoefficient and -// VectorGridFunctionCoefficient to allow for evaluating the flux on internal boundaries. -class BdrFluxCoefficient : public mfem::Coefficient, public BdrGridFunctionCoefficient -{ -private: - const mfem::ParGridFunction &B; - const mfem::Vector dir; - mfem::Vector V, VL, nor; - -public: - BdrFluxCoefficient(const mfem::ParGridFunction &gf, mfem::Vector d, - const std::map &local_to_shared) - : mfem::Coefficient(), - BdrGridFunctionCoefficient(*gf.ParFESpace()->GetParMesh(), local_to_shared), B(gf), - dir(std::move(d)), V(gf.VectorDim()), VL(gf.VectorDim()), nor(gf.VectorDim()) - { - } - - double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override - { - // Get neighboring elements. - mfem::ElementTransformation *T1, *T2; - GetElementTransformations(T, ip, T1, T2); - - // For interior faces, compute the average value. Since this is only used for - // continuous (normal or tangential) values, we don't care that we average out the - // discontinuous (tangential or normal) parts. - B.GetVectorValue(*T1, T1->GetIntPoint(), V); - if (T2) - { - B.GetVectorValue(*T2, T2->GetIntPoint(), VL); - V += VL; - V *= 0.5; - } - - // Orient sign with the global direction. - GetNormal(T, nor); - return (dir * nor < 0.0) ? -(V * nor) : V * nor; - } -}; - -enum class DielectricInterfaceType -{ - DEFAULT, - MA, - MS, - SA -}; - -// Computes a single-valued α Eᵀ E on boundaries from E given as a vector grid function. -// Uses the neighbor element on a user specified side to compute a single-sided value for -// potentially discontinuous solutions for an interior boundary element. The four cases -// correspond to a generic interface vs. specializations for metal-air, metal-substrate, -// and subtrate-air interfaces following: -// J. Wenner et al., Surface loss simulations of superconducting coplanar waveguide -// resonators, Appl. Phys. Lett. (2011). -template -class DielectricInterfaceCoefficient : public mfem::Coefficient, - public BdrGridFunctionCoefficient -{ -private: - const mfem::ParGridFunction &E; - const MaterialOperator &mat_op; - const double ts, epsilon; - const mfem::Vector side; - mfem::Vector C1, V, nor; - - int Initialize(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip, - mfem::Vector &V) - { - // Get neighboring elements. - mfem::ElementTransformation *T1, *T2; - GetElementTransformations(T, ip, T1, T2, &C1); - - // Get the single-sided solution. - if (!T2) - { - // Ignore side, solution is single-valued. - E.GetVectorValue(*T1, T1->GetIntPoint(), V); - return T1->Attribute; - } - if (!side.Size()) - { - // With no side specified, try to take the solution from the element which corresponds - // to the vacuum domain, or at least the one with the higher speed of light. - if (mat_op.GetLightSpeedMin(T2->Attribute) > mat_op.GetLightSpeedMax(T1->Attribute)) - { - E.GetVectorValue(*T2, T2->GetIntPoint(), V); - return T2->Attribute; - } - E.GetVectorValue(*T1, T1->GetIntPoint(), V); - return T1->Attribute; - } - if (C1 * side < 0.0) - { - // Get solution in el2. - E.GetVectorValue(*T2, T2->GetIntPoint(), V); - return T2->Attribute; - } - // Get solution in el1. - E.GetVectorValue(*T1, T1->GetIntPoint(), V); - return T1->Attribute; - } - -public: - DielectricInterfaceCoefficient(const mfem::ParGridFunction &gf, - const MaterialOperator &op, double ti, double ei, - mfem::Vector s) - : mfem::Coefficient(), BdrGridFunctionCoefficient(*gf.ParFESpace()->GetParMesh(), - op.GetLocalToSharedFaceMap()), - E(gf), mat_op(op), ts(ti), epsilon(ei), side(std::move(s)), C1(gf.VectorDim()), - V(gf.VectorDim()), nor(gf.VectorDim()) - { - } - - double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override - { - MFEM_ABORT("DielectricInterfaceCoefficient::Eval() is not implemented for this " - "interface type!"); - return 0.0; - } -}; - -template <> -inline double DielectricInterfaceCoefficient::Eval( - mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) -{ - // Get single-sided solution and neighboring element attribute. - Initialize(T, ip, V); - GetNormal(T, nor); - - // Metal-air interface: 0.5 * t / ε_MA * |E_n|² . - double Vn = V * nor; - return 0.5 * ts / epsilon * (Vn * Vn); -} - -template <> -inline double DielectricInterfaceCoefficient::Eval( - mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) -{ - // Get single-sided solution and neighboring element attribute. - int attr = Initialize(T, ip, V); - GetNormal(T, nor); - - // Metal-substrate interface: 0.5 * t * (ε_S)² / ε_MS * |E_n|² . - const double Vn = V * nor; - const double epsilon_S = mat_op.GetPermittivityReal(attr).InnerProduct(nor, nor); - return 0.5 * ts * std::pow(epsilon_S, 2) / epsilon * (Vn * Vn); -} - -template <> -inline double DielectricInterfaceCoefficient::Eval( - mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) -{ - // Get single-sided solution and neighboring element attribute. - Initialize(T, ip, V); - GetNormal(T, nor); - - // Substrate-air interface: 0.5 * t * (ε_SA * |E_t|² + 1 / ε_MS * |E_n|²) . - double Vn = V * nor; - V.Add(-Vn, nor); - return 0.5 * ts * (epsilon * (V * V) + (Vn * Vn) / epsilon); -} - -template <> -inline double DielectricInterfaceCoefficient::Eval( - mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) -{ - // Get single-sided solution and neighboring element attribute. - Initialize(T, ip, V); - - // No specific interface, use full field evaluation: 0.5 * t * ε * |E|² . - return 0.5 * ts * epsilon * (V * V); -} - -enum class EnergyDensityType -{ - ELECTRIC, - MAGNETIC -}; - -// Returns the local energy density evaluated as 1/2 Dᴴ E or 1/2 Bᴴ H for real-valued -// material coefficients. For internal boundary elements, the solution is taken on the side -// of the element with the larger-valued material property (permittivity or permeability). -template -class EnergyDensityCoefficient : public mfem::Coefficient, public BdrGridFunctionCoefficient -{ -private: - const GridFunctionType &U; - const MaterialOperator &mat_op; - mfem::Vector V; - - double GetLocalEnergyDensity(mfem::ElementTransformation &T, - const mfem::IntegrationPoint &ip, int attr); - -public: - EnergyDensityCoefficient(const GridFunctionType &gf, const MaterialOperator &op) - : mfem::Coefficient(), BdrGridFunctionCoefficient(*gf.ParFESpace()->GetParMesh(), - op.GetLocalToSharedFaceMap()), - U(gf), mat_op(op), V(gf.ParFESpace()->GetParMesh()->SpaceDimension()) - { - } - - double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override - { - if (T.ElementType == mfem::ElementTransformation::ELEMENT) - { - return GetLocalEnergyDensity(T, ip, T.Attribute); - } - if (T.ElementType == mfem::ElementTransformation::BDR_ELEMENT) - { - // Get neighboring elements. - mfem::ElementTransformation *T1, *T2; - GetElementTransformations(T, ip, T1, T2); - - // For interior faces, compute the value on the side where the material property is - // larger (typically should choose the non-vacuum side). - if (T2 && - mat_op.GetLightSpeedMax(T2->Attribute) < mat_op.GetLightSpeedMin(T1->Attribute)) - { - return GetLocalEnergyDensity(*T2, T2->GetIntPoint(), T2->Attribute); - } - else - { - return GetLocalEnergyDensity(*T1, T1->GetIntPoint(), T1->Attribute); - } - } - MFEM_ABORT("Unsupported element type in EnergyDensityCoefficient!"); - return 0.0; - } -}; - -template <> -inline double -EnergyDensityCoefficient:: - GetLocalEnergyDensity(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip, - int attr) -{ - // Only the real part of the permittivity contributes to the energy (imaginary part - // cancels out in the inner product due to symmetry). - U.real().GetVectorValue(T, ip, V); - double res = mat_op.GetPermittivityReal(attr).InnerProduct(V, V); - U.imag().GetVectorValue(T, ip, V); - res += mat_op.GetPermittivityReal(attr).InnerProduct(V, V); - return 0.5 * res; -} - -template <> -inline double EnergyDensityCoefficient:: - GetLocalEnergyDensity(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip, - int attr) -{ - U.GetVectorValue(T, ip, V); - return 0.5 * mat_op.GetPermittivityReal(attr).InnerProduct(V, V); -} - -template <> -inline double -EnergyDensityCoefficient:: - GetLocalEnergyDensity(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip, - int attr) -{ - U.real().GetVectorValue(T, ip, V); - double res = mat_op.GetInvPermeability(attr).InnerProduct(V, V); - U.imag().GetVectorValue(T, ip, V); - res += mat_op.GetInvPermeability(attr).InnerProduct(V, V); - return 0.5 * res; -} - -template <> -inline double EnergyDensityCoefficient:: - GetLocalEnergyDensity(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip, - int attr) -{ - U.GetVectorValue(T, ip, V); - return 0.5 * mat_op.GetInvPermeability(attr).InnerProduct(V, V); -} - -// Returns the local field evaluated on a boundary element. For internal boundary elements, -// the solution is taken on the side of the element with the larger-valued material -// property (permittivity or permeability). -class BdrFieldVectorCoefficient : public mfem::VectorCoefficient, - public BdrGridFunctionCoefficient -{ -private: - const mfem::ParGridFunction &U; - const MaterialOperator &mat_op; - -public: - BdrFieldVectorCoefficient(const mfem::ParGridFunction &gf, const MaterialOperator &op) - : mfem::VectorCoefficient(gf.ParFESpace()->GetParMesh()->SpaceDimension()), - BdrGridFunctionCoefficient(*gf.ParFESpace()->GetParMesh(), - op.GetLocalToSharedFaceMap()), - U(gf), mat_op(op) - { - } - - void Eval(mfem::Vector &V, mfem::ElementTransformation &T, - const mfem::IntegrationPoint &ip) override - { - // Get neighboring elements. - mfem::ElementTransformation *T1, *T2; - GetElementTransformations(T, ip, T1, T2); - - // For interior faces, compute the value on the side where the material property is - // larger (typically should choose the non-vacuum side). - if (T2 && - mat_op.GetLightSpeedMax(T2->Attribute) < mat_op.GetLightSpeedMin(T1->Attribute)) - { - U.GetVectorValue(*T2, T2->GetIntPoint(), V); - } - else - { - U.GetVectorValue(*T1, T1->GetIntPoint(), V); - } - } -}; - -class BdrFieldCoefficient : public mfem::Coefficient, public BdrGridFunctionCoefficient -{ -private: - const mfem::ParGridFunction &U; - const MaterialOperator &mat_op; - -public: - BdrFieldCoefficient(const mfem::ParGridFunction &gf, const MaterialOperator &op) - : mfem::Coefficient(), BdrGridFunctionCoefficient(*gf.ParFESpace()->GetParMesh(), - op.GetLocalToSharedFaceMap()), - U(gf), mat_op(op) - { - } - - double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override - { - // Get neighboring elements. - mfem::ElementTransformation *T1, *T2; - GetElementTransformations(T, ip, T1, T2); - - // For interior faces, compute the value on the side where the material property is - // larger (typically should choose the non-vacuum side). - if (T2 && - mat_op.GetLightSpeedMax(T2->Attribute) < mat_op.GetLightSpeedMin(T1->Attribute)) - { - return U.GetValue(*T2, T2->GetIntPoint()); - } - else - { - return U.GetValue(*T1, T1->GetIntPoint()); - } - } -}; - -// Wraps a mfem::MatrixCoefficient to compute a scalar coefficient as nᵀ M n. Only works -// for square matrix coefficients of size equal to the spatial dimension. -class NormalProjectedCoefficient : public mfem::Coefficient -{ - std::unique_ptr c; - mfem::DenseMatrix K; - mfem::Vector nor; - -public: - NormalProjectedCoefficient(std::unique_ptr &&coef) - : mfem::Coefficient(), c(std::move(coef)), K(c->GetHeight(), c->GetWidth()), - nor(c->GetHeight()) - { - } - - double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override - { - c->Eval(K, T, ip); - BdrGridFunctionCoefficient::GetNormal(T, nor); - return K.InnerProduct(nor, nor); - } -}; - -class VectorWrappedCoefficient : public mfem::VectorCoefficient -{ -private: - std::unique_ptr c; - -public: - VectorWrappedCoefficient(int d, std::unique_ptr &&coef) - : mfem::VectorCoefficient(d), c(std::move(coef)) - { - } - - void SetTime(double t) override - { - mfem::VectorCoefficient::SetTime(t); - c->SetTime(t); - } - - void Eval(mfem::Vector &V, mfem::ElementTransformation &T, - const mfem::IntegrationPoint &ip) override - { - V.SetSize(vdim); - V = c->Eval(T, ip); - } -}; - -class MatrixWrappedCoefficient : public mfem::MatrixCoefficient -{ -private: - std::unique_ptr c; - -public: - MatrixWrappedCoefficient(int d, std::unique_ptr &&coef) - : mfem::MatrixCoefficient(d), c(std::move(coef)) - { - } - - void SetTime(double t) override - { - mfem::MatrixCoefficient::SetTime(t); - c->SetTime(t); - } - - void Eval(mfem::DenseMatrix &K, mfem::ElementTransformation &T, - const mfem::IntegrationPoint &ip) override - { - K.Diag(c->Eval(T, ip), height); - } -}; - -class SumCoefficient : public mfem::Coefficient -{ -private: - std::vector, const mfem::Array *>> c; - - void AddCoefficient(std::unique_ptr &&coef, - const mfem::Array *marker) - { - c.emplace_back(std::move(coef), marker); - } - -public: - SumCoefficient() : mfem::Coefficient() {} - - bool empty() const { return c.empty(); } - - void AddCoefficient(std::unique_ptr &&coef) - { - AddCoefficient(std::move(coef), nullptr); - } - - void AddCoefficient(std::unique_ptr &&coef, - const mfem::Array &marker) - { - AddCoefficient(std::move(coef), &marker); - } - - void SetTime(double t) override - { - mfem::Coefficient::SetTime(t); - for (auto &[coef, marker] : c) - { - coef->SetTime(t); - } - } - - double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override - { - double val = 0.0; - for (auto &[coef, marker] : c) - { - if (!marker || (*marker)[T.Attribute - 1]) - { - val += coef->Eval(T, ip); - } - } - return val; - } -}; - -class SumVectorCoefficient : public mfem::VectorCoefficient -{ -private: - std::vector, const mfem::Array *>> - c; - - void AddCoefficient(std::unique_ptr &&coef, - const mfem::Array *marker) - { - MFEM_VERIFY(coef->GetVDim() == vdim, - "Invalid VectorCoefficient dimensions for SumVectorCoefficient!"); - c.emplace_back(std::move(coef), marker); - } - - void AddCoefficient(std::unique_ptr &&coef, - const mfem::Array *marker) - { - c.emplace_back(std::make_unique(vdim, std::move(coef)), - marker); - } - -public: - SumVectorCoefficient(int d) : mfem::VectorCoefficient(d) {} - - bool empty() const { return c.empty(); } - - void AddCoefficient(std::unique_ptr &&coef) - { - AddCoefficient(std::move(coef), nullptr); - } - - void AddCoefficient(std::unique_ptr &&coef, - const mfem::Array &marker) - { - AddCoefficient(std::move(coef), &marker); - } - - void AddCoefficient(std::unique_ptr &&coef) - { - AddCoefficient(std::move(coef), nullptr); - } - - void AddCoefficient(std::unique_ptr &&coef, - const mfem::Array &marker) - { - AddCoefficient(std::move(coef), &marker); - } - - void SetTime(double t) override - { - mfem::VectorCoefficient::SetTime(t); - for (auto &[coef, marker] : c) - { - coef->SetTime(t); - } - } - - void Eval(mfem::Vector &V, mfem::ElementTransformation &T, - const mfem::IntegrationPoint &ip) override - { - mfem::Vector U(vdim); - V.SetSize(vdim); - V = 0.0; - for (auto &[coef, marker] : c) - { - if (!marker || (*marker)[T.Attribute - 1]) - { - coef->Eval(U, T, ip); - V += U; - } - } - } -}; - -class SumMatrixCoefficient : public mfem::MatrixCoefficient -{ -private: - std::vector, const mfem::Array *>> - c; - - void AddCoefficient(std::unique_ptr &&coef, - const mfem::Array *marker) - { - MFEM_VERIFY(coef->GetHeight() == height && coef->GetWidth() == width, - "Invalid MatrixCoefficient dimensions for SumMatrixCoefficient!"); - c.emplace_back(std::move(coef), marker); - } - - void AddCoefficient(std::unique_ptr &&coef, - const mfem::Array *marker) - { - MFEM_VERIFY(width == height, "MatrixWrappedCoefficient can only be constructed for " - "square MatrixCoefficient objects!"); - c.emplace_back(std::make_unique(height, std::move(coef)), - marker); - } - -public: - SumMatrixCoefficient(int d) : mfem::MatrixCoefficient(d) {} - SumMatrixCoefficient(int h, int w) : mfem::MatrixCoefficient(h, w) {} - - bool empty() const { return c.empty(); } - - void AddCoefficient(std::unique_ptr &&coef) - { - AddCoefficient(std::move(coef), nullptr); - } - - void AddCoefficient(std::unique_ptr &&coef, - const mfem::Array &marker) - { - AddCoefficient(std::move(coef), &marker); - } - - void AddCoefficient(std::unique_ptr &&coef) - { - AddCoefficient(std::move(coef), nullptr); - } - - void AddCoefficient(std::unique_ptr &&coef, - const mfem::Array &marker) - { - AddCoefficient(std::move(coef), &marker); - } - - void SetTime(double t) override - { - mfem::MatrixCoefficient::SetTime(t); - for (auto &[coef, marker] : c) - { - coef->SetTime(t); - } - } - - void Eval(mfem::DenseMatrix &K, mfem::ElementTransformation &T, - const mfem::IntegrationPoint &ip) override - { - mfem::DenseMatrix M(height, width); - K.SetSize(height, width); - K = 0.0; - for (auto &[coef, marker] : c) - { - if (!marker || (*marker)[T.Attribute - 1]) - { - coef->Eval(M, T, ip); - K += M; - } - } - } -}; - -} // namespace palace - -#endif // PALACE_FEM_COEFFICIENT_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_FEM_COEFFICIENT_HPP +#define PALACE_FEM_COEFFICIENT_HPP + +#include +#include +#include +#include +#include +#include "fem/gridfunction.hpp" +#include "linalg/vector.hpp" +#include "models/materialoperator.hpp" +#include "utils/geodata.hpp" +#include "utils/labels.hpp" + +// XX TODO: Add bulk element Eval() overrides to speed up postprocessing (also needed in +// mfem::DataCollection classes. + +namespace palace +{ + +// +// Derived coefficients which compute single values on internal boundaries where a possibly +// discontinuous function is given as an input grid function. These are all cheap to +// construct by design. All methods assume the provided grid function is ready for parallel +// comm on shared faces after a call to ExchangeFaceNbrData. +// + +// Base class for coefficients which need to evaluate a GridFunction in a domain element +// attached to a boundary element, or both domain elements on either side for internal +// boundaries, with optional scaling factor. +class BdrGridFunctionCoefficient +{ +protected: + // XX TODO: For thread-safety (multiple threads evaluating a coefficient simultaneously), + // the FET, FET.Elem1, and FET.Elem2 objects cannot be shared. + const mfem::ParMesh &mesh; + mfem::FaceElementTransformations FET; + mfem::IsoparametricTransformation T1, T2; + const double scaling; // scaling factor used for unit conversions + + bool GetBdrElementNeighborTransformations(int i, const mfem::IntegrationPoint &ip) + { + // Get the element transformations neighboring the element, and optionally set the + // integration point too. + return GetBdrElementNeighborTransformations(i, mesh, FET, T1, T2, &ip); + } + +public: + BdrGridFunctionCoefficient(const mfem::ParMesh &mesh, double scaling = 1.0) + : mesh(mesh), scaling(scaling) + { + } + + // For a boundary element, return the element transformation objects for the neighboring + // domain elements. FET.Elem2 may be nullptr if the boundary is a true one-sided boundary, + // but if it is shared with another subdomain then it will be populated. Expects + // ParMesh::ExchangeFaceNbrData has been called already. + static bool GetBdrElementNeighborTransformations( + int i, const mfem::ParMesh &mesh, mfem::FaceElementTransformations &FET, + mfem::IsoparametricTransformation &T1, mfem::IsoparametricTransformation &T2, + const mfem::IntegrationPoint *ip = nullptr); + + // Return normal vector to the boundary element at an integration point. For a face + // element, the normal points out of the element (from element 1 into element 2, if it + // exists). This convention can be flipped with the optional parameter. It is assumed + // that the element transformation has already been configured at the integration point + // of interest. + static void GetNormal(mfem::ElementTransformation &T, mfem::Vector &normal, + bool invert = false) + { + MFEM_ASSERT(normal.Size() == T.GetSpaceDim(), + "Size mismatch for normal vector (space dimension = " << T.GetSpaceDim() + << ")!"); + mfem::CalcOrtho(T.Jacobian(), normal); + normal /= invert ? -normal.Norml2() : normal.Norml2(); + } +}; + +// Computes surface current Jₛ = n x H = n x μ⁻¹ B on boundaries from B as a vector grid +// function where n is an inward normal (computes -n x H for outward normal n). For a +// two-sided internal boundary, the contributions from both sides add. +class BdrSurfaceCurrentVectorCoefficient : public mfem::VectorCoefficient, + public BdrGridFunctionCoefficient +{ +private: + const mfem::ParGridFunction &B; + const MaterialOperator &mat_op; + +public: + BdrSurfaceCurrentVectorCoefficient(const mfem::ParGridFunction &B, + const MaterialOperator &mat_op, double scaling = 1.0) + : mfem::VectorCoefficient(B.VectorDim()), + BdrGridFunctionCoefficient(*B.ParFESpace()->GetParMesh(), scaling), B(B), + mat_op(mat_op) + { + } + + using mfem::VectorCoefficient::Eval; + void Eval(mfem::Vector &V, mfem::ElementTransformation &T, + const mfem::IntegrationPoint &ip) override + { + // Get neighboring elements. + MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, + "Unexpected element type in BdrSurfaceCurrentVectorCoefficient!"); + bool ori = GetBdrElementNeighborTransformations(T.ElementNo, ip); + + // For interior faces, compute Jₛ = n x H = n x μ⁻¹ (B1 - B2), where B1 (B2) is B in + // element 1 (element 2) and n points into element 1. + double W_data[3], VU_data[3]; + mfem::Vector W(W_data, vdim), VU(VU_data, vdim); + B.GetVectorValue(*FET.Elem1, FET.Elem1->GetIntPoint(), W); + mat_op.GetInvPermeability(FET.Elem1->Attribute).Mult(W, VU); + if (FET.Elem2) + { + // Double-sided, not a true boundary. Add result with opposite normal. + double VL_data[3]; + mfem::Vector VL(VL_data, vdim); + B.GetVectorValue(*FET.Elem2, FET.Elem2->GetIntPoint(), W); + mat_op.GetInvPermeability(FET.Elem2->Attribute).Mult(W, VL); + VU -= VL; + } + + // Orient with normal pointing into element 1. + double normal_data[3]; + mfem::Vector normal(normal_data, vdim); + GetNormal(T, normal, ori); + V.SetSize(vdim); + linalg::Cross3(normal, VU, V); + + V *= scaling; + } +}; + +// Computes the flux Φₛ = F ⋅ n with F = B or ε E on interior boundary elements using B or +// E given as a vector grid function. For a two-sided internal boundary, the contributions +// from both sides can either add or be averaged. +template +class BdrSurfaceFluxCoefficient : public mfem::Coefficient, + public BdrGridFunctionCoefficient +{ +private: + const mfem::ParGridFunction *E, *B; + const MaterialOperator &mat_op; + bool two_sided; + const mfem::Vector &x0; + void GetLocalFlux(mfem::ElementTransformation &T, mfem::Vector &V) const; + +public: + BdrSurfaceFluxCoefficient(const mfem::ParGridFunction *E, const mfem::ParGridFunction *B, + const MaterialOperator &mat_op, bool two_sided, + const mfem::Vector &x0, double scaling = 1.0) + : mfem::Coefficient(), + BdrGridFunctionCoefficient( + E ? *E->ParFESpace()->GetParMesh() : *B->ParFESpace()->GetParMesh(), scaling), + E(E), B(B), mat_op(mat_op), two_sided(two_sided), x0(x0) + { + MFEM_VERIFY((E || (Type != SurfaceFlux::ELECTRIC && Type != SurfaceFlux::POWER)) && + (B || (Type != SurfaceFlux::MAGNETIC && Type != SurfaceFlux::POWER)), + "Missing E or B field grid function for surface flux coefficient!"); + } + + double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override + { + // Get neighboring elements. + MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, + "Unexpected element type in BdrSurfaceFluxCoefficient!"); + bool ori = GetBdrElementNeighborTransformations(T.ElementNo, ip); + + // For interior faces, compute either F ⋅ n as the average or by adding the + // contributions from opposite sides with opposite normals. + const int vdim = T.GetSpaceDim(); + double VU_data[3]; + mfem::Vector VU(VU_data, vdim); + GetLocalFlux(*FET.Elem1, VU); + if (FET.Elem2) + { + // Double-sided, not a true boundary. + double VL_data[3]; + mfem::Vector VL(VL_data, vdim); + GetLocalFlux(*FET.Elem2, VL); + if (two_sided) + { + // Add result with opposite normal. This only happens when crack_bdr_elements = + // false (two_sided = true doesn't make sense for an internal boundary without an + // associated BC). + VU -= VL; + } + else + { + // Take the average of the values on both sides. + add(0.5, VU, VL, VU); + } + } + + // Dot with normal direction and assign appropriate sign. The normal is oriented to + // point into element 1. + double normal_data[3]; + mfem::Vector normal(normal_data, vdim); + GetNormal(T, normal, ori); + double flux = VU * normal; + if (two_sided) + { + return flux; + } + else + { + // Orient outward from the surface with the given center. + double x_data[3]; + mfem::Vector x(x_data, vdim); + T.Transform(ip, x); + x -= x0; + return (x * normal < 0.0) ? -flux : flux; + } + } +}; + +template <> +inline void BdrSurfaceFluxCoefficient::GetLocalFlux( + mfem::ElementTransformation &T, mfem::Vector &V) const +{ + // Flux D. + double W_data[3]; + mfem::Vector W(W_data, T.GetSpaceDim()); + E->GetVectorValue(T, T.GetIntPoint(), W); + mat_op.GetPermittivityReal(T.Attribute).Mult(W, V); + V *= scaling; +} + +template <> +inline void BdrSurfaceFluxCoefficient::GetLocalFlux( + mfem::ElementTransformation &T, mfem::Vector &V) const +{ + // Flux B. + B->GetVectorValue(T, T.GetIntPoint(), V); + V *= scaling; +} + +template <> +inline void +BdrSurfaceFluxCoefficient::GetLocalFlux(mfem::ElementTransformation &T, + mfem::Vector &V) const +{ + // Flux E x H = E x μ⁻¹ B. + double W1_data[3], W2_data[3]; + mfem::Vector W1(W1_data, T.GetSpaceDim()), W2(W2_data, T.GetSpaceDim()); + B->GetVectorValue(T, T.GetIntPoint(), W1); + mat_op.GetInvPermeability(T.Attribute).Mult(W1, W2); + E->GetVectorValue(T, T.GetIntPoint(), W1); + V.SetSize(W1.Size()); + linalg::Cross3(W1, W2, V); + V *= scaling; +} + +// Computes a single-valued α Eᵀ E on boundaries from E given as a vector grid function. +// Uses the neighbor element on a user specified side to compute a single-sided value for +// potentially discontinuous solutions for an interior boundary element. The four cases +// correspond to a generic interface vs. specializations for metal-air, metal-substrate, +// and substrate-air interfaces following: +// J. Wenner et al., Surface loss simulations of superconducting coplanar waveguide +// resonators, Appl. Phys. Lett. (2011). +template +class InterfaceDielectricCoefficient : public mfem::Coefficient, + public BdrGridFunctionCoefficient +{ +private: + const GridFunction &E; + const MaterialOperator &mat_op; + const double t_i, epsilon_i; + + void Initialize(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip, + mfem::Vector *normal) + { + // Get neighboring elements and the normal vector, oriented to point into element 1. + MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, + "Unexpected element type in InterfaceDielectricCoefficient!"); + bool ori = GetBdrElementNeighborTransformations(T.ElementNo, ip); + if (normal) + { + GetNormal(T, *normal, ori); + } + } + + int GetLocalVectorValue(const mfem::ParGridFunction &U, mfem::Vector &V, + bool vacuum_side) const + { + constexpr double threshold = 1.0 - 1.0e-6; + const bool use_elem1 = + ((vacuum_side && mat_op.GetLightSpeedMax(FET.Elem1->Attribute) >= threshold) || + (!vacuum_side && mat_op.GetLightSpeedMax(FET.Elem1->Attribute) < threshold)); + const bool use_elem2 = + (FET.Elem2 && + ((vacuum_side && mat_op.GetLightSpeedMax(FET.Elem2->Attribute) >= threshold) || + (!vacuum_side && mat_op.GetLightSpeedMax(FET.Elem2->Attribute) < threshold))); + if (use_elem1) + { + U.GetVectorValue(*FET.Elem1, FET.Elem1->GetIntPoint(), V); + if (use_elem2) + { + // Double-sided, not a true boundary. Just average the solution from both sides. + double W_data[3]; + mfem::Vector W(W_data, V.Size()); + U.GetVectorValue(*FET.Elem2, FET.Elem2->GetIntPoint(), W); + add(0.5, V, W, V); + } + return FET.Elem1->Attribute; + } + else if (use_elem2) + { + U.GetVectorValue(*FET.Elem2, FET.Elem2->GetIntPoint(), V); + return FET.Elem2->Attribute; + } + else + { + return 0; + } + } + +public: + InterfaceDielectricCoefficient(const GridFunction &E, const MaterialOperator &mat_op, + double t_i, double epsilon_i) + : mfem::Coefficient(), BdrGridFunctionCoefficient(*E.ParFESpace()->GetParMesh()), E(E), + mat_op(mat_op), t_i(t_i), epsilon_i(epsilon_i) + { + } + + double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override; +}; + +template <> +inline double InterfaceDielectricCoefficient::Eval( + mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) +{ + // Get single-sided solution. Don't use lightspeed detection for differentiating side. + auto GetLocalVectorValueDefault = [this](const mfem::ParGridFunction &U, mfem::Vector &V) + { + U.GetVectorValue(*FET.Elem1, FET.Elem1->GetIntPoint(), V); + if (FET.Elem2) + { + // Double-sided, not a true boundary. Just average the field solution from both sides. + double W_data[3]; + mfem::Vector W(W_data, V.Size()); + U.GetVectorValue(*FET.Elem2, FET.Elem2->GetIntPoint(), W); + add(0.5, V, W, V); + } + }; + double V_data[3]; + mfem::Vector V(V_data, T.GetSpaceDim()); + Initialize(T, ip, nullptr); + GetLocalVectorValueDefault(E.Real(), V); + double V2 = V * V; + if (E.HasImag()) + { + GetLocalVectorValueDefault(E.Imag(), V); + V2 += V * V; + } + + // No specific interface, use full field evaluation: 0.5 * t * ε * |E|² . + return 0.5 * t_i * epsilon_i * V2; +} + +template <> +inline double InterfaceDielectricCoefficient::Eval( + mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) +{ + // Get single-sided solution on air (vacuum) side and neighboring element attribute. + double V_data[3], normal_data[3]; + mfem::Vector V(V_data, T.GetSpaceDim()), normal(normal_data, T.GetSpaceDim()); + Initialize(T, ip, &normal); + int attr = GetLocalVectorValue(E.Real(), V, true); + if (attr <= 0) + { + return 0.0; + } + double Vn = V * normal; + double Vn2 = Vn * Vn; + if (E.HasImag()) + { + GetLocalVectorValue(E.Imag(), V, true); + Vn = V * normal; + Vn2 += Vn * Vn; + } + + // Metal-air interface: 0.5 * t / ε_MA * |E_n|² . + return 0.5 * (t_i / epsilon_i) * Vn2; +} + +template <> +inline double InterfaceDielectricCoefficient::Eval( + mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) +{ + // Get single-sided solution on substrate side and neighboring element attribute. + double V_data[3], W_data[3], normal_data[3]; + mfem::Vector V(V_data, T.GetSpaceDim()), W(W_data, T.GetSpaceDim()), + normal(normal_data, T.GetSpaceDim()); + Initialize(T, ip, &normal); + int attr = GetLocalVectorValue(E.Real(), V, false); + if (attr <= 0) + { + return 0.0; + } + mat_op.GetPermittivityReal(attr).Mult(V, W); + double Vn = W * normal; + double Vn2 = Vn * Vn; + if (E.HasImag()) + { + GetLocalVectorValue(E.Imag(), V, false); + mat_op.GetPermittivityReal(attr).Mult(V, W); + Vn = W * normal; + Vn2 += Vn * Vn; + } + + // Metal-substrate interface: 0.5 * t / ε_MS * |(ε_S E)_n|² . + return 0.5 * (t_i / epsilon_i) * Vn2; +} + +template <> +inline double InterfaceDielectricCoefficient::Eval( + mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) +{ + // Get single-sided solution on air side and neighboring element attribute. + double V_data[3], normal_data[3]; + mfem::Vector V(V_data, T.GetSpaceDim()), normal(normal_data, T.GetSpaceDim()); + Initialize(T, ip, &normal); + int attr = GetLocalVectorValue(E.Real(), V, true); + if (attr <= 0) + { + return 0.0; + } + double Vn = V * normal; + V.Add(-Vn, normal); + double Vn2 = Vn * Vn; + double Vt2 = V * V; + if (E.HasImag()) + { + GetLocalVectorValue(E.Imag(), V, true); + Vn = V * normal; + V.Add(-Vn, normal); + Vn2 += Vn * Vn; + Vt2 += V * V; + } + + // Substrate-air interface: 0.5 * t * (ε_SA * |E_t|² + 1 / ε_SA * |E_n|²) . + return 0.5 * t_i * ((epsilon_i * Vt2) + (Vn2 / epsilon_i)); +} + +// Helper for EnergyDensityCoefficient. +enum class EnergyDensityType +{ + ELECTRIC, + MAGNETIC +}; + +// Returns the local energy density evaluated as 1/2 Dᴴ E or 1/2 Hᴴ B for real-valued +// material coefficients. For internal boundary elements, the solution is averaged across +// the interface. +template +class EnergyDensityCoefficient : public mfem::Coefficient, public BdrGridFunctionCoefficient +{ +private: + const GridFunction &U; + const MaterialOperator &mat_op; + double GetLocalEnergyDensity(mfem::ElementTransformation &T) const; + +public: + EnergyDensityCoefficient(const GridFunction &U, const MaterialOperator &mat_op, + double scaling = 1.0) + : mfem::Coefficient(), + BdrGridFunctionCoefficient(*U.ParFESpace()->GetParMesh(), scaling), U(U), + mat_op(mat_op) + { + } + + double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override + { + if (T.ElementType == mfem::ElementTransformation::ELEMENT) + { + return GetLocalEnergyDensity(T); + } + else if (T.ElementType == mfem::ElementTransformation::BDR_ELEMENT) + { + // Get neighboring elements. + GetBdrElementNeighborTransformations(T.ElementNo, ip); + + // For interior faces, compute the average value. + if (FET.Elem2) + { + return 0.5 * + (GetLocalEnergyDensity(*FET.Elem1) + GetLocalEnergyDensity(*FET.Elem2)); + } + else + { + return GetLocalEnergyDensity(*FET.Elem1); + } + } + MFEM_ABORT("Unsupported element type in EnergyDensityCoefficient!"); + return 0.0; + } +}; + +template <> +inline double EnergyDensityCoefficient::GetLocalEnergyDensity( + mfem::ElementTransformation &T) const +{ + // Only the real part of the permittivity contributes to the energy (imaginary part + // cancels out in the inner product due to symmetry). + double V_data[3]; + mfem::Vector V(V_data, T.GetSpaceDim()); + U.Real().GetVectorValue(T, T.GetIntPoint(), V); + double dot = mat_op.GetPermittivityReal(T.Attribute).InnerProduct(V, V); + if (U.HasImag()) + { + U.Imag().GetVectorValue(T, T.GetIntPoint(), V); + dot += mat_op.GetPermittivityReal(T.Attribute).InnerProduct(V, V); + } + return 0.5 * dot * scaling; +} + +template <> +inline double EnergyDensityCoefficient::GetLocalEnergyDensity( + mfem::ElementTransformation &T) const +{ + double V_data[3]; + mfem::Vector V(V_data, T.GetSpaceDim()); + U.Real().GetVectorValue(T, T.GetIntPoint(), V); + double dot = mat_op.GetInvPermeability(T.Attribute).InnerProduct(V, V); + if (U.HasImag()) + { + U.Imag().GetVectorValue(T, T.GetIntPoint(), V); + dot += mat_op.GetInvPermeability(T.Attribute).InnerProduct(V, V); + } + return 0.5 * dot * scaling; +} + +// Compute time-averaged Poynting vector Re{E x H⋆}, without the typical factor of 1/2. For +// internal boundary elements, the solution is taken as the average. +class PoyntingVectorCoefficient : public mfem::VectorCoefficient, + public BdrGridFunctionCoefficient +{ +private: + const GridFunction &E, &B; + const MaterialOperator &mat_op; + + void GetLocalPower(mfem::ElementTransformation &T, mfem::Vector &V) const + { + double W1_data[3], W2_data[3]; + mfem::Vector W1(W1_data, T.GetSpaceDim()), W2(W2_data, T.GetSpaceDim()); + B.Real().GetVectorValue(T, T.GetIntPoint(), W1); + mat_op.GetInvPermeability(T.Attribute).Mult(W1, W2); + E.Real().GetVectorValue(T, T.GetIntPoint(), W1); + V.SetSize(vdim); + linalg::Cross3(W1, W2, V); + if (E.HasImag()) + { + B.Imag().GetVectorValue(T, T.GetIntPoint(), W1); + mat_op.GetInvPermeability(T.Attribute).Mult(W1, W2); + E.Imag().GetVectorValue(T, T.GetIntPoint(), W1); + linalg::Cross3(W1, W2, V, true); + } + V *= scaling; + } + +public: + PoyntingVectorCoefficient(const GridFunction &E, const GridFunction &B, + const MaterialOperator &mat_op, double scaling = 1.0) + : mfem::VectorCoefficient(E.VectorDim()), + BdrGridFunctionCoefficient(*E.ParFESpace()->GetParMesh(), scaling), E(E), B(B), + mat_op(mat_op) + { + } + + using mfem::VectorCoefficient::Eval; + void Eval(mfem::Vector &V, mfem::ElementTransformation &T, + const mfem::IntegrationPoint &ip) override + { + if (T.ElementType == mfem::ElementTransformation::ELEMENT) + { + GetLocalPower(T, V); + return; + } + else if (T.ElementType == mfem::ElementTransformation::BDR_ELEMENT) + { + // Get neighboring elements. + GetBdrElementNeighborTransformations(T.ElementNo, ip); + + // For interior faces, compute the value on the desired side. + GetLocalPower(*FET.Elem1, V); + if (FET.Elem2) + { + double W_data[3]; + mfem::Vector W(W_data, V.Size()); + GetLocalPower(*FET.Elem2, W); + add(0.5, V, W, V); + } + return; + } + MFEM_ABORT("Unsupported element type in PoyntingVectorCoefficient!"); + } +}; + +// Returns the local vector field evaluated on a boundary element. For internal boundary +// elements the solution is the average. +class BdrFieldVectorCoefficient : public mfem::VectorCoefficient, + public BdrGridFunctionCoefficient +{ +private: + const mfem::ParGridFunction &U; + +public: + BdrFieldVectorCoefficient(const mfem::ParGridFunction &U) + : mfem::VectorCoefficient(U.VectorDim()), + BdrGridFunctionCoefficient(*U.ParFESpace()->GetParMesh()), U(U) + { + } + + using mfem::VectorCoefficient::Eval; + void Eval(mfem::Vector &V, mfem::ElementTransformation &T, + const mfem::IntegrationPoint &ip) override + { + // Get neighboring elements. + MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, + "Unexpected element type in BdrFieldVectorCoefficient!"); + GetBdrElementNeighborTransformations(T.ElementNo, ip); + + // For interior faces, compute the average. + U.GetVectorValue(*FET.Elem1, FET.Elem1->GetIntPoint(), V); + if (FET.Elem2) + { + double W_data[3]; + mfem::Vector W(W_data, V.Size()); + U.GetVectorValue(*FET.Elem2, FET.Elem2->GetIntPoint(), W); + add(0.5, V, W, V); + } + } +}; + +// Returns the local scalar field evaluated on a boundary element. For internal boundary +// elements the solution is the average. +class BdrFieldCoefficient : public mfem::Coefficient, public BdrGridFunctionCoefficient +{ +private: + const mfem::ParGridFunction &U; + +public: + BdrFieldCoefficient(const mfem::ParGridFunction &U) + : mfem::Coefficient(), BdrGridFunctionCoefficient(*U.ParFESpace()->GetParMesh()), U(U) + { + } + + double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override + { + // Get neighboring elements. + MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, + "Unexpected element type in BdrFieldCoefficient!"); + GetBdrElementNeighborTransformations(T.ElementNo, ip); + + // For interior faces, compute the average. + if (FET.Elem2) + { + return 0.5 * (U.GetValue(*FET.Elem1, FET.Elem1->GetIntPoint()), + U.GetValue(*FET.Elem2, FET.Elem2->GetIntPoint())); + } + else + { + return U.GetValue(*FET.Elem1, FET.Elem1->GetIntPoint()); + } + } +}; + +// +// More helpful coefficient types. Wrapper coefficients allow additions of scalar and vector +// or matrix coefficients. Restricted coefficients only compute the coefficient if for the +// given list of attributes. Sum coefficients own a list of coefficients to add. +// + +class VectorWrappedCoefficient : public mfem::VectorCoefficient +{ +private: + std::unique_ptr coeff; + +public: + VectorWrappedCoefficient(int dim, std::unique_ptr &&coeff) + : mfem::VectorCoefficient(dim), coeff(std::move(coeff)) + { + } + + using mfem::VectorCoefficient::Eval; + void Eval(mfem::Vector &V, mfem::ElementTransformation &T, + const mfem::IntegrationPoint &ip) override + { + V.SetSize(vdim); + V = coeff->Eval(T, ip); + } +}; + +class MatrixWrappedCoefficient : public mfem::MatrixCoefficient +{ +private: + std::unique_ptr coeff; + +public: + MatrixWrappedCoefficient(int dim, std::unique_ptr &&coeff) + : mfem::MatrixCoefficient(dim), coeff(std::move(coeff)) + { + } + + void Eval(mfem::DenseMatrix &K, mfem::ElementTransformation &T, + const mfem::IntegrationPoint &ip) override + { + K.Diag(coeff->Eval(T, ip), height); + } +}; + +template +class RestrictedCoefficient : public Coefficient +{ +private: + mfem::Array attr_marker; + +public: + template + RestrictedCoefficient(const mfem::Array &attr_list, T &&...args) + : Coefficient(std::forward(args)...), + attr_marker(mesh::AttrToMarker(attr_list.Size() ? attr_list.Max() : 0, attr_list)) + { + } + + double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override + { + return (T.Attribute > attr_marker.Size() || !attr_marker[T.Attribute - 1]) + ? 0.0 + : Coefficient::Eval(T, ip); + } +}; + +template +class RestrictedVectorCoefficient : public Coefficient +{ +private: + mfem::Array attr_marker; + +public: + template + RestrictedVectorCoefficient(const mfem::Array &attr_list, T &&...args) + : Coefficient(std::forward(args)...), + attr_marker(mesh::AttrToMarker(attr_list.Size() ? attr_list.Max() : 0, attr_list)) + { + } + + void Eval(mfem::Vector &V, mfem::ElementTransformation &T, + const mfem::IntegrationPoint &ip) override + { + if (T.Attribute > attr_marker.Size() || !attr_marker[T.Attribute - 1]) + { + V.SetSize(this->vdim); + V = 0.0; + } + else + { + Coefficient::Eval(V, T, ip); + } + } +}; + +template +class RestrictedMatrixCoefficient : public Coefficient +{ +private: + mfem::Array attr_marker; + +public: + template + RestrictedMatrixCoefficient(const mfem::Array &attr_list, T &&...args) + : Coefficient(std::forward(args)...), + attr_marker(mesh::AttrToMarker(attr_list.Size() ? attr_list.Max() : 0, attr_list)) + { + } + + void Eval(mfem::DenseMatrix &K, mfem::ElementTransformation &T, + const mfem::IntegrationPoint &ip) override + { + if (T.Attribute > attr_marker.Size() || !attr_marker[T.Attribute - 1]) + { + K.SetSize(this->height, this->width); + K = 0.0; + } + else + { + Coefficient::Eval(K, T, ip); + } + } +}; + +class SumCoefficient : public mfem::Coefficient +{ +private: + std::vector, double>> c; + +public: + SumCoefficient() : mfem::Coefficient() {} + + bool empty() const { return c.empty(); } + + void AddCoefficient(std::unique_ptr &&coeff, double a = 1.0) + { + c.emplace_back(std::move(coeff), a); + } + + double Eval(mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override + { + double val = 0.0; + for (auto &[coeff, a] : c) + { + val += a * coeff->Eval(T, ip); + } + return val; + } +}; + +class SumVectorCoefficient : public mfem::VectorCoefficient +{ +private: + std::vector, double>> c; + +public: + SumVectorCoefficient(int d) : mfem::VectorCoefficient(d) {} + + bool empty() const { return c.empty(); } + + void AddCoefficient(std::unique_ptr &&coeff, double a = 1.0) + { + MFEM_VERIFY(coeff->GetVDim() == vdim, + "Invalid VectorCoefficient dimensions for SumVectorCoefficient!"); + c.emplace_back(std::move(coeff), a); + } + + void AddCoefficient(std::unique_ptr &&coeff, double a = 1.0) + { + c.emplace_back(std::make_unique(vdim, std::move(coeff)), a); + } + + using mfem::VectorCoefficient::Eval; + void Eval(mfem::Vector &V, mfem::ElementTransformation &T, + const mfem::IntegrationPoint &ip) override + { + double U_data[3]; + mfem::Vector U(U_data, vdim); + V.SetSize(vdim); + V = 0.0; + for (auto &[coeff, a] : c) + { + coeff->Eval(U, T, ip); + V.Add(a, U); + } + } +}; + +class SumMatrixCoefficient : public mfem::MatrixCoefficient +{ +private: + std::vector, double>> c; + +public: + SumMatrixCoefficient(int d) : mfem::MatrixCoefficient(d) {} + SumMatrixCoefficient(int h, int w) : mfem::MatrixCoefficient(h, w) {} + + bool empty() const { return c.empty(); } + + void AddCoefficient(std::unique_ptr &&coeff, double a) + { + MFEM_VERIFY(coeff->GetHeight() == height && coeff->GetWidth() == width, + "Invalid MatrixCoefficient dimensions for SumMatrixCoefficient!"); + c.emplace_back(std::move(coeff), a); + } + + void AddCoefficient(std::unique_ptr &&coeff, double a) + { + MFEM_VERIFY(width == height, "MatrixWrappedCoefficient can only be constructed for " + "square MatrixCoefficient objects!"); + c.emplace_back(std::make_unique(height, std::move(coeff)), a); + } + + void Eval(mfem::DenseMatrix &K, mfem::ElementTransformation &T, + const mfem::IntegrationPoint &ip) override + { + double M_data[9]; + mfem::DenseMatrix M(M_data, height, width); + K.SetSize(height, width); + K = 0.0; + for (auto &[coeff, a] : c) + { + coeff->Eval(M, T, ip); + K.Add(a, M); + } + } +}; + +} // namespace palace + +#endif // PALACE_FEM_COEFFICIENT_HPP diff --git a/palace/fem/errorindicator.cpp b/palace/fem/errorindicator.cpp index 1980e3ca60..dc5d821d5d 100644 --- a/palace/fem/errorindicator.cpp +++ b/palace/fem/errorindicator.cpp @@ -1,50 +1,49 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "errorindicator.hpp" - -#include - -namespace palace -{ - -void ErrorIndicator::AddIndicator(const ErrorIndicator &indicator) -{ - if (n == 0) - { - local = indicator.local; - n = indicator.n; - return; - } - - // The average local indicator is used rather than the indicator for the maximum - // error to drive the adaptation, to account for a local error that might be marginally - // important to many solves, rather than only large in one solve. - MFEM_ASSERT(local.Size() == indicator.local.Size(), - "Unexpected size mismatch for ErrorIndicator::AddIndicator!"); - - // The local indicators must be squared before combining, so that the global error - // calculation is valid: - // E = √(1/N ∑ₙ ∑ₖ ηₖₙ²) - // from which it follows that: - // E² = 1/N ∑ₙ ∑ₖ ηₖₙ² - // = 1/N ∑ₙ Eₙ² - // Namely the average of the global error indicators included in the reduction. - // Squaring both sides means the summation can be rearranged, and then the local error - // indicators become: - // eₖ = √(1/N ∑ₙ ηₖₙ²) - const int N = local.Size(); - const auto *DIL = indicator.local.Read(); - auto *DL = local.ReadWrite(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - DL[i] = std::sqrt((DL[i] * DL[i] * n + DIL[i] * DIL[i] * indicator.n) / - (n + indicator.n)); - }); - - // More samples have been added, update for the running average. - n += indicator.n; -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "errorindicator.hpp" + +#include + +namespace palace +{ + +void ErrorIndicator::AddIndicator(const Vector &indicator) +{ + if (n == 0) + { + local = indicator; + n = 1; + return; + } + + // The average local indicator is used rather than the indicator for the maximum + // error to drive the adaptation, to account for a local error that might be marginally + // important to many solves, rather than only large in one solve. + MFEM_ASSERT(local.Size() == indicator.Size(), + "Unexpected size mismatch for ErrorIndicator::AddIndicator!"); + + // The local indicators must be squared before combining, so that the global error + // calculation is valid: + // E = √(1/N ∑ₙ ∑ₖ ηₖₙ²) + // from which it follows that: + // E² = 1/N ∑ₙ ∑ₖ ηₖₙ² + // = 1/N ∑ₙ Eₙ² + // Namely the average of the global error indicators included in the reduction. + // Squaring both sides means the summation can be rearranged, and then the local error + // indicators become: + // eₖ = √(1/N ∑ₙ ηₖₙ²) + const bool use_dev = local.UseDevice() || indicator.UseDevice(); + const int N = local.Size(); + const int Dn = n; + const auto *DI = indicator.Read(); + auto *DL = local.ReadWrite(); + mfem::forall_switch( + use_dev, N, [=] MFEM_HOST_DEVICE(int i) + { DL[i] = std::sqrt((DL[i] * DL[i] * Dn + DI[i] * DI[i]) / (Dn + 1)); }); + + // More samples have been added, update for the running average. + n += 1; +} + +} // namespace palace diff --git a/palace/fem/errorindicator.hpp b/palace/fem/errorindicator.hpp index ae12132617..f51e841d2a 100644 --- a/palace/fem/errorindicator.hpp +++ b/palace/fem/errorindicator.hpp @@ -1,69 +1,80 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_FEM_ERROR_INDICATORS_HPP -#define PALACE_FEM_ERROR_INDICATORS_HPP - -#include -#include -#include "linalg/vector.hpp" -#include "utils/communication.hpp" - -namespace palace -{ - -// -// Storage for error estimation results from a simulation which involves one or more solves, -// required in the AMR loop. -// -class ErrorIndicator -{ -protected: - // Elemental localized error indicators. Used for marking elements for - // refinement and coarsening. - Vector local; - - // Number of samples. - int n; - -public: - ErrorIndicator(Vector &&local) : local(std::move(local)), n(1) {} - ErrorIndicator() : n(0) {} - - // Add an indicator to the running total. - void AddIndicator(const ErrorIndicator &indicator); - - // Return the local error indicator. - const auto &Local() const { return local; } - - // Return the global error indicator. - auto Norml2(MPI_Comm comm) const { return linalg::Norml2(comm, local); } - - // Return the largest local error indicator. - auto Max(MPI_Comm comm) const - { - auto max = local.Max(); - Mpi::GlobalMax(1, &max, comm); - return max; - } - - // Return the smallest local error indicator. - auto Min(MPI_Comm comm) const - { - auto min = local.Min(); - Mpi::GlobalMin(1, &min, comm); - return min; - } - - // Return the mean local error indicator. - auto Mean(MPI_Comm comm) const - { - auto sum = local.Sum(); - Mpi::GlobalSum(1, &sum, comm); - return sum / linalg::GlobalSize(comm, local); - } -}; - -} // namespace palace - -#endif // PALACE_FEM_ERROR_INDICATORS_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_FEM_ERROR_INDICATORS_HPP +#define PALACE_FEM_ERROR_INDICATORS_HPP + +#include +#include +#include "linalg/vector.hpp" +#include "utils/communication.hpp" + +namespace palace +{ + +// +// Storage for error estimation results from a simulation which involves one or more solves, +// required in the AMR loop. +// +class ErrorIndicator +{ +protected: + // Elemental localized error indicators. Used for marking elements for + // refinement and coarsening. + Vector local; + + // Number of samples. + int n; + +public: + ErrorIndicator(Vector &&local) : local(std::move(local)), n(1) + { + this->local.UseDevice(true); + } + ErrorIndicator() : n(0) { local.UseDevice(true); } + + // Add an indicator to the running total. + void AddIndicator(const Vector &indicator); + + // Return the local error indicator. + const auto &Local() const { return local; } + + // Return the global error indicator. + auto Norml2(MPI_Comm comm) const { return linalg::Norml2(comm, local); } + + // Return the largest local error indicator. + auto Max(MPI_Comm comm) const + { + auto max = local.Max(); + Mpi::GlobalMax(1, &max, comm); + return max; + } + + // Return the smallest local error indicator. + auto Min(MPI_Comm comm) const + { + auto min = local.Min(); + Mpi::GlobalMin(1, &min, comm); + return min; + } + + // Return the mean local error indicator. + auto Mean(MPI_Comm comm) const { return linalg::Mean(comm, local); } + + struct SummaryStatistics + { + double norm; + double min; + double max; + double mean; + }; + + SummaryStatistics GetSummaryStatistics(MPI_Comm comm) const + { + return {Norml2(comm), Min(comm), Max(comm), Mean(comm)}; + } +}; + +} // namespace palace + +#endif // PALACE_FEM_ERROR_INDICATORS_HPP diff --git a/palace/fem/fespace.cpp b/palace/fem/fespace.cpp index 98cd0a44d6..377f6b58ea 100644 --- a/palace/fem/fespace.cpp +++ b/palace/fem/fespace.cpp @@ -1,121 +1,248 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fespace.hpp" - -#include "fem/bilinearform.hpp" -#include "fem/integrator.hpp" -#include "linalg/rap.hpp" -#include "utils/omp.hpp" - -namespace palace -{ - -std::size_t FiniteElementSpace::global_id = 0; - -std::size_t FiniteElementSpace::GetId() const -{ - PalacePragmaOmp(critical(GetId)) - { - if (!init || GetSequence() != prev_sequence) - { - id = global_id++; - prev_sequence = GetSequence(); - init = true; - } - } - return id; -} - -const Operator &AuxiliaryFiniteElementSpace::BuildDiscreteInterpolator() const -{ - // G is always partially assembled. - const int dim = GetParMesh()->Dimension(); - const auto aux_map_type = FEColl()->GetMapType(dim); - const auto primal_map_type = primal_fespace.FEColl()->GetMapType(dim); - if (aux_map_type == mfem::FiniteElement::VALUE && - primal_map_type == mfem::FiniteElement::H_CURL) - { - // Discrete gradient interpolator - DiscreteLinearOperator interp(*this, primal_fespace); - interp.AddDomainInterpolator(); - G = std::make_unique(interp.Assemble(), *this, primal_fespace, true); - } - else if (primal_map_type == mfem::FiniteElement::VALUE && - aux_map_type == mfem::FiniteElement::H_CURL) - { - // Discrete gradient interpolator (spaces reversed) - DiscreteLinearOperator interp(primal_fespace, *this); - interp.AddDomainInterpolator(); - G = std::make_unique(interp.Assemble(), primal_fespace, *this, true); - } - else if (aux_map_type == mfem::FiniteElement::H_CURL && - primal_map_type == mfem::FiniteElement::H_DIV) - { - // Discrete curl interpolator - DiscreteLinearOperator interp(*this, primal_fespace); - interp.AddDomainInterpolator(); - G = std::make_unique(interp.Assemble(), *this, primal_fespace, true); - } - else if (primal_map_type == mfem::FiniteElement::H_CURL && - aux_map_type == mfem::FiniteElement::H_DIV) - { - // Discrete curl interpolator (spaces reversed) - DiscreteLinearOperator interp(primal_fespace, *this); - interp.AddDomainInterpolator(); - G = std::make_unique(interp.Assemble(), primal_fespace, *this, true); - } - else if (aux_map_type == mfem::FiniteElement::H_DIV && - primal_map_type == mfem::FiniteElement::INTEGRAL) - { - // Discrete divergence interpolator - DiscreteLinearOperator interp(*this, primal_fespace); - interp.AddDomainInterpolator(); - G = std::make_unique(interp.Assemble(), *this, primal_fespace, true); - } - else if (primal_map_type == mfem::FiniteElement::H_DIV && - aux_map_type == mfem::FiniteElement::INTEGRAL) - { - // Discrete divergence interpolator (spaces reversed) - DiscreteLinearOperator interp(primal_fespace, *this); - interp.AddDomainInterpolator(); - G = std::make_unique(interp.Assemble(), primal_fespace, *this, true); - } - else - { - MFEM_ABORT("Unsupported trial/test FE spaces for AuxiliaryFiniteElementSpace discrete " - "interpolator!"); - } - - return *G; -} - -template -const Operator & -BaseFiniteElementSpaceHierarchy::BuildProlongationAtLevel(std::size_t l) const -{ - // P is always partially assembled. - MFEM_VERIFY(l >= 0 && l < GetNumLevels() - 1, - "Can only construct a finite element space prolongation with more than one " - "space in the hierarchy!"); - if (fespaces[l]->GetParMesh() != fespaces[l + 1]->GetParMesh()) - { - P[l] = std::make_unique( - std::make_unique(*fespaces[l], *fespaces[l + 1]), - *fespaces[l], *fespaces[l + 1], true); - } - else - { - DiscreteLinearOperator p(*fespaces[l], *fespaces[l + 1]); - p.AddDomainInterpolator(); - P[l] = - std::make_unique(p.Assemble(), *fespaces[l], *fespaces[l + 1], true); - } - - return *P[l]; -} - -template class BaseFiniteElementSpaceHierarchy; -template class BaseFiniteElementSpaceHierarchy; - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fespace.hpp" + +#include "fem/bilinearform.hpp" +#include "fem/integrator.hpp" +#include "fem/libceed/basis.hpp" +#include "fem/libceed/restriction.hpp" +#include "linalg/rap.hpp" + +namespace palace +{ + +CeedBasis FiniteElementSpace::GetCeedBasis(Ceed ceed, mfem::Geometry::Type geom) const +{ + auto it = basis.find(ceed); + MFEM_ASSERT(it != basis.end(), "Unknown Ceed context in GetCeedBasis!"); + auto &basis_map = it->second; + auto basis_it = basis_map.find(geom); + if (basis_it != basis_map.end()) + { + return basis_it->second; + } + return basis_map.emplace(geom, BuildCeedBasis(*this, ceed, geom)).first->second; +} + +CeedElemRestriction +FiniteElementSpace::GetCeedElemRestriction(Ceed ceed, mfem::Geometry::Type geom, + const std::vector &indices) const +{ + auto it = restr.find(ceed); + MFEM_ASSERT(it != restr.end(), "Unknown Ceed context in GetCeedElemRestriction!"); + auto &restr_map = it->second; + auto restr_it = restr_map.find(geom); + if (restr_it != restr_map.end()) + { + return restr_it->second; + } + return restr_map.emplace(geom, BuildCeedElemRestriction(*this, ceed, geom, indices)) + .first->second; +} + +CeedElemRestriction +FiniteElementSpace::GetInterpCeedElemRestriction(Ceed ceed, mfem::Geometry::Type geom, + const std::vector &indices) const +{ + const mfem::FiniteElement &fe = *GetFEColl().FiniteElementForGeometry(geom); + if (!HasUniqueInterpRestriction(fe)) + { + return GetCeedElemRestriction(ceed, geom, indices); + } + auto it = interp_restr.find(ceed); + MFEM_ASSERT(it != interp_restr.end(), + "Unknown Ceed context in GetInterpCeedElemRestriction!"); + auto &restr_map = it->second; + auto restr_it = restr_map.find(geom); + if (restr_it != restr_map.end()) + { + return restr_it->second; + } + return restr_map + .emplace(geom, BuildCeedElemRestriction(*this, ceed, geom, indices, true, false)) + .first->second; +} + +CeedElemRestriction +FiniteElementSpace::GetInterpRangeCeedElemRestriction(Ceed ceed, mfem::Geometry::Type geom, + const std::vector &indices) const +{ + const mfem::FiniteElement &fe = *GetFEColl().FiniteElementForGeometry(geom); + if (!HasUniqueInterpRangeRestriction(fe)) + { + return GetInterpCeedElemRestriction(ceed, geom, indices); + } + auto it = interp_range_restr.find(ceed); + MFEM_ASSERT(it != interp_range_restr.end(), + "Unknown Ceed context in GetInterpRangeCeedElemRestriction!"); + auto &restr_map = it->second; + auto restr_it = restr_map.find(geom); + if (restr_it != restr_map.end()) + { + return restr_it->second; + } + return restr_map + .emplace(geom, BuildCeedElemRestriction(*this, ceed, geom, indices, true, true)) + .first->second; +} + +void FiniteElementSpace::ResetCeedObjects() +{ + for (auto &[ceed, basis_map] : basis) + { + for (auto &[key, val] : basis_map) + { + PalaceCeedCall(ceed, CeedBasisDestroy(&val)); + } + } + for (auto &[ceed, restr_map] : restr) + { + for (auto &[key, val] : restr_map) + { + PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&val)); + } + } + for (auto &[ceed, restr_map] : interp_restr) + { + for (auto &[key, val] : restr_map) + { + PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&val)); + } + } + for (auto &[ceed, restr_map] : interp_range_restr) + { + for (auto &[key, val] : restr_map) + { + PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&val)); + } + } + basis.clear(); + restr.clear(); + interp_restr.clear(); + interp_range_restr.clear(); + for (std::size_t i = 0; i < ceed::internal::GetCeedObjects().size(); i++) + { + Ceed ceed = ceed::internal::GetCeedObjects()[i]; + basis.emplace(ceed, ceed::GeometryObjectMap()); + restr.emplace(ceed, ceed::GeometryObjectMap()); + interp_restr.emplace(ceed, ceed::GeometryObjectMap()); + interp_range_restr.emplace(ceed, ceed::GeometryObjectMap()); + } +} + +CeedBasis FiniteElementSpace::BuildCeedBasis(const mfem::FiniteElementSpace &fespace, + Ceed ceed, mfem::Geometry::Type geom) +{ + // Find the appropriate integration rule for the element. + mfem::IsoparametricTransformation T; + const mfem::FiniteElement *fe_nodal = + fespace.GetMesh()->GetNodalFESpace()->FEColl()->FiniteElementForGeometry(geom); + if (!fe_nodal) + { + fe_nodal = + fespace.GetMesh()->GetNodalFESpace()->FEColl()->TraceFiniteElementForGeometry(geom); + } + T.SetFE(fe_nodal); + const int q_order = fem::DefaultIntegrationOrder::Get(T); + const mfem::IntegrationRule &ir = mfem::IntRules.Get(geom, q_order); + + // Build the libCEED basis. + CeedBasis val; + const mfem::FiniteElement *fe = fespace.FEColl()->FiniteElementForGeometry(geom); + if (!fe) + { + fe = fespace.FEColl()->TraceFiniteElementForGeometry(geom); + } + const int vdim = fespace.GetVDim(); + ceed::InitBasis(*fe, ir, vdim, ceed, &val); + return val; +} + +CeedElemRestriction FiniteElementSpace::BuildCeedElemRestriction( + const mfem::FiniteElementSpace &fespace, Ceed ceed, mfem::Geometry::Type geom, + const std::vector &indices, bool is_interp, bool is_interp_range) +{ + // Construct the libCEED element restriction for this element type. + CeedElemRestriction val; + const bool use_bdr = (mfem::Geometry::Dimension[geom] != fespace.GetMesh()->Dimension()); + ceed::InitRestriction(fespace, indices, use_bdr, is_interp, is_interp_range, ceed, &val); + return val; +} + +const Operator &FiniteElementSpace::BuildDiscreteInterpolator() const +{ + // Allow finite element spaces to be swapped in their order (intended as deriv(aux) -> + // primal). G is always partially assembled. + const int dim = Dimension(); + const bool swap = + (aux_fespace->GetFEColl().GetMapType(dim) == GetFEColl().GetDerivMapType(dim)); + MFEM_VERIFY(!swap, "Incorrect order for primal/auxiliary (test/trial) spaces in discrete " + "interpolator construction!"); + MFEM_VERIFY( + GetFEColl().GetMapType(dim) == aux_fespace->GetFEColl().GetDerivMapType(dim), + "Unsupported trial/test FE spaces for FiniteElementSpace discrete interpolator!"); + const FiniteElementSpace &trial_fespace = !swap ? *aux_fespace : *this; + const FiniteElementSpace &test_fespace = !swap ? *this : *aux_fespace; + const auto aux_map_type = trial_fespace.GetFEColl().GetMapType(dim); + const auto primal_map_type = test_fespace.GetFEColl().GetMapType(dim); + if (aux_map_type == mfem::FiniteElement::VALUE && + primal_map_type == mfem::FiniteElement::H_CURL) + { + // Discrete gradient interpolator. + DiscreteLinearOperator interp(trial_fespace, test_fespace); + interp.AddDomainInterpolator(); + G = std::make_unique(interp.PartialAssemble(), trial_fespace, test_fespace, + true); + } + else if (aux_map_type == mfem::FiniteElement::H_CURL && + primal_map_type == mfem::FiniteElement::H_DIV) + { + // Discrete curl interpolator. + DiscreteLinearOperator interp(trial_fespace, test_fespace); + interp.AddDomainInterpolator(); + G = std::make_unique(interp.PartialAssemble(), trial_fespace, test_fespace, + true); + } + else if (aux_map_type == mfem::FiniteElement::H_DIV && + primal_map_type == mfem::FiniteElement::INTEGRAL) + { + // Discrete divergence interpolator. + DiscreteLinearOperator interp(trial_fespace, test_fespace); + interp.AddDomainInterpolator(); + G = std::make_unique(interp.PartialAssemble(), trial_fespace, test_fespace, + true); + } + else + { + MFEM_ABORT( + "Unsupported trial/test FE spaces for FiniteElementSpace discrete interpolator!"); + } + + return *G; +} + +const Operator &FiniteElementSpaceHierarchy::BuildProlongationAtLevel(std::size_t l) const +{ + // P is always partially assembled. + MFEM_VERIFY(l + 1 < GetNumLevels(), + "Can only construct a finite element space prolongation with more than one " + "space in the hierarchy!"); + if (&fespaces[l]->GetMesh() != &fespaces[l + 1]->GetMesh()) + { + P[l] = std::make_unique( + std::make_unique(*fespaces[l], *fespaces[l + 1]), + *fespaces[l], *fespaces[l + 1], true); + } + else + { + DiscreteLinearOperator p(*fespaces[l], *fespaces[l + 1]); + p.AddDomainInterpolator(); + P[l] = std::make_unique(p.PartialAssemble(), *fespaces[l], + *fespaces[l + 1], true); + } + + return *P[l]; +} + +} // namespace palace diff --git a/palace/fem/fespace.hpp b/palace/fem/fespace.hpp index 05d495e899..575ebe3018 100644 --- a/palace/fem/fespace.hpp +++ b/palace/fem/fespace.hpp @@ -1,182 +1,290 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_FEM_FESPACE_HPP -#define PALACE_FEM_FESPACE_HPP - -#include -#include -#include -#include "linalg/operator.hpp" - -namespace palace -{ - -// -// Wrapper for MFEM's ParFiniteElementSpace class, where the finite element space object -// is constructed with a unique ID associated with it. This is useful for defining equality -// operations between spaces (either different spaces on the same mesh, or the same space -// type on different meshes). -// -class FiniteElementSpace : public mfem::ParFiniteElementSpace -{ -private: - static std::size_t global_id; - mutable std::size_t id; - mutable long int prev_sequence; - mutable bool init = false; - -public: - using mfem::ParFiniteElementSpace::ParFiniteElementSpace; - FiniteElementSpace(const mfem::ParFiniteElementSpace &fespace) - : mfem::ParFiniteElementSpace(fespace) - { - } - - // Get the ID associated with the instance of this class. If the underlying sequence has - // changed (due to a mesh update, for example), regenerate the ID. - std::size_t GetId() const; -}; - -// -// An AuxiliaryFiniteElement space is a FiniteElementSpace which allows for lazy -// construction of the interpolation operator (discrete gradient or curl) from the primal -// space to this one. -// -class AuxiliaryFiniteElementSpace : public FiniteElementSpace -{ -private: - const FiniteElementSpace &primal_fespace; - mutable std::unique_ptr G; - - const Operator &BuildDiscreteInterpolator() const; - -public: - template - AuxiliaryFiniteElementSpace(const FiniteElementSpace &primal_fespace, T &&...args) - : FiniteElementSpace(std::forward(args)...), primal_fespace(primal_fespace) - { - } - - // Return the discrete gradient or discrete curl matrix interpolating from the auxiliary - // to the primal space, constructing it on the fly as necessary. - const Operator &GetDiscreteInterpolator() const - { - return G ? *G : BuildDiscreteInterpolator(); - } -}; - -// -// A collection of FiniteElementSpace objects constructed on the same mesh with the ability -// to construct the prolongation operators between them as needed. -// -template -class BaseFiniteElementSpaceHierarchy -{ - static_assert(std::is_base_of::value, - "A space hierarchy can only be constructed of FiniteElementSpace objects!"); - -protected: - std::vector> fespaces; - mutable std::vector> P; - - const Operator &BuildProlongationAtLevel(std::size_t l) const; - -public: - BaseFiniteElementSpaceHierarchy() = default; - BaseFiniteElementSpaceHierarchy(std::unique_ptr &&fespace) - { - AddLevel(std::move(fespace)); - } - - auto GetNumLevels() const { return fespaces.size(); } - - void AddLevel(std::unique_ptr &&fespace) - { - fespaces.push_back(std::move(fespace)); - P.push_back(nullptr); - } - - FESpace &GetFESpaceAtLevel(std::size_t l) - { - MFEM_ASSERT(l >= 0 && l < GetNumLevels(), - "Out of bounds request for finite element space at level " << l << "!"); - return *fespaces[l]; - } - const FESpace &GetFESpaceAtLevel(std::size_t l) const - { - MFEM_ASSERT(l >= 0 && l < GetNumLevels(), - "Out of bounds request for finite element space at level " << l << "!"); - return *fespaces[l]; - } - - FESpace &GetFinestFESpace() - { - MFEM_ASSERT(GetNumLevels() > 0, - "Out of bounds request for finite element space at level 0!"); - return *fespaces.back(); - } - const FESpace &GetFinestFESpace() const - { - MFEM_ASSERT(GetNumLevels() > 0, - "Out of bounds request for finite element space at level 0!"); - return *fespaces.back(); - } - - const Operator &GetProlongationAtLevel(std::size_t l) const - { - MFEM_ASSERT(l >= 0 && l < GetNumLevels() - 1, - "Out of bounds request for finite element space prolongation at level " - << l << "!"); - return P[l] ? *P[l] : BuildProlongationAtLevel(l); - } - - std::vector GetProlongationOperators() const - { - std::vector P_(GetNumLevels() - 1); - for (std::size_t l = 0; l < P_.size(); l++) - { - P_[l] = &GetProlongationAtLevel(l); - } - return P_; - } -}; - -class FiniteElementSpaceHierarchy - : public BaseFiniteElementSpaceHierarchy -{ -public: - using BaseFiniteElementSpaceHierarchy< - FiniteElementSpace>::BaseFiniteElementSpaceHierarchy; -}; - -// -// A special type of FiniteElementSpaceHierarchy where all members are auxiliary finite -// element spaces. -// -class AuxiliaryFiniteElementSpaceHierarchy - : public BaseFiniteElementSpaceHierarchy -{ -public: - using BaseFiniteElementSpaceHierarchy< - AuxiliaryFiniteElementSpace>::BaseFiniteElementSpaceHierarchy; - - const Operator &GetDiscreteInterpolatorAtLevel(std::size_t l) const - { - return GetFESpaceAtLevel(l).GetDiscreteInterpolator(); - } - - std::vector GetDiscreteInterpolators() const - { - std::vector G_(GetNumLevels()); - for (std::size_t l = 0; l < G_.size(); l++) - { - G_[l] = &GetDiscreteInterpolatorAtLevel(l); - } - return G_; - } -}; - -} // namespace palace - -#endif // PALACE_FEM_FESPACE_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_FEM_FESPACE_HPP +#define PALACE_FEM_FESPACE_HPP + +#include +#include +#include +#include "fem/libceed/ceed.hpp" +#include "fem/mesh.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +// +// Wrapper for MFEM's ParFiniteElementSpace class, with extensions for Palace. +// +class FiniteElementSpace +{ +private: + // Underlying MFEM object. + mfem::ParFiniteElementSpace fespace; + + // Reference to the underlying mesh object (not owned). + Mesh &mesh; + + // Members for constructing libCEED operators. + mutable ceed::CeedObjectMap basis; + mutable ceed::CeedObjectMap restr, interp_restr, interp_range_restr; + + // Temporary storage for operator applications. + mutable ComplexVector tx, lx, ly; + + // Members for discrete interpolators from an auxiliary space to a primal space. + mutable const FiniteElementSpace *aux_fespace; + mutable std::unique_ptr G; + + bool HasUniqueInterpRestriction(const mfem::FiniteElement &fe) const + { + // For interpolation operators and tensor-product elements, we need native (not + // lexicographic) ordering. + const mfem::TensorBasisElement *tfe = + dynamic_cast(&fe); + return (tfe && tfe->GetDofMap().Size() > 0 && + fe.GetRangeType() != mfem::FiniteElement::VECTOR); + } + + bool HasUniqueInterpRangeRestriction(const mfem::FiniteElement &fe) const + { + // The range restriction for interpolation operators needs to use a special + // DofTransformation (not equal to the transpose of the domain restriction). + if (mesh.Dimension() < 3) + { + return false; + } + const auto geom = fe.GetGeomType(); + const auto *dof_trans = fespace.FEColl()->DofTransformationForGeometry(geom); + return (dof_trans && !dof_trans->IsIdentity()); + } + + const Operator &BuildDiscreteInterpolator() const; + +public: + template + FiniteElementSpace(Mesh &mesh, T &&...args) + : fespace(&mesh.Get(), std::forward(args)...), mesh(mesh), aux_fespace(nullptr) + { + ResetCeedObjects(); + tx.UseDevice(true); + lx.UseDevice(true); + ly.UseDevice(true); + } + virtual ~FiniteElementSpace() { ResetCeedObjects(); } + + const auto &Get() const { return fespace; } + auto &Get() { return fespace; } + + operator const mfem::ParFiniteElementSpace &() const { return Get(); } + operator mfem::ParFiniteElementSpace &() { return Get(); } + + const auto &GetFEColl() const { return *Get().FEColl(); } + auto &GetFEColl() { return *Get().FEColl(); } + + const auto &GetMesh() const { return mesh; } + auto &GetMesh() { return mesh; } + + const auto &GetParMesh() const { return mesh.Get(); } + auto &GetParMesh() { return mesh.Get(); } + + auto GetVDim() const { return Get().GetVDim(); } + auto GetVSize() const { return Get().GetVSize(); } + auto GlobalVSize() const { return Get().GlobalVSize(); } + auto GetTrueVSize() const { return Get().GetTrueVSize(); } + auto GlobalTrueVSize() const { return Get().GlobalTrueVSize(); } + auto Dimension() const { return mesh.Get().Dimension(); } + auto SpaceDimension() const { return mesh.Get().SpaceDimension(); } + auto GetMaxElementOrder() const { return Get().GetMaxElementOrder(); } + + const auto *GetProlongationMatrix() const { return Get().GetProlongationMatrix(); } + const auto *GetRestrictionMatrix() const { return Get().GetRestrictionMatrix(); } + + // Return the discrete gradient, curl, or divergence matrix interpolating from the + // auxiliary to the primal space, constructing it on the fly as necessary. + const auto &GetDiscreteInterpolator(const FiniteElementSpace &aux_fespace_) const + { + if (&aux_fespace_ != aux_fespace) + { + G.reset(); + aux_fespace = &aux_fespace_; + } + return G ? *G : BuildDiscreteInterpolator(); + } + + // Return the basis object for elements of the given element geometry type. + CeedBasis GetCeedBasis(Ceed ceed, mfem::Geometry::Type geom) const; + + // Return the element restriction object for the given element set (all with the same + // geometry type). + CeedElemRestriction GetCeedElemRestriction(Ceed ceed, mfem::Geometry::Type geom, + const std::vector &indices) const; + + // If the space has a special element restriction for discrete interpolators, return that. + // Otherwise return the same restriction as given by GetCeedElemRestriction. + CeedElemRestriction GetInterpCeedElemRestriction(Ceed ceed, mfem::Geometry::Type geom, + const std::vector &indices) const; + + // If the space has a special element restriction for the range space of discrete + // interpolators, return that. Otherwise return the same restriction as given by + // GetCeedElemRestriction. + CeedElemRestriction + GetInterpRangeCeedElemRestriction(Ceed ceed, mfem::Geometry::Type geom, + const std::vector &indices) const; + + // Clear the cached basis and element restriction objects owned by the finite element + // space. + void ResetCeedObjects(); + + void Update() { ResetCeedObjects(); } + + static CeedBasis BuildCeedBasis(const mfem::FiniteElementSpace &fespace, Ceed ceed, + mfem::Geometry::Type geom); + static CeedElemRestriction + BuildCeedElemRestriction(const mfem::FiniteElementSpace &fespace, Ceed ceed, + mfem::Geometry::Type geom, const std::vector &indices, + bool is_interp = false, bool is_interp_range = false); + + template + auto &GetTVector() const + { + tx.SetSize(GetTrueVSize()); + if constexpr (std::is_same::value) + { + return tx; + } + else + { + return tx.Real(); + } + } + + template + auto &GetLVector() const + { + lx.SetSize(GetVSize()); + if constexpr (std::is_same::value) + { + return lx; + } + else + { + return lx.Real(); + } + } + + template + auto &GetLVector2() const + { + ly.SetSize(GetVSize()); + if constexpr (std::is_same::value) + { + return ly; + } + else + { + return ly.Real(); + } + } + + // Get the associated MPI communicator. + MPI_Comm GetComm() const { return fespace.GetComm(); } +}; + +// +// A collection of FiniteElementSpace objects constructed on the same mesh with the ability +// to construct the prolongation operators between them as needed. +// +class FiniteElementSpaceHierarchy +{ +protected: + std::vector> fespaces; + mutable std::vector> P; + + const Operator &BuildProlongationAtLevel(std::size_t l) const; + +public: + FiniteElementSpaceHierarchy() = default; + FiniteElementSpaceHierarchy(std::unique_ptr &&fespace) + { + AddLevel(std::move(fespace)); + } + + auto GetNumLevels() const { return fespaces.size(); } + + void AddLevel(std::unique_ptr &&fespace) + { + fespaces.push_back(std::move(fespace)); + P.push_back(nullptr); + } + + auto &GetFESpaceAtLevel(std::size_t l) + { + MFEM_ASSERT(l < GetNumLevels(), + "Out of bounds request for finite element space at level " << l << "!"); + return *fespaces[l]; + } + const auto &GetFESpaceAtLevel(std::size_t l) const + { + MFEM_ASSERT(l < GetNumLevels(), + "Out of bounds request for finite element space at level " << l << "!"); + return *fespaces[l]; + } + + auto &GetFinestFESpace() + { + MFEM_ASSERT(GetNumLevels() > 0, + "Out of bounds request for finite element space at level 0!"); + return *fespaces.back(); + } + const auto &GetFinestFESpace() const + { + MFEM_ASSERT(GetNumLevels() > 0, + "Out of bounds request for finite element space at level 0!"); + return *fespaces.back(); + } + + const auto &GetProlongationAtLevel(std::size_t l) const + { + MFEM_ASSERT(l + 1 < GetNumLevels(), + "Out of bounds request for finite element space prolongation at level " + << l << "!"); + return P[l] ? *P[l] : BuildProlongationAtLevel(l); + } + + std::vector GetProlongationOperators() const + { + MFEM_ASSERT(GetNumLevels() > 1, + "Out of bounds request for finite element space prolongation at level 0!"); + std::vector P_(GetNumLevels() - 1); + for (std::size_t l = 0; l < P_.size(); l++) + { + P_[l] = &GetProlongationAtLevel(l); + } + return P_; + } + + const auto &GetDiscreteInterpolatorAtLevel(std::size_t l, + const FiniteElementSpace &aux_fespace) const + { + return GetFESpaceAtLevel(l).GetDiscreteInterpolator(aux_fespace); + } + + std::vector + GetDiscreteInterpolators(const FiniteElementSpaceHierarchy &aux_fespaces) const + { + std::vector G_(GetNumLevels()); + G_[0] = nullptr; // No discrete interpolator for coarsest level + for (std::size_t l = 1; l < G_.size(); l++) + { + G_[l] = &GetDiscreteInterpolatorAtLevel(l, aux_fespaces.GetFESpaceAtLevel(l)); + } + return G_; + } +}; + +} // namespace palace + +#endif // PALACE_FEM_FESPACE_HPP diff --git a/palace/fem/integ/curlcurl.cpp b/palace/fem/integ/curlcurl.cpp index 3f4dd13c81..8d89ddd6ef 100644 --- a/palace/fem/integ/curlcurl.cpp +++ b/palace/fem/integ/curlcurl.cpp @@ -1,119 +1,77 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fem/integrator.hpp" - -#include -#include -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/integrator.hpp" - -#include "fem/qfunctions/curlcurl_qf.h" - -namespace palace -{ - -struct CurlCurlIntegratorInfo : public ceed::IntegratorInfo -{ - CurlCurlContext ctx; -}; - -namespace -{ - -CurlCurlIntegratorInfo -InitializeIntegratorInfo(const mfem::ParFiniteElementSpace &fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - bool use_bdr, mfem::Coefficient *Q, mfem::VectorCoefficient *VQ, - mfem::MatrixCoefficient *MQ, - std::vector &coeff) -{ - MFEM_VERIFY(fespace.GetVDim() == 1, - "libCEED interface for CurlCurlIntegrator does not support vdim > 1!"); - - CurlCurlIntegratorInfo info = {{0}}; - - mfem::ParMesh &mesh = *fespace.GetParMesh(); - info.ctx.dim = mesh.Dimension() - use_bdr; - info.ctx.space_dim = mesh.SpaceDimension(); - info.ctx.curl_dim = (info.ctx.dim < 3) ? 1 : info.ctx.dim; - - info.trial_op = ceed::EvalMode::Curl; - info.test_op = ceed::EvalMode::Curl; - info.qdata_size = (info.ctx.curl_dim * (info.ctx.curl_dim + 1)) / 2; - - mfem::ConstantCoefficient *const_coeff = dynamic_cast(Q); - if (const_coeff || !(Q || VQ || MQ)) - { - info.ctx.coeff = const_coeff ? const_coeff->constant : 1.0; - - info.build_qf = f_build_curlcurl_const_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_curlcurl_const_scalar_loc); - } - else if (Q) - { - ceed::InitCoefficient(*Q, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_curlcurl_quad_scalar_loc); - } - else if (VQ) - { - MFEM_VERIFY(VQ->GetVDim() == info.ctx.curl_dim, - "Invalid vector coefficient dimension for CurlCurlIntegrator!"); - ceed::InitCoefficient(*VQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_curlcurl_quad_vector_loc); - } - else if (MQ) - { - MFEM_VERIFY(MQ->GetVDim() == info.ctx.curl_dim, - "Invalid matrix coefficient dimension for CurlCurlIntegrator!"); - ceed::InitCoefficient(*MQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_curlcurl_quad_matrix_loc); - } - - info.apply_qf = f_apply_curlcurl; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_curlcurl_loc); - - return info; -} - -} // namespace - -void CurlCurlIntegrator::Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "CurlCurlIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = - InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void CurlCurlIntegrator::AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "CurlCurlIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = - InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fem/integrator.hpp" + +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" +#include "utils/diagnostic.hpp" + +PalacePragmaDiagnosticPush +PalacePragmaDiagnosticDisableUnused + +#include "fem/qfunctions/hdiv_qf.h" +#include "fem/qfunctions/l2_qf.h" + +PalacePragmaDiagnosticPop + +namespace palace +{ + +using namespace ceed; + +void CurlCurlIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis trial_basis, + CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, + CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt dim, space_dim, trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetDimension(trial_basis, &dim)); + PalaceCeedCall(ceed, CeedGeometryDataGetSpaceDimension(geom_data_restr, dim, &space_dim)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY(trial_num_comp == test_num_comp && trial_num_comp == 1, + "CurlCurlIntegrator requires test and trial spaces with a single component!"); + switch (10 * space_dim + dim) + { + case 22: + // Curl in 2D has a single component. + info.apply_qf = assemble_q_data ? f_build_l2_1 : f_apply_l2_1; + info.apply_qf_path = PalaceQFunctionRelativePath(assemble_q_data ? f_build_l2_1_loc + : f_apply_l2_1_loc); + break; + case 33: + info.apply_qf = assemble_q_data ? f_build_hdiv_33 : f_apply_hdiv_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdiv_33_loc : f_apply_hdiv_33_loc); + break; + case 32: + // Curl in 2D has a single component. + info.apply_qf = assemble_q_data ? f_build_l2_1 : f_apply_l2_1; + info.apply_qf_path = PalaceQFunctionRelativePath(assemble_q_data ? f_build_l2_1_loc + : f_apply_l2_1_loc); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" << dim << ", " << space_dim + << ") for CurlCurlIntegrator!"); + } + info.trial_ops = EvalMode::Curl; + info.test_ops = EvalMode::Curl; + if (dim < 3) + { + info.trial_ops |= EvalMode::Weight; + } + + // Set up the coefficient and assemble. + auto ctx = PopulateCoefficientContext((dim < 3) ? 1 : dim, Q, transpose); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +} // namespace palace diff --git a/palace/fem/integ/curlcurlmass.cpp b/palace/fem/integ/curlcurlmass.cpp index b7937b3025..51f75e0fd3 100644 --- a/palace/fem/integ/curlcurlmass.cpp +++ b/palace/fem/integ/curlcurlmass.cpp @@ -1,193 +1,70 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fem/integrator.hpp" - -#include -#include -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/integrator.hpp" - -#include "fem/qfunctions/curlcurlmass_qf.h" - -namespace palace -{ - -struct CurlCurlMassIntegratorInfo : public ceed::IntegratorInfo -{ - CurlCurlMassContext ctx; -}; - -namespace -{ - -CurlCurlMassIntegratorInfo -InitializeIntegratorInfo(const mfem::ParFiniteElementSpace &fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - bool use_bdr, mfem::Coefficient *Qc, mfem::VectorCoefficient *VQc, - mfem::MatrixCoefficient *MQc, mfem::Coefficient *Qm, - mfem::VectorCoefficient *VQm, mfem::MatrixCoefficient *MQm, - std::vector &coeff) -{ - MFEM_VERIFY(fespace.GetVDim() == 1, - "libCEED interface for CurlCurlMassIntegrator does not support vdim > 1!"); - - CurlCurlMassIntegratorInfo info = {{0}}; - - mfem::ParMesh &mesh = *fespace.GetParMesh(); - info.ctx.dim = mesh.Dimension() - use_bdr; - info.ctx.space_dim = mesh.SpaceDimension(); - info.ctx.curl_dim = (info.ctx.dim < 3) ? 1 : info.ctx.dim; - - info.trial_op = ceed::EvalMode::InterpAndCurl; - info.test_op = ceed::EvalMode::InterpAndCurl; - info.qdata_size = (info.ctx.curl_dim * (info.ctx.curl_dim + 1)) / 2 + - (info.ctx.dim * (info.ctx.dim + 1)) / 2; - - MFEM_VERIFY((Qc || VQc || MQc) && (Qm || VQm || MQm), - "libCEED CurlCurlMassIntegrator requires both a " - "curl-curl and a mass integrator coefficient!"); - if (Qc) - { - ceed::InitCoefficient(*Qc, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - if (Qm) - { - ceed::InitCoefficient(*Qm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_mass_quad_scalar_scalar; - info.build_qf_path = - PalaceQFunctionRelativePath(f_build_curlcurl_mass_quad_scalar_scalar_loc); - } - else if (VQm) - { - MFEM_VERIFY(VQm->GetVDim() == info.ctx.space_dim, - "Invalid vector coefficient dimension for CurlCurlMassIntegrator!"); - ceed::InitCoefficient(*VQm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_mass_quad_scalar_vector; - info.build_qf_path = - PalaceQFunctionRelativePath(f_build_curlcurl_mass_quad_scalar_vector_loc); - } - else if (MQm) - { - MFEM_VERIFY(MQm->GetVDim() == info.ctx.space_dim, - "Invalid matrix coefficient dimension for CurlCurlMassIntegrator!"); - ceed::InitCoefficient(*MQm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_mass_quad_scalar_matrix; - info.build_qf_path = - PalaceQFunctionRelativePath(f_build_curlcurl_mass_quad_scalar_matrix_loc); - } - } - else if (VQc) - { - MFEM_VERIFY(VQc->GetVDim() == info.ctx.curl_dim, - "Invalid vector coefficient dimension for CurlCurlMassIntegrator!"); - ceed::InitCoefficient(*VQc, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - if (Qm) - { - ceed::InitCoefficient(*Qm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_mass_quad_vector_scalar; - info.build_qf_path = - PalaceQFunctionRelativePath(f_build_curlcurl_mass_quad_vector_scalar_loc); - } - else if (VQm) - { - MFEM_VERIFY(VQm->GetVDim() == info.ctx.space_dim, - "Invalid vector coefficient dimension for CurlCurlMassIntegrator!"); - ceed::InitCoefficient(*VQm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_mass_quad_vector_vector; - info.build_qf_path = - PalaceQFunctionRelativePath(f_build_curlcurl_mass_quad_vector_vector_loc); - } - else if (MQm) - { - MFEM_VERIFY(MQm->GetVDim() == info.ctx.space_dim, - "Invalid matrix coefficient dimension for CurlCurlMassIntegrator!"); - ceed::InitCoefficient(*MQm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_mass_quad_vector_matrix; - info.build_qf_path = - PalaceQFunctionRelativePath(f_build_curlcurl_mass_quad_vector_matrix_loc); - } - } - else if (MQc) - { - MFEM_VERIFY(MQc->GetVDim() == info.ctx.curl_dim, - "Invalid matrix coefficient dimension for CurlCurlMassIntegrator!"); - ceed::InitCoefficient(*MQc, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - if (Qm) - { - ceed::InitCoefficient(*Qm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_mass_quad_matrix_scalar; - info.build_qf_path = - PalaceQFunctionRelativePath(f_build_curlcurl_mass_quad_matrix_scalar_loc); - } - else if (VQm) - { - MFEM_VERIFY(VQm->GetVDim() == info.ctx.space_dim, - "Invalid vector coefficient dimension for CurlCurlMassIntegrator!"); - ceed::InitCoefficient(*VQm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_mass_quad_matrix_vector; - info.build_qf_path = - PalaceQFunctionRelativePath(f_build_curlcurl_mass_quad_matrix_vector_loc); - } - else if (MQm) - { - MFEM_VERIFY(MQm->GetVDim() == info.ctx.space_dim, - "Invalid matrix coefficient dimension for CurlCurlMassIntegrator!"); - ceed::InitCoefficient(*MQm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_curlcurl_mass_quad_matrix_matrix; - info.build_qf_path = - PalaceQFunctionRelativePath(f_build_curlcurl_mass_quad_matrix_matrix_loc); - } - } - - info.apply_qf = f_apply_curlcurl_mass; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_curlcurl_mass_loc); - - return info; -} - -} // namespace - -void CurlCurlMassIntegrator::Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "CurlCurlMassIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Qc, VQc, - MQc, Qm, VQm, MQm, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void CurlCurlMassIntegrator::AssembleBoundary( - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "CurlCurlMassIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Qc, VQc, - MQc, Qm, VQm, MQm, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fem/integrator.hpp" + +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" + +#include "fem/qfunctions/hdivmass_qf.h" + +namespace palace +{ + +using namespace ceed; + +void CurlCurlMassIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis trial_basis, + CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, + CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt dim, space_dim, trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetDimension(trial_basis, &dim)); + PalaceCeedCall(ceed, CeedGeometryDataGetSpaceDimension(geom_data_restr, dim, &space_dim)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY( + trial_num_comp == test_num_comp && trial_num_comp == 1, + "CurlCurlMassIntegrator requires test and trial spaces with a single component!"); + switch (10 * space_dim + dim) + { + case 22: + info.apply_qf = assemble_q_data ? f_build_hdivmass_22 : f_apply_hdivmass_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdivmass_22_loc : f_apply_hdivmass_22_loc); + break; + case 33: + info.apply_qf = assemble_q_data ? f_build_hdivmass_33 : f_apply_hdivmass_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdivmass_33_loc : f_apply_hdivmass_33_loc); + break; + case 32: + info.apply_qf = assemble_q_data ? f_build_hdivmass_32 : f_apply_hdivmass_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdivmass_32_loc : f_apply_hdivmass_32_loc); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" + << dim << ", " << space_dim << ") for CurlCurlMassIntegrator!"); + } + info.trial_ops = EvalMode::Curl | EvalMode::Interp; + info.test_ops = EvalMode::Curl | EvalMode::Interp; + if (dim < 3) + { + info.trial_ops |= EvalMode::Weight; + } + + // Set up the coefficient and assemble. Mass goes first. + auto ctx = PopulateCoefficientContext(space_dim, Q_mass, (dim < 3) ? 1 : dim, Q, + transpose_mass, transpose); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +} // namespace palace diff --git a/palace/fem/integ/diffusion.cpp b/palace/fem/integ/diffusion.cpp index f54f1b8d03..52b6c97ef2 100644 --- a/palace/fem/integ/diffusion.cpp +++ b/palace/fem/integ/diffusion.cpp @@ -1,118 +1,70 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fem/integrator.hpp" - -#include -#include -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/integrator.hpp" - -#include "fem/qfunctions/diffusion_qf.h" - -namespace palace -{ - -struct DiffusionIntegratorInfo : public ceed::IntegratorInfo -{ - DiffusionContext ctx; -}; - -namespace -{ - -DiffusionIntegratorInfo -InitializeIntegratorInfo(const mfem::ParFiniteElementSpace &fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - bool use_bdr, mfem::Coefficient *Q, mfem::VectorCoefficient *VQ, - mfem::MatrixCoefficient *MQ, - std::vector &coeff) -{ - MFEM_VERIFY(fespace.GetVDim() == 1, - "libCEED interface for DiffusionIntegrator does not support vdim > 1!"); - - DiffusionIntegratorInfo info = {{0}}; - - mfem::ParMesh &mesh = *fespace.GetParMesh(); - info.ctx.dim = mesh.Dimension() - use_bdr; - info.ctx.space_dim = mesh.SpaceDimension(); - - info.trial_op = ceed::EvalMode::Grad; - info.test_op = ceed::EvalMode::Grad; - info.qdata_size = (info.ctx.dim * (info.ctx.dim + 1)) / 2; - - mfem::ConstantCoefficient *const_coeff = dynamic_cast(Q); - if (const_coeff || !(Q || VQ || MQ)) - { - info.ctx.coeff = const_coeff ? const_coeff->constant : 1.0; - - info.build_qf = f_build_diff_const_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_diff_const_scalar_loc); - } - else if (Q) - { - ceed::InitCoefficient(*Q, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_diff_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_diff_quad_scalar_loc); - } - else if (VQ) - { - MFEM_VERIFY(VQ->GetVDim() == info.ctx.space_dim, - "Invalid vector coefficient dimension for DiffusionIntegrator!"); - ceed::InitCoefficient(*VQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_diff_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_diff_quad_vector_loc); - } - else if (MQ) - { - MFEM_VERIFY(MQ->GetVDim() == info.ctx.space_dim, - "Invalid matrix coefficient dimension for DiffusionIntegrator!"); - ceed::InitCoefficient(*MQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_diff_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_diff_quad_matrix_loc); - } - - info.apply_qf = f_apply_diff; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_diff_loc); - - return info; -} - -} // namespace - -void DiffusionIntegrator::Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "DiffusionIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = - InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void DiffusionIntegrator::AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "DiffusionIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = - InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fem/integrator.hpp" + +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" + +#include "fem/qfunctions/hcurl_qf.h" + +namespace palace +{ + +using namespace ceed; + +void DiffusionIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis trial_basis, + CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, + CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt dim, space_dim, trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetDimension(trial_basis, &dim)); + PalaceCeedCall(ceed, CeedGeometryDataGetSpaceDimension(geom_data_restr, dim, &space_dim)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY( + trial_num_comp == test_num_comp && trial_num_comp == 1, + "DiffusionIntegrator requires test and trial spaces with a single component!"); + switch (10 * space_dim + dim) + { + case 22: + info.apply_qf = assemble_q_data ? f_build_hcurl_22 : f_apply_hcurl_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_22_loc : f_apply_hcurl_22_loc); + break; + case 33: + info.apply_qf = assemble_q_data ? f_build_hcurl_33 : f_apply_hcurl_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_33_loc : f_apply_hcurl_33_loc); + break; + case 21: + info.apply_qf = assemble_q_data ? f_build_hcurl_21 : f_apply_hcurl_21; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_21_loc : f_apply_hcurl_21_loc); + break; + case 32: + info.apply_qf = assemble_q_data ? f_build_hcurl_32 : f_apply_hcurl_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_32_loc : f_apply_hcurl_32_loc); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" << dim << ", " << space_dim + << ") for DiffusionIntegrator!"); + } + info.trial_ops = EvalMode::Grad; + info.test_ops = EvalMode::Grad; + + // Set up the coefficient and assemble. + auto ctx = PopulateCoefficientContext(space_dim, Q, transpose); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +} // namespace palace diff --git a/palace/fem/integ/diffusionmass.cpp b/palace/fem/integ/diffusionmass.cpp index a9b633c75b..7e49bb2f5a 100644 --- a/palace/fem/integ/diffusionmass.cpp +++ b/palace/fem/integ/diffusionmass.cpp @@ -1,112 +1,71 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fem/integrator.hpp" - -#include -#include -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/integrator.hpp" - -#include "fem/qfunctions/diffusionmass_qf.h" - -namespace palace -{ - -struct DiffusionMassIntegratorInfo : public ceed::IntegratorInfo -{ - DiffusionMassContext ctx; -}; - -namespace -{ - -DiffusionMassIntegratorInfo -InitializeIntegratorInfo(const mfem::ParFiniteElementSpace &fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - bool use_bdr, mfem::Coefficient *Qd, mfem::VectorCoefficient *VQd, - mfem::MatrixCoefficient *MQd, mfem::Coefficient *Qm, - std::vector &coeff) -{ - MFEM_VERIFY(fespace.GetVDim() == 1, - "libCEED interface for DiffusionMassIntegrator does not support vdim > 1!"); - - DiffusionMassIntegratorInfo info = {{0}}; - - mfem::ParMesh &mesh = *fespace.GetParMesh(); - info.ctx.dim = mesh.Dimension() - use_bdr; - info.ctx.space_dim = mesh.SpaceDimension(); - - info.trial_op = ceed::EvalMode::InterpAndGrad; - info.test_op = ceed::EvalMode::InterpAndGrad; - info.qdata_size = (info.ctx.dim * (info.ctx.dim + 1)) / 2 + 1; - - MFEM_VERIFY((Qd || VQd || MQd) && Qm, "libCEED DiffusionMassIntegrator requires both a " - "diffusion and a mass integrator coefficient!"); - if (Qd) - { - ceed::InitCoefficient(*Qd, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_diff_mass_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_diff_mass_quad_scalar_loc); - } - else if (VQd) - { - MFEM_VERIFY(VQd->GetVDim() == info.ctx.space_dim, - "Invalid vector coefficient dimension for DiffusionMassIntegrator!"); - ceed::InitCoefficient(*VQd, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_diff_mass_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_diff_mass_quad_vector_loc); - } - else if (MQd) - { - MFEM_VERIFY(MQd->GetVDim() == info.ctx.space_dim, - "Invalid matrix coefficient dimension for DiffusionMassIntegrator!"); - ceed::InitCoefficient(*MQd, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_diff_mass_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_diff_mass_quad_matrix_loc); - } - ceed::InitCoefficient(*Qm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.apply_qf = f_apply_diff_mass; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_diff_mass_loc); - - return info; -} - -} // namespace - -void DiffusionMassIntegrator::Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "DiffusionMassIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Qd, VQd, - MQd, Qm, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void DiffusionMassIntegrator::AssembleBoundary( - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "DiffusionMassIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Qd, VQd, - MQd, Qm, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fem/integrator.hpp" + +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" + +#include "fem/qfunctions/hcurlmass_qf.h" + +namespace palace +{ + +using namespace ceed; + +void DiffusionMassIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, + CeedVector geom_data, + CeedElemRestriction geom_data_restr, + CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt dim, space_dim, trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetDimension(trial_basis, &dim)); + PalaceCeedCall(ceed, CeedGeometryDataGetSpaceDimension(geom_data_restr, dim, &space_dim)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY( + trial_num_comp == test_num_comp && trial_num_comp == 1, + "DiffusionMassIntegrator requires test and trial spaces with a single component!"); + switch (10 * space_dim + dim) + { + case 22: + info.apply_qf = assemble_q_data ? f_build_hcurlmass_22 : f_apply_hcurlmass_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlmass_22_loc : f_apply_hcurlmass_22_loc); + break; + case 33: + info.apply_qf = assemble_q_data ? f_build_hcurlmass_33 : f_apply_hcurlmass_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlmass_33_loc : f_apply_hcurlmass_33_loc); + break; + case 21: + info.apply_qf = assemble_q_data ? f_build_hcurlmass_21 : f_apply_hcurlmass_21; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlmass_21_loc : f_apply_hcurlmass_21_loc); + break; + case 32: + info.apply_qf = assemble_q_data ? f_build_hcurlmass_32 : f_apply_hcurlmass_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlmass_32_loc : f_apply_hcurlmass_32_loc); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" + << dim << ", " << space_dim << ") for DiffusionMassIntegrator!"); + } + info.trial_ops = EvalMode::Grad | EvalMode::Interp; + info.test_ops = EvalMode::Grad | EvalMode::Interp; + + // Set up the coefficient and assemble. Mass goes first. + auto ctx = PopulateCoefficientContext(1, Q_mass, space_dim, Q, transpose_mass, transpose); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +} // namespace palace diff --git a/palace/fem/integ/divdiv.cpp b/palace/fem/integ/divdiv.cpp index e0d7ba241b..f8ae793431 100644 --- a/palace/fem/integ/divdiv.cpp +++ b/palace/fem/integ/divdiv.cpp @@ -1,97 +1,62 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fem/integrator.hpp" - -#include -#include -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/integrator.hpp" - -#include "fem/qfunctions/divdiv_qf.h" - -namespace palace -{ - -struct DivDivIntegratorInfo : public ceed::IntegratorInfo -{ - DivDivContext ctx; -}; - -namespace -{ - -DivDivIntegratorInfo -InitializeIntegratorInfo(const mfem::ParFiniteElementSpace &fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - bool use_bdr, mfem::Coefficient *Q, - std::vector &coeff) -{ - MFEM_VERIFY(fespace.GetVDim() == 1, - "libCEED interface for DivDivIntegrator does not support vdim > 1!"); - - DivDivIntegratorInfo info = {{0}}; - - mfem::ParMesh &mesh = *fespace.GetParMesh(); - info.ctx.dim = mesh.Dimension() - use_bdr; - info.ctx.space_dim = mesh.SpaceDimension(); - - info.trial_op = ceed::EvalMode::Div; - info.test_op = ceed::EvalMode::Div; - info.qdata_size = 1; - - mfem::ConstantCoefficient *const_coeff = dynamic_cast(Q); - if (const_coeff || !Q) - { - info.ctx.coeff = const_coeff ? const_coeff->constant : 1.0; - - info.build_qf = f_build_divdiv_const; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_divdiv_const_loc); - } - else if (Q) - { - ceed::InitCoefficient(*Q, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_divdiv_quad; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_divdiv_quad_loc); - } - - info.apply_qf = f_apply_divdiv; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_divdiv_loc); - - return info; -} - -} // namespace - -void DivDivIntegrator::Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "DivDivIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Q, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void DivDivIntegrator::AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "DivDivIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Q, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fem/integrator.hpp" + +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" + +#include "fem/qfunctions/l2_qf.h" + +namespace palace +{ + +using namespace ceed; + +void DivDivIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis trial_basis, + CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY( + trial_num_comp == test_num_comp, + "DivDivIntegrator requires test and trial spaces with same number of components!"); + switch (trial_num_comp) + { + case 1: + info.apply_qf = assemble_q_data ? f_build_l2_1 : f_apply_l2_1; + info.apply_qf_path = PalaceQFunctionRelativePath(assemble_q_data ? f_build_l2_1_loc + : f_apply_l2_1_loc); + break; + case 2: + info.apply_qf = assemble_q_data ? f_build_l2_2 : f_apply_l2_2; + info.apply_qf_path = PalaceQFunctionRelativePath(assemble_q_data ? f_build_l2_2_loc + : f_apply_l2_2_loc); + break; + case 3: + info.apply_qf = assemble_q_data ? f_build_l2_3 : f_apply_l2_3; + info.apply_qf_path = PalaceQFunctionRelativePath(assemble_q_data ? f_build_l2_3_loc + : f_apply_l2_3_loc); + break; + default: + MFEM_ABORT("Invalid value of num_comp = " << trial_num_comp + << " for DivDivIntegrator!"); + } + info.trial_ops = EvalMode::Div | EvalMode::Weight; + info.test_ops = EvalMode::Div; + + // Set up the coefficient and assemble. + auto ctx = PopulateCoefficientContext(trial_num_comp, Q, transpose); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +} // namespace palace diff --git a/palace/fem/integ/divdivmass.cpp b/palace/fem/integ/divdivmass.cpp index 01e9dfafff..6e743788f6 100644 --- a/palace/fem/integ/divdivmass.cpp +++ b/palace/fem/integ/divdivmass.cpp @@ -1,112 +1,70 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fem/integrator.hpp" - -#include -#include -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/integrator.hpp" - -#include "fem/qfunctions/divdivmass_qf.h" - -namespace palace -{ - -struct DivDivMassIntegratorInfo : public ceed::IntegratorInfo -{ - DivDivMassContext ctx; -}; - -namespace -{ - -DivDivMassIntegratorInfo -InitializeIntegratorInfo(const mfem::ParFiniteElementSpace &fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - bool use_bdr, mfem::Coefficient *Qd, mfem::Coefficient *Qm, - mfem::VectorCoefficient *VQm, mfem::MatrixCoefficient *MQm, - std::vector &coeff) -{ - MFEM_VERIFY(fespace.GetVDim() == 1, - "libCEED interface for DivDivMassIntegrator does not support vdim > 1!"); - - DivDivMassIntegratorInfo info = {{0}}; - - mfem::ParMesh &mesh = *fespace.GetParMesh(); - info.ctx.dim = mesh.Dimension() - use_bdr; - info.ctx.space_dim = mesh.SpaceDimension(); - - info.trial_op = ceed::EvalMode::InterpAndDiv; - info.test_op = ceed::EvalMode::InterpAndDiv; - info.qdata_size = 1 + (info.ctx.dim * (info.ctx.dim + 1)) / 2; - - MFEM_VERIFY(Qd && (Qm || VQm || MQm), "libCEED DivDivMassIntegrator requires both a " - "div-div and a mass integrator coefficient!"); - ceed::InitCoefficient(*Qd, mesh, ir, indices, use_bdr, coeff.emplace_back()); - if (Qm) - { - ceed::InitCoefficient(*Qm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_divdiv_mass_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_divdiv_mass_quad_scalar_loc); - } - else if (VQm) - { - MFEM_VERIFY(VQm->GetVDim() == info.ctx.space_dim, - "Invalid vector coefficient dimension for DivDivMassIntegrator!"); - ceed::InitCoefficient(*VQm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_divdiv_mass_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_divdiv_mass_quad_vector_loc); - } - else if (MQm) - { - MFEM_VERIFY(MQm->GetVDim() == info.ctx.space_dim, - "Invalid matrix coefficient dimension for DivDivMassIntegrator!"); - ceed::InitCoefficient(*MQm, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_divdiv_mass_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_divdiv_mass_quad_matrix_loc); - } - - info.apply_qf = f_apply_divdiv_mass; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_divdiv_mass_loc); - - return info; -} - -} // namespace - -void DivDivMassIntegrator::Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "DivDivMassIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Qd, Qm, - VQm, MQm, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void DivDivMassIntegrator::AssembleBoundary( - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "DivDivMassIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Qd, Qm, - VQm, MQm, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fem/integrator.hpp" + +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" + +#include "fem/qfunctions/l2mass_qf.h" + +namespace palace +{ + +using namespace ceed; + +void DivDivMassIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis trial_basis, + CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, + CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt dim, space_dim, trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetDimension(trial_basis, &dim)); + PalaceCeedCall(ceed, CeedGeometryDataGetSpaceDimension(geom_data_restr, dim, &space_dim)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY( + trial_num_comp == test_num_comp && trial_num_comp == 1, + "DivDivMassIntegrator requires test and trial spaces with a single component!"); + switch (10 * space_dim + dim) + { + case 22: + info.apply_qf = assemble_q_data ? f_build_l2mass_22 : f_apply_l2mass_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_l2mass_22_loc : f_apply_l2mass_22_loc); + break; + case 33: + info.apply_qf = assemble_q_data ? f_build_l2mass_33 : f_apply_l2mass_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_l2mass_33_loc : f_apply_l2mass_33_loc); + break; + case 21: + info.apply_qf = assemble_q_data ? f_build_l2mass_21 : f_apply_l2mass_21; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_l2mass_21_loc : f_apply_l2mass_21_loc); + break; + case 32: + info.apply_qf = assemble_q_data ? f_build_l2mass_32 : f_apply_l2mass_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_l2mass_32_loc : f_apply_l2mass_32_loc); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" << dim << ", " << space_dim + << ") for DivDivMassIntegrator!"); + } + info.trial_ops = EvalMode::Div | EvalMode::Interp | EvalMode::Weight; + info.test_ops = EvalMode::Div | EvalMode::Interp; + + // Set up the coefficient and assemble. Mass goes first. + auto ctx = PopulateCoefficientContext(space_dim, Q_mass, 1, Q, transpose_mass, transpose); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +} // namespace palace diff --git a/palace/fem/integ/grad.cpp b/palace/fem/integ/grad.cpp index aaf0ddb8d9..3b8ca5113c 100644 --- a/palace/fem/integ/grad.cpp +++ b/palace/fem/integ/grad.cpp @@ -1,115 +1,70 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fem/integrator.hpp" - -#include -#include -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/integrator.hpp" - -#include "fem/qfunctions/grad_qf.h" - -namespace palace -{ - -struct GradientIntegratorInfo : public ceed::IntegratorInfo -{ - GradContext ctx; -}; - -namespace -{ - -GradientIntegratorInfo -InitializeIntegratorInfo(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - bool use_bdr, mfem::Coefficient *Q, mfem::VectorCoefficient *VQ, - mfem::MatrixCoefficient *MQ, - std::vector &coeff) -{ - GradientIntegratorInfo info = {{0}}; - - mfem::ParMesh &mesh = *trial_fespace.GetParMesh(); - info.ctx.dim = mesh.Dimension() - use_bdr; - info.ctx.space_dim = mesh.SpaceDimension(); - MFEM_VERIFY(trial_fespace.GetVDim() == 1 && test_fespace.GetVDim() == info.ctx.space_dim, - "libCEED interface for GradientIntegrator requires trial space vdim == 1 and " - "test space vdim == space dimension!"); - - info.trial_op = ceed::EvalMode::Grad; - info.test_op = ceed::EvalMode::Interp; - info.qdata_size = info.ctx.space_dim * info.ctx.dim; - - mfem::ConstantCoefficient *const_coeff = dynamic_cast(Q); - if (const_coeff || !(Q || VQ || MQ)) - { - info.ctx.coeff = const_coeff ? const_coeff->constant : 1.0; - - info.build_qf = f_build_grad_const_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_grad_const_scalar_loc); - } - else if (Q) - { - ceed::InitCoefficient(*Q, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_grad_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_grad_quad_scalar_loc); - } - else if (VQ) - { - MFEM_VERIFY(VQ->GetVDim() == info.ctx.space_dim, - "Invalid vector coefficient dimension for GradientIntegrator integrator!"); - ceed::InitCoefficient(*VQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_grad_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_grad_quad_vector_loc); - } - else if (MQ) - { - MFEM_VERIFY(MQ->GetVDim() == info.ctx.space_dim, - "Invalid matrix coefficient dimension for GradientIntegrator integrator!"); - ceed::InitCoefficient(*MQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_grad_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_grad_quad_matrix_loc); - } - - info.apply_qf = f_apply_grad; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_grad_loc); - - return info; -} - -} // namespace - -void GradientIntegrator::Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, - use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void GradientIntegrator::AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, - use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fem/integrator.hpp" + +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" + +#include "fem/qfunctions/hcurlh1d_qf.h" + +namespace palace +{ + +using namespace ceed; + +void GradientIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis trial_basis, + CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, + CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt dim, space_dim, trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetDimension(trial_basis, &dim)); + PalaceCeedCall(ceed, CeedGeometryDataGetSpaceDimension(geom_data_restr, dim, &space_dim)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY(trial_num_comp == 1 && test_num_comp == space_dim, + "GradientIntegrator requires trial space with a single component and test " + "space with space_dim components!"); + switch (10 * space_dim + dim) + { + case 22: + info.apply_qf = assemble_q_data ? f_build_hcurlh1d_22 : f_apply_hcurlh1d_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlh1d_22_loc : f_apply_hcurlh1d_22_loc); + break; + case 33: + info.apply_qf = assemble_q_data ? f_build_hcurlh1d_33 : f_apply_hcurlh1d_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlh1d_33_loc : f_apply_hcurlh1d_33_loc); + break; + case 21: + info.apply_qf = assemble_q_data ? f_build_hcurlh1d_21 : f_apply_hcurlh1d_21; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlh1d_21_loc : f_apply_hcurlh1d_21_loc); + break; + case 32: + info.apply_qf = assemble_q_data ? f_build_hcurlh1d_32 : f_apply_hcurlh1d_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlh1d_32_loc : f_apply_hcurlh1d_32_loc); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" << dim << ", " << space_dim + << ") for GradientIntegrator!"); + } + info.trial_ops = EvalMode::Grad; + info.test_ops = EvalMode::Interp; + + // Set up the coefficient and assemble. + auto ctx = PopulateCoefficientContext(space_dim, Q, transpose); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +} // namespace palace diff --git a/palace/fem/integ/mass.cpp b/palace/fem/integ/mass.cpp index adecb6b5bd..8ea0a86556 100644 --- a/palace/fem/integ/mass.cpp +++ b/palace/fem/integ/mass.cpp @@ -1,129 +1,62 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fem/integrator.hpp" - -#include -#include -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/integrator.hpp" - -#include "fem/qfunctions/mass_qf.h" - -namespace palace -{ - -struct MassIntegratorInfo : public ceed::IntegratorInfo -{ - MassContext ctx; -}; - -namespace -{ - -MassIntegratorInfo InitializeIntegratorInfo(const mfem::ParFiniteElementSpace &fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, bool use_bdr, - mfem::Coefficient *Q, - mfem::VectorCoefficient *VQ, - mfem::MatrixCoefficient *MQ, - std::vector &coeff) -{ - MassIntegratorInfo info = {{0}}; - - mfem::ParMesh &mesh = *fespace.GetParMesh(); - info.ctx.dim = mesh.Dimension() - use_bdr; - info.ctx.space_dim = mesh.SpaceDimension(); - info.ctx.vdim = fespace.GetVDim(); - - info.trial_op = ceed::EvalMode::Interp; - info.test_op = ceed::EvalMode::Interp; - - mfem::ConstantCoefficient *const_coeff = dynamic_cast(Q); - if (const_coeff || !(Q || VQ || MQ)) - { - info.qdata_size = 1; - info.ctx.coeff = const_coeff ? const_coeff->constant : 1.0; - - info.build_qf = f_build_mass_const_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_mass_const_scalar_loc); - - info.apply_qf = f_apply_mass_scalar; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_mass_scalar_loc); - } - else if (Q) - { - info.qdata_size = 1; - ceed::InitCoefficient(*Q, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_mass_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_mass_quad_scalar_loc); - - info.apply_qf = f_apply_mass_scalar; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_mass_scalar_loc); - } - else if (VQ) - { - MFEM_VERIFY(VQ->GetVDim() == info.ctx.vdim, - "Invalid vector coefficient dimension for vector MassIntegrator!"); - info.qdata_size = info.ctx.vdim; - ceed::InitCoefficient(*VQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_mass_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_mass_quad_vector_loc); - - info.apply_qf = f_apply_mass_vector; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_mass_vector_loc); - } - else if (MQ) - { - MFEM_VERIFY(MQ->GetVDim() == info.ctx.vdim, - "Invalid matrix coefficient dimension for vector MassIntegrator!"); - info.qdata_size = (info.ctx.vdim * (info.ctx.vdim + 1)) / 2; - ceed::InitCoefficient(*MQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_mass_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_mass_quad_matrix_loc); - - info.apply_qf = f_apply_mass_matrix; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_mass_matrix_loc); - } - - return info; -} - -} // namespace - -void MassIntegrator::Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, - CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "MassIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = - InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void MassIntegrator::AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - MFEM_VERIFY(&trial_fespace == &test_fespace, - "MassIntegrator requires the same test and trial spaces!"); - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = - InitializeIntegratorInfo(trial_fespace, ir, indices, use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fem/integrator.hpp" + +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" + +#include "fem/qfunctions/h1_qf.h" + +namespace palace +{ + +using namespace ceed; + +void MassIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis trial_basis, + CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY( + trial_num_comp == test_num_comp, + "MassIntegrator requires test and trial spaces with same number of components!"); + switch (trial_num_comp) + { + case 1: + info.apply_qf = assemble_q_data ? f_build_h1_1 : f_apply_h1_1; + info.apply_qf_path = PalaceQFunctionRelativePath(assemble_q_data ? f_build_h1_1_loc + : f_apply_h1_1_loc); + break; + case 2: + info.apply_qf = assemble_q_data ? f_build_h1_2 : f_apply_h1_2; + info.apply_qf_path = PalaceQFunctionRelativePath(assemble_q_data ? f_build_h1_2_loc + : f_apply_h1_2_loc); + break; + case 3: + info.apply_qf = assemble_q_data ? f_build_h1_3 : f_apply_h1_3; + info.apply_qf_path = PalaceQFunctionRelativePath(assemble_q_data ? f_build_h1_3_loc + : f_apply_h1_3_loc); + break; + default: + MFEM_ABORT("Invalid value of num_comp = " << trial_num_comp + << " for MassIntegrator!"); + } + info.trial_ops = EvalMode::Interp; + info.test_ops = EvalMode::Interp; + + // Set up the coefficient and assemble. + auto ctx = PopulateCoefficientContext(trial_num_comp, Q, transpose); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +} // namespace palace diff --git a/palace/fem/integ/mixedveccurl.cpp b/palace/fem/integ/mixedveccurl.cpp index 23068297ed..a1f8724b76 100644 --- a/palace/fem/integ/mixedveccurl.cpp +++ b/palace/fem/integ/mixedveccurl.cpp @@ -1,249 +1,117 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fem/integrator.hpp" - -#include -#include -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/integrator.hpp" - -#include "fem/qfunctions/hcurlhdiv_qf.h" -#include "fem/qfunctions/hdiv_qf.h" - -namespace palace -{ - -struct MixedVectorCurlIntegratorInfo : public ceed::IntegratorInfo -{ - VectorFEMassContext ctx; -}; - -namespace -{ - -MixedVectorCurlIntegratorInfo -InitializeIntegratorInfo(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - bool use_bdr, mfem::Coefficient *Q, mfem::VectorCoefficient *VQ, - mfem::MatrixCoefficient *MQ, - std::vector &coeff, - ceed::EvalMode trial_op, ceed::EvalMode test_op) -{ - MFEM_VERIFY( - trial_fespace.GetVDim() == 1 && test_fespace.GetVDim() == 1, - "libCEED interface for MixedVectorCurlIntegrator/MixedVectorWeakCurlIntegrator does " - "not support vdim > 1!"); - - MixedVectorCurlIntegratorInfo info = {{0}}; - - mfem::ParMesh &mesh = *trial_fespace.GetParMesh(); - info.ctx.dim = mesh.Dimension() - use_bdr; - info.ctx.space_dim = mesh.SpaceDimension(); - MFEM_VERIFY( - info.ctx.dim == 3 && info.ctx.space_dim == 3, - "MixedVectorCurlIntegrator/MixedVectorWeakCurlIntegrator is only availble in 3D!"); - - int trial_map_type = trial_fespace.FEColl()->GetMapType(info.ctx.dim); - int test_map_type = test_fespace.FEColl()->GetMapType(info.ctx.dim); - MFEM_VERIFY( - (trial_op == ceed::EvalMode::Curl && trial_map_type == mfem::FiniteElement::H_CURL && - (test_op == ceed::EvalMode::Interp && - (test_map_type == mfem::FiniteElement::H_CURL || - test_map_type == mfem::FiniteElement::H_DIV))) || - (test_op == ceed::EvalMode::Curl && - test_map_type == mfem::FiniteElement::H_CURL && - (trial_op == ceed::EvalMode::Interp && - (trial_map_type == mfem::FiniteElement::H_CURL || - trial_map_type == mfem::FiniteElement::H_DIV))), - "libCEED interface for MixedVectorCurlIntegrator/MixedVectorWeakCurlIntegrator " - "requires H(curl) or mixed H(curl) and H(div) FE spaces!"); - - info.trial_op = trial_op; - info.test_op = test_op; - if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_CURL) - { - // Quadrature data is nonsymmetric in this case. - info.qdata_size = info.ctx.dim * info.ctx.dim; - info.ctx.sym = false; - } - else - { - info.qdata_size = (info.ctx.dim * (info.ctx.dim + 1)) / 2; - info.ctx.sym = true; - } - - mfem::ConstantCoefficient *const_coeff = dynamic_cast(Q); - if (const_coeff || !(Q || VQ || MQ)) - { - info.ctx.coeff = const_coeff ? const_coeff->constant : 1.0; - - if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_CURL) - { - if (trial_op == ceed::EvalMode::Curl) - { - info.build_qf = f_build_hdivhcurl_const_scalar; - info.build_qf_path = - PalaceQFunctionRelativePath(f_build_hdivhcurl_const_scalar_loc); - } - else // test_op == ceed::EvalMode::Curl - { - info.build_qf = f_build_hcurlhdiv_const_scalar; - info.build_qf_path = - PalaceQFunctionRelativePath(f_build_hcurlhdiv_const_scalar_loc); - } - } - else - { - info.build_qf = f_build_hdiv_const_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdiv_const_scalar_loc); - } - } - else if (Q) - { - ceed::InitCoefficient(*Q, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_CURL) - { - if (trial_op == ceed::EvalMode::Curl) - { - info.build_qf = f_build_hdivhcurl_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdivhcurl_quad_scalar_loc); - } - else // test_op == ceed::EvalMode::Curl - { - info.build_qf = f_build_hcurlhdiv_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurlhdiv_quad_scalar_loc); - } - } - else - { - info.build_qf = f_build_hdiv_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdiv_quad_scalar_loc); - } - } - else if (VQ) - { - MFEM_VERIFY(VQ->GetVDim() == info.ctx.space_dim, - "Invalid vector coefficient dimension for " - "MixedVectorCurlIntegrator/MixedVectorWeakCurlIntegrator integrator!"); - ceed::InitCoefficient(*VQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_CURL) - { - if (trial_op == ceed::EvalMode::Curl) - { - info.build_qf = f_build_hdivhcurl_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdivhcurl_quad_vector_loc); - } - else // test_op == ceed::EvalMode::Curl - { - info.build_qf = f_build_hcurlhdiv_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurlhdiv_quad_vector_loc); - } - } - else - { - info.build_qf = f_build_hdiv_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdiv_quad_vector_loc); - } - } - else if (MQ) - { - MFEM_VERIFY(MQ->GetVDim() == info.ctx.space_dim, - "Invalid matrix coefficient dimension for " - "MixedVectorCurlIntegrator/MixedVectorWeakCurlIntegrator integrator!"); - ceed::InitCoefficient(*MQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_CURL) - { - if (trial_op == ceed::EvalMode::Curl) - { - info.build_qf = f_build_hdivhcurl_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdivhcurl_quad_matrix_loc); - } - else // test_op == ceed::EvalMode::Curl - { - info.build_qf = f_build_hcurlhdiv_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurlhdiv_quad_matrix_loc); - } - } - else - { - info.build_qf = f_build_hdiv_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdiv_quad_matrix_loc); - } - } - - info.apply_qf = f_apply_vecfemass; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_vecfemass_loc); - - return info; -} - -} // namespace - -void MixedVectorCurlIntegrator::Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = - InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, use_bdr, Q, VQ, MQ, - coeff, ceed::EvalMode::Curl, ceed::EvalMode::Interp); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void MixedVectorCurlIntegrator::AssembleBoundary( - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, CeedOperator *op_t) -{ - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = - InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, use_bdr, Q, VQ, MQ, - coeff, ceed::EvalMode::Curl, ceed::EvalMode::Interp); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void MixedVectorWeakCurlIntegrator::Assemble( - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, CeedOperator *op_t) -{ - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = - InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, use_bdr, Q, VQ, MQ, - coeff, ceed::EvalMode::Interp, ceed::EvalMode::Curl); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void MixedVectorWeakCurlIntegrator::AssembleBoundary( - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, CeedOperator *op_t) -{ - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = - InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, use_bdr, Q, VQ, MQ, - coeff, ceed::EvalMode::Interp, ceed::EvalMode::Curl); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fem/integrator.hpp" + +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" +#include "utils/diagnostic.hpp" + +PalacePragmaDiagnosticPush +PalacePragmaDiagnosticDisableUnused + +#include "fem/qfunctions/hcurlhdiv_qf.h" +#include "fem/qfunctions/hdiv_qf.h" + +PalacePragmaDiagnosticPop + +namespace palace +{ + +using namespace ceed; + +void MixedVectorCurlIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, + CeedVector geom_data, + CeedElemRestriction geom_data_restr, + CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt dim, space_dim, trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetDimension(trial_basis, &dim)); + PalaceCeedCall(ceed, CeedGeometryDataGetSpaceDimension(geom_data_restr, dim, &space_dim)); + MFEM_VERIFY(dim == 3 && space_dim == 3, + "MixedVectorCurlIntegrator is only available in 3D!"); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY( + trial_num_comp == test_num_comp && trial_num_comp == 1, + "MixedVectorCurlIntegrator requires test and trial spaces with a single component!"); + if (test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hdiv_33 : f_apply_hdiv_33; + info.apply_qf_path = PalaceQFunctionRelativePath(assemble_q_data ? f_build_hdiv_33_loc + : f_apply_hdiv_33_loc); + } + else if (test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hdivhcurl_33 : f_apply_hdivhcurl_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdivhcurl_33_loc : f_apply_hdivhcurl_33_loc); + } + else + { + MFEM_ABORT("Invalid trial/test element map type for MixedVectorCurlIntegrator!"); + } + info.trial_ops = EvalMode::Curl; + info.test_ops = EvalMode::Interp; + + // Set up the coefficient and assemble. + auto ctx = PopulateCoefficientContext(space_dim, Q, transpose); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +void MixedVectorWeakCurlIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, + CeedVector geom_data, + CeedElemRestriction geom_data_restr, + CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt dim, space_dim, trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetDimension(trial_basis, &dim)); + PalaceCeedCall(ceed, CeedGeometryDataGetSpaceDimension(geom_data_restr, dim, &space_dim)); + MFEM_VERIFY(dim == 3 && space_dim == 3, + "MixedVectorWeakCurlIntegrator is only available in 3D!"); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY(trial_num_comp == test_num_comp && trial_num_comp == 1, + "MixedVectorWeakCurlIntegrator requires test and trial spaces with a single " + "component!"); + if (trial_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hdiv_33 : f_apply_hdiv_33; + info.apply_qf_path = PalaceQFunctionRelativePath(assemble_q_data ? f_build_hdiv_33_loc + : f_apply_hdiv_33_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hcurlhdiv_33 : f_apply_hcurlhdiv_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlhdiv_33_loc : f_apply_hcurlhdiv_33_loc); + } + else + { + MFEM_ABORT("Invalid trial/test element map type for MixedVectorWeakCurlIntegrator!"); + } + info.trial_ops = EvalMode::Interp; + info.test_ops = EvalMode::Curl; + + // Set up the coefficient and assemble. + auto ctx = PopulateCoefficientContext(space_dim, Q, transpose, -1.0); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +} // namespace palace diff --git a/palace/fem/integ/mixedvecgrad.cpp b/palace/fem/integ/mixedvecgrad.cpp index 6ee3280c4b..0809da62da 100644 --- a/palace/fem/integ/mixedvecgrad.cpp +++ b/palace/fem/integ/mixedvecgrad.cpp @@ -1,170 +1,185 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fem/integrator.hpp" - -#include -#include -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/integrator.hpp" - -#include "fem/qfunctions/hcurl_qf.h" - -namespace palace -{ - -struct MixedVectorGradientIntegratorInfo : public ceed::IntegratorInfo -{ - VectorFEMassContext ctx; -}; - -namespace -{ - -MixedVectorGradientIntegratorInfo -InitializeIntegratorInfo(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - bool use_bdr, mfem::Coefficient *Q, mfem::VectorCoefficient *VQ, - mfem::MatrixCoefficient *MQ, - std::vector &coeff) -{ - MFEM_VERIFY(trial_fespace.GetVDim() == 1 && test_fespace.GetVDim() == 1, - "libCEED interface for " - "MixedVectorGradientIntegrator/MixedVectorWeakDivergenceIntegrator does not " - "support vdim > 1!"); - - MixedVectorGradientIntegratorInfo info = {{0}}; - - mfem::ParMesh &mesh = *trial_fespace.GetParMesh(); - info.ctx.dim = mesh.Dimension() - use_bdr; - info.ctx.space_dim = mesh.SpaceDimension(); - - int trial_map_type = trial_fespace.FEColl()->GetMapType(info.ctx.dim); - int trial_deriv_map_type = trial_fespace.FEColl()->GetDerivMapType(info.ctx.dim); - int test_map_type = test_fespace.FEColl()->GetMapType(info.ctx.dim); - int test_deriv_map_type = test_fespace.FEColl()->GetDerivMapType(info.ctx.dim); - MFEM_VERIFY((trial_map_type == mfem::FiniteElement::H_CURL && - test_deriv_map_type == mfem::FiniteElement::H_CURL) || - (trial_deriv_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_CURL), - "libCEED interface for " - "MixedVectorGradientIntegrator/MixedVectorWeakDivergenceIntegrator requires " - "mixed H1 and H(curl) FE spaces!"); - - info.trial_op = (trial_map_type == mfem::FiniteElement::H_CURL) ? ceed::EvalMode::Interp - : ceed::EvalMode::Grad; - info.test_op = (test_map_type == mfem::FiniteElement::H_CURL) ? ceed::EvalMode::Interp - : ceed::EvalMode::Grad; - info.qdata_size = (info.ctx.dim * (info.ctx.dim + 1)) / 2; - info.ctx.sym = true; - - mfem::ConstantCoefficient *const_coeff = dynamic_cast(Q); - if (const_coeff || !(Q || VQ || MQ)) - { - info.ctx.coeff = const_coeff ? const_coeff->constant : 1.0; - - info.build_qf = f_build_hcurl_const_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurl_const_scalar_loc); - } - else if (Q) - { - ceed::InitCoefficient(*Q, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_hcurl_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurl_quad_scalar_loc); - } - else if (VQ) - { - MFEM_VERIFY(VQ->GetVDim() == info.ctx.space_dim, - "Invalid vector coefficient dimension for " - "MixedVectorGradient/MixedVectorWeakDivergenceIntegrator integrator!"); - ceed::InitCoefficient(*VQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_hcurl_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurl_quad_vector_loc); - } - else if (MQ) - { - MFEM_VERIFY(MQ->GetVDim() == info.ctx.space_dim, - "Invalid matrix coefficient dimension for " - "MixedVectorGradient/MixedVectorWeakDivergenceIntegrator integrator!"); - ceed::InitCoefficient(*MQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - info.build_qf = f_build_hcurl_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurl_quad_matrix_loc); - } - - info.apply_qf = f_apply_vecfemass; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_vecfemass_loc); - - return info; -} - -} // namespace - -void MixedVectorGradientIntegrator::Assemble( - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, CeedOperator *op_t) -{ - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, - use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void MixedVectorGradientIntegrator::AssembleBoundary( - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, CeedOperator *op_t) -{ - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, - use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void MixedVectorWeakDivergenceIntegrator::Assemble( - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, CeedOperator *op_t) -{ - // Negative coefficient comes from definition of integrator as -(Q u, grad v). - constexpr bool use_bdr = false; - std::vector coeff; - auto info = InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, use_bdr, Q, - VQ, MQ, coeff); - info.ctx.coeff *= -1.0; - for (auto &c : coeff) - { - c.data *= -1.0; - } - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void MixedVectorWeakDivergenceIntegrator::AssembleBoundary( - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, CeedOperator *op_t) -{ - // Negative coefficient comes from definition of integrator as -(Q u, grad v). - constexpr bool use_bdr = true; - std::vector coeff; - auto info = InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, use_bdr, Q, - VQ, MQ, coeff); - info.ctx.coeff *= -1.0; - for (auto &c : coeff) - { - c.data *= -1.0; - } - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fem/integrator.hpp" + +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" +#include "utils/diagnostic.hpp" + +PalacePragmaDiagnosticPush +PalacePragmaDiagnosticDisableUnused + +#include "fem/qfunctions/hcurl_qf.h" +#include "fem/qfunctions/hcurlhdiv_qf.h" + +PalacePragmaDiagnosticPop + +namespace palace +{ + +using namespace ceed; + +void MixedVectorGradientIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, + CeedVector geom_data, + CeedElemRestriction geom_data_restr, + CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt dim, space_dim, trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetDimension(trial_basis, &dim)); + PalaceCeedCall(ceed, CeedGeometryDataGetSpaceDimension(geom_data_restr, dim, &space_dim)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY(trial_num_comp == test_num_comp && trial_num_comp == 1, + "MixedVectorGradientIntegrator requires test and trial spaces with a single " + "component!"); + switch (10 * space_dim + dim) + { + case 22: + if (test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hcurl_22 : f_apply_hcurl_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_22_loc : f_apply_hcurl_22_loc); + } + else if (test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hcurlhdiv_22 : f_apply_hcurlhdiv_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlhdiv_22_loc : f_apply_hcurlhdiv_22_loc); + } + else + { + MFEM_ABORT("Invalid test map type for MixedVectorGradientIntegrator!"); + } + break; + case 33: + if (test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hcurl_33 : f_apply_hcurl_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_33_loc : f_apply_hcurl_33_loc); + } + else if (test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hcurlhdiv_33 : f_apply_hcurlhdiv_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlhdiv_33_loc : f_apply_hcurlhdiv_33_loc); + } + else + { + MFEM_ABORT("Invalid test map type for MixedVectorGradientIntegrator!"); + } + break; + case 21: + if (test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hcurl_21 : f_apply_hcurl_21; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_21_loc : f_apply_hcurl_21_loc); + } + else if (test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hcurlhdiv_21 : f_apply_hcurlhdiv_21; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlhdiv_21_loc : f_apply_hcurlhdiv_21_loc); + } + else + { + MFEM_ABORT("Invalid test map type for MixedVectorGradientIntegrator!"); + } + break; + case 32: + if (test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hcurl_32 : f_apply_hcurl_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_32_loc : f_apply_hcurl_32_loc); + } + else if (test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hcurlhdiv_32 : f_apply_hcurlhdiv_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlhdiv_32_loc : f_apply_hcurlhdiv_32_loc); + } + else + { + MFEM_ABORT("Invalid test map type for MixedVectorGradientIntegrator!"); + } + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" + << dim << ", " << space_dim << ") for MixedVectorGradientIntegrator!"); + } + info.trial_ops = EvalMode::Grad; + info.test_ops = EvalMode::Interp; + + // Set up the coefficient and assemble. + auto ctx = PopulateCoefficientContext(space_dim, Q, transpose); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +void MixedVectorWeakDivergenceIntegrator::Assemble( + Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt dim, space_dim, trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetDimension(trial_basis, &dim)); + PalaceCeedCall(ceed, CeedGeometryDataGetSpaceDimension(geom_data_restr, dim, &space_dim)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY( + trial_num_comp == test_num_comp && trial_num_comp == 1, + "MixedVectorWeakDivergenceIntegrator requires test and trial spaces with a single " + "component!"); + switch (10 * space_dim + dim) + { + case 22: + info.apply_qf = assemble_q_data ? f_build_hcurl_22 : f_apply_hcurl_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_22_loc : f_apply_hcurl_22_loc); + break; + case 33: + info.apply_qf = assemble_q_data ? f_build_hcurl_33 : f_apply_hcurl_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_33_loc : f_apply_hcurl_33_loc); + break; + case 21: + info.apply_qf = assemble_q_data ? f_build_hcurl_21 : f_apply_hcurl_21; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_21_loc : f_apply_hcurl_21_loc); + break; + case 32: + info.apply_qf = assemble_q_data ? f_build_hcurl_32 : f_apply_hcurl_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_32_loc : f_apply_hcurl_32_loc); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" + << dim << ", " << space_dim + << ") for MixedVectorWeakDivergenceIntegrator!"); + } + info.trial_ops = EvalMode::Interp; + info.test_ops = EvalMode::Grad; + + // Set up the coefficient and assemble. + auto ctx = PopulateCoefficientContext(space_dim, Q, transpose, -1.0); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +} // namespace palace diff --git a/palace/fem/integ/vecfemass.cpp b/palace/fem/integ/vecfemass.cpp index a0d8ef7d5b..83143a96ab 100644 --- a/palace/fem/integ/vecfemass.cpp +++ b/palace/fem/integ/vecfemass.cpp @@ -1,222 +1,188 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "fem/integrator.hpp" - -#include -#include -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/integrator.hpp" - -#include "fem/qfunctions/hcurl_qf.h" -#include "fem/qfunctions/hcurlhdiv_qf.h" -#include "fem/qfunctions/hdiv_qf.h" - -namespace palace -{ - -struct VectorFEMassIntegratorInfo : public ceed::IntegratorInfo -{ - VectorFEMassContext ctx; -}; - -namespace -{ - -VectorFEMassIntegratorInfo -InitializeIntegratorInfo(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - bool use_bdr, mfem::Coefficient *Q, mfem::VectorCoefficient *VQ, - mfem::MatrixCoefficient *MQ, - std::vector &coeff) -{ - MFEM_VERIFY(trial_fespace.GetVDim() == 1 && test_fespace.GetVDim() == 1, - "libCEED interface for VectorFEMassIntegrator does not support vdim > 1!"); - - VectorFEMassIntegratorInfo info = {{0}}; - - mfem::ParMesh &mesh = *trial_fespace.GetParMesh(); - info.ctx.dim = mesh.Dimension() - use_bdr; - info.ctx.space_dim = mesh.SpaceDimension(); - - int trial_map_type = trial_fespace.FEColl()->GetMapType(info.ctx.dim); - int test_map_type = test_fespace.FEColl()->GetMapType(info.ctx.dim); - MFEM_VERIFY((trial_map_type == mfem::FiniteElement::H_CURL || - trial_map_type == mfem::FiniteElement::H_DIV) && - (test_map_type == mfem::FiniteElement::H_CURL || - test_map_type == mfem::FiniteElement::H_DIV), - "VectorFEMassIntegrator requires H(div) or H(curl) FE spaces!"); - - info.trial_op = ceed::EvalMode::Interp; - info.test_op = ceed::EvalMode::Interp; - if (trial_map_type != test_map_type) - { - // Quadrature data is nonsymmetric in this case. - info.qdata_size = info.ctx.dim * info.ctx.dim; - info.ctx.sym = false; - } - else - { - info.qdata_size = (info.ctx.dim * (info.ctx.dim + 1)) / 2; - info.ctx.sym = true; - } - - mfem::ConstantCoefficient *const_coeff = dynamic_cast(Q); - if (const_coeff || !(Q || VQ || MQ)) - { - info.ctx.coeff = const_coeff ? const_coeff->constant : 1.0; - - if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_CURL) - { - info.build_qf = f_build_hcurl_const_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurl_const_scalar_loc); - } - else if (trial_map_type == mfem::FiniteElement::H_DIV && - test_map_type == mfem::FiniteElement::H_DIV) - { - info.build_qf = f_build_hdiv_const_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdiv_const_scalar_loc); - } - else if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_DIV) - { - info.build_qf = f_build_hcurlhdiv_const_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurlhdiv_const_scalar_loc); - } - else // trial_map_type == mfem::FiniteElement::H_DIV && test_map_type == - // mfem::FiniteElement::H_CURL - { - info.build_qf = f_build_hdivhcurl_const_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdivhcurl_const_scalar_loc); - } - } - else if (Q) - { - ceed::InitCoefficient(*Q, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_CURL) - { - info.build_qf = f_build_hcurl_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurl_quad_scalar_loc); - } - else if (trial_map_type == mfem::FiniteElement::H_DIV && - test_map_type == mfem::FiniteElement::H_DIV) - { - info.build_qf = f_build_hdiv_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdiv_quad_scalar_loc); - } - else if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_DIV) - { - info.build_qf = f_build_hcurlhdiv_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurlhdiv_quad_scalar_loc); - } - else // trial_map_type == mfem::FiniteElement::H_DIV && test_map_type == - // mfem::FiniteElement::H_CURL - { - info.build_qf = f_build_hdivhcurl_quad_scalar; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdivhcurl_quad_scalar_loc); - } - } - else if (VQ) - { - MFEM_VERIFY(VQ->GetVDim() == info.ctx.space_dim, - "Invalid vector coefficient dimension for VectorFEMassIntegrator!"); - ceed::InitCoefficient(*VQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_CURL) - { - info.build_qf = f_build_hcurl_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurl_quad_vector_loc); - } - else if (trial_map_type == mfem::FiniteElement::H_DIV && - test_map_type == mfem::FiniteElement::H_DIV) - { - info.build_qf = f_build_hdiv_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdiv_quad_vector_loc); - } - else if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_DIV) - { - info.build_qf = f_build_hcurlhdiv_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurlhdiv_quad_vector_loc); - } - else // trial_map_type == mfem::FiniteElement::H_DIV && test_map_type == - // mfem::FiniteElement::H_CURL - { - info.build_qf = f_build_hdivhcurl_quad_vector; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdivhcurl_quad_vector_loc); - } - } - else if (MQ) - { - MFEM_VERIFY(MQ->GetVDim() == info.ctx.space_dim, - "Invalid matrix coefficient dimension for VectorFEMassIntegrator!"); - ceed::InitCoefficient(*MQ, mesh, ir, indices, use_bdr, coeff.emplace_back()); - - if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_CURL) - { - info.build_qf = f_build_hcurl_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurl_quad_matrix_loc); - } - else if (trial_map_type == mfem::FiniteElement::H_DIV && - test_map_type == mfem::FiniteElement::H_DIV) - { - info.build_qf = f_build_hdiv_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdiv_quad_matrix_loc); - } - else if (trial_map_type == mfem::FiniteElement::H_CURL && - test_map_type == mfem::FiniteElement::H_DIV) - { - info.build_qf = f_build_hcurlhdiv_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hcurlhdiv_quad_matrix_loc); - } - else // trial_map_type == mfem::FiniteElement::H_DIV && test_map_type == - // mfem::FiniteElement::H_CURL - { - info.build_qf = f_build_hdivhcurl_quad_matrix; - info.build_qf_path = PalaceQFunctionRelativePath(f_build_hdivhcurl_quad_matrix_loc); - } - } - - info.apply_qf = f_apply_vecfemass; - info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_vecfemass_loc); - - return info; -} - -} // namespace - -void VectorFEMassIntegrator::Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - constexpr bool use_bdr = false; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, - use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -void VectorFEMassIntegrator::AssembleBoundary( - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, CeedOperator *op, CeedOperator *op_t) -{ - constexpr bool use_bdr = true; - std::vector coeff; - const auto info = InitializeIntegratorInfo(trial_fespace, test_fespace, ir, indices, - use_bdr, Q, VQ, MQ, coeff); - ceed::AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, coeff, - ceed, op, op_t); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "fem/integrator.hpp" + +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" + +#include "fem/qfunctions/hcurl_qf.h" +#include "fem/qfunctions/hcurlhdiv_qf.h" +#include "fem/qfunctions/hdiv_qf.h" + +namespace palace +{ + +using namespace ceed; + +void VectorFEMassIntegrator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis trial_basis, + CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, + CeedOperator *op) const +{ + CeedQFunctionInfo info; + info.assemble_q_data = assemble_q_data; + + // Set up QFunctions. + CeedInt dim, space_dim, trial_num_comp, test_num_comp; + PalaceCeedCall(ceed, CeedBasisGetDimension(trial_basis, &dim)); + PalaceCeedCall(ceed, CeedGeometryDataGetSpaceDimension(geom_data_restr, dim, &space_dim)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(trial_basis, &trial_num_comp)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(test_basis, &test_num_comp)); + MFEM_VERIFY( + trial_num_comp == test_num_comp && trial_num_comp == 1, + "VectorFEMassIntegrator requires test and trial spaces with a single component!"); + switch (10 * space_dim + dim) + { + case 22: + if (trial_map_type == mfem::FiniteElement::H_CURL && + test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hcurl_22 : f_apply_hcurl_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_22_loc : f_apply_hcurl_22_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_DIV && + test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hdiv_22 : f_apply_hdiv_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdiv_22_loc : f_apply_hdiv_22_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_CURL && + test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hcurlhdiv_22 : f_apply_hcurlhdiv_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlhdiv_22_loc : f_apply_hcurlhdiv_22_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_DIV && + test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hdivhcurl_22 : f_apply_hdivhcurl_22; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdivhcurl_22_loc : f_apply_hdivhcurl_22_loc); + } + else + { + MFEM_ABORT("Invalid trial/test element map type for VectorFEMassIntegrator!"); + } + break; + case 33: + if (trial_map_type == mfem::FiniteElement::H_CURL && + test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hcurl_33 : f_apply_hcurl_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_33_loc : f_apply_hcurl_33_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_DIV && + test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hdiv_33 : f_apply_hdiv_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdiv_33_loc : f_apply_hdiv_33_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_CURL && + test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hcurlhdiv_33 : f_apply_hcurlhdiv_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlhdiv_33_loc : f_apply_hcurlhdiv_33_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_DIV && + test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hdivhcurl_33 : f_apply_hdivhcurl_33; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdivhcurl_33_loc : f_apply_hdivhcurl_33_loc); + } + else + { + MFEM_ABORT("Invalid trial/test element map type for VectorFEMassIntegrator!"); + } + break; + case 21: + if (trial_map_type == mfem::FiniteElement::H_CURL && + test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hcurl_21 : f_apply_hcurl_21; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_21_loc : f_apply_hcurl_21_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_DIV && + test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hdiv_21 : f_apply_hdiv_21; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdiv_21_loc : f_apply_hdiv_21_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_CURL && + test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hcurlhdiv_21 : f_apply_hcurlhdiv_21; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlhdiv_21_loc : f_apply_hcurlhdiv_21_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_DIV && + test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hdivhcurl_21 : f_apply_hdivhcurl_21; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdivhcurl_21_loc : f_apply_hdivhcurl_21_loc); + } + else + { + MFEM_ABORT("Invalid trial/test element map type for VectorFEMassIntegrator!"); + } + break; + case 32: + if (trial_map_type == mfem::FiniteElement::H_CURL && + test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hcurl_32 : f_apply_hcurl_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurl_32_loc : f_apply_hcurl_32_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_DIV && + test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hdiv_32 : f_apply_hdiv_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdiv_32_loc : f_apply_hdiv_32_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_CURL && + test_map_type == mfem::FiniteElement::H_DIV) + { + info.apply_qf = assemble_q_data ? f_build_hcurlhdiv_32 : f_apply_hcurlhdiv_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hcurlhdiv_32_loc : f_apply_hcurlhdiv_32_loc); + } + else if (trial_map_type == mfem::FiniteElement::H_DIV && + test_map_type == mfem::FiniteElement::H_CURL) + { + info.apply_qf = assemble_q_data ? f_build_hdivhcurl_32 : f_apply_hdivhcurl_32; + info.apply_qf_path = PalaceQFunctionRelativePath( + assemble_q_data ? f_build_hdivhcurl_32_loc : f_apply_hdivhcurl_32_loc); + } + else + { + MFEM_ABORT("Invalid trial/test element map type for VectorFEMassIntegrator!"); + } + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" + << dim << ", " << space_dim << ") for VectorFEMassIntegrator!"); + } + info.trial_ops = EvalMode::Interp; + info.test_ops = EvalMode::Interp; + + // Set up the coefficient and assemble. + auto ctx = PopulateCoefficientContext(space_dim, Q, transpose); + AssembleCeedOperator(info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, + trial_restr, test_restr, trial_basis, test_basis, geom_data, + geom_data_restr, op); +} + +} // namespace palace diff --git a/palace/fem/integrator.cpp b/palace/fem/integrator.cpp index 6b7ac1edc0..555180b71b 100644 --- a/palace/fem/integrator.cpp +++ b/palace/fem/integrator.cpp @@ -1,69 +1,96 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "integrator.hpp" - -#include "fem/libceed/integrator.hpp" - -namespace palace -{ - -void DiscreteInterpolator::Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - // Interpolators do not use an integration rule to map between the test and trial spaces. - ceed::AssembleCeedInterpolator(trial_fespace, test_fespace, indices, ceed, op, op_t); -} - -void VectorFEBoundaryLFIntegrator::AssembleRHSElementVect(const mfem::FiniteElement &fe, - mfem::ElementTransformation &T, - mfem::Vector &elvect) -{ - const int dof = fe.GetDof(); - const int dim = fe.GetDim(); - const int q_order = fem::GetDefaultIntegrationOrder(fe, fe, T, q_extra); - const mfem::IntegrationRule &ir = mfem::IntRules.Get(fe.GetGeomType(), q_order); - f_hat.SetSize(dim); - vshape.SetSize(dof, dim); - elvect.SetSize(dof); - elvect = 0.0; - - for (int i = 0; i < ir.GetNPoints(); i++) - { - const mfem::IntegrationPoint &ip = ir.IntPoint(i); - T.SetIntPoint(&ip); - fe.CalcVShape(ip, vshape); - - Q.Eval(f_loc, T, ip); - T.InverseJacobian().Mult(f_loc, f_hat); - f_hat *= ip.weight * T.Weight(); - vshape.AddMult(f_hat, elvect); - } -} - -void BoundaryLFIntegrator::AssembleRHSElementVect(const mfem::FiniteElement &fe, - mfem::ElementTransformation &T, - mfem::Vector &elvect) -{ - const int dof = fe.GetDof(); - const int q_order = fem::GetDefaultIntegrationOrder(fe, fe, T, q_extra); - const mfem::IntegrationRule &ir = mfem::IntRules.Get(fe.GetGeomType(), q_order); - shape.SetSize(dof); - elvect.SetSize(dof); - elvect = 0.0; - - for (int i = 0; i < ir.GetNPoints(); i++) - { - const mfem::IntegrationPoint &ip = ir.IntPoint(i); - T.SetIntPoint(&ip); - fe.CalcShape(ip, shape); - - double val = ip.weight * T.Weight() * Q.Eval(T, ip); - elvect.Add(val, shape); - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "integrator.hpp" + +#include "fem/libceed/libceed_integrator.hpp" + +namespace palace +{ + +namespace fem +{ + +int DefaultIntegrationOrder::Get(const mfem::IsoparametricTransformation &T) +{ + return 2 * p_trial + (q_order_jac ? T.OrderW() : 0) + + (T.GetFE()->Space() == mfem::FunctionSpace::Pk ? q_order_extra_pk + : q_order_extra_qk); +} + +int DefaultIntegrationOrder::Get(const mfem::ElementTransformation &T) +{ + const auto *T_iso = dynamic_cast(&T); + MFEM_VERIFY( + T_iso, + "Unexpected non-isoparametric element transformation to calculate quadrature order!"); + return Get(*T_iso); +} + +int DefaultIntegrationOrder::Get(const mfem::Mesh &mesh, mfem::Geometry::Type geom) +{ + MFEM_VERIFY(mesh.GetNodes(), "The mesh has no nodal FE space!"); + mfem::IsoparametricTransformation T; + T.SetFE(mesh.GetNodalFESpace()->FEColl()->FiniteElementForGeometry(geom)); + return Get(T); +} + +} // namespace fem + +void DiscreteInterpolator::Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis interp_basis, + CeedOperator *op, CeedOperator *op_t) +{ + // Interpolators do not use an integration rule to map between the test and trial spaces. + ceed::AssembleCeedInterpolator(ceed, trial_restr, test_restr, interp_basis, op, op_t); +} + +void VectorFEBoundaryLFIntegrator::AssembleRHSElementVect(const mfem::FiniteElement &fe, + mfem::ElementTransformation &T, + mfem::Vector &elvect) +{ + const int dof = fe.GetDof(); + const int dim = fe.GetDim(); + const int q_order = fem::DefaultIntegrationOrder::Get(T); + const mfem::IntegrationRule &ir = mfem::IntRules.Get(fe.GetGeomType(), q_order); + f_hat.SetSize(dim); + vshape.SetSize(dof, dim); + elvect.SetSize(dof); + elvect = 0.0; + + for (int i = 0; i < ir.GetNPoints(); i++) + { + const mfem::IntegrationPoint &ip = ir.IntPoint(i); + T.SetIntPoint(&ip); + fe.CalcVShape(ip, vshape); + + Q.Eval(f_loc, T, ip); + T.InverseJacobian().Mult(f_loc, f_hat); + f_hat *= ip.weight * T.Weight(); + vshape.AddMult(f_hat, elvect); + } +} + +void BoundaryLFIntegrator::AssembleRHSElementVect(const mfem::FiniteElement &fe, + mfem::ElementTransformation &T, + mfem::Vector &elvect) +{ + const int dof = fe.GetDof(); + const int q_order = fem::DefaultIntegrationOrder::Get(T); + const mfem::IntegrationRule &ir = mfem::IntRules.Get(fe.GetGeomType(), q_order); + shape.SetSize(dof); + elvect.SetSize(dof); + elvect = 0.0; + + for (int i = 0; i < ir.GetNPoints(); i++) + { + const mfem::IntegrationPoint &ip = ir.IntPoint(i); + T.SetIntPoint(&ip); + fe.CalcShape(ip, shape); + + double val = ip.weight * T.Weight() * Q.Eval(T, ip); + elvect.Add(val, shape); + } +} + +} // namespace palace diff --git a/palace/fem/integrator.hpp b/palace/fem/integrator.hpp index 2f232033aa..cc987fbe47 100644 --- a/palace/fem/integrator.hpp +++ b/palace/fem/integrator.hpp @@ -1,554 +1,339 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_FEM_INTEGRATOR_HPP -#define PALACE_FEM_INTEGRATOR_HPP - -#include -#include - -// Forward declarations of libCEED objects. -typedef struct Ceed_private *Ceed; -typedef struct CeedOperator_private *CeedOperator; - -namespace palace -{ - -// -// Classes which implement or extend bilinear and linear form integrators. -// - -namespace fem -{ - -// Helper functions for creating an integration rule to exactly integrate polynomials of -// order 2p + w + q_extra. -inline int GetDefaultIntegrationOrder(const mfem::FiniteElement &trial_fe, - const mfem::FiniteElement &test_fe, - const mfem::ElementTransformation &T, int q_extra_pk, - int q_extra_qk) -{ - return trial_fe.GetOrder() + test_fe.GetOrder() + T.OrderW() + - (trial_fe.Space() == mfem::FunctionSpace::Pk ? q_extra_pk : q_extra_qk); -} - -inline int GetDefaultIntegrationOrder(const mfem::FiniteElement &trial_fe, - const mfem::FiniteElement &test_fe, - const mfem::ElementTransformation &T, int q_extra = 0) -{ - return GetDefaultIntegrationOrder(trial_fe, test_fe, T, q_extra, q_extra); -} - -inline int GetDefaultIntegrationOrder(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const std::vector &indices, bool use_bdr, - int q_extra_pk = 0, int q_extra_qk = 0) -{ - // Every process is guaranteed to have at least one element, and assumes no variable order - // spaces are used. - mfem::ParMesh &mesh = *trial_fespace.GetParMesh(); - mfem::IsoparametricTransformation T; - if (use_bdr) - { - const mfem::FiniteElement &trial_fe = *trial_fespace.GetBE(indices[0]); - const mfem::FiniteElement &test_fe = *test_fespace.GetBE(indices[0]); - mesh.GetBdrElementTransformation(indices[0], &T); - return GetDefaultIntegrationOrder(trial_fe, test_fe, T, q_extra_pk, q_extra_qk); - } - else - { - const mfem::FiniteElement &trial_fe = *trial_fespace.GetFE(indices[0]); - const mfem::FiniteElement &test_fe = *test_fespace.GetFE(indices[0]); - mesh.GetElementTransformation(indices[0], &T); - return GetDefaultIntegrationOrder(trial_fe, test_fe, T, q_extra_pk, q_extra_qk); - } -} - -} // namespace fem - -// Base class for libCEED-based bilinear form integrators. -class BilinearFormIntegrator -{ -public: - virtual ~BilinearFormIntegrator() = default; - - virtual void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) = 0; - - virtual void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) = 0; -}; - -// Integrator for a(u, v) = (Q u, v) for H1 elements (also for vector (H1)ᵈ spaces). -class MassIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Q; - mfem::VectorCoefficient *VQ; - mfem::MatrixCoefficient *MQ; - -public: - MassIntegrator() : Q(nullptr), VQ(nullptr), MQ(nullptr) {} - MassIntegrator(mfem::Coefficient &Q) : Q(&Q), VQ(nullptr), MQ(nullptr) {} - MassIntegrator(mfem::VectorCoefficient &VQ) : Q(nullptr), VQ(&VQ), MQ(nullptr) {} - MassIntegrator(mfem::MatrixCoefficient &MQ) : Q(nullptr), VQ(nullptr), MQ(&MQ) {} - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = (Q u, v) for vector finite elements. -class VectorFEMassIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Q; - mfem::VectorCoefficient *VQ; - mfem::MatrixCoefficient *MQ; - -public: - VectorFEMassIntegrator() : Q(nullptr), VQ(nullptr), MQ(nullptr) {} - VectorFEMassIntegrator(mfem::Coefficient &Q) : Q(&Q), VQ(nullptr), MQ(nullptr) {} - VectorFEMassIntegrator(mfem::VectorCoefficient &VQ) : Q(nullptr), VQ(&VQ), MQ(nullptr) {} - VectorFEMassIntegrator(mfem::MatrixCoefficient &MQ) : Q(nullptr), VQ(nullptr), MQ(&MQ) {} - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = (Q curl u, curl v) for Nedelec elements. -class CurlCurlIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Q; - mfem::VectorCoefficient *VQ; - mfem::MatrixCoefficient *MQ; - -public: - CurlCurlIntegrator() : Q(nullptr), VQ(nullptr), MQ(nullptr) {} - CurlCurlIntegrator(mfem::Coefficient &Q) : Q(&Q), VQ(nullptr), MQ(nullptr) {} - CurlCurlIntegrator(mfem::VectorCoefficient &VQ) : Q(nullptr), VQ(&VQ), MQ(nullptr) {} - CurlCurlIntegrator(mfem::MatrixCoefficient &MQ) : Q(nullptr), VQ(nullptr), MQ(&MQ) {} - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = (Qc curl u, curl v) + (Qm u, v) for Nedelec elements. -class CurlCurlMassIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Qc, *Qm; - mfem::VectorCoefficient *VQc, *VQm; - mfem::MatrixCoefficient *MQc, *MQm; - -public: - CurlCurlMassIntegrator(mfem::Coefficient &Qc, mfem::Coefficient &Qm) - : Qc(&Qc), Qm(&Qm), VQc(nullptr), VQm(nullptr), MQc(nullptr), MQm(nullptr) - { - } - CurlCurlMassIntegrator(mfem::Coefficient &Qc, mfem::VectorCoefficient &VQm) - : Qc(&Qc), Qm(nullptr), VQc(nullptr), VQm(&VQm), MQc(nullptr), MQm(nullptr) - { - } - CurlCurlMassIntegrator(mfem::Coefficient &Qc, mfem::MatrixCoefficient &MQm) - : Qc(&Qc), Qm(nullptr), VQc(nullptr), VQm(nullptr), MQc(nullptr), MQm(&MQm) - { - } - CurlCurlMassIntegrator(mfem::VectorCoefficient &VQc, mfem::Coefficient &Qm) - : Qc(nullptr), Qm(&Qm), VQc(&VQc), VQm(nullptr), MQc(nullptr), MQm(nullptr) - { - } - CurlCurlMassIntegrator(mfem::VectorCoefficient &VQc, mfem::VectorCoefficient &VQm) - : Qc(nullptr), Qm(nullptr), VQc(&VQc), VQm(&VQm), MQc(nullptr), MQm(nullptr) - { - } - CurlCurlMassIntegrator(mfem::VectorCoefficient &VQc, mfem::MatrixCoefficient &MQm) - : Qc(nullptr), Qm(nullptr), VQc(&VQc), VQm(nullptr), MQc(nullptr), MQm(&MQm) - { - } - CurlCurlMassIntegrator(mfem::MatrixCoefficient &MQc, mfem::Coefficient &Qm) - : Qc(nullptr), Qm(&Qm), VQc(nullptr), VQm(nullptr), MQc(&MQc), MQm(nullptr) - { - } - CurlCurlMassIntegrator(mfem::MatrixCoefficient &MQc, mfem::VectorCoefficient &VQm) - : Qc(nullptr), Qm(nullptr), VQc(nullptr), VQm(&VQm), MQc(&MQc), MQm(nullptr) - { - } - CurlCurlMassIntegrator(mfem::MatrixCoefficient &MQc, mfem::MatrixCoefficient &MQm) - : Qc(nullptr), Qm(nullptr), VQc(nullptr), VQm(nullptr), MQc(&MQc), MQm(&MQm) - { - } - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = (Q grad u, grad v) for H1 elements. -class DiffusionIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Q; - mfem::VectorCoefficient *VQ; - mfem::MatrixCoefficient *MQ; - -public: - DiffusionIntegrator() : Q(nullptr), VQ(nullptr), MQ(nullptr) {} - DiffusionIntegrator(mfem::Coefficient &Q) : Q(&Q), VQ(nullptr), MQ(nullptr) {} - DiffusionIntegrator(mfem::VectorCoefficient &VQ) : Q(nullptr), VQ(&VQ), MQ(nullptr) {} - DiffusionIntegrator(mfem::MatrixCoefficient &MQ) : Q(nullptr), VQ(nullptr), MQ(&MQ) {} - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = (Qd grad u, grad v) + (Qm u, v) for H1 elements. -class DiffusionMassIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Qd, *Qm; - mfem::VectorCoefficient *VQd; - mfem::MatrixCoefficient *MQd; - -public: - DiffusionMassIntegrator(mfem::Coefficient &Qd, mfem::Coefficient &Qm) - : Qd(&Qd), Qm(&Qm), VQd(nullptr), MQd(nullptr) - { - } - DiffusionMassIntegrator(mfem::VectorCoefficient &VQd, mfem::Coefficient &Qm) - : Qd(nullptr), Qm(&Qm), VQd(&VQd), MQd(nullptr) - { - } - DiffusionMassIntegrator(mfem::MatrixCoefficient &MQd, mfem::Coefficient &Qm) - : Qd(nullptr), Qm(&Qm), VQd(nullptr), MQd(&MQd) - { - } - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = (Q div u, div v) for Raviart-Thomas elements. -class DivDivIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Q; - -public: - DivDivIntegrator() : Q(nullptr) {} - DivDivIntegrator(mfem::Coefficient &Q) : Q(&Q) {} - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = (Qd div u, div v) + (Qm u, v) for Raviart-Thomas elements. -class DivDivMassIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Qd, *Qm; - mfem::VectorCoefficient *VQm; - mfem::MatrixCoefficient *MQm; - -public: - DivDivMassIntegrator(mfem::Coefficient &Qd, mfem::Coefficient &Qm) - : Qd(&Qd), Qm(&Qm), VQm(nullptr), MQm(nullptr) - { - } - DivDivMassIntegrator(mfem::Coefficient &Qd, mfem::VectorCoefficient &VQm) - : Qd(&Qd), Qm(nullptr), VQm(&VQm), MQm(nullptr) - { - } - DivDivMassIntegrator(mfem::Coefficient &Qd, mfem::MatrixCoefficient &MQm) - : Qd(&Qd), Qm(nullptr), VQm(nullptr), MQm(&MQm) - { - } - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = (Q grad u, v) for u in H1 and v in H(curl). -class MixedVectorGradientIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Q; - mfem::VectorCoefficient *VQ; - mfem::MatrixCoefficient *MQ; - -public: - MixedVectorGradientIntegrator() : Q(nullptr), VQ(nullptr), MQ(nullptr) {} - MixedVectorGradientIntegrator(mfem::Coefficient &Q) : Q(&Q), VQ(nullptr), MQ(nullptr) {} - MixedVectorGradientIntegrator(mfem::VectorCoefficient &VQ) - : Q(nullptr), VQ(&VQ), MQ(nullptr) - { - } - MixedVectorGradientIntegrator(mfem::MatrixCoefficient &MQ) - : Q(nullptr), VQ(nullptr), MQ(&MQ) - { - } - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = -(Q u, grad v) for u in H(curl) and v in H1. -class MixedVectorWeakDivergenceIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Q; - mfem::VectorCoefficient *VQ; - mfem::MatrixCoefficient *MQ; - -public: - MixedVectorWeakDivergenceIntegrator() : Q(nullptr), VQ(nullptr), MQ(nullptr) {} - MixedVectorWeakDivergenceIntegrator(mfem::Coefficient &Q) - : Q(&Q), VQ(nullptr), MQ(nullptr) - { - } - MixedVectorWeakDivergenceIntegrator(mfem::VectorCoefficient &VQ) - : Q(nullptr), VQ(&VQ), MQ(nullptr) - { - } - MixedVectorWeakDivergenceIntegrator(mfem::MatrixCoefficient &MQ) - : Q(nullptr), VQ(nullptr), MQ(&MQ) - { - } - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = (Q curl u, v) for u in H(curl) and v in H(div). -class MixedVectorCurlIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Q; - mfem::VectorCoefficient *VQ; - mfem::MatrixCoefficient *MQ; - -public: - MixedVectorCurlIntegrator() : Q(nullptr), VQ(nullptr), MQ(nullptr) {} - MixedVectorCurlIntegrator(mfem::Coefficient &Q) : Q(&Q), VQ(nullptr), MQ(nullptr) {} - MixedVectorCurlIntegrator(mfem::VectorCoefficient &VQ) : Q(nullptr), VQ(&VQ), MQ(nullptr) - { - } - MixedVectorCurlIntegrator(mfem::MatrixCoefficient &MQ) : Q(nullptr), VQ(nullptr), MQ(&MQ) - { - } - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = (Q u, curl v) for u in H(div) and v in H(curl). -class MixedVectorWeakCurlIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Q; - mfem::VectorCoefficient *VQ; - mfem::MatrixCoefficient *MQ; - -public: - MixedVectorWeakCurlIntegrator() : Q(nullptr), VQ(nullptr), MQ(nullptr) {} - MixedVectorWeakCurlIntegrator(mfem::Coefficient &Q) : Q(&Q), VQ(nullptr), MQ(nullptr) {} - MixedVectorWeakCurlIntegrator(mfem::VectorCoefficient &VQ) - : Q(nullptr), VQ(&VQ), MQ(nullptr) - { - } - MixedVectorWeakCurlIntegrator(mfem::MatrixCoefficient &MQ) - : Q(nullptr), VQ(nullptr), MQ(&MQ) - { - } - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Integrator for a(u, v) = (Q grad u, v) for u in H1 and v in (H1)ᵈ. -class GradientIntegrator : public BilinearFormIntegrator -{ -protected: - mfem::Coefficient *Q; - mfem::VectorCoefficient *VQ; - mfem::MatrixCoefficient *MQ; - -public: - GradientIntegrator() : Q(nullptr), VQ(nullptr), MQ(nullptr) {} - GradientIntegrator(mfem::Coefficient &Q) : Q(&Q), VQ(nullptr), MQ(nullptr) {} - GradientIntegrator(mfem::VectorCoefficient &VQ) : Q(nullptr), VQ(&VQ), MQ(nullptr) {} - GradientIntegrator(mfem::MatrixCoefficient &MQ) : Q(nullptr), VQ(nullptr), MQ(&MQ) {} - - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override; -}; - -// Base class for all discrete interpolators. -class DiscreteInterpolator : public BilinearFormIntegrator -{ -public: - void Assemble(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) override; - - void AssembleBoundary(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - Ceed ceed, CeedOperator *op, CeedOperator *op_t) override - { - MFEM_ABORT("Boundary assembly is not implemented for DiscreteInterpolator objects!"); - } -}; - -// Interpolator for the identity map, where the domain space is a subspace of the range -// space (discrete embedding matrix). -using IdentityInterpolator = DiscreteInterpolator; - -// Interpolator for the discrete gradient map from an H1 space to an H(curl) space. -using GradientInterpolator = DiscreteInterpolator; - -// Interpolator for the discrete curl map from an H(curl) space to an H(div) space. -using CurlInterpolator = DiscreteInterpolator; - -// Interpolator for the discrete divergence map from an H(div) space to an L2 space. -using DivergenceInterpolator = DiscreteInterpolator; - -// Similar to MFEM's VectorFEBoundaryTangentLFIntegrator for ND spaces, but instead of -// computing (n x f, v), this just computes (f, v). Also eliminates the a and b quadrature -// parameters and uses fem::GetDefaultIntegrationOrder instead. -class VectorFEBoundaryLFIntegrator : public mfem::LinearFormIntegrator -{ -private: - mfem::VectorCoefficient &Q; - mfem::DenseMatrix vshape; - mfem::Vector f_loc, f_hat; - int q_extra; - -public: - VectorFEBoundaryLFIntegrator(mfem::VectorCoefficient &QG, int q_extra = 0) - : Q(QG), q_extra(q_extra) - { - } - - void AssembleRHSElementVect(const mfem::FiniteElement &fe, mfem::ElementTransformation &T, - mfem::Vector &elvect) override; -}; - -// Similar to MFEM's BoundaryLFIntegrator for H1 spaces, but eliminates the a and b -// quadrature parameters and uses fem::GetDefaultIntegrationOrder instead. -class BoundaryLFIntegrator : public mfem::LinearFormIntegrator -{ -private: - mfem::Coefficient &Q; - mfem::Vector shape; - int q_extra; - -public: - BoundaryLFIntegrator(mfem::Coefficient &QG, int q_extra = 0) : Q(QG), q_extra(q_extra) {} - - void AssembleRHSElementVect(const mfem::FiniteElement &fe, mfem::ElementTransformation &T, - mfem::Vector &elvect) override; -}; - -using VectorFEDomainLFIntegrator = VectorFEBoundaryLFIntegrator; -using DomainLFIntegrator = BoundaryLFIntegrator; - -} // namespace palace - -#endif // PALACE_FEM_INTEGRATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_FEM_INTEGRATOR_HPP +#define PALACE_FEM_INTEGRATOR_HPP + +#include +#include "fem/libceed/ceed.hpp" + +namespace palace +{ + +class MaterialPropertyCoefficient; + +// +// Classes which implement or extend bilinear and linear form integrators. In doc strings u +// refers to the trial function, and v the test function. +// + +namespace fem +{ + +// Helper functions for creating an integration rule to exactly integrate polynomials of +// order 2 * p_trial + order(|J|) + q_extra. +struct DefaultIntegrationOrder +{ + inline static int p_trial = 1; + inline static bool q_order_jac = true; + inline static int q_order_extra_pk = 0; + inline static int q_order_extra_qk = 0; + static int Get(const mfem::IsoparametricTransformation &T); + static int Get(const mfem::ElementTransformation &T); + static int Get(const mfem::Mesh &mesh, mfem::Geometry::Type geom); +}; + +} // namespace fem + +// Base class for libCEED-based bilinear form integrators. +class BilinearFormIntegrator +{ +protected: + const MaterialPropertyCoefficient *Q; + bool assemble_q_data; + bool transpose; + +public: + BilinearFormIntegrator(const MaterialPropertyCoefficient *Q = nullptr, + const bool transpose = false) + : Q(Q), assemble_q_data(false), transpose(transpose) + { + } + BilinearFormIntegrator(const MaterialPropertyCoefficient &Q, const bool transpose = false) + : Q(&Q), assemble_q_data(false), transpose(transpose) + { + } + virtual ~BilinearFormIntegrator() = default; + + virtual void Assemble(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis trial_basis, + CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const = 0; + + virtual void SetMapTypes(int trial_type, int test_type) {} + + void AssembleQuadratureData() { assemble_q_data = true; } +}; + +// Integrator for a(u, v) = (Q u, v) for H1 elements (also for vector (H1)ᵈ spaces). +class MassIntegrator : public BilinearFormIntegrator +{ +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; +}; + +// Integrator for a(u, v) = (Q u, v) for vector finite elements. +class VectorFEMassIntegrator : public BilinearFormIntegrator +{ +protected: + int trial_map_type = mfem::FiniteElement::UNKNOWN_MAP_TYPE; + int test_map_type = mfem::FiniteElement::UNKNOWN_MAP_TYPE; + +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; + + void SetMapTypes(int trial_type, int test_type) override + { + trial_map_type = trial_type; + test_map_type = test_type; + } +}; + +// Integrator for a(u, v) = (Q grad u, grad v) for H1 elements. +class DiffusionIntegrator : public BilinearFormIntegrator +{ +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; +}; + +// Integrator for a(u, v) = (Q curl u, curl v) for Nedelec elements. +class CurlCurlIntegrator : public BilinearFormIntegrator +{ +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; +}; + +// Integrator for a(u, v) = (Q div u, div v) for Raviart-Thomas elements. +class DivDivIntegrator : public BilinearFormIntegrator +{ +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; +}; + +// Integrator for a(u, v) = (Qd grad u, grad v) + (Qm u, v) for H1 elements. +class DiffusionMassIntegrator : public BilinearFormIntegrator +{ +protected: + const MaterialPropertyCoefficient *Q_mass; + bool transpose_mass; + +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + DiffusionMassIntegrator(const MaterialPropertyCoefficient &Q, + const MaterialPropertyCoefficient &Q_mass, + const bool transpose = false, const bool transpose_mass = false) + : BilinearFormIntegrator(Q, transpose), Q_mass(&Q_mass), transpose_mass(transpose_mass) + { + } + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; +}; + +// Integrator for a(u, v) = (Qc curl u, curl v) + (Qm u, v) for Nedelec elements. +class CurlCurlMassIntegrator : public BilinearFormIntegrator +{ +protected: + const MaterialPropertyCoefficient *Q_mass; + bool transpose_mass; + +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + CurlCurlMassIntegrator(const MaterialPropertyCoefficient &Q, + const MaterialPropertyCoefficient &Q_mass, + const bool transpose = false, const bool transpose_mass = false) + : BilinearFormIntegrator(Q, transpose), Q_mass(&Q_mass), transpose_mass(transpose_mass) + { + } + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; +}; + +// Integrator for a(u, v) = (Qd div u, div v) + (Qm u, v) for Raviart-Thomas elements. +class DivDivMassIntegrator : public BilinearFormIntegrator +{ +protected: + const MaterialPropertyCoefficient *Q_mass; + bool transpose_mass; + +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + DivDivMassIntegrator(const MaterialPropertyCoefficient &Q, + const MaterialPropertyCoefficient &Q_mass, + const bool transpose = false, const bool transpose_mass = false) + : BilinearFormIntegrator(Q, transpose), Q_mass(&Q_mass), transpose_mass(transpose_mass) + { + } + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; +}; + +// Integrator for a(u, v) = (Q grad u, v) for u in H1 and v in H(curl) or H(div). +class MixedVectorGradientIntegrator : public BilinearFormIntegrator +{ +protected: + int trial_map_type = mfem::FiniteElement::UNKNOWN_MAP_TYPE; + int test_map_type = mfem::FiniteElement::UNKNOWN_MAP_TYPE; + +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; + + void SetMapTypes(int trial_type, int test_type) override + { + trial_map_type = trial_type; + test_map_type = test_type; + } +}; + +// Integrator for a(u, v) = -(Q u, grad v) for u in H(curl) and v in H1. +class MixedVectorWeakDivergenceIntegrator : public BilinearFormIntegrator +{ +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; +}; + +// Integrator for a(u, v) = (Q curl u, v) for u in H(curl) and v in H(div) or H(curl). +class MixedVectorCurlIntegrator : public BilinearFormIntegrator +{ +protected: + int trial_map_type = mfem::FiniteElement::UNKNOWN_MAP_TYPE; + int test_map_type = mfem::FiniteElement::UNKNOWN_MAP_TYPE; + +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; + + void SetMapTypes(int trial_type, int test_type) override + { + trial_map_type = trial_type; + test_map_type = test_type; + } +}; + +// Integrator for a(u, v) = -(Q u, curl v) for u in H(div) or H(curl) and v in H(curl). +class MixedVectorWeakCurlIntegrator : public BilinearFormIntegrator +{ +protected: + int trial_map_type = mfem::FiniteElement::UNKNOWN_MAP_TYPE; + int test_map_type = mfem::FiniteElement::UNKNOWN_MAP_TYPE; + +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; + + void SetMapTypes(int trial_type, int test_type) override + { + trial_map_type = trial_type; + test_map_type = test_type; + } +}; + +// Integrator for a(u, v) = (Q grad u, v) for u in H1 and v in (H1)ᵈ. +class GradientIntegrator : public BilinearFormIntegrator +{ +public: + using BilinearFormIntegrator::BilinearFormIntegrator; + + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) const override; +}; + +// Base class for all discrete interpolators. +class DiscreteInterpolator +{ +public: + void Assemble(Ceed ceed, CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis interp_basis, CeedOperator *op, CeedOperator *op_t); +}; + +// Interpolator for the identity map, where the domain space is a subspace of the range +// space (discrete embedding matrix). +using IdentityInterpolator = DiscreteInterpolator; + +// Interpolator for the discrete gradient map from an H1 space to an H(curl) space. +using GradientInterpolator = DiscreteInterpolator; + +// Interpolator for the discrete curl map from an H(curl) space to an H(div) space. +using CurlInterpolator = DiscreteInterpolator; + +// Interpolator for the discrete divergence map from an H(div) space to an L2 space. +using DivergenceInterpolator = DiscreteInterpolator; + +// Similar to MFEM's VectorFEBoundaryTangentLFIntegrator for ND spaces, but instead of +// computing (n x f, v), this just computes (f, v). Also eliminates the a and b quadrature +// parameters and uses fem::DefaultIntegrationOrder instead. +class VectorFEBoundaryLFIntegrator : public mfem::LinearFormIntegrator +{ +private: + mfem::VectorCoefficient &Q; + mfem::DenseMatrix vshape; + mfem::Vector f_loc, f_hat; + +public: + VectorFEBoundaryLFIntegrator(mfem::VectorCoefficient &QG) : Q(QG) {} + + void AssembleRHSElementVect(const mfem::FiniteElement &fe, mfem::ElementTransformation &T, + mfem::Vector &elvect) override; +}; + +// Similar to MFEM's BoundaryLFIntegrator for H1 spaces, but eliminates the a and b +// quadrature parameters and uses fem::DefaultIntegrationOrder instead. +class BoundaryLFIntegrator : public mfem::LinearFormIntegrator +{ +private: + mfem::Coefficient &Q; + mfem::Vector shape; + +public: + BoundaryLFIntegrator(mfem::Coefficient &QG) : Q(QG) {} + + void AssembleRHSElementVect(const mfem::FiniteElement &fe, mfem::ElementTransformation &T, + mfem::Vector &elvect) override; +}; + +using VectorFEDomainLFIntegrator = VectorFEBoundaryLFIntegrator; +using DomainLFIntegrator = BoundaryLFIntegrator; + +} // namespace palace + +#endif // PALACE_FEM_INTEGRATOR_HPP diff --git a/palace/fem/interpolator.cpp b/palace/fem/interpolator.cpp index 5458ba7303..b13eead4b5 100644 --- a/palace/fem/interpolator.cpp +++ b/palace/fem/interpolator.cpp @@ -1,109 +1,303 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "interpolator.hpp" - -#include -#include "utils/communication.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -#if defined(MFEM_USE_GSLIB) -InterpolationOperator::InterpolationOperator(const IoData &iodata, mfem::ParMesh &mesh) - : op(mesh.GetComm()) -#else -InterpolationOperator::InterpolationOperator(const IoData &iodata, mfem::ParMesh &mesh) -#endif -{ -#if defined(MFEM_USE_GSLIB) - // Set up probes interpolation. All processes search for all points. - if (iodata.domains.postpro.probe.empty()) - { - return; - } - const double bb_t = 0.1; // MFEM defaults - const double newton_tol = 1.0e-12; - const int npts = static_cast(iodata.domains.postpro.probe.size()); - MFEM_VERIFY( - mesh.Dimension() == mesh.SpaceDimension(), - "Probe postprocessing functionality requires mesh dimension == space dimension!"); - mfem::Vector xyz(npts * mesh.SpaceDimension()); - op_idx.resize(npts); - int i = 0; - for (const auto &[idx, data] : iodata.domains.postpro.probe) - { - // Use default ordering byNODES. - xyz(i) = data.x; - xyz(npts + i) = data.y; - if (mesh.SpaceDimension() == 3) - { - xyz(2 * npts + i) = data.z; - } - op_idx[i++] = idx; - } - op.Setup(mesh, bb_t, newton_tol, npts); - op.FindPoints(xyz, mfem::Ordering::byNODES); - op.SetDefaultInterpolationValue(0.0); - i = 0; - for (const auto &[idx, data] : iodata.domains.postpro.probe) - { - if (op.GetCode()[i++] == 2) - { - Mpi::Warning("Probe {:d} at ({:.3e}, {:.3e}, {:.3e}) m could not be found!\n" - "Using default value 0.0!\n", - idx, iodata.DimensionalizeValue(IoData::ValueType::LENGTH, data.x), - iodata.DimensionalizeValue(IoData::ValueType::LENGTH, data.y), - iodata.DimensionalizeValue(IoData::ValueType::LENGTH, data.z)); - } - } -#else - MFEM_VERIFY(iodata.domains.postpro.probe.empty(), - "InterpolationOperator class requires MFEM_USE_GSLIB!"); -#endif -} - -std::vector InterpolationOperator::ProbeField(const mfem::ParGridFunction &U) -{ -#if defined(MFEM_USE_GSLIB) - // Interpolated vector values are returned from GSLIB interpolator byNODES, which we - // transform to byVDIM for output. - const int npts = op.GetCode().Size(); - const int dim = U.VectorDim(); - std::vector vals(npts * dim); - mfem::Vector v(npts * dim); - op.Interpolate(U, v); - for (int d = 0; d < dim; d++) - { - for (int i = 0; i < npts; i++) - { - vals[i * dim + d] = v(d * npts + i); - } - } - return vals; -#else - MFEM_ABORT("InterpolationOperator class requires MFEM_USE_GSLIB!"); - return {}; -#endif -} - -std::vector> -InterpolationOperator::ProbeField(const mfem::ParComplexGridFunction &U, bool has_imaginary) -{ - std::vector vr = ProbeField(U.real()); - if (has_imaginary) - { - std::vector vi = ProbeField(U.imag()); - std::vector> vals(vr.size()); - std::transform(vr.begin(), vr.end(), vi.begin(), vals.begin(), - [](double xr, double xi) { return std::complex(xr, xi); }); - return vals; - } - else - { - return std::vector>(vr.begin(), vr.end()); - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "interpolator.hpp" + +#include +#include "fem/fespace.hpp" +#include "fem/gridfunction.hpp" +#include "utils/communication.hpp" +#include "utils/iodata.hpp" + +namespace palace +{ + +namespace +{ + +constexpr auto GSLIB_BB_TOL = 0.01; // MFEM defaults, slightly reduced bounding box +constexpr auto GSLIB_NEWTON_TOL = 1.0e-12; + +} // namespace +InterpolationOperator::InterpolationOperator(const IoData &iodata, + FiniteElementSpace &nd_space) +#if defined(MFEM_USE_GSLIB) + : op(nd_space.GetParMesh().GetComm()), v_dim_fes(nd_space.Get().GetVectorDim()) +{ + auto &mesh = nd_space.GetParMesh(); + // Set up probes interpolation. All processes search for all points. + if (iodata.domains.postpro.probe.empty()) + { + return; + } + const int dim = mesh.SpaceDimension(); + MFEM_VERIFY( + mesh.Dimension() == dim, + "Probe postprocessing functionality requires mesh dimension == space dimension!"); + const int npts = static_cast(iodata.domains.postpro.probe.size()); + mfem::Vector xyz(npts * dim); + op_idx.resize(npts); + int i = 0; + for (const auto &[idx, data] : iodata.domains.postpro.probe) + { + for (int d = 0; d < dim; d++) + { + // Use default ordering byNODES. + xyz(d * npts + i) = data.center[d]; + } + op_idx[i++] = idx; + } + op.Setup(mesh, GSLIB_BB_TOL, GSLIB_NEWTON_TOL, npts); + op.FindPoints(xyz, mfem::Ordering::byNODES); + op.SetDefaultInterpolationValue(0.0); + i = 0; + for (const auto &[idx, data] : iodata.domains.postpro.probe) + { + if (op.GetCode()[i++] == 2) + { + Mpi::Warning( + "Probe {:d} at ({:.3e}) m could not be found!\n Using default value 0.0!\n", idx, + fmt::join(iodata.units.Dimensionalize(data.center), + ", ")); + } + } +} +#else +{ + MFEM_CONTRACT_VAR(GSLIB_BB_TOL); + MFEM_CONTRACT_VAR(GSLIB_NEWTON_TOL); + MFEM_VERIFY(iodata.domains.postpro.probe.empty(), + "InterpolationOperator class requires MFEM_USE_GSLIB!"); +} +#endif + +std::vector InterpolationOperator::ProbeField(const mfem::ParGridFunction &U) +{ +#if defined(MFEM_USE_GSLIB) + // Interpolated vector values are returned from GSLIB interpolator with the same ordering + // as the source grid function, which we transform to byVDIM for output. + const int npts = op.GetCode().Size(); + const int vdim = U.VectorDim(); + std::vector vals(npts * vdim); + if (U.FESpace()->GetOrdering() == mfem::Ordering::byVDIM) + { + mfem::Vector v(vals.data(), npts * vdim); + op.Interpolate(U, v); + } + else + { + mfem::Vector v(npts * vdim); + op.Interpolate(U, v); + for (int d = 0; d < vdim; d++) + { + for (int i = 0; i < npts; i++) + { + vals[i * vdim + d] = v(d * npts + i); + } + } + } + return vals; +#else + MFEM_ABORT("InterpolationOperator class requires MFEM_USE_GSLIB!"); + return {}; +#endif +} + +std::vector> InterpolationOperator::ProbeField(const GridFunction &U) +{ + std::vector vr = ProbeField(U.Real()); + if (U.HasImag()) + { + std::vector vi = ProbeField(U.Imag()); + std::vector> vals(vr.size()); + std::transform(vr.begin(), vr.end(), vi.begin(), vals.begin(), + [](double xr, double xi) { return std::complex(xr, xi); }); + return vals; + } + else + { + return {vr.begin(), vr.end()}; + } +} + +namespace fem +{ + +void InterpolateFunction(const mfem::GridFunction &U, mfem::GridFunction &V) +{ +#if defined(MFEM_USE_GSLIB) + // Generate list of points where the grid function will be evaluated. If the grid function + // to interpolate is an H1 space of the same order as the mesh nodes, we can use the + // mesh node points directly. Otherwise, for a different basis order or type, we generate + // the interpolation points in the physical space manually. + auto &dest_mesh = *V.FESpace()->GetMesh(); + MFEM_VERIFY(dest_mesh.GetNodes(), "Destination mesh has no nodal FE space!"); + const int dim = dest_mesh.SpaceDimension(); + mfem::Vector xyz; + mfem::Ordering::Type ordering; + const auto *dest_fec_h1 = + dynamic_cast(V.FESpace()->FEColl()); + const auto *dest_nodes_h1 = dynamic_cast( + dest_mesh.GetNodes()->FESpace()->FEColl()); + int dest_fespace_order = V.FESpace()->GetMaxElementOrder(); + int dest_nodes_order = dest_mesh.GetNodes()->FESpace()->GetMaxElementOrder(); + dest_mesh.GetNodes()->HostRead(); + if (dest_fec_h1 && dest_nodes_h1 && dest_fespace_order == dest_nodes_order) + { + xyz.MakeRef(*dest_mesh.GetNodes(), 0, dest_mesh.GetNodes()->Size()); + ordering = dest_mesh.GetNodes()->FESpace()->GetOrdering(); + } + else + { + int npts = 0, offset = 0; + for (int i = 0; i < dest_mesh.GetNE(); i++) + { + npts += V.FESpace()->GetFE(i)->GetNodes().GetNPoints(); + } + xyz.SetSize(npts * dim); + mfem::DenseMatrix pointmat; + for (int i = 0; i < dest_mesh.GetNE(); i++) + { + const mfem::FiniteElement &fe = *V.FESpace()->GetFE(i); + mfem::ElementTransformation &T = *dest_mesh.GetElementTransformation(i); + T.Transform(fe.GetNodes(), pointmat); + for (int j = 0; j < pointmat.Width(); j++) + { + for (int d = 0; d < dim; d++) + { + // Use default ordering byNODES. + xyz(d * npts + offset + j) = pointmat(d, j); + } + } + offset += pointmat.Width(); + } + ordering = mfem::Ordering::byNODES; + } + const int npts = xyz.Size() / dim; + + // Set up the interpolator. + auto &src_mesh = *U.FESpace()->GetMesh(); + MFEM_VERIFY(src_mesh.GetNodes(), "Source mesh has no nodal FE space!"); + auto *src_pmesh = dynamic_cast(&src_mesh); + MPI_Comm comm = (src_pmesh) ? src_pmesh->GetComm() : MPI_COMM_SELF; + mfem::FindPointsGSLIB op(comm); + op.Setup(src_mesh, GSLIB_BB_TOL, GSLIB_NEWTON_TOL, npts); + + // Perform the interpolation and fill the target GridFunction (see MFEM's field-interp + // miniapp). + const int vdim = U.VectorDim(); + mfem::Vector vals(npts * vdim); + op.SetDefaultInterpolationValue(0.0); + op.SetL2AvgType(mfem::FindPointsGSLIB::NONE); + op.Interpolate(xyz, U, vals, ordering); + const auto *dest_fec_l2 = + dynamic_cast(V.FESpace()->FEColl()); + if (dest_fec_h1 || dest_fec_l2) + { + if (dest_fec_h1 && dest_fespace_order != dest_nodes_order) + { + // H1 with order != mesh order needs to handle duplicated interpolation points. + mfem::Vector elem_vals; + mfem::Array vdofs; + int offset = 0; + for (int i = 0; i < dest_mesh.GetNE(); i++) + { + const mfem::FiniteElement &fe = *V.FESpace()->GetFE(i); + const int elem_npts = fe.GetNodes().GetNPoints(); + elem_vals.SetSize(elem_npts * vdim); + for (int d = 0; d < vdim; d++) + { + for (int j = 0; j < elem_npts; j++) + { + // Arrange element values byNODES to align with GetElementVDofs. + int idx = (U.FESpace()->GetOrdering() == mfem::Ordering::byNODES) + ? d * npts + offset + j + : (offset + j) * vdim + d; + elem_vals(d * elem_npts + j) = vals(idx); + } + } + const auto *dof_trans = V.FESpace()->GetElementVDofs(i, vdofs); + if (dof_trans) + { + dof_trans->TransformPrimal(elem_vals); + } + V.SetSubVector(vdofs, elem_vals); + offset += elem_npts; + } + } + else + { + // Otherwise, H1 and L2 copy interpolated values to vdofs. + MFEM_ASSERT(V.Size() == vals.Size(), + "Unexpected size mismatch for interpolated values and grid function!"); + V = vals; + } + } + else + { + // H(div) or H(curl) use ProjectFromNodes. + mfem::Vector elem_vals, v; + mfem::Array vdofs; + int offset = 0; + for (int i = 0; i < dest_mesh.GetNE(); i++) + { + const mfem::FiniteElement &fe = *V.FESpace()->GetFE(i); + mfem::ElementTransformation &T = *dest_mesh.GetElementTransformation(i); + const int elem_npts = fe.GetNodes().GetNPoints(); + elem_vals.SetSize(elem_npts * vdim); + for (int d = 0; d < vdim; d++) + { + for (int j = 0; j < elem_npts; j++) + { + // Arrange element values byVDIM for ProjectFromNodes. + int idx = (U.FESpace()->GetOrdering() == mfem::Ordering::byNODES) + ? d * npts + offset + j + : (offset + j) * vdim + d; + elem_vals(j * vdim + d) = vals(idx); + } + } + const auto *dof_trans = V.FESpace()->GetElementVDofs(i, vdofs); + v.SetSize(vdofs.Size()); + fe.ProjectFromNodes(elem_vals, T, v); + if (dof_trans) + { + dof_trans->TransformPrimal(v); + } + V.SetSubVector(vdofs, v); + offset += elem_npts; + } + } +#else + MFEM_ABORT("InterpolateFunction requires MFEM_USE_GSLIB!"); +#endif +} + +void InterpolateFunction(const mfem::Vector &xyz, const mfem::GridFunction &U, + mfem::Vector &vals, mfem::Ordering::Type ordering) +{ +#if defined(MFEM_USE_GSLIB) + // Set up the interpolator. + auto &src_mesh = *U.FESpace()->GetMesh(); + MFEM_VERIFY(src_mesh.GetNodes(), "Source mesh has no nodal FE space!"); + const int dim = src_mesh.SpaceDimension(); + const int npts = xyz.Size() / dim; + auto *src_pmesh = dynamic_cast(&src_mesh); + MPI_Comm comm = (src_pmesh) ? src_pmesh->GetComm() : MPI_COMM_SELF; + mfem::FindPointsGSLIB op(comm); + op.Setup(src_mesh, GSLIB_BB_TOL, GSLIB_NEWTON_TOL, npts); + + // Perform the interpolation, with the ordering of the returned values matching the + // ordering of the source grid function. + const int vdim = U.VectorDim(); + MFEM_VERIFY(vals.Size() == npts * vdim, "Incorrect size for interpolated values vector!"); + op.SetDefaultInterpolationValue(0.0); + op.SetL2AvgType(mfem::FindPointsGSLIB::NONE); + op.Interpolate(xyz, U, vals, ordering); +#else + MFEM_ABORT("InterpolateFunction requires MFEM_USE_GSLIB!"); +#endif +} + +} // namespace fem + +} // namespace palace diff --git a/palace/fem/interpolator.hpp b/palace/fem/interpolator.hpp index 9b5ec96854..15602042e3 100644 --- a/palace/fem/interpolator.hpp +++ b/palace/fem/interpolator.hpp @@ -1,40 +1,59 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_FEM_INTERPOLATOR_HPP -#define PALACE_FEM_INTERPOLATOR_HPP - -#include -#include -#include - -namespace palace -{ - -class IoData; - -// -// A class which wraps MFEM's GSLIB interface for high-order field interpolation. -// -class InterpolationOperator -{ -private: -#if defined(MFEM_USE_GSLIB) - mfem::FindPointsGSLIB op; -#endif - std::vector op_idx; - -public: - InterpolationOperator(const IoData &iodata, mfem::ParMesh &mesh); - - const auto &GetProbes() const { return op_idx; } - - std::vector ProbeField(const mfem::ParGridFunction &U); - - std::vector> ProbeField(const mfem::ParComplexGridFunction &U, - bool has_imaginary); -}; - -} // namespace palace - -#endif // PALACE_FEM_INTERPOLATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_FEM_INTERPOLATOR_HPP +#define PALACE_FEM_INTERPOLATOR_HPP + +#include +#include +#include + +namespace palace +{ + +class GridFunction; +class IoData; +class FiniteElementSpace; + +// +// A class which wraps MFEM's GSLIB interface for high-order field interpolation. +// +class InterpolationOperator +{ +private: +#if defined(MFEM_USE_GSLIB) + mfem::FindPointsGSLIB op; +#endif + std::vector op_idx; + + int v_dim_fes; // dimension of interpolated vector from NDSpace + + std::vector ProbeField(const mfem::ParGridFunction &U); + +public: + InterpolationOperator(const IoData &iodata, FiniteElementSpace &nd_space); + + auto GetVDim() const { return v_dim_fes; } + const auto &GetProbes() const { return op_idx; } + + std::vector> ProbeField(const GridFunction &U); +}; + +namespace fem +{ + +// Interpolate a function on a serial or parallel mesh to a different mesh, using GSLIB. +// Similar to MFEM's field-interp miniapp. +void InterpolateFunction(const mfem::GridFunction &U, mfem::GridFunction &V); + +// Interpolate a function at a specific list of points, specified using the provided +// ordering. The output vector values are always arranged byVDIM. +void InterpolateFunction(const mfem::Vector &xyz, const mfem::GridFunction &U, + mfem::Vector &V, + mfem::Ordering::Type ordering = mfem::Ordering::byNODES); + +} // namespace fem + +} // namespace palace + +#endif // PALACE_FEM_INTERPOLATOR_HPP diff --git a/palace/fem/libceed/basis.cpp b/palace/fem/libceed/basis.cpp index 72e8d6288f..003ed4f63f 100644 --- a/palace/fem/libceed/basis.cpp +++ b/palace/fem/libceed/basis.cpp @@ -1,295 +1,217 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "basis.hpp" - -#include "fem/libceed/hash.hpp" -#include "fem/libceed/utils.hpp" -#include "utils/omp.hpp" - -namespace palace::ceed -{ - -namespace internal -{ - -static std::unordered_map basis_map; -static std::unordered_map interp_basis_map; - -void ClearBasisCache() -{ - for (auto [k, v] : basis_map) - { - Ceed ceed; - PalaceCeedCallBackend(CeedBasisGetCeed(v, &ceed)); - PalaceCeedCall(ceed, CeedBasisDestroy(&v)); - } - for (auto [k, v] : interp_basis_map) - { - Ceed ceed; - PalaceCeedCallBackend(CeedBasisGetCeed(v, &ceed)); - PalaceCeedCall(ceed, CeedBasisDestroy(&v)); - } - basis_map.clear(); - interp_basis_map.clear(); -} - -} // namespace internal - -namespace -{ - -inline CeedElemTopology GetCeedTopology(mfem::Geometry::Type geom) -{ - switch (geom) - { - case mfem::Geometry::SEGMENT: - return CEED_TOPOLOGY_LINE; - case mfem::Geometry::TRIANGLE: - return CEED_TOPOLOGY_TRIANGLE; - case mfem::Geometry::SQUARE: - return CEED_TOPOLOGY_QUAD; - case mfem::Geometry::TETRAHEDRON: - return CEED_TOPOLOGY_TET; - case mfem::Geometry::CUBE: - return CEED_TOPOLOGY_HEX; - case mfem::Geometry::PRISM: - return CEED_TOPOLOGY_PRISM; - case mfem::Geometry::PYRAMID: - return CEED_TOPOLOGY_PYRAMID; - default: - MFEM_ABORT("This type of element is not supported!"); - return CEED_TOPOLOGY_LINE; // Silence compiler warning - } -} - -void InitTensorBasis(const mfem::ParFiniteElementSpace &fespace, - const mfem::FiniteElement &fe, const mfem::IntegrationRule &ir, - Ceed ceed, CeedBasis *basis) -{ - const mfem::DofToQuad &maps = fe.GetDofToQuad(ir, mfem::DofToQuad::TENSOR); - const int dim = fe.GetDim(); - const int ncomp = fespace.GetVDim(); - const int P = maps.ndof; - const int Q = maps.nqpt; - mfem::Vector qX(Q), qW(Q); - // The x-coordinates of the first `Q` points of the integration rule are the points of - // the corresponding 1D rule. We also scale the weights accordingly. - double w_sum = 0.0; - for (int i = 0; i < Q; i++) - { - const mfem::IntegrationPoint &ip = ir.IntPoint(i); - qX(i) = ip.x; - qW(i) = ip.weight; - w_sum += ip.weight; - } - qW *= 1.0 / w_sum; - PalaceCeedCall(ceed, CeedBasisCreateTensorH1(ceed, dim, ncomp, P, Q, maps.Bt.GetData(), - maps.Gt.GetData(), qX.GetData(), - qW.GetData(), basis)); -} - -void InitNonTensorBasis(const mfem::ParFiniteElementSpace &fespace, - const mfem::FiniteElement &fe, const mfem::IntegrationRule &ir, - Ceed ceed, CeedBasis *basis) -{ - const mfem::DofToQuad &maps = fe.GetDofToQuad(ir, mfem::DofToQuad::FULL); - const int dim = fe.GetDim(); - const int ncomp = fespace.GetVDim(); - const int P = maps.ndof; - const int Q = maps.nqpt; - mfem::DenseMatrix qX(dim, Q); - mfem::Vector qW(Q); - for (int i = 0; i < Q; i++) - { - const mfem::IntegrationPoint &ip = ir.IntPoint(i); - qX(0, i) = ip.x; - if (dim > 1) - { - qX(1, i) = ip.y; - } - if (dim > 2) - { - qX(2, i) = ip.z; - } - qW(i) = ip.weight; - } - if (fe.GetMapType() == mfem::FiniteElement::H_DIV) - { - PalaceCeedCall(ceed, CeedBasisCreateHdiv(ceed, GetCeedTopology(fe.GetGeomType()), ncomp, - P, Q, maps.Bt.GetData(), maps.Gt.GetData(), - qX.GetData(), qW.GetData(), basis)); - } - else if (fe.GetMapType() == mfem::FiniteElement::H_CURL) - { - PalaceCeedCall(ceed, - CeedBasisCreateHcurl(ceed, GetCeedTopology(fe.GetGeomType()), ncomp, P, - Q, maps.Bt.GetData(), maps.Gt.GetData(), - qX.GetData(), qW.GetData(), basis)); - } - else - { - PalaceCeedCall(ceed, CeedBasisCreateH1(ceed, GetCeedTopology(fe.GetGeomType()), ncomp, - P, Q, maps.Bt.GetData(), maps.Gt.GetData(), - qX.GetData(), qW.GetData(), basis)); - } -} - -#if 0 -void InitCeedInterpolatorBasis(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::FiniteElement &trial_fe, - const mfem::FiniteElement &test_fe, - Ceed ceed, - CeedBasis *basis) -{ - // Basis projection operator using libCEED - CeedBasis trial_basis, test_basis; - const int P = std::max(trial_fe.GetDof(), test_fe.GetDof()), ir_order_max = 100; - int ir_order = std::max(trial_fe.GetOrder(), test_fe.GetOrder()); - for (; ir_order < ir_order_max; ir_order++) - { - if (IntRules.Get(trial_fe.GetGeomType(), ir_order).GetNPoints() >= P) { break; } - } - const mfem::IntegrationRule &ir = IntRules.Get(trial_fe.GetGeomType(), ir_order); - InitBasis(trial_fespace, trial_fe, ir, ceed, &trial_basis); - InitBasis(test_fespace, test_fe, ir, ceed, &test_basis); - PalaceCeedCall(ceed, CeedBasisCreateProjection(trial_basis, test_basis, basis)); -} -#endif - -void InitMFEMInterpolatorBasis(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::FiniteElement &trial_fe, - const mfem::FiniteElement &test_fe, Ceed ceed, - CeedBasis *basis) -{ - MFEM_VERIFY( - trial_fespace.GetVDim() == test_fespace.GetVDim(), - "libCEED discrete linear operator requires same vdim for trial and test FE spaces!"); - const int dim = trial_fe.GetDim(); - const int ncomp = trial_fespace.GetVDim(); - const int trial_P = trial_fe.GetDof(); - const int test_P = test_fe.GetDof(); - mfem::DenseMatrix qX(dim, test_P), Gt(trial_P, test_P * dim), Bt; - mfem::Vector qW(test_P); - mfem::IsoparametricTransformation dummy; - dummy.SetIdentityTransformation(trial_fe.GetGeomType()); - if (trial_fe.GetMapType() == test_fe.GetMapType()) - { - // Prolongation - test_fe.GetTransferMatrix(trial_fe, dummy, Bt); - } - else if (trial_fe.GetMapType() == mfem::FiniteElement::VALUE && - test_fe.GetMapType() == mfem::FiniteElement::H_CURL) - { - // Discrete gradient interpolator - test_fe.ProjectGrad(trial_fe, dummy, Bt); - } - else if (trial_fe.GetMapType() == mfem::FiniteElement::H_CURL && - test_fe.GetMapType() == mfem::FiniteElement::H_DIV) - { - // Discrete curl interpolator - test_fe.ProjectCurl(trial_fe, dummy, Bt); - } - else if (trial_fe.GetMapType() == mfem::FiniteElement::H_DIV && - test_fe.GetMapType() == mfem::FiniteElement::INTEGRAL) - { - // Discrete divergence interpolator - test_fe.ProjectDiv(trial_fe, dummy, Bt); - } - else - { - MFEM_ABORT("Unsupported trial/test FE spaces for libCEED discrete linear operator!"); - } - Bt.Transpose(); - Gt = 0.0; - qX = 0.0; - qW = 0.0; - PalaceCeedCall(ceed, CeedBasisCreateH1(ceed, GetCeedTopology(trial_fe.GetGeomType()), - ncomp, trial_P, test_P, Bt.GetData(), Gt.GetData(), - qX.GetData(), qW.GetData(), basis)); -} - -} // namespace - -void InitBasis(const mfem::ParFiniteElementSpace &fespace, const mfem::FiniteElement &fe, - const mfem::IntegrationRule &ir, Ceed ceed, CeedBasis *basis) -{ - // Check for fespace -> basis in hash table. - internal::BasisKey key(ceed, fespace, fe, ir); - - // Initialize or retrieve key values (avoid simultaneous search and write). - auto basis_itr = internal::basis_map.end(); - PalacePragmaOmp(critical(InitBasis)) - { - basis_itr = internal::basis_map.find(key); - } - if (basis_itr == internal::basis_map.end()) - { - const bool tensor = dynamic_cast(&fe) != nullptr; - const bool vector = fe.GetRangeType() == mfem::FiniteElement::VECTOR; - if (tensor && !vector) - { - InitTensorBasis(fespace, fe, ir, ceed, basis); - } - else - { - InitNonTensorBasis(fespace, fe, ir, ceed, basis); - } - PalacePragmaOmp(critical(InitBasis)) - { - internal::basis_map[key] = *basis; - } - // std::cout << "New basis (" << ceed << ", " << &fe << ", " << &ir << ")\n"; - } - else - { - *basis = basis_itr->second; - // std::cout << "Reusing basis (" << ceed << ", " << &fe << ", " << &ir << ")\n"; - } -} - -void InitInterpolatorBasis(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::FiniteElement &trial_fe, - const mfem::FiniteElement &test_fe, Ceed ceed, CeedBasis *basis) -{ - // Check for fespace -> basis in hash table. - internal::InterpBasisKey key(ceed, trial_fespace, test_fespace, trial_fe, test_fe); - - // Initialize or retrieve key values (avoid simultaneous search and write). - auto basis_itr = internal::interp_basis_map.end(); - PalacePragmaOmp(critical(InitInterpBasis)) - { - basis_itr = internal::interp_basis_map.find(key); - } - if (basis_itr == internal::interp_basis_map.end()) - { -#if 0 - if (trial_fe.GetMapType() == test_fe.GetMapType()) - { - InitCeedInterpolatorBasis(trial_fespace, test_fespace, trial_fe, test_fe, ceed, basis); - } - else -#endif - { - InitMFEMInterpolatorBasis(trial_fespace, test_fespace, trial_fe, test_fe, ceed, - basis); - } - PalacePragmaOmp(critical(InitInterpBasis)) - { - internal::interp_basis_map[key] = *basis; - } - // std::cout << "New interpolator basis (" << ceed << ", " << &trial_fe - // << ", " << &test_fe << ")\n"; - } - else - { - *basis = basis_itr->second; - // std::cout << "Reusing interpolator basis (" << ceed << ", " << &trial_fe - // << ", " << &test_fe << ")\n"; - } -} - -} // namespace palace::ceed +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "basis.hpp" + +#include +#include "utils/diagnostic.hpp" + +namespace palace::ceed +{ + +namespace +{ + +void InitTensorBasis(const mfem::FiniteElement &fe, const mfem::IntegrationRule &ir, + CeedInt num_comp, Ceed ceed, CeedBasis *basis) +{ + // The x-coordinates of the first `Q` points of the integration rule are the points of + // the corresponding 1D rule. We also scale the weights accordingly. + const mfem::DofToQuad &maps = fe.GetDofToQuad(ir, mfem::DofToQuad::TENSOR); + const int dim = fe.GetDim(); + const int P = maps.ndof; + const int Q = maps.nqpt; + mfem::Vector qX(Q), qW(Q); + double w_sum = 0.0; + for (int i = 0; i < Q; i++) + { + const mfem::IntegrationPoint &ip = ir.IntPoint(i); + qX(i) = ip.x; + qW(i) = ip.weight; + w_sum += ip.weight; + } + qW *= 1.0 / w_sum; + + PalaceCeedCall(ceed, CeedBasisCreateTensorH1(ceed, dim, num_comp, P, Q, maps.Bt.GetData(), + maps.Gt.GetData(), qX.GetData(), + qW.GetData(), basis)); +} + +void InitNonTensorBasis(const mfem::FiniteElement &fe, const mfem::IntegrationRule &ir, + CeedInt num_comp, Ceed ceed, CeedBasis *basis) +{ + const mfem::DofToQuad &maps = fe.GetDofToQuad(ir, mfem::DofToQuad::FULL); + const int dim = fe.GetDim(); + const int P = maps.ndof; + const int Q = maps.nqpt; + mfem::DenseMatrix qX(dim, Q); + mfem::Vector qW(Q); + for (int i = 0; i < Q; i++) + { + const mfem::IntegrationPoint &ip = ir.IntPoint(i); + qX(0, i) = ip.x; + if (dim > 1) + { + qX(1, i) = ip.y; + } + if (dim > 2) + { + qX(2, i) = ip.z; + } + qW(i) = ip.weight; + } + + if (fe.GetMapType() == mfem::FiniteElement::H_DIV) + { + PalaceCeedCall(ceed, + CeedBasisCreateHdiv(ceed, GetCeedTopology(fe.GetGeomType()), num_comp, P, + Q, maps.Bt.GetData(), maps.Gt.GetData(), + qX.GetData(), qW.GetData(), basis)); + } + else if (fe.GetMapType() == mfem::FiniteElement::H_CURL) + { + PalaceCeedCall(ceed, + CeedBasisCreateHcurl(ceed, GetCeedTopology(fe.GetGeomType()), num_comp, + P, Q, maps.Bt.GetData(), maps.Gt.GetData(), + qX.GetData(), qW.GetData(), basis)); + } + else + { + PalaceCeedCall(ceed, + CeedBasisCreateH1(ceed, GetCeedTopology(fe.GetGeomType()), num_comp, P, + Q, maps.Bt.GetData(), maps.Gt.GetData(), qX.GetData(), + qW.GetData(), basis)); + } +} + +PalacePragmaDiagnosticPush +PalacePragmaDiagnosticDisableUnused + +void InitCeedInterpolatorBasis(const mfem::FiniteElement &trial_fe, + const mfem::FiniteElement &test_fe, CeedInt trial_num_comp, + CeedInt test_num_comp, Ceed ceed, CeedBasis *basis) +{ + // Basis projection operator using libCEED. + CeedBasis trial_basis, test_basis; + const int P = std::max(trial_fe.GetDof(), test_fe.GetDof()), ir_order_max = 100; + int ir_order = std::max(trial_fe.GetOrder(), test_fe.GetOrder()); + for (; ir_order < ir_order_max; ir_order++) + { + if (mfem::IntRules.Get(trial_fe.GetGeomType(), ir_order).GetNPoints() >= P) + { + break; + } + } + const mfem::IntegrationRule &ir = mfem::IntRules.Get(trial_fe.GetGeomType(), ir_order); + + InitBasis(trial_fe, ir, trial_num_comp, ceed, &trial_basis); + InitBasis(test_fe, ir, test_num_comp, ceed, &test_basis); + PalaceCeedCall(ceed, CeedBasisCreateProjection(trial_basis, test_basis, basis)); + PalaceCeedCall(ceed, CeedBasisDestroy(&trial_basis)); + PalaceCeedCall(ceed, CeedBasisDestroy(&test_basis)); +} + +PalacePragmaDiagnosticPop + +void InitMfemInterpolatorBasis(const mfem::FiniteElement &trial_fe, + const mfem::FiniteElement &test_fe, CeedInt trial_num_comp, + CeedInt test_num_comp, Ceed ceed, CeedBasis *basis) +{ + MFEM_VERIFY(trial_num_comp == test_num_comp && trial_num_comp == 1, + "libCEED discrete linear operator requires same vdim = 1 for trial and test " + "FE spaces!"); + const int trial_P = trial_fe.GetDof(); + const int test_P = test_fe.GetDof(); + mfem::DenseMatrix Bt, Gt(trial_P, test_P); + mfem::Vector qX(test_P), qW(test_P); + mfem::IsoparametricTransformation dummy; + dummy.SetIdentityTransformation(trial_fe.GetGeomType()); + if (trial_fe.GetMapType() == test_fe.GetMapType()) + { + // Prolongation. + test_fe.GetTransferMatrix(trial_fe, dummy, Bt); + } + else if (trial_fe.GetMapType() == mfem::FiniteElement::VALUE && + test_fe.GetMapType() == mfem::FiniteElement::H_CURL) + { + // Discrete gradient interpolator. + test_fe.ProjectGrad(trial_fe, dummy, Bt); + } + else if (trial_fe.GetMapType() == mfem::FiniteElement::H_CURL && + test_fe.GetMapType() == mfem::FiniteElement::H_DIV) + { + // Discrete curl interpolator. + test_fe.ProjectCurl(trial_fe, dummy, Bt); + } + else if (trial_fe.GetMapType() == mfem::FiniteElement::H_DIV && + test_fe.GetMapType() == mfem::FiniteElement::INTEGRAL) + { + // Discrete divergence interpolator. + test_fe.ProjectDiv(trial_fe, dummy, Bt); + } + else + { + MFEM_ABORT("Unsupported trial/test FE spaces for libCEED discrete linear operator!"); + } + Bt.Transpose(); + Gt = 0.0; + qX = 0.0; + qW = 0.0; + + // Note: ceed::GetCeedTopology(CEED_TOPOLOGY_LINE) == 1. + PalaceCeedCall(ceed, CeedBasisCreateH1(ceed, CEED_TOPOLOGY_LINE, trial_num_comp, trial_P, + test_P, Bt.GetData(), Gt.GetData(), qX.GetData(), + qW.GetData(), basis)); +} + +} // namespace + +void InitBasis(const mfem::FiniteElement &fe, const mfem::IntegrationRule &ir, + CeedInt num_comp, Ceed ceed, CeedBasis *basis) +{ + if constexpr (false) + { + std::cout << "New basis (" << ceed << ", " << &fe << ", " << &ir << ")\n"; + } + const bool tensor = dynamic_cast(&fe) != nullptr; + const bool vector = fe.GetRangeType() == mfem::FiniteElement::VECTOR; + if (tensor && !vector) + { + InitTensorBasis(fe, ir, num_comp, ceed, basis); + } + else + { + InitNonTensorBasis(fe, ir, num_comp, ceed, basis); + } +} + +void InitInterpolatorBasis(const mfem::FiniteElement &trial_fe, + const mfem::FiniteElement &test_fe, CeedInt trial_num_comp, + CeedInt test_num_comp, Ceed ceed, CeedBasis *basis) +{ + if constexpr (false) + { + std::cout << "New interpolator basis (" << ceed << ", " << &trial_fe << ", " << &test_fe + << ", " << (trial_fe.GetMapType() == test_fe.GetMapType()) << ")\n"; + } + if constexpr (false) + { + if (trial_fe.GetMapType() == test_fe.GetMapType()) + { + InitCeedInterpolatorBasis(trial_fe, test_fe, trial_num_comp, test_num_comp, ceed, + basis); + } + else + { + InitMfemInterpolatorBasis(trial_fe, test_fe, trial_num_comp, test_num_comp, ceed, + basis); + } + } + else + { + InitMfemInterpolatorBasis(trial_fe, test_fe, trial_num_comp, test_num_comp, ceed, + basis); + } +} + +} // namespace palace::ceed diff --git a/palace/fem/libceed/basis.hpp b/palace/fem/libceed/basis.hpp index d0e2149460..39e94b4b21 100644 --- a/palace/fem/libceed/basis.hpp +++ b/palace/fem/libceed/basis.hpp @@ -1,52 +1,29 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_BASIS_HPP -#define PALACE_LIBCEED_BASIS_HPP - -#include -#include -#include -#include - -namespace palace::ceed -{ - -void InitBasis(const mfem::ParFiniteElementSpace &fespace, const mfem::FiniteElement &fe, - const mfem::IntegrationRule &ir, Ceed ceed, CeedBasis *basis); - -inline void InitBasis(const mfem::ParFiniteElementSpace &fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - bool use_bdr, Ceed ceed, CeedBasis *basis) -{ - const mfem::FiniteElement &fe = - use_bdr ? *fespace.GetBE(indices[0]) : *fespace.GetFE(indices[0]); - InitBasis(fespace, fe, ir, ceed, basis); -} - -void InitInterpolatorBasis(const mfem::ParFiniteElementSpace &trial_fes, - const mfem::ParFiniteElementSpace &test_fes, - const mfem::FiniteElement &trial_fe, - const mfem::FiniteElement &test_fe, Ceed ceed, CeedBasis *basis); - -inline void InitInterpolatorBasis(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const std::vector &indices, Ceed ceed, - CeedBasis *basis) -{ - const mfem::FiniteElement &trial_fe = *trial_fespace.GetFE(indices[0]); - const mfem::FiniteElement &test_fe = *test_fespace.GetFE(indices[0]); - InitInterpolatorBasis(trial_fespace, test_fespace, trial_fe, test_fe, ceed, basis); -} - -namespace internal -{ - -// Destroy the cached CeedBasis objects. -void ClearBasisCache(); - -} // namespace internal - -} // namespace palace::ceed - -#endif // PALACE_LIBCEED_BASIS_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_BASIS_HPP +#define PALACE_LIBCEED_BASIS_HPP + +#include "fem/libceed/ceed.hpp" + +namespace mfem +{ + +class FiniteElement; +class IntegrationRule; + +} // namespace mfem + +namespace palace::ceed +{ + +void InitBasis(const mfem::FiniteElement &fe, const mfem::IntegrationRule &ir, int num_comp, + Ceed ceed, CeedBasis *basis); + +void InitInterpolatorBasis(const mfem::FiniteElement &trial_fe, + const mfem::FiniteElement &test_fe, int trial_num_comp, + int test_num_comp, Ceed ceed, CeedBasis *basis); + +} // namespace palace::ceed + +#endif // PALACE_LIBCEED_BASIS_HPP diff --git a/palace/fem/libceed/ceed.cpp b/palace/fem/libceed/ceed.cpp new file mode 100644 index 0000000000..d7c3ab9840 --- /dev/null +++ b/palace/fem/libceed/ceed.cpp @@ -0,0 +1,153 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "ceed.hpp" + +#include +#include "utils/omp.hpp" + +namespace palace::ceed +{ + +namespace internal +{ + +static std::vector ceeds; + +const std::vector &GetCeedObjects() +{ + return ceeds; +} + +std::size_t NumCeeds() +{ + return GetCeedObjects().size(); +} + +} // namespace internal + +void Initialize(const char *resource, const char *jit_source_dir) +{ + PalacePragmaOmp(parallel) + { + PalacePragmaOmp(master) + { + // Only parallelize libCEED operators over threads when not using the GPU. + const int nt = !std::string_view(resource).compare(0, 4, "/cpu") + ? utils::GetNumActiveThreads() + : 1; + internal::ceeds.resize(nt, nullptr); + } + } + + // Master thread initializes all Ceed objects (ineherently sequential anyway due to shared + // resources). + for (std::size_t i = 0; i < internal::ceeds.size(); i++) + { + int ierr = CeedInit(resource, &internal::ceeds[i]); + MFEM_VERIFY(!ierr, "Failed to initialize libCEED with resource " << resource << "!"); + Ceed ceed = internal::ceeds[i]; + + // Configure error handling (allow errors to be handled by PalaceCeedCallBackend or + // PalaceCeedCall). + PalaceCeedCall(ceed, CeedSetErrorHandler(ceed, CeedErrorStore)); + + // Configure QFunction search path. + if (jit_source_dir) + { + PalaceCeedCall(ceed, CeedAddJitSourceRoot(ceed, jit_source_dir)); + } + } +} + +void Finalize() +{ + // Destroy Ceed context(s). + for (std::size_t i = 0; i < internal::ceeds.size(); i++) + { + int ierr = CeedDestroy(&internal::ceeds[i]); + MFEM_VERIFY(!ierr, "Failed to finalize libCEED!"); + } + internal::ceeds.clear(); +} + +std::string Print() +{ + MFEM_VERIFY(internal::GetCeedObjects().size() > 0, + "libCEED must be initialized before querying the active backend!"); + Ceed ceed = internal::GetCeedObjects()[0]; + const char *ceed_resource; + PalaceCeedCall(ceed, CeedGetResource(ceed, &ceed_resource)); + return std::string(ceed_resource); +} + +void InitCeedVector(const mfem::Vector &v, Ceed ceed, CeedVector *cv, bool init) +{ + CeedMemType mem; + PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem)); + if (!mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE) + { + mem = CEED_MEM_HOST; + } + const auto *data = v.Read(mem == CEED_MEM_DEVICE); + if (init) + { + PalaceCeedCall(ceed, CeedVectorCreate(ceed, v.Size(), cv)); + } + else + { + PalaceCeedCall(ceed, CeedVectorTakeArray(*cv, mem, nullptr)); + } + PalaceCeedCall( + ceed, CeedVectorSetArray(*cv, mem, CEED_USE_POINTER, const_cast(data))); +} + +CeedElemTopology GetCeedTopology(mfem::Geometry::Type geom) +{ + switch (geom) + { + case mfem::Geometry::SEGMENT: + return CEED_TOPOLOGY_LINE; + case mfem::Geometry::TRIANGLE: + return CEED_TOPOLOGY_TRIANGLE; + case mfem::Geometry::SQUARE: + return CEED_TOPOLOGY_QUAD; + case mfem::Geometry::TETRAHEDRON: + return CEED_TOPOLOGY_TET; + case mfem::Geometry::CUBE: + return CEED_TOPOLOGY_HEX; + case mfem::Geometry::PRISM: + return CEED_TOPOLOGY_PRISM; + case mfem::Geometry::PYRAMID: + return CEED_TOPOLOGY_PYRAMID; + default: + MFEM_ABORT("This type of element is not supported!"); + return CEED_TOPOLOGY_LINE; // Silence compiler warning + } +} + +mfem::Geometry::Type GetMfemTopology(CeedElemTopology geom) +{ + switch (geom) + { + case CEED_TOPOLOGY_LINE: + return mfem::Geometry::SEGMENT; + case CEED_TOPOLOGY_TRIANGLE: + return mfem::Geometry::TRIANGLE; + case CEED_TOPOLOGY_QUAD: + return mfem::Geometry::SQUARE; + case CEED_TOPOLOGY_TET: + return mfem::Geometry::TETRAHEDRON; + case CEED_TOPOLOGY_HEX: + return mfem::Geometry::CUBE; + case CEED_TOPOLOGY_PRISM: + return mfem::Geometry::PRISM; + case CEED_TOPOLOGY_PYRAMID: + return mfem::Geometry::PYRAMID; + default: + MFEM_ABORT("This type of element is not supported!"); + return mfem::Geometry::SEGMENT; // Silence compiler warning + } +} + +} // namespace palace::ceed diff --git a/palace/fem/libceed/ceed.hpp b/palace/fem/libceed/ceed.hpp new file mode 100644 index 0000000000..4affd5e658 --- /dev/null +++ b/palace/fem/libceed/ceed.hpp @@ -0,0 +1,80 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_CEED_HPP +#define PALACE_LIBCEED_CEED_HPP + +#include +#include +#include +#include +#include + +#define PalaceCeedCall(ceed, ...) \ + do \ + { \ + int ierr_ = __VA_ARGS__; \ + if (ierr_ != CEED_ERROR_SUCCESS) \ + { \ + const char *msg; \ + CeedGetErrorMessage(ceed, &msg); \ + MFEM_ABORT(msg); \ + } \ + } while (0) + +#define PalaceCeedCallBackend(...) \ + do \ + { \ + int ierr_ = __VA_ARGS__; \ + if (ierr_ != CEED_ERROR_SUCCESS) \ + { \ + MFEM_ABORT("libCEED encountered a fatal error!"); \ + } \ + } while (0) + +#define PalaceQFunctionRelativePath(path) strstr(path, "qfunctions") + +namespace palace::ceed +{ + +// Useful alias templates for libCEED objects specific to a specific Ceed context and +// element geometry type. +template +using GeometryObjectMap = std::unordered_map; +template +using CeedObjectMap = std::unordered_map>; + +// Call libCEED's CeedInit for the given resource. The specific device to use is set prior +// to this using mfem::Device. +void Initialize(const char *resource, const char *jit_source_dir); + +// Finalize libCEED with CeedDestroy. +void Finalize(); + +// Get the configured libCEED backend. +std::string Print(); + +// Initialize a CeedVector from an mfem::Vector. When init is false, expects the CeedVector +// has already been initialized and just sets the data pointer. +void InitCeedVector(const mfem::Vector &v, Ceed ceed, CeedVector *cv, bool init = true); + +// Convert an MFEM geometry type to a libCEED one. +CeedElemTopology GetCeedTopology(mfem::Geometry::Type geom); + +// Convert a libCEED geometry type to an MFEM one. +mfem::Geometry::Type GetMfemTopology(CeedElemTopology geom); + +namespace internal +{ + +// Access the Ceed objects initialized by CeedInit. +const std::vector &GetCeedObjects(); + +// Convenience method for number of ceeds. +std::size_t NumCeeds(); + +} // namespace internal + +} // namespace palace::ceed + +#endif // PALACE_LIBCEED_CEED_HPP diff --git a/palace/fem/libceed/coefficient.hpp b/palace/fem/libceed/coefficient.hpp deleted file mode 100644 index c5c7698c61..0000000000 --- a/palace/fem/libceed/coefficient.hpp +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_COEFFICIENT_HPP -#define PALACE_LIBCEED_COEFFICIENT_HPP - -#include -#include -#include - -namespace palace::ceed -{ - -struct QuadratureCoefficient -{ - int ncomp; - mfem::Vector data; -}; - -inline void InitCoefficient(mfem::Coefficient &Q, mfem::ParMesh &mesh, - const mfem::IntegrationRule &ir, - const std::vector &indices, bool use_bdr, - QuadratureCoefficient &coeff) -{ - const auto ne = indices.size(); - const auto nqpts = ir.GetNPoints(); - coeff.ncomp = 1; - coeff.data.SetSize(ne * nqpts); - auto C = mfem::Reshape(coeff.data.HostWrite(), nqpts, ne); - mfem::IsoparametricTransformation T; - for (std::size_t i = 0; i < ne; ++i) - { - const auto e = indices[i]; - if (use_bdr) - { - mesh.GetBdrElementTransformation(e, &T); - } - else - { - mesh.GetElementTransformation(e, &T); - } - for (int q = 0; q < nqpts; ++q) - { - const mfem::IntegrationPoint &ip = ir.IntPoint(q); - T.SetIntPoint(&ip); - C(q, i) = Q.Eval(T, ip); - } - } -} - -inline void InitCoefficient(mfem::VectorCoefficient &VQ, mfem::ParMesh &mesh, - const mfem::IntegrationRule &ir, - const std::vector &indices, bool use_bdr, - QuadratureCoefficient &coeff) -{ - const auto ne = indices.size(); - const auto vdim = VQ.GetVDim(); - const auto nqpts = ir.GetNPoints(); - coeff.ncomp = vdim; - coeff.data.SetSize(ne * nqpts * vdim); - auto C = mfem::Reshape(coeff.data.HostWrite(), vdim, nqpts, ne); - mfem::IsoparametricTransformation T; - mfem::DenseMatrix Q_ip(vdim, nqpts); - for (std::size_t i = 0; i < ne; ++i) - { - const auto e = indices[i]; - if (use_bdr) - { - mesh.GetBdrElementTransformation(e, &T); - } - else - { - mesh.GetElementTransformation(e, &T); - } - VQ.Eval(Q_ip, T, ir); - for (int q = 0; q < nqpts; ++q) - { - for (int d = 0; d < vdim; ++d) - { - C(d, q, i) = Q_ip(d, q); - } - } - } -} - -inline void InitCoefficient(mfem::MatrixCoefficient &MQ, mfem::ParMesh &mesh, - const mfem::IntegrationRule &ir, - const std::vector &indices, bool use_bdr, - QuadratureCoefficient &coeff) -{ - // Assumes matrix coefficient is symmetric. - const auto ne = indices.size(); - const auto vdim = MQ.GetVDim(); - const auto ncomp = (vdim * (vdim + 1)) / 2; - const auto nqpts = ir.GetNPoints(); - coeff.ncomp = ncomp; - coeff.data.SetSize(ne * nqpts * ncomp); - auto C = mfem::Reshape(coeff.data.HostWrite(), ncomp, nqpts, ne); - mfem::IsoparametricTransformation T; - mfem::DenseMatrix Q_ip(vdim); - for (std::size_t i = 0; i < ne; ++i) - { - const auto e = indices[i]; - if (use_bdr) - { - mesh.GetBdrElementTransformation(e, &T); - } - else - { - mesh.GetElementTransformation(e, &T); - } - for (int q = 0; q < nqpts; ++q) - { - const mfem::IntegrationPoint &ip = ir.IntPoint(q); - T.SetIntPoint(&ip); - MQ.Eval(Q_ip, T, ip); - for (int dj = 0; dj < vdim; ++dj) - { - for (int di = dj; di < vdim; ++di) - { - const int idx = (dj * vdim) - (((dj - 1) * dj) / 2) + di - dj; - C(idx, q, i) = Q_ip(di, dj); // Column-major - } - } - } - } -} - -} // namespace palace::ceed - -#endif // PALACE_LIBCEED_COEFFICIENT_HPP diff --git a/palace/fem/libceed/hash.hpp b/palace/fem/libceed/hash.hpp index 83a6f75c48..e8bcf3452c 100644 --- a/palace/fem/libceed/hash.hpp +++ b/palace/fem/libceed/hash.hpp @@ -1,174 +1,174 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_HASH_HPP -#define PALACE_LIBCEED_HASH_HPP - -#include -#include -#include -#include -#include "fem/fespace.hpp" - -namespace palace::ceed -{ - -// Base case for combining hashes. -inline void CeedHashCombine(std::size_t &seed) {} - -// See for example https://onlinelibrary.wiley.com/doi/abs/10.1002/asi.10170, the source -// of https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html. -template -inline void CeedHashCombine(std::size_t &seed, const T &v, const U &...args) -{ - std::hash hasher; - seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); - (CeedHashCombine(seed, args), ...); -} - -namespace internal -{ - -struct FiniteElementKey -{ - mfem::Geometry::Type type; - int order, P; - int space, range_type, map_type, deriv_type, deriv_range_type, deriv_map_type; - FiniteElementKey(const mfem::FiniteElement &fe) - : type(fe.GetGeomType()), order(fe.GetOrder()), P(fe.GetDof()), space(fe.Space()), - range_type(fe.GetRangeType()), map_type(fe.GetMapType()), - deriv_type(fe.GetDerivType()), deriv_range_type(fe.GetDerivRangeType()), - deriv_map_type(fe.GetDerivMapType()) - { - } - bool operator==(const FiniteElementKey &k) const - { - return (type == k.type && order == k.order && P == k.P && space == k.space && - range_type == k.range_type && map_type == k.map_type && - deriv_type == k.deriv_type && deriv_range_type == k.deriv_range_type && - deriv_map_type == k.deriv_map_type); - } -}; - -using FiniteElementPairKey = std::pair; - -struct FiniteElementPairHash -{ - std::size_t operator()(const FiniteElementPairKey &k) const - { - std::size_t hash = 0; - CeedHashCombine(hash, k.first, k.second); - return hash; - } -}; - -struct BasisKey -{ - Ceed ceed; - FiniteElementKey fe; - int qorder, nqpts, ncomp; - BasisKey(Ceed ceed, const mfem::ParFiniteElementSpace &fespace, - const mfem::FiniteElement &fe, const mfem::IntegrationRule &ir) - : ceed(ceed), fe(fe), qorder(ir.GetOrder()), nqpts(ir.GetNPoints()), - ncomp(fespace.GetVDim()) - { - } - bool operator==(const BasisKey &k) const - { - return (ceed == k.ceed && fe == k.fe && qorder == k.qorder && nqpts == k.nqpts && - ncomp == k.ncomp); - } -}; - -struct BasisHash -{ - std::size_t operator()(const BasisKey &k) const - { - std::size_t hash = 0; - CeedHashCombine(hash, k.ceed, k.fe, k.qorder, k.nqpts, k.ncomp); - return hash; - } -}; - -struct InterpBasisKey -{ - Ceed ceed; - FiniteElementKey trial_fe, test_fe; - int ncomp; - InterpBasisKey(Ceed ceed, const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::FiniteElement &trial_fe, const mfem::FiniteElement &test_fe) - : ceed(ceed), trial_fe(trial_fe), test_fe(test_fe), ncomp(trial_fespace.GetVDim()) - { - } - bool operator==(const InterpBasisKey &k) const - { - return (ceed == k.ceed && trial_fe == k.trial_fe && test_fe == k.test_fe && - ncomp == k.ncomp); - } -}; - -struct InterpBasisHash -{ - std::size_t operator()(const InterpBasisKey &k) const - { - std::size_t hash = 0; - CeedHashCombine(hash, k.ceed, k.trial_fe, k.test_fe, k.ncomp); - return hash; - } -}; - -struct RestrKey -{ - Ceed ceed; - std::size_t fespace, first_elem; - bool use_bdr, unique_interp_restr, unique_interp_range_restr; - RestrKey(Ceed ceed, const FiniteElementSpace &fespace, std::size_t first_elem, - bool use_bdr, bool unique_interp_restr, bool unique_interp_range_restr) - : ceed(ceed), fespace(fespace.GetId()), first_elem(first_elem), use_bdr(use_bdr), - unique_interp_restr(unique_interp_restr), - unique_interp_range_restr(unique_interp_range_restr) - { - } - bool operator==(const RestrKey &k) const - { - return (ceed == k.ceed && fespace == k.fespace && first_elem == k.first_elem && - use_bdr == k.use_bdr && unique_interp_restr == k.unique_interp_restr && - unique_interp_range_restr == k.unique_interp_range_restr); - } -}; - -struct RestrHash -{ - std::size_t operator()(const RestrKey &k) const - { - std::size_t hash = 0; - CeedHashCombine(hash, k.ceed, k.fespace, k.first_elem, k.use_bdr, k.unique_interp_restr, - k.unique_interp_range_restr); - return hash; - } -}; - -} // namespace internal - -} // namespace palace::ceed - -namespace std -{ - -template <> -struct hash -{ - std::size_t operator()(const palace::ceed::internal::FiniteElementKey &k) const noexcept - { - std::size_t hash = 0; - palace::ceed::CeedHashCombine(hash, k.type, k.order, k.P, k.space, k.range_type, - k.map_type, k.deriv_type, k.deriv_range_type, - k.deriv_map_type); - return hash; - } -}; - -} // namespace std - -#endif // PALACE_LIBCEED_HASH_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HASH_HPP +#define PALACE_LIBCEED_HASH_HPP + +#include +#include +#include +#include +#include "fem/fespace.hpp" + +namespace palace::ceed +{ + +// Base case for combining hashes. +inline void CeedHashCombine(std::size_t &seed) {} + +// See for example https://onlinelibrary.wiley.com/doi/abs/10.1002/asi.10170, the source +// of https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html. +template +inline void CeedHashCombine(std::size_t &seed, const T &v, const U &...args) +{ + std::hash hasher; + seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + (CeedHashCombine(seed, args), ...); +} + +namespace internal +{ + +struct FiniteElementKey +{ + mfem::Geometry::Type type; + int order, P; + int space, range_type, map_type, deriv_type, deriv_range_type, deriv_map_type; + FiniteElementKey(const mfem::FiniteElement &fe) + : type(fe.GetGeomType()), order(fe.GetOrder()), P(fe.GetDof()), space(fe.Space()), + range_type(fe.GetRangeType()), map_type(fe.GetMapType()), + deriv_type(fe.GetDerivType()), deriv_range_type(fe.GetDerivRangeType()), + deriv_map_type(fe.GetDerivMapType()) + { + } + bool operator==(const FiniteElementKey &k) const + { + return (type == k.type && order == k.order && P == k.P && space == k.space && + range_type == k.range_type && map_type == k.map_type && + deriv_type == k.deriv_type && deriv_range_type == k.deriv_range_type && + deriv_map_type == k.deriv_map_type); + } +}; + +using FiniteElementPairKey = std::pair; + +struct FiniteElementPairHash +{ + std::size_t operator()(const FiniteElementPairKey &k) const + { + std::size_t hash = 0; + CeedHashCombine(hash, k.first, k.second); + return hash; + } +}; + +struct BasisKey +{ + Ceed ceed; + FiniteElementKey fe; + int qorder, nqpts, ncomp; + BasisKey(Ceed ceed, const mfem::ParFiniteElementSpace &fespace, + const mfem::FiniteElement &fe, const mfem::IntegrationRule &ir) + : ceed(ceed), fe(fe), qorder(ir.GetOrder()), nqpts(ir.GetNPoints()), + ncomp(fespace.GetVDim()) + { + } + bool operator==(const BasisKey &k) const + { + return (ceed == k.ceed && fe == k.fe && qorder == k.qorder && nqpts == k.nqpts && + ncomp == k.ncomp); + } +}; + +struct BasisHash +{ + std::size_t operator()(const BasisKey &k) const + { + std::size_t hash = 0; + CeedHashCombine(hash, k.ceed, k.fe, k.qorder, k.nqpts, k.ncomp); + return hash; + } +}; + +struct InterpBasisKey +{ + Ceed ceed; + FiniteElementKey trial_fe, test_fe; + int ncomp; + InterpBasisKey(Ceed ceed, const mfem::ParFiniteElementSpace &trial_fespace, + const mfem::ParFiniteElementSpace &test_fespace, + const mfem::FiniteElement &trial_fe, const mfem::FiniteElement &test_fe) + : ceed(ceed), trial_fe(trial_fe), test_fe(test_fe), ncomp(trial_fespace.GetVDim()) + { + } + bool operator==(const InterpBasisKey &k) const + { + return (ceed == k.ceed && trial_fe == k.trial_fe && test_fe == k.test_fe && + ncomp == k.ncomp); + } +}; + +struct InterpBasisHash +{ + std::size_t operator()(const InterpBasisKey &k) const + { + std::size_t hash = 0; + CeedHashCombine(hash, k.ceed, k.trial_fe, k.test_fe, k.ncomp); + return hash; + } +}; + +struct RestrKey +{ + Ceed ceed; + std::size_t fespace, first_elem; + bool use_bdr, unique_interp_restr, unique_interp_range_restr; + RestrKey(Ceed ceed, const FiniteElementSpace &fespace, std::size_t first_elem, + bool use_bdr, bool unique_interp_restr, bool unique_interp_range_restr) + : ceed(ceed), fespace(fespace.GetId()), first_elem(first_elem), use_bdr(use_bdr), + unique_interp_restr(unique_interp_restr), + unique_interp_range_restr(unique_interp_range_restr) + { + } + bool operator==(const RestrKey &k) const + { + return (ceed == k.ceed && fespace == k.fespace && first_elem == k.first_elem && + use_bdr == k.use_bdr && unique_interp_restr == k.unique_interp_restr && + unique_interp_range_restr == k.unique_interp_range_restr); + } +}; + +struct RestrHash +{ + std::size_t operator()(const RestrKey &k) const + { + std::size_t hash = 0; + CeedHashCombine(hash, k.ceed, k.fespace, k.first_elem, k.use_bdr, k.unique_interp_restr, + k.unique_interp_range_restr); + return hash; + } +}; + +} // namespace internal + +} // namespace palace::ceed + +namespace std +{ + +template <> +struct hash +{ + std::size_t operator()(const palace::ceed::internal::FiniteElementKey &k) const noexcept + { + std::size_t hash = 0; + palace::ceed::CeedHashCombine(hash, k.type, k.order, k.P, k.space, k.range_type, + k.map_type, k.deriv_type, k.deriv_range_type, + k.deriv_map_type); + return hash; + } +}; + +} // namespace std + +#endif // PALACE_LIBCEED_HASH_HPP diff --git a/palace/fem/libceed/integrator.hpp b/palace/fem/libceed/integrator.hpp deleted file mode 100644 index 897d1add9f..0000000000 --- a/palace/fem/libceed/integrator.hpp +++ /dev/null @@ -1,480 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_INTEGRATOR_HPP -#define PALACE_LIBCEED_INTEGRATOR_HPP - -#include -#include -#include -#include -#include "fem/libceed/basis.hpp" -#include "fem/libceed/coefficient.hpp" -#include "fem/libceed/restriction.hpp" -#include "fem/libceed/utils.hpp" - -namespace palace::ceed -{ - -// Evaluation modes for CeedOperator fields for various integrators. -enum class EvalMode -{ - None, - Interp, - Grad, - Div, - Curl, - InterpAndGrad, - InterpAndDiv, - InterpAndCurl -}; - -// Data structure for CeedOperator construction for various integrators. -struct IntegratorInfo -{ - // QFunctions for operator construction and application. - CeedQFunctionUser build_qf, apply_qf; - - // Path and name of the QFunctions for operator construction and application. - std::string build_qf_path, apply_qf_path; - - // Evaluation modes for the test and trial basis. - EvalMode trial_op, test_op; - - // Size of the data at each quadrature point. - int qdata_size; -}; - -// Helper function which combines quadrature data assembly and operator assembly in a single -// method. -template -inline void AssembleCeedOperator(const CeedIntegratorInfo &info, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, const bool use_bdr, - const std::vector &Q, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - // Assemble quadrature data. - CeedVector qdata; - CeedElemRestriction qdata_restr; - AssembleCeedQuadratureData(info, trial_fespace, test_fespace, ir, indices, use_bdr, Q, - ceed, &qdata, &qdata_restr); - - // Assemble the operator (no transpose). - AssembleCeedOperator(info, trial_fespace, test_fespace, ir, indices, use_bdr, qdata, - qdata_restr, ceed, op); - *op_t = nullptr; - - // Cleanup (these are now owned by the operator). - PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&qdata_restr)); - PalaceCeedCall(ceed, CeedVectorDestroy(&qdata)); -} - -// Create libCEED quadrature data and element restriction for use in a partially assembled -// libCEED operator. -template -inline void -AssembleCeedQuadratureData(const CeedIntegratorInfo &info, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, const std::vector &indices, - const bool use_bdr, const std::vector &Q, - Ceed ceed, CeedVector *qdata, CeedElemRestriction *qdata_restr) -{ - MFEM_VERIFY(trial_fespace.GetParMesh() == test_fespace.GetParMesh(), - "Trial and test finite element spaces must correspond to the same mesh!"); - const mfem::ParMesh &mesh = *trial_fespace.GetParMesh(); - MFEM_VERIFY(mesh.GetNodes(), "The mesh has no nodal FE space!"); - const mfem::GridFunction &mesh_nodes = *mesh.GetNodes(); - MFEM_VERIFY(dynamic_cast(mesh_nodes.FESpace()), - "Unexpected non-parallel FiniteElementSpace for mesh nodes!"); - const mfem::ParFiniteElementSpace &mesh_fespace = - *dynamic_cast(mesh_nodes.FESpace()); - - CeedInt ne = static_cast(indices.size()); - CeedInt dim = mesh.Dimension() - use_bdr; - CeedInt space_dim = mesh.SpaceDimension(); - - CeedElemRestriction mesh_restr; - CeedBasis mesh_basis; - CeedInt nqpts, qdata_size = info.qdata_size; - InitRestriction(mesh_fespace, indices, use_bdr, ceed, &mesh_restr); - InitBasis(mesh_fespace, ir, indices, use_bdr, ceed, &mesh_basis); - PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(mesh_basis, &nqpts)); - - // Strided restrictions are cheap to construct and not stored in the global cache. - PalaceCeedCall(ceed, CeedVectorCreate(ceed, ne * nqpts * qdata_size, qdata)); - PalaceCeedCall(ceed, CeedElemRestrictionCreateStrided(ceed, ne, nqpts, qdata_size, - ne * nqpts * qdata_size, - CEED_STRIDES_BACKEND, qdata_restr)); - - // Create the QFunction that builds the operator (i.e. computes its quadrature data). - CeedQFunction build_qf; - PalaceCeedCall(ceed, CeedQFunctionCreateInterior(ceed, 1, info.build_qf, - info.build_qf_path.c_str(), &build_qf)); - - CeedQFunctionContext build_ctx; - PalaceCeedCall(ceed, CeedQFunctionContextCreate(ceed, &build_ctx)); - PalaceCeedCall(ceed, - CeedQFunctionContextSetData(build_ctx, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(info.ctx), (void *)&info.ctx)); - PalaceCeedCall(ceed, CeedQFunctionSetContext(build_qf, build_ctx)); - PalaceCeedCall(ceed, CeedQFunctionContextDestroy(&build_ctx)); - - // Inputs - for (std::size_t i = 0; i < Q.size(); i++) - { - std::string name = "coeff" + std::to_string(i + 1); - const CeedInt ncomp = Q[i].ncomp; - PalaceCeedCall(ceed, - CeedQFunctionAddInput(build_qf, name.c_str(), ncomp, CEED_EVAL_NONE)); - } - PalaceCeedCall(ceed, - CeedQFunctionAddInput(build_qf, "dx", dim * space_dim, CEED_EVAL_GRAD)); - PalaceCeedCall(ceed, CeedQFunctionAddInput(build_qf, "weights", 1, CEED_EVAL_WEIGHT)); - - // Output - PalaceCeedCall(ceed, - CeedQFunctionAddOutput(build_qf, "qdata", qdata_size, CEED_EVAL_NONE)); - - // Create the operator that builds the quadrature data for the actual operator. - CeedOperator build_op; - PalaceCeedCall(ceed, CeedOperatorCreate(ceed, build_qf, nullptr, nullptr, &build_op)); - PalaceCeedCall(ceed, CeedQFunctionDestroy(&build_qf)); - - for (std::size_t i = 0; i < Q.size(); i++) - { - std::string name = "coeff" + std::to_string(i + 1); - const CeedInt ncomp = Q[i].ncomp; - CeedInt strides[3] = {ncomp, 1, ncomp * nqpts}; - CeedElemRestriction coeff_restr; - CeedVector coeff_vector; - - PalaceCeedCall(ceed, CeedElemRestrictionCreateStrided(ceed, ne, nqpts, ncomp, - ne * nqpts * ncomp, strides, - &coeff_restr)); - InitCeedVector(Q[i].data, ceed, &coeff_vector); - - PalaceCeedCall(ceed, CeedOperatorSetField(build_op, name.c_str(), coeff_restr, - CEED_BASIS_NONE, coeff_vector)); - - PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&coeff_restr)); - PalaceCeedCall(ceed, CeedVectorDestroy(&coeff_vector)); - } - PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "dx", mesh_restr, mesh_basis, - CEED_VECTOR_ACTIVE)); - PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "weights", CEED_ELEMRESTRICTION_NONE, - mesh_basis, CEED_VECTOR_NONE)); - PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "qdata", *qdata_restr, - CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); - - PalaceCeedCall(ceed, CeedOperatorCheckReady(build_op)); - - // Compute the quadrature data for the operator. - CeedVector nodes; - InitCeedVector(mesh_nodes, ceed, &nodes); - - PalaceCeedCall(ceed, CeedOperatorApply(build_op, nodes, *qdata, CEED_REQUEST_IMMEDIATE)); - - PalaceCeedCall(ceed, CeedVectorDestroy(&nodes)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&build_op)); -} - -// Create libCEED operator using the given quadrature data and element restriction. -template -inline void AssembleCeedOperator(const CeedIntegratorInfo &info, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const mfem::IntegrationRule &ir, - const std::vector &indices, const bool use_bdr, - CeedVector qdata, CeedElemRestriction qdata_restr, - Ceed ceed, CeedOperator *op) -{ - MFEM_VERIFY(trial_fespace.GetParMesh() == test_fespace.GetParMesh(), - "Trial and test finite element spaces must correspond to the same mesh!"); - const mfem::ParMesh &mesh = *trial_fespace.GetParMesh(); - - CeedInt dim = mesh.Dimension() - use_bdr; - CeedInt curl_dim = (dim < 3) ? 1 : dim; - CeedInt trial_vdim = trial_fespace.GetVDim(); - CeedInt test_vdim = test_fespace.GetVDim(); - bool trial_vectorfe = - (trial_fespace.FEColl()->GetRangeType(dim) == mfem::FiniteElement::VECTOR); - bool test_vectorfe = - (test_fespace.FEColl()->GetRangeType(dim) == mfem::FiniteElement::VECTOR); - - CeedElemRestriction trial_restr, test_restr; - CeedBasis trial_basis, test_basis; - InitRestriction(trial_fespace, indices, use_bdr, ceed, &trial_restr); - InitRestriction(test_fespace, indices, use_bdr, ceed, &test_restr); - InitBasis(trial_fespace, ir, indices, use_bdr, ceed, &trial_basis); - InitBasis(test_fespace, ir, indices, use_bdr, ceed, &test_basis); - - CeedInt trial_nqpts, test_nqpts, mesh_nqpts, qdata_size; - PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(trial_basis, &trial_nqpts)); - PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(test_basis, &test_nqpts)); - PalaceCeedCall(ceed, CeedElemRestrictionGetElementSize(qdata_restr, &mesh_nqpts)); - PalaceCeedCall(ceed, CeedElemRestrictionGetNumComponents(qdata_restr, &qdata_size)); - MFEM_VERIFY(trial_nqpts == test_nqpts && trial_nqpts == mesh_nqpts, - "Trial and test basis must have the same number of quadrature points!"); - - // Create the QFunction that defines the action of the operator. - CeedQFunction apply_qf; - PalaceCeedCall(ceed, CeedQFunctionCreateInterior(ceed, 1, info.apply_qf, - info.apply_qf_path.c_str(), &apply_qf)); - - CeedQFunctionContext apply_ctx; - PalaceCeedCall(ceed, CeedQFunctionContextCreate(ceed, &apply_ctx)); - PalaceCeedCall(ceed, - CeedQFunctionContextSetData(apply_ctx, CEED_MEM_HOST, CEED_COPY_VALUES, - sizeof(info.ctx), (void *)&info.ctx)); - PalaceCeedCall(ceed, CeedQFunctionSetContext(apply_qf, apply_ctx)); - PalaceCeedCall(ceed, CeedQFunctionContextDestroy(&apply_ctx)); - - // Inputs - switch (info.trial_op) - { - case EvalMode::None: - PalaceCeedCall(ceed, - CeedQFunctionAddInput(apply_qf, "u", trial_vdim, CEED_EVAL_NONE)); - break; - case EvalMode::Interp: - PalaceCeedCall(ceed, CeedQFunctionAddInput(apply_qf, "u", - trial_vdim * (trial_vectorfe ? dim : 1), - CEED_EVAL_INTERP)); - break; - case EvalMode::Grad: - MFEM_VERIFY(!trial_vectorfe, "EvalMode::Grad is not intended for vector FE!"); - PalaceCeedCall( - ceed, CeedQFunctionAddInput(apply_qf, "gu", trial_vdim * dim, CEED_EVAL_GRAD)); - break; - case EvalMode::Div: - PalaceCeedCall(ceed, - CeedQFunctionAddInput(apply_qf, "du", trial_vdim, CEED_EVAL_DIV)); - break; - case EvalMode::Curl: - PalaceCeedCall(ceed, CeedQFunctionAddInput(apply_qf, "cu", trial_vdim * curl_dim, - CEED_EVAL_CURL)); - break; - case EvalMode::InterpAndGrad: - MFEM_VERIFY(!trial_vectorfe, - "EvalMode::InterpAndGrad is not intended for vector FE!"); - PalaceCeedCall(ceed, - CeedQFunctionAddInput(apply_qf, "u", trial_vdim, CEED_EVAL_INTERP)); - PalaceCeedCall( - ceed, CeedQFunctionAddInput(apply_qf, "gu", trial_vdim * dim, CEED_EVAL_GRAD)); - break; - case EvalMode::InterpAndDiv: - MFEM_VERIFY(trial_vectorfe, "EvalMode::InterpAndDiv is only intended for vector FE!"); - PalaceCeedCall( - ceed, CeedQFunctionAddInput(apply_qf, "u", trial_vdim * dim, CEED_EVAL_INTERP)); - PalaceCeedCall(ceed, - CeedQFunctionAddInput(apply_qf, "du", trial_vdim, CEED_EVAL_DIV)); - break; - case EvalMode::InterpAndCurl: - MFEM_VERIFY(trial_vectorfe, - "EvalMode::InterpAndCurl is only intended for vector FE!"); - PalaceCeedCall( - ceed, CeedQFunctionAddInput(apply_qf, "u", trial_vdim * dim, CEED_EVAL_INTERP)); - PalaceCeedCall(ceed, CeedQFunctionAddInput(apply_qf, "cu", trial_vdim * curl_dim, - CEED_EVAL_CURL)); - break; - } - PalaceCeedCall(ceed, - CeedQFunctionAddInput(apply_qf, "qdata", qdata_size, CEED_EVAL_NONE)); - - // Output - switch (info.test_op) - { - case EvalMode::None: - PalaceCeedCall(ceed, - CeedQFunctionAddOutput(apply_qf, "v", test_vdim, CEED_EVAL_NONE)); - break; - case EvalMode::Interp: - PalaceCeedCall(ceed, CeedQFunctionAddOutput(apply_qf, "v", - test_vdim * (test_vectorfe ? dim : 1), - CEED_EVAL_INTERP)); - break; - case EvalMode::Grad: - MFEM_VERIFY(!test_vectorfe, "EvalMode::Grad is not intended for vector FE!"); - PalaceCeedCall( - ceed, CeedQFunctionAddOutput(apply_qf, "gv", test_vdim * dim, CEED_EVAL_GRAD)); - break; - case EvalMode::Div: - PalaceCeedCall(ceed, - CeedQFunctionAddOutput(apply_qf, "dv", test_vdim, CEED_EVAL_DIV)); - break; - case EvalMode::Curl: - PalaceCeedCall(ceed, CeedQFunctionAddOutput(apply_qf, "cv", test_vdim * curl_dim, - CEED_EVAL_CURL)); - break; - case EvalMode::InterpAndGrad: - MFEM_VERIFY(!test_vectorfe, "EvalMode::InterpAndGrad is not intended for vector FE!"); - PalaceCeedCall(ceed, - CeedQFunctionAddOutput(apply_qf, "v", test_vdim, CEED_EVAL_INTERP)); - PalaceCeedCall( - ceed, CeedQFunctionAddOutput(apply_qf, "gv", test_vdim * dim, CEED_EVAL_GRAD)); - break; - case EvalMode::InterpAndDiv: - MFEM_VERIFY(test_vectorfe, "EvalMode::InterpAndDiv is only intended for vector FE!"); - PalaceCeedCall( - ceed, CeedQFunctionAddOutput(apply_qf, "v", test_vdim * dim, CEED_EVAL_INTERP)); - PalaceCeedCall(ceed, - CeedQFunctionAddOutput(apply_qf, "dv", test_vdim, CEED_EVAL_DIV)); - break; - case EvalMode::InterpAndCurl: - MFEM_VERIFY(test_vectorfe, "EvalMode::InterpAndCurl is only intended for vector FE!"); - PalaceCeedCall( - ceed, CeedQFunctionAddOutput(apply_qf, "v", test_vdim * dim, CEED_EVAL_INTERP)); - PalaceCeedCall(ceed, CeedQFunctionAddOutput(apply_qf, "cv", test_vdim * curl_dim, - CEED_EVAL_CURL)); - break; - } - - // Create the operator. - PalaceCeedCall(ceed, CeedOperatorCreate(ceed, apply_qf, nullptr, nullptr, op)); - PalaceCeedCall(ceed, CeedQFunctionDestroy(&apply_qf)); - - switch (info.trial_op) - { - case EvalMode::None: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "u", trial_restr, CEED_BASIS_NONE, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::Interp: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "u", trial_restr, trial_basis, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::Grad: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "gu", trial_restr, trial_basis, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::Div: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "du", trial_restr, trial_basis, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::Curl: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "cu", trial_restr, trial_basis, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::InterpAndGrad: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "u", trial_restr, trial_basis, - CEED_VECTOR_ACTIVE)); - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "gu", trial_restr, trial_basis, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::InterpAndDiv: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "u", trial_restr, trial_basis, - CEED_VECTOR_ACTIVE)); - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "du", trial_restr, trial_basis, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::InterpAndCurl: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "u", trial_restr, trial_basis, - CEED_VECTOR_ACTIVE)); - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "cu", trial_restr, trial_basis, - CEED_VECTOR_ACTIVE)); - break; - } - PalaceCeedCall(ceed, - CeedOperatorSetField(*op, "qdata", qdata_restr, CEED_BASIS_NONE, qdata)); - switch (info.test_op) - { - case EvalMode::None: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "v", test_restr, CEED_BASIS_NONE, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::Interp: - PalaceCeedCall( - ceed, CeedOperatorSetField(*op, "v", test_restr, test_basis, CEED_VECTOR_ACTIVE)); - break; - case EvalMode::Grad: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "gv", test_restr, test_basis, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::Div: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "dv", test_restr, test_basis, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::Curl: - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "cv", test_restr, test_basis, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::InterpAndGrad: - PalaceCeedCall( - ceed, CeedOperatorSetField(*op, "v", test_restr, test_basis, CEED_VECTOR_ACTIVE)); - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "gv", test_restr, test_basis, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::InterpAndDiv: - PalaceCeedCall( - ceed, CeedOperatorSetField(*op, "v", test_restr, test_basis, CEED_VECTOR_ACTIVE)); - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "dv", test_restr, test_basis, - CEED_VECTOR_ACTIVE)); - break; - case EvalMode::InterpAndCurl: - PalaceCeedCall( - ceed, CeedOperatorSetField(*op, "v", test_restr, test_basis, CEED_VECTOR_ACTIVE)); - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "cv", test_restr, test_basis, - CEED_VECTOR_ACTIVE)); - break; - } - - PalaceCeedCall(ceed, CeedOperatorCheckReady(*op)); -} - -// Construct libCEED operators for interpolation operations and their transpose between -// the two spaces. The operation for interpolation is decided by the conformity of the trial -// and test spaces. -inline void AssembleCeedInterpolator(const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - const std::vector &indices, Ceed ceed, - CeedOperator *op, CeedOperator *op_t) -{ - CeedInt trial_vdim = trial_fespace.GetVDim(); - CeedInt test_vdim = test_fespace.GetVDim(); - MFEM_VERIFY(trial_vdim == 1 && test_vdim == 1, - "AssembleCeedInterpolator does not support spaces with vdim > 1!"); - - CeedElemRestriction trial_restr, test_restr; - CeedBasis basis_ctof; - InitRestriction(trial_fespace, indices, false, true, false, ceed, &trial_restr); - InitRestriction(test_fespace, indices, false, true, true, ceed, &test_restr); - InitInterpolatorBasis(trial_fespace, test_fespace, indices, ceed, &basis_ctof); - - // Create the QFunction that defines the action of the operator (only an identity as - // element dof multiplicity is handled outside of libCEED). - CeedQFunction apply_qf, apply_qf_t; - PalaceCeedCall(ceed, CeedQFunctionCreateIdentity(ceed, trial_vdim, CEED_EVAL_INTERP, - CEED_EVAL_NONE, &apply_qf)); - PalaceCeedCall(ceed, CeedQFunctionCreateIdentity(ceed, trial_vdim, CEED_EVAL_NONE, - CEED_EVAL_INTERP, &apply_qf_t)); - - // Create the operator. - PalaceCeedCall(ceed, CeedOperatorCreate(ceed, apply_qf, nullptr, nullptr, op)); - PalaceCeedCall(ceed, CeedQFunctionDestroy(&apply_qf)); - - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "input", trial_restr, basis_ctof, - CEED_VECTOR_ACTIVE)); - PalaceCeedCall(ceed, CeedOperatorSetField(*op, "output", test_restr, CEED_BASIS_NONE, - CEED_VECTOR_ACTIVE)); - - PalaceCeedCall(ceed, CeedOperatorCheckReady(*op)); - - // Create the transpose operator. - PalaceCeedCall(ceed, CeedOperatorCreate(ceed, apply_qf_t, nullptr, nullptr, op_t)); - PalaceCeedCall(ceed, CeedQFunctionDestroy(&apply_qf_t)); - - PalaceCeedCall(ceed, CeedOperatorSetField(*op_t, "input", test_restr, CEED_BASIS_NONE, - CEED_VECTOR_ACTIVE)); - PalaceCeedCall(ceed, CeedOperatorSetField(*op_t, "output", trial_restr, basis_ctof, - CEED_VECTOR_ACTIVE)); - - PalaceCeedCall(ceed, CeedOperatorCheckReady(*op_t)); -} - -} // namespace palace::ceed - -#endif // PALACE_LIBCEED_INTEGRATOR_HPP diff --git a/palace/fem/libceed/libceed_coefficient.cpp b/palace/fem/libceed/libceed_coefficient.cpp new file mode 100644 index 0000000000..a78e967eac --- /dev/null +++ b/palace/fem/libceed/libceed_coefficient.cpp @@ -0,0 +1,132 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "libceed_coefficient.hpp" + +#include +#include "fem/libceed/ceed.hpp" +#include "models/materialoperator.hpp" + +#include "fem/qfunctions/coeff/coeff_qf.h" + +namespace palace::ceed +{ + +namespace +{ + +inline auto CoeffDim(int dim) +{ + return dim * dim; +} + +inline void MakeDiagonalCoefficient(int dim, CeedIntScalar *mat_coeff, CeedScalar a, + CeedInt k) +{ + const int coeff_dim = CoeffDim(dim); + for (int i = 0; i < coeff_dim; i++) + { + mat_coeff[coeff_dim * k + i].second = 0.0; + } + for (int di = 0; di < dim; ++di) + { + const int idx = di * (dim + 1); + mat_coeff[coeff_dim * k + idx].second = a; + } +} + +inline auto *AttrMat(CeedIntScalar *ctx) +{ + return ctx + 1; +} + +inline auto *MatCoeff(CeedIntScalar *ctx) +{ + const CeedInt num_attr = ctx[0].first; + return ctx + 2 + num_attr; +} + +} // namespace + +std::vector PopulateCoefficientContext(int dim, + const MaterialPropertyCoefficient *Q, + bool transpose, double a) +{ + if (!Q) + { + // No attributes are stored in the map from attributes to material property coefficient, + // indicating that all attributes map to the same identity coefficient. + std::vector ctx(2 + CoeffDim(dim), {0}); + ctx[0].first = 0; + ctx[1].first = 1; + MakeDiagonalCoefficient(dim, MatCoeff(ctx.data()), a, 0); + return ctx; + } + + // Material property coefficients might be empty if all attributes map to zero + // coefficient. + const auto &attr_mat = Q->GetAttributeToMaterial(); + const auto &mat_coeff = Q->GetMaterialProperties(); + MFEM_VERIFY(attr_mat.Size() > 0, "Empty attributes for MaterialPropertyCoefficient!"); + MFEM_VERIFY(attr_mat.Max() < mat_coeff.SizeK(), + "Invalid attribute material property for MaterialPropertyCoefficient (" + << attr_mat.Max() << " vs. " << mat_coeff.SizeK() << ")!"); + MFEM_VERIFY(mat_coeff.SizeI() == mat_coeff.SizeJ() && + (mat_coeff.SizeI() == 1 || mat_coeff.SizeI() == dim), + "Dimension mismatch for MaterialPropertyCoefficient and libCEED integrator!"); + + // Map unassigned attributes to zero material property coefficient (the last material + // property is reserved for zero). + const int coeff_dim = CoeffDim(dim); + std::vector ctx(2 + attr_mat.Size() + coeff_dim * (mat_coeff.SizeK() + 1)); + ctx[0].first = attr_mat.Size(); + const int zero_mat = mat_coeff.SizeK(); + for (int i = 0; i < attr_mat.Size(); i++) + { + const int k = attr_mat[i]; + AttrMat(ctx.data())[i].first = (k < 0) ? zero_mat : k; + } + + // Copy material properties. + ctx[1 + attr_mat.Size()].first = mat_coeff.SizeK() + 1; + for (int k = 0; k < mat_coeff.SizeK(); k++) + { + if (mat_coeff.SizeI() == 1) + { + // Copy as diagonal matrix coefficient. + MakeDiagonalCoefficient(dim, MatCoeff(ctx.data()), a * mat_coeff(0, 0, k), k); + } + else + { + for (int dj = 0; dj < dim; ++dj) + { + for (int di = 0; di < dim; ++di) + { + // Column-major ordering. + const int idx = transpose ? (di * dim) + dj : (dj * dim) + di; + MatCoeff(ctx.data())[coeff_dim * k + idx].second = a * mat_coeff(di, dj, k); + } + } + } + } + for (int d = 0; d < coeff_dim; d++) + { + MatCoeff(ctx.data())[coeff_dim * zero_mat + d].second = 0.0; + } + + return ctx; +} + +std::vector +PopulateCoefficientContext(int dim_mass, const MaterialPropertyCoefficient *Q_mass, int dim, + const MaterialPropertyCoefficient *Q, bool transpose_mass, + bool transpose, double a_mass, double a) +{ + // Mass coefficient comes first, then the other one for the QFunction. + auto ctx_mass = PopulateCoefficientContext(dim_mass, Q_mass, transpose_mass, a_mass); + auto ctx = PopulateCoefficientContext(dim, Q, transpose, a); + ctx_mass.insert(ctx_mass.end(), ctx.begin(), ctx.end()); + return ctx_mass; +} + +} // namespace palace::ceed diff --git a/palace/fem/libceed/libceed_coefficient.hpp b/palace/fem/libceed/libceed_coefficient.hpp new file mode 100644 index 0000000000..671e3dc1ef --- /dev/null +++ b/palace/fem/libceed/libceed_coefficient.hpp @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_COEFFICIENT_HPP +#define PALACE_LIBCEED_COEFFICIENT_HPP + +#include + +union CeedIntScalar; + +namespace palace +{ + +class MaterialPropertyCoefficient; + +namespace ceed +{ + +std::vector PopulateCoefficientContext(int dim, + const MaterialPropertyCoefficient *Q, + bool transpose = false, + double a = 1.0); + +std::vector +PopulateCoefficientContext(int dim_mass, const MaterialPropertyCoefficient *Q_mass, int dim, + const MaterialPropertyCoefficient *Q, bool tranpose_mass = false, + bool transpose = false, double a_mass = 1.0, double a = 1.0); + +} // namespace ceed + +} // namespace palace + +#endif // PALACE_LIBCEED_COEFFICIENT_HPP diff --git a/palace/fem/libceed/libceed_integrator.cpp b/palace/fem/libceed/libceed_integrator.cpp new file mode 100644 index 0000000000..9db73d889c --- /dev/null +++ b/palace/fem/libceed/libceed_integrator.cpp @@ -0,0 +1,622 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "libceed_integrator.hpp" + +#include +#include +#include +#include "utils/diagnostic.hpp" + +PalacePragmaDiagnosticPush +PalacePragmaDiagnosticDisableUnused + +#include "fem/qfunctions/apply_qf.h" +#include "fem/qfunctions/geom_qf.h" + +PalacePragmaDiagnosticPop + +namespace palace::ceed +{ + +namespace +{ + +void AddQFunctionActiveInputs(unsigned int ops, Ceed ceed, CeedBasis basis, + CeedQFunction qf, std::string name = "u") +{ + // Add inputs or outputs with evaluation modes for the active vector of a QFunction. + CeedInt num_comp; + PalaceCeedCall(ceed, CeedBasisGetNumComponents(basis, &num_comp)); + if (ops & EvalMode::None) + { + PalaceCeedCall(ceed, CeedQFunctionAddInput(qf, name.c_str(), num_comp, CEED_EVAL_NONE)); + } + if (ops & EvalMode::Interp) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp)); + PalaceCeedCall( + ceed, CeedQFunctionAddInput(qf, name.c_str(), num_comp * q_comp, CEED_EVAL_INTERP)); + } + if (ops & EvalMode::Grad) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp)); + PalaceCeedCall(ceed, CeedQFunctionAddInput(qf, (std::string("grad_") + name).c_str(), + num_comp * q_comp, CEED_EVAL_GRAD)); + } + if (ops & EvalMode::Div) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp)); + PalaceCeedCall(ceed, CeedQFunctionAddInput(qf, (std::string("div_") + name).c_str(), + num_comp * q_comp, CEED_EVAL_DIV)); + } + if (ops & EvalMode::Curl) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp)); + PalaceCeedCall(ceed, CeedQFunctionAddInput(qf, (std::string("curl_") + name).c_str(), + num_comp * q_comp, CEED_EVAL_CURL)); + } +} + +void AddQFunctionActiveOutputs(unsigned int ops, Ceed ceed, CeedBasis basis, + CeedQFunction qf, std::string name = "v") +{ + // Add inputs or outputs with evaluation modes for the active vector of a QFunction. + CeedInt num_comp; + PalaceCeedCall(ceed, CeedBasisGetNumComponents(basis, &num_comp)); + if (ops & EvalMode::None) + { + PalaceCeedCall(ceed, + CeedQFunctionAddOutput(qf, name.c_str(), num_comp, CEED_EVAL_NONE)); + } + if (ops & EvalMode::Interp) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp)); + PalaceCeedCall(ceed, CeedQFunctionAddOutput(qf, name.c_str(), num_comp * q_comp, + CEED_EVAL_INTERP)); + } + if (ops & EvalMode::Grad) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp)); + PalaceCeedCall(ceed, CeedQFunctionAddOutput(qf, (std::string("grad_") + name).c_str(), + num_comp * q_comp, CEED_EVAL_GRAD)); + } + if (ops & EvalMode::Div) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp)); + PalaceCeedCall(ceed, CeedQFunctionAddOutput(qf, (std::string("div_") + name).c_str(), + num_comp * q_comp, CEED_EVAL_DIV)); + } + if (ops & EvalMode::Curl) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp)); + PalaceCeedCall(ceed, CeedQFunctionAddOutput(qf, (std::string("curl_") + name).c_str(), + num_comp * q_comp, CEED_EVAL_CURL)); + } +} + +void AddOperatorActiveFields(unsigned int ops, Ceed ceed, CeedElemRestriction restr, + CeedBasis basis, CeedOperator op, const std::string &name, + CeedVector v) +{ + // Set active input or output vector fields of an operator. + if (ops & EvalMode::None) + { + PalaceCeedCall(ceed, CeedOperatorSetField(op, name.c_str(), restr, CEED_BASIS_NONE, v)); + } + if (ops & EvalMode::Interp) + { + PalaceCeedCall(ceed, CeedOperatorSetField(op, name.c_str(), restr, basis, v)); + } + if (ops & EvalMode::Grad) + { + PalaceCeedCall(ceed, CeedOperatorSetField(op, (std::string("grad_") + name).c_str(), + restr, basis, v)); + } + if (ops & EvalMode::Div) + { + PalaceCeedCall(ceed, CeedOperatorSetField(op, (std::string("div_") + name).c_str(), + restr, basis, v)); + } + if (ops & EvalMode::Curl) + { + PalaceCeedCall(ceed, CeedOperatorSetField(op, (std::string("curl_") + name).c_str(), + restr, basis, v)); + } +} + +void AddOperatorActiveInputFields(unsigned int ops, Ceed ceed, CeedElemRestriction restr, + CeedBasis basis, CeedOperator op, std::string name = "u", + CeedVector v = CEED_VECTOR_ACTIVE) +{ + AddOperatorActiveFields(ops, ceed, restr, basis, op, name, v); +} + +void AddOperatorActiveOutputFields(unsigned int ops, Ceed ceed, CeedElemRestriction restr, + CeedBasis basis, CeedOperator op, std::string name = "v", + CeedVector v = CEED_VECTOR_ACTIVE) +{ + AddOperatorActiveFields(ops, ceed, restr, basis, op, name, v); +} + +std::vector QuadratureDataSetup(unsigned int ops, Ceed ceed, + CeedElemRestriction restr, CeedBasis basis, + CeedVector *q_data, + CeedElemRestriction *q_data_restr) +{ + // Operator application at each quadrature point should be square, so just use the inputs + // and ignore the outputs. + CeedInt num_comp; + PalaceCeedCall(ceed, CeedBasisGetNumComponents(basis, &num_comp)); + + std::vector active_input_sizes; + if (ops & EvalMode::None) + { + active_input_sizes.push_back(num_comp); + } + if (ops & EvalMode::Interp) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp)); + active_input_sizes.push_back(num_comp * q_comp); + } + if (ops & EvalMode::Grad) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp)); + active_input_sizes.push_back(num_comp * q_comp); + } + if (ops & EvalMode::Div) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp)); + active_input_sizes.push_back(num_comp * q_comp); + } + if (ops & EvalMode::Curl) + { + CeedInt q_comp; + PalaceCeedCall(ceed, + CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp)); + active_input_sizes.push_back(num_comp * q_comp); + } + + CeedInt num_elem, num_qpts, q_data_size = 0; + PalaceCeedCall(ceed, CeedElemRestrictionGetNumElements(restr, &num_elem)); + PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(basis, &num_qpts)); + for (auto size : active_input_sizes) + { + q_data_size += size * size; + } + + PalaceCeedCall( + ceed, CeedVectorCreate(ceed, (CeedSize)num_elem * num_qpts * q_data_size, q_data)); + PalaceCeedCall( + ceed, CeedElemRestrictionCreateStrided(ceed, num_elem, num_qpts, q_data_size, + (CeedSize)num_elem * num_qpts * q_data_size, + CEED_STRIDES_BACKEND, q_data_restr)); + + return active_input_sizes; +} + +void QuadratureDataAssembly(const std::vector &qf_active_sizes, + const CeedQFunctionInfo &info, Ceed ceed, + CeedElemRestriction trial_restr, CeedElemRestriction test_restr, + CeedBasis trial_basis, CeedBasis test_basis, CeedVector q_data, + CeedElemRestriction q_data_restr, CeedOperator *op) +{ + // Assemble the quadrature data, destroy the operator, and create a new one for the + // actual operator application. + PalaceCeedCall(ceed, + CeedOperatorApply(*op, CEED_VECTOR_NONE, q_data, CEED_REQUEST_IMMEDIATE)); + PalaceCeedCall(ceed, CeedOperatorDestroy(op)); + + MFEM_VERIFY(!qf_active_sizes.empty() && qf_active_sizes.size() <= 2, + "Invalid number of active QFunction input/output fields (" + << qf_active_sizes.size() << ")!"); + CeedQFunction apply_qf; + CeedInt qf_size_1 = qf_active_sizes[0], + qf_size_2 = (qf_active_sizes.size() > 1) ? qf_active_sizes[1] : 0; + switch (10 * qf_size_1 + qf_size_2) + { + case 1: + case 10: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_apply_1, + PalaceQFunctionRelativePath(f_apply_1_loc), &apply_qf)); + break; + case 2: + case 20: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_apply_2, + PalaceQFunctionRelativePath(f_apply_2_loc), &apply_qf)); + break; + case 3: + case 30: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_apply_3, + PalaceQFunctionRelativePath(f_apply_3_loc), &apply_qf)); + break; + case 22: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_apply_22, + PalaceQFunctionRelativePath(f_apply_22_loc), &apply_qf)); + break; + case 33: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_apply_33, + PalaceQFunctionRelativePath(f_apply_33_loc), &apply_qf)); + break; + case 12: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_apply_12, + PalaceQFunctionRelativePath(f_apply_12_loc), &apply_qf)); + break; + case 13: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_apply_13, + PalaceQFunctionRelativePath(f_apply_13_loc), &apply_qf)); + break; + case 21: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_apply_21, + PalaceQFunctionRelativePath(f_apply_21_loc), &apply_qf)); + break; + case 31: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_apply_31, + PalaceQFunctionRelativePath(f_apply_31_loc), &apply_qf)); + break; + default: + MFEM_ABORT("Invalid number of QFunction input/output components (" + << qf_size_1 << ", " << qf_size_2 << ")!"); + apply_qf = nullptr; // Silence compiler warning + } + + // Inputs/outputs. + { + CeedInt q_data_size; + PalaceCeedCall(ceed, CeedElemRestrictionGetNumComponents(q_data_restr, &q_data_size)); + PalaceCeedCall(ceed, + CeedQFunctionAddInput(apply_qf, "q_data", q_data_size, CEED_EVAL_NONE)); + } + AddQFunctionActiveInputs(info.trial_ops, ceed, trial_basis, apply_qf); + AddQFunctionActiveOutputs(info.test_ops, ceed, test_basis, apply_qf); + + // Create the actual operator. + PalaceCeedCall(ceed, CeedOperatorCreate(ceed, apply_qf, nullptr, nullptr, op)); + PalaceCeedCall(ceed, CeedQFunctionDestroy(&apply_qf)); + + PalaceCeedCall( + ceed, CeedOperatorSetField(*op, "q_data", q_data_restr, CEED_BASIS_NONE, q_data)); + AddOperatorActiveInputFields(info.trial_ops, ceed, trial_restr, trial_basis, *op); + AddOperatorActiveOutputFields(info.test_ops, ceed, test_restr, test_basis, *op); + + PalaceCeedCall(ceed, CeedOperatorCheckReady(*op)); +} + +} // namespace + +int CeedGeometryDataGetSpaceDimension(CeedElemRestriction geom_data_restr, CeedInt dim, + CeedInt *space_dim) +{ + if (space_dim) + { + Ceed ceed; + CeedInt geom_data_size; + PalaceCeedCallBackend(CeedElemRestrictionGetCeed(geom_data_restr, &ceed)); + PalaceCeedCall(ceed, + CeedElemRestrictionGetNumComponents(geom_data_restr, &geom_data_size)); + *space_dim = (geom_data_size - 2) / dim; + MFEM_ASSERT(2 + (*space_dim) * dim == geom_data_size, + "Invalid size for geometry quadrature data!"); + } + return CEED_ERROR_SUCCESS; +} + +void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr, + CeedBasis mesh_basis, CeedVector mesh_nodes, + CeedElemRestriction attr_restr, CeedBasis attr_basis, + CeedVector elem_attr, CeedVector geom_data, + CeedElemRestriction geom_data_restr) +{ + CeedInt dim, space_dim, num_qpts; + PalaceCeedCall(ceed, CeedBasisGetDimension(mesh_basis, &dim)); + PalaceCeedCall(ceed, CeedBasisGetNumComponents(mesh_basis, &space_dim)); + PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(mesh_basis, &num_qpts)); + + // Create the QFunction that computes the quadrature data. + CeedQFunction build_qf; + switch (10 * space_dim + dim) + { + case 22: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_build_geom_factor_22, + PalaceQFunctionRelativePath(f_build_geom_factor_22_loc), + &build_qf)); + break; + case 33: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_build_geom_factor_33, + PalaceQFunctionRelativePath(f_build_geom_factor_33_loc), + &build_qf)); + break; + case 21: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_build_geom_factor_21, + PalaceQFunctionRelativePath(f_build_geom_factor_21_loc), + &build_qf)); + break; + case 32: + PalaceCeedCall(ceed, CeedQFunctionCreateInterior( + ceed, 1, f_build_geom_factor_32, + PalaceQFunctionRelativePath(f_build_geom_factor_32_loc), + &build_qf)); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" + << dim << ", " << space_dim << ") for geometry factor quadrature data!"); + build_qf = nullptr; // Silence compiler warning + } + + // Inputs/outputs. + PalaceCeedCall(ceed, CeedQFunctionAddInput(build_qf, "attr", 1, CEED_EVAL_INTERP)); + PalaceCeedCall(ceed, CeedQFunctionAddInput(build_qf, "q_w", 1, CEED_EVAL_WEIGHT)); + PalaceCeedCall( + ceed, CeedQFunctionAddInput(build_qf, "grad_x", space_dim * dim, CEED_EVAL_GRAD)); + { + CeedInt geom_data_size; + PalaceCeedCall(ceed, + CeedElemRestrictionGetNumComponents(geom_data_restr, &geom_data_size)); + MFEM_VERIFY(geom_data_size == 2 + space_dim * dim, + "Insufficient storage for geometry quadrature data!"); + PalaceCeedCall(ceed, CeedQFunctionAddOutput(build_qf, "geom_data", geom_data_size, + CEED_EVAL_NONE)); + } + + // Create the operator that builds the quadrature data. + CeedOperator build_op; + PalaceCeedCall(ceed, CeedOperatorCreate(ceed, build_qf, nullptr, nullptr, &build_op)); + PalaceCeedCall(ceed, CeedQFunctionDestroy(&build_qf)); + + PalaceCeedCall(ceed, + CeedOperatorSetField(build_op, "attr", attr_restr, attr_basis, elem_attr)); + PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "q_w", CEED_ELEMRESTRICTION_NONE, + mesh_basis, CEED_VECTOR_NONE)); + PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "grad_x", mesh_restr, mesh_basis, + CEED_VECTOR_ACTIVE)); + PalaceCeedCall(ceed, CeedOperatorSetField(build_op, "geom_data", geom_data_restr, + CEED_BASIS_NONE, CEED_VECTOR_ACTIVE)); + + PalaceCeedCall(ceed, CeedOperatorCheckReady(build_op)); + + // Compute the quadrature data for the operator. + PalaceCeedCall( + ceed, CeedOperatorApply(build_op, mesh_nodes, geom_data, CEED_REQUEST_IMMEDIATE)); + PalaceCeedCall(ceed, CeedOperatorDestroy(&build_op)); +} + +void AssembleCeedOperator(const CeedQFunctionInfo &info, void *ctx, std::size_t ctx_size, + Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis trial_basis, + CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) +{ + // If we are going to be assembling the quadrature data, construct the storage vector for + // it (to be owned by the operator). + CeedVector q_data = nullptr; + CeedElemRestriction q_data_restr = nullptr; + std::vector qf_active_sizes; + if (info.assemble_q_data) + { + qf_active_sizes = QuadratureDataSetup(info.trial_ops, ceed, trial_restr, trial_basis, + &q_data, &q_data_restr); + } + + // Create the QFunction that defines the action of the operator (or its setup). + CeedQFunction apply_qf; + PalaceCeedCall(ceed, CeedQFunctionCreateInterior(ceed, 1, info.apply_qf, + info.apply_qf_path.c_str(), &apply_qf)); + + CeedQFunctionContext apply_ctx; + PalaceCeedCall(ceed, CeedQFunctionContextCreate(ceed, &apply_ctx)); + PalaceCeedCall(ceed, CeedQFunctionContextSetData(apply_ctx, CEED_MEM_HOST, + CEED_COPY_VALUES, ctx_size, ctx)); + PalaceCeedCall(ceed, CeedQFunctionSetContext(apply_qf, apply_ctx)); + PalaceCeedCall(ceed, CeedQFunctionContextDestroy(&apply_ctx)); + + // Inputs/outputs. + { + CeedInt geom_data_size; + PalaceCeedCall(ceed, + CeedElemRestrictionGetNumComponents(geom_data_restr, &geom_data_size)); + PalaceCeedCall( + ceed, CeedQFunctionAddInput(apply_qf, "geom_data", geom_data_size, CEED_EVAL_NONE)); + } + if (info.trial_ops & EvalMode::Weight) + { + PalaceCeedCall(ceed, CeedQFunctionAddInput(apply_qf, "q_w", 1, CEED_EVAL_WEIGHT)); + } + MFEM_VERIFY(!(info.test_ops & EvalMode::Weight), + "CeedOperator should not have quadrature weight output!"); + if (!info.assemble_q_data) + { + AddQFunctionActiveInputs(info.trial_ops, ceed, trial_basis, apply_qf); + AddQFunctionActiveOutputs(info.test_ops, ceed, test_basis, apply_qf); + } + else + { + CeedInt q_data_size; + PalaceCeedCall(ceed, CeedElemRestrictionGetNumComponents(q_data_restr, &q_data_size)); + PalaceCeedCall(ceed, + CeedQFunctionAddOutput(apply_qf, "q_data", q_data_size, CEED_EVAL_NONE)); + } + + // Create the operator. + PalaceCeedCall(ceed, CeedOperatorCreate(ceed, apply_qf, nullptr, nullptr, op)); + PalaceCeedCall(ceed, CeedQFunctionDestroy(&apply_qf)); + + PalaceCeedCall(ceed, CeedOperatorSetField(*op, "geom_data", geom_data_restr, + CEED_BASIS_NONE, geom_data)); + if (info.trial_ops & EvalMode::Weight) + { + PalaceCeedCall(ceed, CeedOperatorSetField(*op, "q_w", CEED_ELEMRESTRICTION_NONE, + trial_basis, CEED_VECTOR_NONE)); + } + if (!info.assemble_q_data) + { + AddOperatorActiveInputFields(info.trial_ops, ceed, trial_restr, trial_basis, *op); + AddOperatorActiveOutputFields(info.test_ops, ceed, test_restr, test_basis, *op); + } + else + { + PalaceCeedCall(ceed, CeedOperatorSetField(*op, "q_data", q_data_restr, CEED_BASIS_NONE, + CEED_VECTOR_ACTIVE)); + } + + PalaceCeedCall(ceed, CeedOperatorCheckReady(*op)); + + // Assemble the quadrature data and create the actual operator. + if (info.assemble_q_data) + { + QuadratureDataAssembly(qf_active_sizes, info, ceed, trial_restr, test_restr, + trial_basis, test_basis, q_data, q_data_restr, op); + + // Cleanup (these are now owned by the operator). + PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&q_data_restr)); + PalaceCeedCall(ceed, CeedVectorDestroy(&q_data)); + } +} + +void AssembleCeedInterpolator(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis interp_basis, + CeedOperator *op, CeedOperator *op_t) +{ + // Create the QFunction that defines the action of the operator (only an identity as + // element dof multiplicity is handled outside of libCEED). + CeedQFunction apply_qf, apply_qf_t; + PalaceCeedCall(ceed, CeedQFunctionCreateIdentity(ceed, 1, CEED_EVAL_INTERP, + CEED_EVAL_NONE, &apply_qf)); + PalaceCeedCall(ceed, CeedQFunctionCreateIdentity(ceed, 1, CEED_EVAL_NONE, + CEED_EVAL_INTERP, &apply_qf_t)); + + // Create the operator. + PalaceCeedCall(ceed, CeedOperatorCreate(ceed, apply_qf, nullptr, nullptr, op)); + PalaceCeedCall(ceed, CeedQFunctionDestroy(&apply_qf)); + + PalaceCeedCall(ceed, CeedOperatorSetField(*op, "input", trial_restr, interp_basis, + CEED_VECTOR_ACTIVE)); + PalaceCeedCall(ceed, CeedOperatorSetField(*op, "output", test_restr, CEED_BASIS_NONE, + CEED_VECTOR_ACTIVE)); + + PalaceCeedCall(ceed, CeedOperatorCheckReady(*op)); + + // Create the transpose operator. + PalaceCeedCall(ceed, CeedOperatorCreate(ceed, apply_qf_t, nullptr, nullptr, op_t)); + PalaceCeedCall(ceed, CeedQFunctionDestroy(&apply_qf_t)); + + PalaceCeedCall(ceed, CeedOperatorSetField(*op_t, "input", test_restr, CEED_BASIS_NONE, + CEED_VECTOR_ACTIVE)); + PalaceCeedCall(ceed, CeedOperatorSetField(*op_t, "output", trial_restr, interp_basis, + CEED_VECTOR_ACTIVE)); + + PalaceCeedCall(ceed, CeedOperatorCheckReady(*op_t)); +} + +void AssembleCeedElementErrorIntegrator( + const CeedQFunctionInfo &info, void *ctx, std::size_t ctx_size, Ceed ceed, + CeedVector input1, CeedVector input2, CeedElemRestriction input1_restr, + CeedElemRestriction input2_restr, CeedBasis input1_basis, CeedBasis input2_basis, + CeedElemRestriction mesh_elem_restr, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op) +{ + MFEM_VERIFY(!info.assemble_q_data, + "Quadrature interpolator does not support quadrature data assembly!"); + + // Create basis for summing contributions from all quadrature points on the element. + CeedInt num_qpts; + PalaceCeedCall(ceed, CeedBasisGetNumQuadraturePoints(input1_basis, &num_qpts)); + CeedBasis mesh_elem_basis; + { + // Note: ceed::GetCeedTopology(CEED_TOPOLOGY_LINE) == 1. + mfem::Vector Bt(num_qpts), Gt(num_qpts), qX(num_qpts), qW(num_qpts); + Bt = 1.0; + Gt = 0.0; + qX = 0.0; + qW = 0.0; + PalaceCeedCall(ceed, CeedBasisCreateH1(ceed, CEED_TOPOLOGY_LINE, 1, 1, num_qpts, + Bt.GetData(), Gt.GetData(), qX.GetData(), + qW.GetData(), &mesh_elem_basis)); + } + + // Create the QFunction that defines the action of the operator. + CeedQFunction apply_qf; + PalaceCeedCall(ceed, CeedQFunctionCreateInterior(ceed, 1, info.apply_qf, + info.apply_qf_path.c_str(), &apply_qf)); + + CeedQFunctionContext apply_ctx; + PalaceCeedCall(ceed, CeedQFunctionContextCreate(ceed, &apply_ctx)); + PalaceCeedCall(ceed, CeedQFunctionContextSetData(apply_ctx, CEED_MEM_HOST, + CEED_COPY_VALUES, ctx_size, ctx)); + PalaceCeedCall(ceed, CeedQFunctionSetContext(apply_qf, apply_ctx)); + PalaceCeedCall(ceed, CeedQFunctionContextDestroy(&apply_ctx)); + + // Inputs/outputs. "Test" operations are the operations for the second input vector. + { + CeedInt geom_data_size; + PalaceCeedCall(ceed, + CeedElemRestrictionGetNumComponents(geom_data_restr, &geom_data_size)); + PalaceCeedCall( + ceed, CeedQFunctionAddInput(apply_qf, "geom_data", geom_data_size, CEED_EVAL_NONE)); + } + if (info.trial_ops & EvalMode::Weight) + { + PalaceCeedCall(ceed, CeedQFunctionAddInput(apply_qf, "q_w", 1, CEED_EVAL_WEIGHT)); + } + AddQFunctionActiveInputs(info.trial_ops, ceed, input1_basis, apply_qf, "u_1"); + AddQFunctionActiveInputs(info.test_ops, ceed, input2_basis, apply_qf, "u_2"); + PalaceCeedCall(ceed, CeedQFunctionAddOutput(apply_qf, "v", 1, CEED_EVAL_INTERP)); + + // Create the operator. + PalaceCeedCall(ceed, CeedOperatorCreate(ceed, apply_qf, nullptr, nullptr, op)); + PalaceCeedCall(ceed, CeedQFunctionDestroy(&apply_qf)); + + PalaceCeedCall(ceed, CeedOperatorSetField(*op, "geom_data", geom_data_restr, + CEED_BASIS_NONE, geom_data)); + if (info.trial_ops & EvalMode::Weight) + { + PalaceCeedCall(ceed, CeedOperatorSetField(*op, "q_w", CEED_ELEMRESTRICTION_NONE, + input1_basis, CEED_VECTOR_NONE)); + } + AddOperatorActiveInputFields(info.trial_ops, ceed, input1_restr, input1_basis, *op, "u_1", + input1); + AddOperatorActiveInputFields(info.test_ops, ceed, input2_restr, input2_basis, *op, "u_2", + input2); + PalaceCeedCall(ceed, CeedOperatorSetField(*op, "v", mesh_elem_restr, mesh_elem_basis, + CEED_VECTOR_ACTIVE)); + + PalaceCeedCall(ceed, CeedOperatorCheckReady(*op)); + + // Cleanup (this is now owned by the operator). + PalaceCeedCall(ceed, CeedBasisDestroy(&mesh_elem_basis)); +} + +} // namespace palace::ceed diff --git a/palace/fem/libceed/libceed_integrator.hpp b/palace/fem/libceed/libceed_integrator.hpp new file mode 100644 index 0000000000..cbee373599 --- /dev/null +++ b/palace/fem/libceed/libceed_integrator.hpp @@ -0,0 +1,86 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_INTEGRATOR_HPP +#define PALACE_LIBCEED_INTEGRATOR_HPP + +#include +#include +#include "fem/libceed/ceed.hpp" + +namespace palace::ceed +{ + +// Evaluation modes for CeedOperator fields for various integrators. +enum EvalMode : unsigned int +{ + Weight = 1 << 0, + None = 1 << 1, + Interp = 1 << 2, + Grad = 1 << 3, + Div = 1 << 4, + Curl = 1 << 5 +}; + +// Data structure for CeedOperator construction for various integrators. +struct CeedQFunctionInfo +{ + // QFunctions for operator construction and application. + CeedQFunctionUser apply_qf; + + // Path and name of the QFunctions for operator construction and application. + std::string apply_qf_path; + + // Evaluation modes for the test and trial basis. + unsigned int trial_ops, test_ops; + + // Control whether or not to pre-assemble the quadrature data or compute it during + // operator application in true matrix-free fashion. + bool assemble_q_data; + + CeedQFunctionInfo() + : apply_qf(nullptr), apply_qf_path(""), trial_ops(0), test_ops(0), + assemble_q_data(false) + { + } +}; + +// Helper function to get the geometry space dimension. +int CeedGeometryDataGetSpaceDimension(CeedElemRestriction geom_data_restr, CeedInt dim, + CeedInt *space_dim); + +// Assemble libCEED mesh geometry factor quadrature data for use in a partially assembled +// libCEED operator. +void AssembleCeedGeometryData(Ceed ceed, CeedElemRestriction mesh_restr, + CeedBasis mesh_basis, CeedVector mesh_nodes, + CeedElemRestriction attr_restr, CeedBasis attr_basis, + CeedVector elem_attr, CeedVector geom_data, + CeedElemRestriction geom_data_restr); + +// Construct libCEED operator using the given quadrature data, element restriction, and +// basis objects. +void AssembleCeedOperator(const CeedQFunctionInfo &info, void *ctx, std::size_t ctx_size, + Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis trial_basis, + CeedBasis test_basis, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op); + +// Construct libCEED operators for interpolation operations and their transpose between +// the two spaces. Note that contributions for shared degrees of freedom are added, so the +// output of the operator application must be scaled by the inverse multiplicity. +void AssembleCeedInterpolator(Ceed ceed, CeedElemRestriction trial_restr, + CeedElemRestriction test_restr, CeedBasis interp_basis, + CeedOperator *op, CeedOperator *op_t); + +// Construct a libCEED operator which integrates the squared difference between two +// functions over every element. +void AssembleCeedElementErrorIntegrator( + const CeedQFunctionInfo &info, void *ctx, std::size_t ctx_size, Ceed ceed, + CeedVector input1, CeedVector input2, CeedElemRestriction input1_restr, + CeedElemRestriction input2_restr, CeedBasis input1_basis, CeedBasis input2_basis, + CeedElemRestriction mesh_elem_restr, CeedVector geom_data, + CeedElemRestriction geom_data_restr, CeedOperator *op); + +} // namespace palace::ceed + +#endif // PALACE_LIBCEED_INTEGRATOR_HPP diff --git a/palace/fem/libceed/operator.cpp b/palace/fem/libceed/operator.cpp index d665546ec6..2159678570 100644 --- a/palace/fem/libceed/operator.cpp +++ b/palace/fem/libceed/operator.cpp @@ -1,484 +1,587 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "operator.hpp" - -#include -#include -#include -#include "fem/libceed/utils.hpp" -#include "utils/omp.hpp" - -namespace palace::ceed -{ - -Operator::~Operator() -{ - for (std::size_t i = 0; i < ops.size(); i++) - { - Ceed ceed; - PalaceCeedCallBackend(CeedOperatorGetCeed(ops[i], &ceed)); - PalaceCeedCall(ceed, CeedOperatorDestroy(&ops[i])); - PalaceCeedCall(ceed, CeedOperatorDestroy(&ops_t[i])); - PalaceCeedCall(ceed, CeedVectorDestroy(&u[i])); - PalaceCeedCall(ceed, CeedVectorDestroy(&v[i])); - } -} - -void Operator::AddOper(CeedOperator op, CeedOperator op_t) -{ - Ceed ceed; - CeedSize l_in, l_out; - CeedVector loc_u, loc_v; - PalaceCeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(op, &l_in, &l_out)); - MFEM_VERIFY((l_in == 0 && l_out == 0) || (mfem::internal::to_int(l_in) == width && - mfem::internal::to_int(l_out) == height), - "Dimensions mismatch for CeedOperator!"); - if (op_t) - { - CeedSize l_in_t, l_out_t; - PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(op_t, &l_in_t, &l_out_t)); - MFEM_VERIFY((l_in_t == 0 && l_out_t == 0) || (l_in_t == l_out && l_out_t == l_in), - "Dimensions mismatch for transpose CeedOperator!"); - } - PalaceCeedCall(ceed, CeedVectorCreate(ceed, l_in, &loc_u)); - PalaceCeedCall(ceed, CeedVectorCreate(ceed, l_out, &loc_v)); - - PalacePragmaOmp(critical(AddOper)) - { - ops.push_back(op); - ops_t.push_back(op_t); - u.push_back(loc_u); - v.push_back(loc_v); - } -} - -void Operator::AssembleDiagonal(Vector &diag) const -{ - Ceed ceed; - CeedMemType mem; - CeedScalar *data; - - MFEM_VERIFY(diag.Size() == height, "Invalid size for diagonal vector!"); - diag = 0.0; - PalaceCeedCallBackend(CeedOperatorGetCeed(ops[0], &ceed)); - PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem)); - if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE) - { - data = diag.ReadWrite(); - } - else - { - data = diag.HostReadWrite(); - mem = CEED_MEM_HOST; - } - - PalacePragmaOmp(parallel for private(ceed) schedule(static)) - for (std::size_t i = 0; i < ops.size(); i++) - { - PalaceCeedCallBackend(CeedOperatorGetCeed(ops[i], &ceed)); - PalaceCeedCall(ceed, CeedVectorSetArray(v[i], mem, CEED_USE_POINTER, data)); - PalaceCeedCall( - ceed, CeedOperatorLinearAssembleAddDiagonal(ops[i], v[i], CEED_REQUEST_IMMEDIATE)); - PalaceCeedCall(ceed, CeedVectorTakeArray(v[i], mem, nullptr)); - } -} - -namespace -{ - -inline void CeedAddMult(const std::vector &ops, - const std::vector &u, const std::vector &v, - const Vector &x, Vector &y) -{ - Ceed ceed; - CeedMemType mem; - const CeedScalar *x_data; - CeedScalar *y_data; - - PalaceCeedCallBackend(CeedOperatorGetCeed(ops[0], &ceed)); - PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem)); - if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE) - { - x_data = x.Read(); - y_data = y.ReadWrite(); - } - else - { - x_data = x.HostRead(); - y_data = y.HostReadWrite(); - mem = CEED_MEM_HOST; - } - - PalacePragmaOmp(parallel for private(ceed) schedule(static)) - for (std::size_t i = 0; i < ops.size(); i++) - { - if (ops[i]) // No-op for an empty operator - { - PalaceCeedCallBackend(CeedOperatorGetCeed(ops[i], &ceed)); - PalaceCeedCall(ceed, CeedVectorSetArray(u[i], mem, CEED_USE_POINTER, - const_cast(x_data))); - PalaceCeedCall(ceed, CeedVectorSetArray(v[i], mem, CEED_USE_POINTER, y_data)); - PalaceCeedCall(ceed, - CeedOperatorApplyAdd(ops[i], u[i], v[i], CEED_REQUEST_IMMEDIATE)); - PalaceCeedCall(ceed, CeedVectorTakeArray(u[i], mem, nullptr)); - PalaceCeedCall(ceed, CeedVectorTakeArray(v[i], mem, nullptr)); - } - } -} - -} // namespace - -void Operator::Mult(const Vector &x, Vector &y) const -{ - y = 0.0; - CeedAddMult(ops, u, v, x, y); - if (dof_multiplicity.Size() > 0) - { - y *= dof_multiplicity; - } -} - -void Operator::AddMult(const Vector &x, Vector &y, const double a) const -{ - if (a == 1.0 && dof_multiplicity.Size() == 0) - { - CeedAddMult(ops, u, v, x, y); - } - else - { - Vector &temp = (height == width) ? temp_v : temp_u; - temp.SetSize(height); - temp.UseDevice(true); - temp = 0.0; - CeedAddMult(ops, u, v, x, temp); - if (dof_multiplicity.Size() > 0) - { - temp *= dof_multiplicity; - } - y.Add(a, temp); - } -} - -void Operator::MultTranspose(const Vector &x, Vector &y) const -{ - y = 0.0; - if (dof_multiplicity.Size() > 0) - { - temp_v = x; - temp_v *= dof_multiplicity; - CeedAddMult(ops_t, v, u, temp_v, y); - } - else - { - CeedAddMult(ops_t, v, u, x, y); - } -} - -void Operator::AddMultTranspose(const Vector &x, Vector &y, const double a) const -{ - auto AddMultTransposeImpl = [this](const Vector &x_, Vector &y_, const double a_) - { - if (a_ == 1.0) - { - CeedAddMult(ops_t, v, u, x_, y_); - } - else - { - Vector &temp = (height == width && dof_multiplicity.Size() == 0) ? temp_v : temp_u; - temp.SetSize(width); - temp.UseDevice(true); - temp = 0.0; - CeedAddMult(ops_t, v, u, x_, temp); - y_.Add(a_, temp); - } - }; - if (dof_multiplicity.Size() > 0) - { - temp_v = x; - temp_v *= dof_multiplicity; - AddMultTransposeImpl(temp_v, y, a); - } - else - { - AddMultTransposeImpl(x, y, a); - } -} - -namespace -{ - -int CeedInternalCallocArray(size_t n, size_t unit, void *p) -{ - *(void **)p = calloc(n, unit); - MFEM_ASSERT(!n || !unit || *(void **)p, - "calloc failed to allocate " << n << " members of size " << unit << "!"); - return 0; -} - -int CeedInternalFree(void *p) -{ - free(*(void **)p); - *(void **)p = nullptr; - return 0; -} - -#define CeedInternalCalloc(n, p) CeedInternalCallocArray((n), sizeof(**(p)), p) - -void CeedOperatorAssembleCOORemoveZeros(Ceed ceed, CeedSize *nnz, CeedInt **rows, - CeedInt **cols, CeedVector *vals, CeedMemType *mem) -{ - // Filter out zero entries. For now, eliminating zeros happens all on the host. - // XX TODO: Use Thrust for this (thrust::copy_if and thrust::zip_iterator) - CeedInt *new_rows, *new_cols; - PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, &new_rows)); - PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, &new_cols)); - - CeedVector new_vals; - PalaceCeedCall(ceed, CeedVectorCreate(ceed, *nnz, &new_vals)); - - CeedSize q = 0; - const CeedScalar *vals_array; - CeedScalar *new_vals_array; - PalaceCeedCall(ceed, CeedVectorGetArrayRead(*vals, CEED_MEM_HOST, &vals_array)); - PalaceCeedCall(ceed, CeedVectorGetArrayWrite(new_vals, CEED_MEM_HOST, &new_vals_array)); - for (CeedSize k = 0; k < *nnz; k++) - { - if (vals_array[k] != 0.0) - { - new_rows[q] = (*rows)[k]; - new_cols[q] = (*cols)[k]; - new_vals_array[q] = vals_array[k]; - q++; - } - } - PalaceCeedCall(ceed, CeedVectorRestoreArrayRead(*vals, &vals_array)); - PalaceCeedCall(ceed, CeedVectorRestoreArray(new_vals, &new_vals_array)); - - PalaceCeedCall(ceed, CeedInternalFree(rows)); - PalaceCeedCall(ceed, CeedInternalFree(cols)); - PalaceCeedCall(ceed, CeedVectorDestroy(vals)); - - *rows = new_rows; - *cols = new_cols; - *vals = new_vals; - *nnz = q; -} - -void CeedOperatorAssembleCOO(const Operator &op, bool skip_zeros, CeedSize *nnz, - CeedInt **rows, CeedInt **cols, CeedVector *vals, - CeedMemType *mem) -{ - Ceed ceed; - CeedScalar *vals_array; - std::vector loc_nnz(op.Size()), loc_offsets(op.Size() + 1); - std::vector loc_rows(op.Size()), loc_cols(op.Size()); - std::vector loc_vals(op.Size()); - - PalaceCeedCallBackend(CeedOperatorGetCeed(op[0], &ceed)); - PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, mem)); - if (!mfem::Device::Allows(mfem::Backend::DEVICE_MASK) || *mem != CEED_MEM_DEVICE) - { - *mem = CEED_MEM_HOST; - } - - PalacePragmaOmp(parallel for private(ceed) schedule(static)) - for (std::size_t i = 0; i < op.Size(); i++) - { - // Assemble sparsity pattern (rows, cols are always host pointers). - PalaceCeedCallBackend(CeedOperatorGetCeed(op[i], &ceed)); - PalaceCeedCall(ceed, CeedOperatorLinearAssembleSymbolic(op[i], &loc_nnz[i], - &loc_rows[i], &loc_cols[i])); - - // Assemble values. - PalaceCeedCall(ceed, CeedVectorCreate(ceed, loc_nnz[i], &loc_vals[i])); - PalaceCeedCall(ceed, CeedOperatorLinearAssemble(op[i], loc_vals[i])); - } - - loc_offsets[0] = 0; - std::inclusive_scan(loc_nnz.begin(), loc_nnz.end(), loc_offsets.begin() + 1); - *nnz = loc_offsets.back(); - if (op.Size() == 1) - { - // Assemble values. - *rows = loc_rows[0]; - *cols = loc_cols[0]; - *vals = loc_vals[0]; - } - else - { - // Global assembly. - PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, rows)); - PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, cols)); - PalaceCeedCall(ceed, CeedVectorCreate(ceed, *nnz, vals)); - PalaceCeedCall(ceed, CeedVectorGetArrayWrite(*vals, *mem, &vals_array)); - - PalacePragmaOmp(parallel for private(ceed) schedule(static)) - for (std::size_t i = 0; i < op.Size(); i++) - { - const auto start = loc_offsets[i]; - const auto end = loc_offsets[i + 1]; - for (auto k = start; k < end; k++) - { - (*rows)[k] = loc_rows[i][k - start]; - (*cols)[k] = loc_cols[i][k - start]; - } - - // The CeedVector is on only on device when MFEM is also using the device. - const CeedScalar *loc_vals_array; - PalaceCeedCallBackend(CeedVectorGetCeed(loc_vals[i], &ceed)); - PalaceCeedCall(ceed, CeedVectorGetArrayRead(loc_vals[i], *mem, &loc_vals_array)); - if (*mem != CEED_MEM_HOST) - { - mfem::forall(end - start, [=] MFEM_HOST_DEVICE(int k) - { vals_array[k + start] = loc_vals_array[k]; }); - } - else - { - for (auto k = start; k < end; k++) - { - vals_array[k] = loc_vals_array[k - start]; - } - } - PalaceCeedCall(ceed, CeedVectorRestoreArrayRead(loc_vals[i], &loc_vals_array)); - PalaceCeedCall(ceed, CeedInternalFree(&loc_rows[i])); - PalaceCeedCall(ceed, CeedInternalFree(&loc_cols[i])); - PalaceCeedCall(ceed, CeedVectorDestroy(&loc_vals[i])); - } - - PalaceCeedCall(ceed, CeedVectorRestoreArray(*vals, &vals_array)); - } - - // std::cout << " Operator full assembly (COO) has " << *nnz << " NNZ"; - if (skip_zeros && *nnz > 0) - { - CeedOperatorAssembleCOORemoveZeros(ceed, nnz, rows, cols, vals, mem); - // std::cout << " (new NNZ after removal: " << *nnz << ")"; - } - // std::cout << "\n"; -} - -} // namespace - -std::unique_ptr CeedOperatorFullAssemble(const Operator &op, - bool skip_zeros, bool set) -{ - // First, get matrix on master thread in COO format, withs rows/cols always on host and - // vals potentially on the device. Process skipping zeros if desired. - Ceed ceed; - CeedSize nnz; - CeedInt *rows, *cols; - CeedVector vals; - CeedMemType mem; - CeedOperatorAssembleCOO(op, skip_zeros, &nnz, &rows, &cols, &vals, &mem); - PalaceCeedCallBackend(CeedVectorGetCeed(vals, &ceed)); - - // Preallocate CSR memory on host (like PETSc's MatSetValuesCOO). - auto mat = std::make_unique(); - mat->OverrideSize(op.Height(), op.Width()); - mat->GetMemoryI().New(op.Height() + 1); - auto *I = mat->GetI(); - mfem::Array J(nnz), perm(nnz), Jmap(nnz + 1); - - for (int i = 0; i < op.Height() + 1; i++) - { - I[i] = 0; - } - for (int k = 0; k < nnz; k++) - { - perm[k] = k; - } - std::sort(perm.begin(), perm.end(), - [&](const int &i, const int &j) { return (rows[i] < rows[j]); }); - - int q = -1; // True nnz index - for (int k = 0; k < nnz;) - { - // Sort column entries in the row. - const int row = rows[perm[k]]; - const int start = k; - while (k < nnz && rows[perm[k]] == row) - { - k++; - } - std::sort(perm.begin() + start, perm.begin() + k, - [&](const int &i, const int &j) { return (cols[i] < cols[j]); }); - - q++; - I[row + 1] = 1; - J[q] = cols[perm[start]]; - Jmap[q + 1] = 1; - for (int p = start + 1; p < k; p++) - { - if (cols[perm[p]] != cols[perm[p - 1]]) - { - // New nonzero. - q++; - I[row + 1]++; - J[q] = cols[perm[p]]; - Jmap[q + 1] = 1; - } - else - { - Jmap[q + 1]++; - } - } - } - PalaceCeedCall(ceed, CeedInternalFree(&rows)); - PalaceCeedCall(ceed, CeedInternalFree(&cols)); - const int nnz_new = q + 1; - - // Finalize I, Jmap. - I[0] = 0; - for (int i = 0; i < op.Height(); i++) - { - I[i + 1] += I[i]; - } - Jmap[0] = 0; - for (int k = 0; k < nnz; k++) - { - Jmap[k + 1] += Jmap[k]; - } - - mat->GetMemoryJ().New(nnz_new, mat->GetMemoryJ().GetMemoryType()); - mat->GetMemoryData().New(nnz_new, mat->GetMemoryJ().GetMemoryType()); - { - const auto *d_J_old = J.Read(); - auto *d_J = mfem::Write(mat->GetMemoryJ(), nnz_new); - mfem::forall(nnz_new, [=] MFEM_HOST_DEVICE(int k) { d_J[k] = d_J_old[k]; }); - } - - // Fill the values (on device). - const CeedScalar *vals_array; - PalaceCeedCall(ceed, CeedVectorGetArrayRead(vals, mem, &vals_array)); - { - const auto *d_perm = perm.Read(); - const auto *d_Jmap = Jmap.Read(); - auto *d_A = mfem::Write(mat->GetMemoryData(), nnz_new); - if (set) - { - mfem::forall(nnz_new, - [=] MFEM_HOST_DEVICE(int k) { d_A[k] = vals_array[d_perm[d_Jmap[k]]]; }); - } - else - { - mfem::forall(nnz_new, - [=] MFEM_HOST_DEVICE(int k) - { - double sum = 0.0; - for (int p = d_Jmap[k]; p < d_Jmap[k + 1]; p++) - { - sum += vals_array[d_perm[p]]; - } - d_A[k] = sum; - }); - } - } - PalaceCeedCall(ceed, CeedVectorRestoreArrayRead(vals, &vals_array)); - PalaceCeedCall(ceed, CeedVectorDestroy(&vals)); - - return mat; -} - -} // namespace palace::ceed +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "operator.hpp" + +#include +#include +#include +#include +#include "fem/fespace.hpp" +#include "linalg/hypre.hpp" +#include "utils/omp.hpp" + +namespace palace::ceed +{ + +Operator::Operator(int h, int w) : palace::Operator(h, w) +{ + const std::size_t nt = internal::GetCeedObjects().size(); + op.resize(nt, nullptr); + op_t.resize(nt, nullptr); + u.resize(nt, nullptr); + v.resize(nt, nullptr); + PalacePragmaOmp(parallel if (op.size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(static_cast(id) < op.size(), + "Out of bounds access for thread number " << id << "!"); + Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; + CeedOperator loc_op, loc_op_t; + CeedVector loc_u, loc_v; + PalaceCeedCall(ceed, CeedOperatorCreateComposite(ceed, &loc_op)); + PalaceCeedCall(ceed, CeedOperatorCreateComposite(ceed, &loc_op_t)); + PalaceCeedCall(ceed, CeedVectorCreate(ceed, width, &loc_u)); + PalaceCeedCall(ceed, CeedVectorCreate(ceed, height, &loc_v)); + op[id] = loc_op; + op_t[id] = loc_op_t; + u[id] = loc_u; + v[id] = loc_v; + } + temp.UseDevice(true); +} + +Operator::~Operator() +{ + PalacePragmaOmp(parallel if (op.size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(static_cast(id) < op.size(), + "Out of bounds access for thread number " << id << "!"); + Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); + PalaceCeedCall(ceed, CeedOperatorDestroy(&op[id])); + PalaceCeedCall(ceed, CeedOperatorDestroy(&op_t[id])); + PalaceCeedCall(ceed, CeedVectorDestroy(&u[id])); + PalaceCeedCall(ceed, CeedVectorDestroy(&v[id])); + } +} + +void Operator::AddSubOperator(CeedOperator sub_op, CeedOperator sub_op_t) +{ + // This should be called from within a OpenMP parallel region. + const int id = utils::GetThreadNum(); + MFEM_ASSERT(static_cast(id) < op.size(), + "Out of bounds access for thread number " << id << "!"); + Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(sub_op, &ceed)); + CeedSize l_in, l_out; + PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(sub_op, &l_in, &l_out)); + MFEM_VERIFY((l_in < 0 || mfem::internal::to_int(l_in) == width) && + (l_out < 0 || mfem::internal::to_int(l_out) == height), + "Dimensions mismatch for CeedOperator!"); + PalaceCeedCall(ceed, CeedOperatorCompositeAddSub(op[id], sub_op)); + PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op)); + if (sub_op_t) + { + Ceed ceed_t; + PalaceCeedCallBackend(CeedOperatorGetCeed(sub_op_t, &ceed_t)); + MFEM_VERIFY(ceed_t == ceed, "Ceed context mismatch for transpose CeedOperator!"); + CeedSize l_in_t, l_out_t; + PalaceCeedCall(ceed, CeedOperatorGetActiveVectorLengths(sub_op_t, &l_in_t, &l_out_t)); + MFEM_VERIFY(l_in_t == l_out && l_out_t == l_in, + "Dimensions mismatch for transpose CeedOperator!"); + PalaceCeedCall(ceed, CeedOperatorCompositeAddSub(op_t[id], sub_op_t)); + PalaceCeedCall(ceed, CeedOperatorDestroy(&sub_op_t)); + } +} + +void Operator::Finalize() +{ + PalacePragmaOmp(parallel if (op.size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(static_cast(id) < op.size(), + "Out of bounds access for thread number " << id << "!"); + Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); + PalaceCeedCall(ceed, CeedOperatorCheckReady(op[id])); + PalaceCeedCall(ceed, CeedOperatorCheckReady(op_t[id])); + } +} + +void Operator::DestroyAssemblyData() const +{ + PalacePragmaOmp(parallel if (op.size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(static_cast(id) < op.size(), + "Out of bounds access for thread number " << id << "!"); + Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); + PalaceCeedCall(ceed, CeedOperatorAssemblyDataStrip(op[id])); + } +} + +void Operator::AssembleDiagonal(Vector &diag) const +{ + Ceed ceed; + CeedMemType mem; + MFEM_VERIFY(diag.Size() == height, "Invalid size for diagonal vector!"); + diag = 0.0; + PalaceCeedCallBackend(CeedOperatorGetCeed(op[0], &ceed)); + PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem)); + if (!mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE) + { + mem = CEED_MEM_HOST; + } + auto *diag_data = diag.ReadWrite(mem == CEED_MEM_DEVICE); + + PalacePragmaOmp(parallel if (op.size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(static_cast(id) < op.size(), + "Out of bounds access for thread number " << id << "!"); + Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); + PalaceCeedCall(ceed, CeedVectorSetArray(v[id], mem, CEED_USE_POINTER, diag_data)); + PalaceCeedCall( + ceed, CeedOperatorLinearAssembleAddDiagonal(op[id], v[id], CEED_REQUEST_IMMEDIATE)); + PalaceCeedCall(ceed, CeedVectorTakeArray(v[id], mem, nullptr)); + PalaceCeedCall(ceed, CeedOperatorAssemblyDataStrip(op[id])); + } +} + +namespace +{ + +inline void CeedAddMult(const std::vector &op, + const std::vector &u, const std::vector &v, + const Vector &x, Vector &y) +{ + Ceed ceed; + CeedMemType mem; + PalaceCeedCallBackend(CeedOperatorGetCeed(op[0], &ceed)); + PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem)); + if (!mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE) + { + mem = CEED_MEM_HOST; + } + const auto *x_data = x.Read(mem == CEED_MEM_DEVICE); + auto *y_data = y.ReadWrite(mem == CEED_MEM_DEVICE); + + PalacePragmaOmp(parallel if (op.size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(static_cast(id) < op.size(), + "Out of bounds access for thread number " << id << "!"); + Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); + PalaceCeedCall(ceed, CeedVectorSetArray(u[id], mem, CEED_USE_POINTER, + const_cast(x_data))); + PalaceCeedCall(ceed, CeedVectorSetArray(v[id], mem, CEED_USE_POINTER, y_data)); + PalaceCeedCall(ceed, + CeedOperatorApplyAdd(op[id], u[id], v[id], CEED_REQUEST_IMMEDIATE)); + PalaceCeedCall(ceed, CeedVectorTakeArray(u[id], mem, nullptr)); + PalaceCeedCall(ceed, CeedVectorTakeArray(v[id], mem, nullptr)); + } +} + +} // namespace + +void Operator::Mult(const Vector &x, Vector &y) const +{ + y = 0.0; + CeedAddMult(op, u, v, x, y); + if (dof_multiplicity.Size() > 0) + { + y *= dof_multiplicity; + } +} + +void Operator::AddMult(const Vector &x, Vector &y, const double a) const +{ + MFEM_VERIFY(a == 1.0, "ceed::Operator::AddMult only supports coefficient = 1.0!"); + if (dof_multiplicity.Size() > 0) + { + temp.SetSize(height); + temp = 0.0; + CeedAddMult(op, u, v, x, temp); + { + const auto *d_dof_multiplicity = dof_multiplicity.Read(); + const auto *d_temp = temp.Read(); + auto *d_y = y.ReadWrite(); + mfem::forall(height, [=] MFEM_HOST_DEVICE(int i) + { d_y[i] += d_dof_multiplicity[i] * d_temp[i]; }); + } + } + else + { + CeedAddMult(op, u, v, x, y); + } +} + +void Operator::MultTranspose(const Vector &x, Vector &y) const +{ + y = 0.0; + AddMultTranspose(x, y); +} + +void Operator::AddMultTranspose(const Vector &x, Vector &y, const double a) const +{ + MFEM_VERIFY(a == 1.0, + "ceed::Operator::AddMultTranspose only supports coefficient = 1.0!"); + if (dof_multiplicity.Size() > 0) + { + temp.SetSize(height); + { + const auto *d_dof_multiplicity = dof_multiplicity.Read(); + const auto *d_x = x.Read(); + auto *d_temp = temp.Write(); + mfem::forall(height, [=] MFEM_HOST_DEVICE(int i) + { d_temp[i] = d_dof_multiplicity[i] * d_x[i]; }); + } + CeedAddMult(op_t, v, u, temp, y); + } + else + { + CeedAddMult(op_t, v, u, x, y); + } +} + +namespace +{ + +int CeedInternalCallocArray(size_t n, size_t unit, void *p) +{ + *(void **)p = calloc(n, unit); + MFEM_ASSERT(!n || !unit || *(void **)p, + "calloc failed to allocate " << n << " members of size " << unit << "!"); + return 0; +} + +int CeedInternalFree(void *p) +{ + free(*(void **)p); + *(void **)p = nullptr; + return 0; +} + +#define CeedInternalCalloc(n, p) CeedInternalCallocArray((n), sizeof(**(p)), p) + +void CeedOperatorAssembleCOO(Ceed ceed, CeedOperator op, bool skip_zeros, CeedSize *nnz, + CeedInt **rows, CeedInt **cols, CeedVector *vals, + CeedMemType *mem) +{ + PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, mem)); + + // Assemble sparsity pattern (rows, cols are always host pointers). + PalaceCeedCall(ceed, CeedOperatorLinearAssembleSymbolic(op, nnz, rows, cols)); + + // Assemble values. + PalaceCeedCall(ceed, CeedVectorCreate(ceed, *nnz, vals)); + PalaceCeedCall(ceed, CeedOperatorLinearAssemble(op, *vals)); + + // Filter out zero entries. For now, eliminating zeros happens all on the host. + // std::cout << " Operator full assembly (COO) has " << *nnz << " NNZ"; + if (skip_zeros && *nnz > 0) + { + // XX TODO: Use Thrust for this (thrust::copy_if and thrust::zip_iterator) + CeedInt *new_rows, *new_cols; + PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, &new_rows)); + PalaceCeedCall(ceed, CeedInternalCalloc(*nnz, &new_cols)); + + CeedVector new_vals; + PalaceCeedCall(ceed, CeedVectorCreate(ceed, *nnz, &new_vals)); + + CeedSize q = 0; + const CeedScalar *vals_array; + CeedScalar *new_vals_array; + PalaceCeedCall(ceed, CeedVectorGetArrayRead(*vals, CEED_MEM_HOST, &vals_array)); + PalaceCeedCall(ceed, CeedVectorGetArrayWrite(new_vals, CEED_MEM_HOST, &new_vals_array)); + for (CeedSize k = 0; k < *nnz; k++) + { + if (vals_array[k] != 0.0) + { + new_rows[q] = (*rows)[k]; + new_cols[q] = (*cols)[k]; + new_vals_array[q] = vals_array[k]; + q++; + } + } + PalaceCeedCall(ceed, CeedVectorRestoreArrayRead(*vals, &vals_array)); + PalaceCeedCall(ceed, CeedVectorRestoreArray(new_vals, &new_vals_array)); + + PalaceCeedCall(ceed, CeedInternalFree(rows)); + PalaceCeedCall(ceed, CeedInternalFree(cols)); + PalaceCeedCall(ceed, CeedVectorDestroy(vals)); + + *nnz = q; + *rows = new_rows; + *cols = new_cols; + *vals = new_vals; + + // std::cout << " (new NNZ after removal: " << *nnz << ")"; + } + // std::cout << "\n"; +} + +std::unique_ptr OperatorCOOtoCSR(Ceed ceed, CeedInt m, CeedInt n, + CeedSize nnz, CeedInt *rows, + CeedInt *cols, CeedVector vals, + CeedMemType mem, bool set) +{ + // Preallocate CSR memory on host (like PETSc's MatSetValuesCOO). Check for overflow for + // large nonzero counts. + const int nnz_int = mfem::internal::to_int(nnz); + mfem::Array I(m + 1), J(nnz_int), perm(nnz_int), Jmap(nnz_int + 1); + I = 0; + for (int k = 0; k < nnz_int; k++) + { + perm[k] = k; + } + std::sort(perm.begin(), perm.end(), + [&](const int &i, const int &j) { return (rows[i] < rows[j]); }); + + int q = -1; // True nnz index + for (int k = 0; k < nnz_int;) + { + // Sort column entries in the row. + const int row = rows[perm[k]]; + const int start = k; + while (k < nnz_int && rows[perm[k]] == row) + { + k++; + } + std::sort(perm.begin() + start, perm.begin() + k, + [&](const int &i, const int &j) { return (cols[i] < cols[j]); }); + + q++; + I[row + 1] = 1; + J[q] = cols[perm[start]]; + Jmap[q + 1] = 1; + for (int p = start + 1; p < k; p++) + { + if (cols[perm[p]] != cols[perm[p - 1]]) + { + // New nonzero. + q++; + I[row + 1]++; + J[q] = cols[perm[p]]; + Jmap[q + 1] = 1; + } + else + { + Jmap[q + 1]++; + } + } + } + PalaceCeedCall(ceed, CeedInternalFree(&rows)); + PalaceCeedCall(ceed, CeedInternalFree(&cols)); + + // Finalize I, Jmap. + const int nnz_new = q + 1; + I[0] = 0; + for (int i = 0; i < m; i++) + { + I[i + 1] += I[i]; + } + Jmap[0] = 0; + for (int k = 0; k < nnz_new; k++) + { + Jmap[k + 1] += Jmap[k]; + } + + // Construct and fill the final CSR matrix. On GPU, MFEM and Hypre share the same memory + // space. On CPU, the inner nested OpenMP loop (if enabled in MFEM) should be ignored. + auto mat = std::make_unique(m, n, nnz_new); + { + const auto *d_I_old = I.Read(); + auto *d_I = mat->GetI(); + mfem::forall(m + 1, [=] MFEM_HOST_DEVICE(int i) { d_I[i] = d_I_old[i]; }); + } + { + const auto *d_J_old = J.Read(); + auto *d_J = mat->GetJ(); + mfem::forall(nnz_new, [=] MFEM_HOST_DEVICE(int k) { d_J[k] = d_J_old[k]; }); + } + { + auto FillValues = [&](const double *vals_array) + { + const auto *d_perm = perm.Read(); + const auto *d_Jmap = Jmap.Read(); + auto *d_A = mat->GetData(); + if (set) + { + mfem::forall(nnz_new, [=] MFEM_HOST_DEVICE(int k) + { d_A[k] = vals_array[d_perm[d_Jmap[k]]]; }); + } + else + { + mfem::forall(nnz_new, + [=] MFEM_HOST_DEVICE(int k) + { + double sum = 0.0; + for (int p = d_Jmap[k]; p < d_Jmap[k + 1]; p++) + { + sum += vals_array[d_perm[p]]; + } + d_A[k] = sum; + }); + } + }; + Ceed ceed; + const CeedScalar *vals_array; + PalaceCeedCallBackend(CeedVectorGetCeed(vals, &ceed)); + PalaceCeedCall(ceed, CeedVectorGetArrayRead(vals, mem, &vals_array)); + if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem != CEED_MEM_DEVICE) + { + // Copy values to device before filling. + Vector d_vals(nnz_int); + { + auto *d_vals_array = d_vals.HostWrite(); + PalacePragmaOmp(parallel for schedule(static)) + for (int k = 0; k < nnz_int; k++) + { + d_vals_array[k] = vals_array[k]; + } + } + FillValues(d_vals.Read()); + } + else + { + // No copy required. + FillValues(vals_array); + } + PalaceCeedCall(ceed, CeedVectorRestoreArrayRead(vals, &vals_array)); + PalaceCeedCall(ceed, CeedVectorDestroy(&vals)); + } + + return mat; +} + +} // namespace + +std::unique_ptr CeedOperatorFullAssemble(const Operator &op, + bool skip_zeros, bool set) +{ + // Assemble operators on each thread. + std::vector> loc_mat(op.Size()); + PalacePragmaOmp(parallel if (op.Size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(static_cast(id) < op.Size(), + "Out of bounds access for thread number " << id << "!"); + Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(op[id], &ceed)); + + // Check if the operator is empty, otherwise assemble. + CeedInt nsub_ops; + PalaceCeedCall(ceed, CeedOperatorCompositeGetNumSub(op[id], &nsub_ops)); + if (nsub_ops == 0) + { + loc_mat[id] = std::make_unique(op.Height(), op.Width(), 0); + } + else + { + // First, get matrix on master thread in COO format, with rows/cols always on host + // and vals potentially on the device. Process skipping zeros if desired. + CeedSize nnz; + CeedInt *rows, *cols; + CeedVector vals; + CeedMemType mem; + CeedOperatorAssembleCOO(ceed, op[id], skip_zeros, &nnz, &rows, &cols, &vals, &mem); + PalaceCeedCall(ceed, CeedOperatorAssemblyDataStrip(op[id])); + + // Convert COO to CSR (on each thread). The COO memory is free'd internally. + loc_mat[id] = + OperatorCOOtoCSR(ceed, op.Height(), op.Width(), nnz, rows, cols, vals, mem, set); + } + } + + // Add CSR matrix objects from each thread (HYPRE's hypre_CSRMatrixAdd uses threads + // internally as available). We have to scale the duplicated nonzeros when set = true. + auto mat = std::move(loc_mat[0]); + std::unique_ptr b_mat; + if (set && op.Size() > 1) + { + b_mat = std::make_unique(hypre_CSRMatrixClone(*mat, 0)); + hypre_CSRMatrixSetConstantValues(*b_mat, 1.0); + for (std::size_t id = 1; id < op.Size(); id++) + { + hypre_CSRMatrix *b_loc_mat = hypre_CSRMatrixClone(*loc_mat[id], 0); + hypre_CSRMatrixSetConstantValues(b_loc_mat, 1.0); + b_mat = std::make_unique( + hypre_CSRMatrixAdd(1.0, *b_mat, 1.0, b_loc_mat)); + hypre_CSRMatrixDestroy(b_loc_mat); + } + } + for (std::size_t id = 1; id < op.Size(); id++) + { + mat = std::make_unique( + hypre_CSRMatrixAdd(1.0, *mat, 1.0, *loc_mat[id])); + } + if (set && op.Size() > 1) + { + const auto *d_b_data = b_mat->GetData(); + auto *d_data = mat->GetData(); + mfem::forall(mat->NNZ(), + [=] MFEM_HOST_DEVICE(int i) { d_data[i] *= 1.0 / d_b_data[i]; }); + } + + return mat; +} + +std::unique_ptr CeedOperatorCoarsen(const Operator &op_fine, + const FiniteElementSpace &fespace_coarse) +{ + auto SingleOperatorCoarsen = + [&fespace_coarse](Ceed ceed, CeedOperator op_fine, CeedOperator *op_coarse) + { + CeedBasis basis_fine; + CeedElemTopology geom; + PalaceCeedCall(ceed, CeedOperatorGetActiveBasis(op_fine, &basis_fine)); + PalaceCeedCall(ceed, CeedBasisGetTopology(basis_fine, &geom)); + + const auto &geom_data = + fespace_coarse.GetMesh().GetCeedGeomFactorData(ceed).at(GetMfemTopology(geom)); + CeedElemRestriction restr_coarse = fespace_coarse.GetCeedElemRestriction( + ceed, GetMfemTopology(geom), geom_data.indices); + CeedBasis basis_coarse = fespace_coarse.GetCeedBasis(ceed, GetMfemTopology(geom)); + + PalaceCeedCall(ceed, CeedOperatorMultigridLevelCreate(op_fine, nullptr, restr_coarse, + basis_coarse, op_coarse, nullptr, + nullptr)); + PalaceCeedCall(ceed, CeedOperatorAssemblyDataStrip(*op_coarse)); + }; + + // Initialize the coarse operator. + auto op_coarse = std::make_unique(fespace_coarse.GetVSize(), + fespace_coarse.GetVSize()); + + // Assemble the coarse operator by coarsening each sub-operator (over threads, geometry + // types, integrators) of the original fine operator. + PalacePragmaOmp(parallel if (op_fine.Size() > 1)) + { + const int id = utils::GetThreadNum(); + MFEM_ASSERT(static_cast(id) < op_fine.Size(), + "Out of bounds access for thread number " << id << "!"); + Ceed ceed; + PalaceCeedCallBackend(CeedOperatorGetCeed(op_fine[id], &ceed)); + { + Ceed ceed_parent; + PalaceCeedCall(ceed, CeedGetParent(ceed, &ceed_parent)); + if (ceed_parent) + { + ceed = ceed_parent; + } + } + CeedInt nsub_ops_fine; + CeedOperator *sub_ops_fine; + PalaceCeedCall(ceed, CeedOperatorCompositeGetNumSub(op_fine[id], &nsub_ops_fine)); + PalaceCeedCall(ceed, CeedOperatorCompositeGetSubList(op_fine[id], &sub_ops_fine)); + for (CeedInt k = 0; k < nsub_ops_fine; k++) + { + CeedOperator sub_op_coarse; + SingleOperatorCoarsen(ceed, sub_ops_fine[k], &sub_op_coarse); + op_coarse->AddSubOperator(sub_op_coarse); // Sub-operator owned by ceed::Operator + } + } + + // Finalize the operator (call CeedOperatorCheckReady). + op_coarse->Finalize(); + + return op_coarse; +} + +} // namespace palace::ceed diff --git a/palace/fem/libceed/operator.hpp b/palace/fem/libceed/operator.hpp index d352eaea00..25b93410b6 100644 --- a/palace/fem/libceed/operator.hpp +++ b/palace/fem/libceed/operator.hpp @@ -1,77 +1,95 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_OPERATOR_HPP -#define PALACE_LIBCEED_OPERATOR_HPP - -#include -#include -#include - -// Forward declarations of libCEED objects. -typedef struct CeedOperator_private *CeedOperator; -typedef struct CeedVector_private *CeedVector; - -namespace palace -{ - -using Operator = mfem::Operator; -using Vector = mfem::Vector; - -namespace ceed -{ - -// Wrapper class for libCEED's CeedOperator. -class Operator : public palace::Operator -{ -protected: - std::vector ops, ops_t; - std::vector u, v; - Vector dof_multiplicity; - mutable Vector temp_u, temp_v; - -public: - Operator(int h, int w) : palace::Operator(h, w) {} - ~Operator() override; - - CeedOperator operator[](std::size_t i) const { return ops[i]; } - auto Size() const { return ops.size(); } - - void AddOper(CeedOperator op, CeedOperator op_t = nullptr); - - void SetDofMultiplicity(Vector &&mult) { dof_multiplicity = mult; } - - void AssembleDiagonal(Vector &diag) const override; - - void Mult(const Vector &x, Vector &y) const override; - - void AddMult(const Vector &x, Vector &y, const double a = 1.0) const override; - - void MultTranspose(const Vector &x, Vector &y) const override; - - void AddMultTranspose(const Vector &x, Vector &y, const double a = 1.0) const override; -}; - -// A symmetric ceed::Operator replaces *MultTranspose with *Mult (by default, libCEED -// operators do not have a transpose operation). -class SymmetricOperator : public Operator -{ -public: - using Operator::Operator; - - void MultTranspose(const Vector &x, Vector &y) const override { Mult(x, y); } - void AddMultTranspose(const Vector &x, Vector &y, double a = 1.0) const override - { - AddMult(x, y, a); - } -}; - -// Assemble a ceed::Operator as an mfem::SparseMatrix. -std::unique_ptr CeedOperatorFullAssemble(const Operator &op, - bool skip_zeros, bool set); - -} // namespace ceed - -} // namespace palace - -#endif // PALACE_LIBCEED_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_OPERATOR_HPP +#define PALACE_LIBCEED_OPERATOR_HPP + +#include +#include +#include "fem/libceed/ceed.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +class FiniteElementSpace; + +namespace hypre +{ + +class HypreCSRMatrix; + +} // namespace hypre + +namespace ceed +{ + +// +// Wrapper class for libCEED's CeedOperator, supporting composite operator construction and +// application with multiple threads. +// +class Operator : public palace::Operator +{ +protected: + std::vector op, op_t; + std::vector u, v; + Vector dof_multiplicity; + mutable Vector temp; + +public: + Operator(int h, int w); + ~Operator() override; + + CeedOperator operator[](std::size_t i) const { return op[i]; } + + auto Size() const { return op.size(); } + + void AddSubOperator(CeedOperator sub_op, CeedOperator sub_op_t = nullptr); + + void Finalize(); + + void DestroyAssemblyData() const; + + void SetDofMultiplicity(Vector &&mult) { dof_multiplicity = std::move(mult); } + + void AssembleDiagonal(Vector &diag) const override; + + void Mult(const Vector &x, Vector &y) const override; + + void AddMult(const Vector &x, Vector &y, const double a = 1.0) const override; + + void MultTranspose(const Vector &x, Vector &y) const override; + + void AddMultTranspose(const Vector &x, Vector &y, const double a = 1.0) const override; +}; + +// A symmetric ceed::Operator replaces *MultTranspose with *Mult (by default, libCEED +// operators do not have a transpose operation). +class SymmetricOperator : public Operator +{ +public: + using Operator::Operator; + + void MultTranspose(const Vector &x, Vector &y) const override { Mult(x, y); } + void AddMultTranspose(const Vector &x, Vector &y, double a = 1.0) const override + { + AddMult(x, y, a); + } +}; + +// Assemble a ceed::Operator as a CSR matrix. +std::unique_ptr CeedOperatorFullAssemble(const Operator &op, + bool skip_zeros, bool set); + +// Construct a coarse-level ceed::Operator, reusing the quadrature data and quadrature +// function from the fine-level operator. Only available for square, symmetric operators +// (same input and output spaces). +std::unique_ptr CeedOperatorCoarsen(const Operator &op_fine, + const FiniteElementSpace &fespace_coarse); + +} // namespace ceed + +} // namespace palace + +#endif // PALACE_LIBCEED_OPERATOR_HPP diff --git a/palace/fem/libceed/restriction.cpp b/palace/fem/libceed/restriction.cpp index 208ef4c678..704cd0e030 100644 --- a/palace/fem/libceed/restriction.cpp +++ b/palace/fem/libceed/restriction.cpp @@ -1,298 +1,428 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "restriction.hpp" - -#include "fem/fespace.hpp" -#include "fem/libceed/hash.hpp" -#include "fem/libceed/utils.hpp" -#include "utils/omp.hpp" - -namespace palace::ceed -{ - -namespace internal -{ - -static std::unordered_map restr_map; - -void ClearRestrictionCache() -{ - for (auto [k, v] : restr_map) - { - Ceed ceed; - PalaceCeedCallBackend(CeedElemRestrictionGetCeed(v, &ceed)); - PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&v)); - } - restr_map.clear(); -} - -} // namespace internal - -namespace -{ - -void InitLexicoRestr(const mfem::ParFiniteElementSpace &fespace, - const std::vector &indices, bool use_bdr, Ceed ceed, - CeedElemRestriction *restr) -{ - const std::size_t ne = indices.size(); - const mfem::FiniteElement &fe = - use_bdr ? *fespace.GetBE(indices[0]) : *fespace.GetFE(indices[0]); - const int P = fe.GetDof(); - const mfem::TensorBasisElement *tfe = dynamic_cast(&fe); - const mfem::Array &dof_map = tfe->GetDofMap(); - CeedInt compstride = - (fespace.GetOrdering() == mfem::Ordering::byVDIM) ? 1 : fespace.GetNDofs(); - const int stride = (compstride == 1) ? fespace.GetVDim() : 1; - mfem::Array tp_el_dof(ne * P), dofs; - mfem::Array tp_el_orients(ne * P); - bool use_el_orients = false; - mfem::DofTransformation dof_trans; - - for (std::size_t i = 0; i < ne; i++) - { - // No need to handle DofTransformation for tensor-product elements. - const int elem_index = indices[i]; - if (use_bdr) - { - fespace.GetBdrElementDofs(elem_index, dofs, dof_trans); - } - else - { - fespace.GetElementDofs(elem_index, dofs, dof_trans); - } - MFEM_VERIFY(!dof_trans.GetDofTransformation(), - "Unexpected DofTransformation for lexicographic element " - "restriction."); - for (int j = 0; j < P; j++) - { - const int sdid = dof_map[j]; // signed - const int did = (sdid >= 0) ? sdid : -1 - sdid; - const int sgid = dofs[did]; // signed - const int gid = (sgid >= 0) ? sgid : -1 - sgid; - tp_el_dof[j + P * i] = stride * gid; - tp_el_orients[j + P * i] = (sgid >= 0 && sdid < 0) || (sgid < 0 && sdid >= 0); - use_el_orients = use_el_orients || tp_el_orients[j + P * i]; - } - } - - if (use_el_orients) - { - PalaceCeedCall(ceed, CeedElemRestrictionCreateOriented( - ceed, ne, P, fespace.GetVDim(), compstride, - fespace.GetVDim() * fespace.GetNDofs(), CEED_MEM_HOST, - CEED_COPY_VALUES, tp_el_dof.GetData(), tp_el_orients.GetData(), - restr)); - } - else - { - PalaceCeedCall(ceed, CeedElemRestrictionCreate( - ceed, ne, P, fespace.GetVDim(), compstride, - fespace.GetVDim() * fespace.GetNDofs(), CEED_MEM_HOST, - CEED_COPY_VALUES, tp_el_dof.GetData(), restr)); - } -} - -void InitNativeRestr(const mfem::ParFiniteElementSpace &fespace, - const std::vector &indices, bool use_bdr, bool has_dof_trans, - bool is_interp_range, Ceed ceed, CeedElemRestriction *restr) -{ - const std::size_t ne = indices.size(); - const mfem::FiniteElement &fe = - use_bdr ? *fespace.GetBE(indices[0]) : *fespace.GetFE(indices[0]); - const int P = fe.GetDof(); - CeedInt compstride = - (fespace.GetOrdering() == mfem::Ordering::byVDIM) ? 1 : fespace.GetNDofs(); - const int stride = (compstride == 1) ? fespace.GetVDim() : 1; - mfem::Array tp_el_dof(ne * P), dofs; - mfem::Array tp_el_orients; - mfem::Array tp_el_curl_orients; - bool use_el_orients = false; - mfem::DofTransformation dof_trans; - mfem::Vector el_trans_j; - if (!has_dof_trans) - { - tp_el_orients.SetSize(ne * P); - } - else - { - tp_el_curl_orients.SetSize(ne * P * 3, 0); - el_trans_j.SetSize(P); - } - - for (std::size_t i = 0; i < ne; i++) - { - const auto e = indices[i]; - if (use_bdr) - { - fespace.GetBdrElementDofs(e, dofs, dof_trans); - } - else - { - fespace.GetElementDofs(e, dofs, dof_trans); - } - if (!has_dof_trans) - { - for (int j = 0; j < P; j++) - { - const int sgid = dofs[j]; // signed - const int gid = (sgid >= 0) ? sgid : -1 - sgid; - tp_el_dof[j + P * i] = stride * gid; - tp_el_orients[j + P * i] = (sgid < 0); - use_el_orients = use_el_orients || tp_el_orients[j + P * i]; - } - } - else - { - for (int j = 0; j < P; j++) - { - const int sgid = dofs[j]; // signed - const int gid = (sgid >= 0) ? sgid : -1 - sgid; - tp_el_dof[j + P * i] = stride * gid; - - // Fill column j of element tridiagonal matrix tp_el_curl_orients. - el_trans_j = 0.0; - el_trans_j(j) = 1.0; - if (is_interp_range) - { - dof_trans.InvTransformDual(el_trans_j); - } - else - { - dof_trans.InvTransformPrimal(el_trans_j); - } - double sign_j = (sgid < 0) ? -1.0 : 1.0; - tp_el_curl_orients[3 * (j + 0 + P * i) + 1] = - static_cast(sign_j * el_trans_j(j + 0)); - if (j > 0) - { - tp_el_curl_orients[3 * (j - 1 + P * i) + 2] = - static_cast(sign_j * el_trans_j(j - 1)); - } - if (j < P - 1) - { - tp_el_curl_orients[3 * (j + 1 + P * i) + 0] = - static_cast(sign_j * el_trans_j(j + 1)); - } -#ifdef MFEM_DEBUG - int nnz = 0; - for (int k = 0; k < P; k++) - { - if (k < j - 1 && k > j + 1 && el_trans_j(k) != 0.0) - { - nnz++; - } - } - MFEM_ASSERT(nnz == 0, "Element transformation matrix is not tridiagonal at column " - << j << " (nnz = " << nnz << ")!"); -#endif - } - } - } - - if (has_dof_trans) - { - PalaceCeedCall(ceed, CeedElemRestrictionCreateCurlOriented( - ceed, ne, P, fespace.GetVDim(), compstride, - fespace.GetVDim() * fespace.GetNDofs(), CEED_MEM_HOST, - CEED_COPY_VALUES, tp_el_dof.GetData(), - tp_el_curl_orients.GetData(), restr)); - } - else if (use_el_orients) - { - PalaceCeedCall(ceed, CeedElemRestrictionCreateOriented( - ceed, ne, P, fespace.GetVDim(), compstride, - fespace.GetVDim() * fespace.GetNDofs(), CEED_MEM_HOST, - CEED_COPY_VALUES, tp_el_dof.GetData(), tp_el_orients.GetData(), - restr)); - } - else - { - PalaceCeedCall(ceed, CeedElemRestrictionCreate( - ceed, ne, P, fespace.GetVDim(), compstride, - fespace.GetVDim() * fespace.GetNDofs(), CEED_MEM_HOST, - CEED_COPY_VALUES, tp_el_dof.GetData(), restr)); - } -} - -} // namespace - -void InitRestriction(const mfem::ParFiniteElementSpace &fespace, - const std::vector &indices, bool use_bdr, bool is_interp, - bool is_range, Ceed ceed, CeedElemRestriction *restr) -{ - // Check for fespace -> restriction in hash table. - // The restriction for an interpolator range space is slightly different as - // the output is a primal vector instead of a dual vector, and lexicographic - // ordering is never used (no use of tensor-product basis). - // A palace::FiniteElementSpace can be checked for uniqueness so we can use this to reuse - // restrictions across different libCEED operators. For mixed meshes or multiple threads, - // the space elements are partitioned in a non-overlapping manner so we just need the - // index of the first element, and if it is a domain or boundary element, to determine the - // partition. - const FiniteElementSpace *restr_fespace = - dynamic_cast(&fespace); - MFEM_VERIFY(restr_fespace, "ceed::InitRestriction requires a palace::FiniteElementSpace " - "object for space comparisons!"); - const mfem::FiniteElement &fe = - use_bdr ? *fespace.GetBE(indices[0]) : *fespace.GetFE(indices[0]); - const mfem::TensorBasisElement *tfe = dynamic_cast(&fe); - const bool vector = fe.GetRangeType() == mfem::FiniteElement::VECTOR; - mfem::Array dofs; - mfem::DofTransformation dof_trans; - if (use_bdr) - { - fespace.GetBdrElementDofs(indices[0], dofs, dof_trans); - } - else - { - fespace.GetElementDofs(indices[0], dofs, dof_trans); - } - const bool has_dof_trans = dof_trans.GetDofTransformation() && !dof_trans.IsIdentity(); - const bool unique_interp_restr = - (is_interp && tfe && tfe->GetDofMap().Size() > 0 && !vector); - const bool unique_interp_range_restr = (is_interp && is_range && has_dof_trans); - internal::RestrKey key(ceed, *restr_fespace, indices[0], use_bdr, unique_interp_restr, - unique_interp_range_restr); - - // Initialize or retrieve key values (avoid simultaneous search and write). - auto restr_itr = internal::restr_map.end(); - PalacePragmaOmp(critical(InitRestriction)) - { - restr_itr = internal::restr_map.find(key); - } - if (restr_itr == internal::restr_map.end()) - { - const bool lexico = (tfe && tfe->GetDofMap().Size() > 0 && !vector && !is_interp); - if (lexico) - { - // Lexicographic ordering using dof_map. - InitLexicoRestr(fespace, indices, use_bdr, ceed, restr); - } - else - { - // Native ordering. - InitNativeRestr(fespace, indices, use_bdr, has_dof_trans, is_interp && is_range, ceed, - restr); - } - PalacePragmaOmp(critical(InitRestriction)) - { - internal::restr_map[key] = *restr; - } - // std::cout << "New element restriction (" << ceed << ", " << &fespace - // << ", " << indices[0] << ", " << use_bdr - // << ", " << unique_interp_restr - // << ", " << unique_interp_range_restr << ")\n"; - } - else - { - *restr = restr_itr->second; - // std::cout << "Reusing element restriction (" << ceed << ", " << &fespace - // << ", " << indices[0] << ", " << use_bdr << ", " - // << ", " << unique_interp_restr - // << ", " << unique_interp_range_restr << ")\n"; - } -} - -} // namespace palace::ceed +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "restriction.hpp" + +#include +#include "utils/omp.hpp" + +namespace palace::ceed +{ + +namespace +{ + +const mfem::FiniteElement *GetTraceElement(const mfem::FiniteElementSpace &fespace, + const std::vector &indices) +{ + int elem_id, face_info; + fespace.GetMesh()->GetBdrElementAdjacentElement(indices[0], elem_id, face_info); + mfem::Geometry::Type face_geom = fespace.GetMesh()->GetBdrElementGeometry(indices[0]); + return fespace.GetTraceElement(elem_id, face_geom); +}; + +mfem::Array GetFaceDofsFromAdjacentElement(const mfem::FiniteElementSpace &fespace, + mfem::DofTransformation &dof_trans, + const int P, const int e) +{ + // Get coordinates of face dofs. + int elem_id, face_info; + fespace.GetMesh()->GetBdrElementAdjacentElement(e, elem_id, face_info); + mfem::Geometry::Type face_geom = fespace.GetMesh()->GetBdrElementGeometry(e); + face_info = fespace.GetMesh()->EncodeFaceInfo( + fespace.GetMesh()->DecodeFaceInfoLocalIndex(face_info), + mfem::Geometry::GetInverseOrientation( + face_geom, fespace.GetMesh()->DecodeFaceInfoOrientation(face_info))); + mfem::IntegrationPointTransformation Loc1; + fespace.GetMesh()->GetLocalFaceTransformation(fespace.GetMesh()->GetBdrElementType(e), + fespace.GetMesh()->GetElementType(elem_id), + Loc1.Transf, face_info); + const mfem::FiniteElement *face_el = fespace.GetTraceElement(elem_id, face_geom); + MFEM_VERIFY(dynamic_cast(face_el), + "Mesh requires nodal Finite Element."); + mfem::IntegrationRule face_ir(face_el->GetDof()); + Loc1.Transf.ElementNo = elem_id; + Loc1.Transf.mesh = fespace.GetMesh(); + Loc1.Transf.ElementType = mfem::ElementTransformation::ELEMENT; + Loc1.Transform(face_el->GetNodes(), face_ir); + mfem::DenseMatrix face_pm; + fespace.GetMesh()->GetNodes()->GetVectorValues(Loc1.Transf, face_ir, face_pm); + + // Get coordinates of element dofs. + mfem::DenseMatrix elem_pm; + const mfem::FiniteElement *fe_elem = fespace.GetFE(elem_id); + mfem::IsoparametricTransformation T; + fespace.GetMesh()->GetElementTransformation(elem_id, &T); + T.Transform(fe_elem->GetNodes(), elem_pm); + + // Find the dofs. + double tol = 1E-5; + mfem::Array elem_dofs, dofs(P); + fespace.GetElementDofs(elem_id, elem_dofs, dof_trans); + for (int l = 0; l < P; l++) + { + double norm2_f = 0.0; + for (int m = 0; m < face_pm.Height(); m++) + { + norm2_f += face_pm(m, l) * face_pm(m, l); + } + + bool found_match = false; + MFEM_CONTRACT_VAR(found_match); // silence unused warning + for (int m = 0; m < elem_pm.Width(); m++) + { + double norm2_e = 0.0; + for (int n = 0; n < elem_pm.Height(); n++) + { + norm2_e += elem_pm(n, m) * elem_pm(n, m); + } + double relative_tol = tol * std::max(std::max(norm2_f, norm2_e), 1.0E-6); + double diff = 0.0; + for (int o = 0; o < elem_pm.Height(); o++) + { + diff += std::fabs(elem_pm(o, m) - face_pm(o, l)); + } + if (diff <= relative_tol) + { + dofs[l] = elem_dofs[m]; + found_match = true; + break; + } + } + + MFEM_ASSERT(found_match, + [&]() + { + std::stringstream msg; + msg << "l " << l << '\n'; + msg << "elem_dofs\n"; + for (auto x : elem_dofs) + msg << x << ' '; + + msg << "\ndofs\n"; + for (auto x : dofs) + msg << x << ' '; + msg << '\n'; + return msg.str(); + }()); + } + + return dofs; +}; + +void InitLexicoRestr(const mfem::FiniteElementSpace &fespace, + const std::vector &indices, bool use_bdr, Ceed ceed, + CeedElemRestriction *restr) +{ + const std::size_t num_elem = indices.size(); + const mfem::FiniteElement *fe; + bool face_flg = false; + if (!use_bdr) + { + fe = fespace.GetFE(indices[0]); + } + else + { + fe = fespace.GetBE(indices[0]); + if (!fe) + { + fe = GetTraceElement(fespace, indices); + face_flg = true; + } + } + const int P = fe->GetDof(); + const mfem::TensorBasisElement *tfe = dynamic_cast(fe); + const mfem::Array &dof_map = tfe->GetDofMap(); + const bool dof_map_is_identity = dof_map.Size() == 0; + const CeedInt comp_stride = + (fespace.GetVDim() == 1 || fespace.GetOrdering() == mfem::Ordering::byVDIM) + ? 1 + : fespace.GetNDofs(); + const int stride = + (fespace.GetOrdering() == mfem::Ordering::byVDIM) ? fespace.GetVDim() : 1; + mfem::Array tp_el_dof(num_elem * P); + mfem::Array tp_el_orients(num_elem * P); + int use_el_orients = 0; + + PalacePragmaOmp(parallel reduction(+ : use_el_orients)) + { + mfem::Array dofs; + mfem::DofTransformation dof_trans; + bool use_el_orients_loc = false; + + PalacePragmaOmp(for schedule(static)) + for (std::size_t i = 0; i < num_elem; i++) + { + // No need to handle DofTransformation for tensor-product elements. + const int e = indices[i]; + if (use_bdr) + { + if (!face_flg) + { + fespace.GetBdrElementDofs(e, dofs, dof_trans); + } + else + { + dofs = GetFaceDofsFromAdjacentElement(fespace, dof_trans, P, e); + } + } + else + { + fespace.GetElementDofs(e, dofs, dof_trans); + } + MFEM_VERIFY(!dof_trans.GetDofTransformation(), + "Unexpected DofTransformation for lexicographic element " + "restriction."); + for (int j = 0; j < P; j++) + { + const int sdid = dof_map_is_identity ? j : dof_map[j]; // signed + const int did = (sdid >= 0) ? sdid : -1 - sdid; + const int sgid = dofs[did]; // signed + const int gid = (sgid >= 0) ? sgid : -1 - sgid; + tp_el_dof[j + P * i] = stride * gid; + tp_el_orients[j + P * i] = (sgid >= 0 && sdid < 0) || (sgid < 0 && sdid >= 0); + use_el_orients_loc = use_el_orients_loc || tp_el_orients[j + P * i]; + } + } + use_el_orients += use_el_orients_loc; + } + + if (use_el_orients) + { + PalaceCeedCall(ceed, CeedElemRestrictionCreateOriented( + ceed, num_elem, P, fespace.GetVDim(), comp_stride, + fespace.GetVDim() * fespace.GetNDofs(), CEED_MEM_HOST, + CEED_COPY_VALUES, tp_el_dof.GetData(), tp_el_orients.GetData(), + restr)); + } + else + { + PalaceCeedCall(ceed, CeedElemRestrictionCreate( + ceed, num_elem, P, fespace.GetVDim(), comp_stride, + fespace.GetVDim() * fespace.GetNDofs(), CEED_MEM_HOST, + CEED_COPY_VALUES, tp_el_dof.GetData(), restr)); + } +} + +void InitNativeRestr(const mfem::FiniteElementSpace &fespace, + const std::vector &indices, bool use_bdr, bool is_interp_range, + Ceed ceed, CeedElemRestriction *restr) +{ + const std::size_t num_elem = indices.size(); + const mfem::FiniteElement *fe; + bool face_flg = false; + if (!use_bdr) + { + fe = fespace.GetFE(indices[0]); + } + else + { + fe = fespace.GetBE(indices[0]); + if (!fe) + { + fe = GetTraceElement(fespace, indices); + face_flg = true; + } + } + const int P = fe->GetDof(); + const CeedInt comp_stride = + (fespace.GetVDim() == 1 || fespace.GetOrdering() == mfem::Ordering::byVDIM) + ? 1 + : fespace.GetNDofs(); + const int stride = + (fespace.GetOrdering() == mfem::Ordering::byVDIM) ? fespace.GetVDim() : 1; + const bool has_dof_trans = [&]() + { + if (fespace.GetMesh()->Dimension() < 3) + { + return false; + } + const auto geom = fe->GetGeomType(); + const auto *dof_trans = fespace.FEColl()->DofTransformationForGeometry(geom); + return (dof_trans && !dof_trans->IsIdentity()); + }(); + mfem::Array tp_el_dof(num_elem * P); + mfem::Array tp_el_orients; + mfem::Array tp_el_curl_orients; + if (!has_dof_trans) + { + tp_el_orients.SetSize(num_elem * P); + } + else + { + tp_el_curl_orients.SetSize(num_elem * P * 3, 0); + } + int use_el_orients = 0; + + PalacePragmaOmp(parallel reduction(+ : use_el_orients)) + { + mfem::Array dofs; + mfem::DofTransformation dof_trans; + mfem::Vector el_trans_j; + if (has_dof_trans) + { + el_trans_j.SetSize(P); + el_trans_j = 0.0; + } + bool use_el_orients_loc = false; + + PalacePragmaOmp(for schedule(static)) + for (std::size_t i = 0; i < num_elem; i++) + { + const auto e = indices[i]; + if (use_bdr) + { + if (!face_flg) + { + fespace.GetBdrElementDofs(e, dofs, dof_trans); + } + else + { + dofs = GetFaceDofsFromAdjacentElement(fespace, dof_trans, P, e); + } + } + else + { + fespace.GetElementDofs(e, dofs, dof_trans); + } + if (!has_dof_trans) + { + for (int j = 0; j < P; j++) + { + const int sgid = dofs[j]; // signed + const int gid = (sgid >= 0) ? sgid : -1 - sgid; + tp_el_dof[j + P * i] = stride * gid; + tp_el_orients[j + P * i] = (sgid < 0); + use_el_orients_loc = use_el_orients_loc || tp_el_orients[j + P * i]; + } + } + else + { + for (int j = 0; j < P; j++) + { + const int sgid = dofs[j]; // signed + const int gid = (sgid >= 0) ? sgid : -1 - sgid; + tp_el_dof[j + P * i] = stride * gid; + + // Fill column j of element tridiagonal matrix tp_el_curl_orients. + el_trans_j(j) = 1.0; + if (is_interp_range) + { + dof_trans.InvTransformDual(el_trans_j); + } + else + { + dof_trans.InvTransformPrimal(el_trans_j); + } + double sign_j = (sgid < 0) ? -1.0 : 1.0; + tp_el_curl_orients[3 * (j + 0 + P * i) + 1] = + static_cast(sign_j * el_trans_j(j)); + if (j > 0) + { + tp_el_curl_orients[3 * (j - 1 + P * i) + 2] = + static_cast(sign_j * el_trans_j(j - 1)); + } + if (j < P - 1) + { + tp_el_curl_orients[3 * (j + 1 + P * i) + 0] = + static_cast(sign_j * el_trans_j(j + 1)); + } + +#if defined(MFEM_DEBUG) + // Debug check that transformation is actually tridiagonal. + int nnz = 0; + for (int k = 0; k < P; k++) + { + if ((k < j - 1 || k > j + 1) && el_trans_j(k) != 0.0) + { + nnz++; + } + } + MFEM_ASSERT(nnz == 0, + "Element transformation matrix is not tridiagonal at column " + << j << " (nnz = " << nnz << ")!"); +#endif + + // Zero out column vector for next iteration. + el_trans_j(j) = 0.0; + if (j > 0) + { + el_trans_j(j - 1) = 0.0; + } + if (j < P - 1) + { + el_trans_j(j + 1) = 0.0; + } + } + } + } + use_el_orients += use_el_orients_loc; + } + + if (has_dof_trans) + { + PalaceCeedCall(ceed, CeedElemRestrictionCreateCurlOriented( + ceed, num_elem, P, fespace.GetVDim(), comp_stride, + fespace.GetVDim() * fespace.GetNDofs(), CEED_MEM_HOST, + CEED_COPY_VALUES, tp_el_dof.GetData(), + tp_el_curl_orients.GetData(), restr)); + } + else if (use_el_orients) + { + PalaceCeedCall(ceed, CeedElemRestrictionCreateOriented( + ceed, num_elem, P, fespace.GetVDim(), comp_stride, + fespace.GetVDim() * fespace.GetNDofs(), CEED_MEM_HOST, + CEED_COPY_VALUES, tp_el_dof.GetData(), tp_el_orients.GetData(), + restr)); + } + else + { + PalaceCeedCall(ceed, CeedElemRestrictionCreate( + ceed, num_elem, P, fespace.GetVDim(), comp_stride, + fespace.GetVDim() * fespace.GetNDofs(), CEED_MEM_HOST, + CEED_COPY_VALUES, tp_el_dof.GetData(), restr)); + } +} + +} // namespace + +void InitRestriction(const mfem::FiniteElementSpace &fespace, + const std::vector &indices, bool use_bdr, bool is_interp, + bool is_interp_range, Ceed ceed, CeedElemRestriction *restr) +{ + MFEM_ASSERT(!indices.empty(), "Empty element index set for libCEED element restriction!"); + if constexpr (false) + { + std::cout << "New element restriction (" << ceed << ", " << &fespace << ", " + << indices[0] << ", " << use_bdr << ", " << is_interp << ", " + << is_interp_range << ")\n"; + } + const mfem::FiniteElement *fe; + if (!use_bdr) + { + fe = fespace.GetFE(indices[0]); + } + else + { + fe = fespace.GetBE(indices[0]); + if (!fe) + { + fe = GetTraceElement(fespace, indices); + } + } + const mfem::TensorBasisElement *tfe = dynamic_cast(fe); + const bool vector = fe->GetRangeType() == mfem::FiniteElement::VECTOR; + const bool lexico = (tfe && !vector && !is_interp); + if (lexico) + { + // Lexicographic ordering using dof_map. + InitLexicoRestr(fespace, indices, use_bdr, ceed, restr); + } + else + { + // Native ordering. + InitNativeRestr(fespace, indices, use_bdr, is_interp_range, ceed, restr); + } +} + +} // namespace palace::ceed diff --git a/palace/fem/libceed/restriction.hpp b/palace/fem/libceed/restriction.hpp index 22218eef4f..8abc3291a1 100644 --- a/palace/fem/libceed/restriction.hpp +++ b/palace/fem/libceed/restriction.hpp @@ -1,36 +1,26 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_RESTRICTION_HPP -#define PALACE_LIBCEED_RESTRICTION_HPP - -#include -#include -#include -#include - -namespace palace::ceed -{ - -void InitRestriction(const mfem::ParFiniteElementSpace &fespace, - const std::vector &indices, bool use_bdr, bool is_interp, - bool is_range, Ceed ceed, CeedElemRestriction *restr); - -inline void InitRestriction(const mfem::ParFiniteElementSpace &fespace, - const std::vector &indices, bool use_bdr, Ceed ceed, - CeedElemRestriction *restr) -{ - InitRestriction(fespace, indices, use_bdr, false, false, ceed, restr); -} - -namespace internal -{ - -// Destroy the cached CeedElemRestriction objects. -void ClearRestrictionCache(); - -} // namespace internal - -} // namespace palace::ceed - -#endif // PALACE_LIBCEED_RESTRICTION_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_RESTRICTION_HPP +#define PALACE_LIBCEED_RESTRICTION_HPP + +#include +#include "fem/libceed/ceed.hpp" + +namespace mfem +{ + +class FiniteElementSpace; + +} // namespace mfem + +namespace palace::ceed +{ + +void InitRestriction(const mfem::FiniteElementSpace &fespace, + const std::vector &indices, bool use_bdr, bool is_interp, + bool is_interp_range, Ceed ceed, CeedElemRestriction *restr); + +} // namespace palace::ceed + +#endif // PALACE_LIBCEED_RESTRICTION_HPP diff --git a/palace/fem/libceed/utils.cpp b/palace/fem/libceed/utils.cpp index 9ada7e1aba..acee0823a9 100644 --- a/palace/fem/libceed/utils.cpp +++ b/palace/fem/libceed/utils.cpp @@ -1,108 +1,108 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "utils.hpp" - -#include "fem/libceed/basis.hpp" -#include "fem/libceed/restriction.hpp" -#include "utils/omp.hpp" - -#if defined(MFEM_USE_OPENMP) -#include -#endif - -namespace palace::ceed -{ - -namespace internal -{ - -static std::vector ceeds; - -const std::vector &GetCeedObjects() -{ - return ceeds; -} - -} // namespace internal - -void Initialize(const char *resource, const char *jit_source_dir) -{ - PalacePragmaOmp(parallel) - { - PalacePragmaOmp(master) - { -#if defined(MFEM_USE_OPENMP) - const int nt = omp_get_num_threads(); -#else - const int nt = 1; -#endif - internal::ceeds.resize(nt, nullptr); - } - } - - // Master thread initializes all Ceed objects (ineherently sequential anyway due to shared - // resources). - for (std::size_t i = 0; i < internal::ceeds.size(); i++) - { - int ierr = CeedInit(resource, &internal::ceeds[i]); - MFEM_VERIFY(!ierr, "Failed to initialize libCEED with resource " << resource << "!"); - Ceed ceed = internal::ceeds[i]; - - // Configure error handling (allow errors to be handled by PalaceCeedCallBackend or - // PalaceCeedCall). - PalaceCeedCall(ceed, CeedSetErrorHandler(ceed, CeedErrorStore)); - - // Configure QFunction search path. - if (jit_source_dir) - { - PalaceCeedCall(ceed, CeedAddJitSourceRoot(ceed, jit_source_dir)); - } - } -} - -void Finalize() -{ - // Destroy global basis and element restriction caches. - internal::ClearBasisCache(); - internal::ClearRestrictionCache(); - - // Destroy Ceed context(s). - for (std::size_t i = 0; i < internal::ceeds.size(); i++) - { - int ierr = CeedDestroy(&internal::ceeds[i]); - MFEM_VERIFY(!ierr, "Failed to finalize libCEED!"); - } - internal::ceeds.clear(); -} - -std::string Print() -{ - MFEM_VERIFY(internal::GetCeedObjects().size() > 0, - "libCEED must be initialized before querying the active backend!"); - Ceed ceed = internal::GetCeedObjects()[0]; - const char *ceed_resource; - PalaceCeedCall(ceed, CeedGetResource(ceed, &ceed_resource)); - return std::string(ceed_resource); -} - -void InitCeedVector(const mfem::Vector &v, Ceed ceed, CeedVector *cv) -{ - CeedMemType mem; - const CeedScalar *data; - PalaceCeedCall(ceed, CeedVectorCreate(ceed, v.Size(), cv)); - PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem)); - if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE) - { - data = v.Read(); - } - else - { - data = v.HostRead(); - mem = CEED_MEM_HOST; - } - PalaceCeedCall( - ceed, CeedVectorSetArray(*cv, mem, CEED_USE_POINTER, const_cast(data))); -} - -} // namespace palace::ceed +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "utils.hpp" + +#include "fem/libceed/basis.hpp" +#include "fem/libceed/restriction.hpp" +#include "utils/omp.hpp" + +#if defined(MFEM_USE_OPENMP) +#include +#endif + +namespace palace::ceed +{ + +namespace internal +{ + +static std::vector ceeds; + +const std::vector &GetCeedObjects() +{ + return ceeds; +} + +} // namespace internal + +void Initialize(const char *resource, const char *jit_source_dir) +{ + PalacePragmaOmp(parallel) + { + PalacePragmaOmp(master) + { +#if defined(MFEM_USE_OPENMP) + const int nt = omp_get_num_threads(); +#else + const int nt = 1; +#endif + internal::ceeds.resize(nt, nullptr); + } + } + + // Master thread initializes all Ceed objects (ineherently sequential anyway due to shared + // resources). + for (std::size_t i = 0; i < internal::ceeds.size(); i++) + { + int ierr = CeedInit(resource, &internal::ceeds[i]); + MFEM_VERIFY(!ierr, "Failed to initialize libCEED with resource " << resource << "!"); + Ceed ceed = internal::ceeds[i]; + + // Configure error handling (allow errors to be handled by PalaceCeedCallBackend or + // PalaceCeedCall). + PalaceCeedCall(ceed, CeedSetErrorHandler(ceed, CeedErrorStore)); + + // Configure QFunction search path. + if (jit_source_dir) + { + PalaceCeedCall(ceed, CeedAddJitSourceRoot(ceed, jit_source_dir)); + } + } +} + +void Finalize() +{ + // Destroy global basis and element restriction caches. + internal::ClearBasisCache(); + internal::ClearRestrictionCache(); + + // Destroy Ceed context(s). + for (std::size_t i = 0; i < internal::ceeds.size(); i++) + { + int ierr = CeedDestroy(&internal::ceeds[i]); + MFEM_VERIFY(!ierr, "Failed to finalize libCEED!"); + } + internal::ceeds.clear(); +} + +std::string Print() +{ + MFEM_VERIFY(internal::GetCeedObjects().size() > 0, + "libCEED must be initialized before querying the active backend!"); + Ceed ceed = internal::GetCeedObjects()[0]; + const char *ceed_resource; + PalaceCeedCall(ceed, CeedGetResource(ceed, &ceed_resource)); + return std::string(ceed_resource); +} + +void InitCeedVector(const mfem::Vector &v, Ceed ceed, CeedVector *cv) +{ + CeedMemType mem; + const CeedScalar *data; + PalaceCeedCall(ceed, CeedVectorCreate(ceed, v.Size(), cv)); + PalaceCeedCall(ceed, CeedGetPreferredMemType(ceed, &mem)); + if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && mem == CEED_MEM_DEVICE) + { + data = v.Read(); + } + else + { + data = v.HostRead(); + mem = CEED_MEM_HOST; + } + PalaceCeedCall( + ceed, CeedVectorSetArray(*cv, mem, CEED_USE_POINTER, const_cast(data))); +} + +} // namespace palace::ceed diff --git a/palace/fem/libceed/utils.hpp b/palace/fem/libceed/utils.hpp index 6009727c64..f5a4ba9c4f 100644 --- a/palace/fem/libceed/utils.hpp +++ b/palace/fem/libceed/utils.hpp @@ -1,63 +1,63 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_UTILS_HPP -#define PALACE_LIBCEED_UTILS_HPP - -#include -#include -#include -#include -#include - -#define PalaceCeedCall(ceed, ...) \ - do \ - { \ - int ierr_ = __VA_ARGS__; \ - if (ierr_ != CEED_ERROR_SUCCESS) \ - { \ - const char *msg; \ - CeedGetErrorMessage(ceed, &msg); \ - MFEM_ABORT(msg); \ - } \ - } while (0) - -#define PalaceCeedCallBackend(...) \ - do \ - { \ - int ierr_ = __VA_ARGS__; \ - if (ierr_ != CEED_ERROR_SUCCESS) \ - { \ - MFEM_ABORT("libCEED encountered a fatal error!"); \ - } \ - } while (0) - -#define PalaceQFunctionRelativePath(path) strstr(path, "qfunctions") - -namespace palace::ceed -{ - -// Call libCEED's CeedInit for the given resource. The specific device to use is set prior -// to this using mfem::Device. -void Initialize(const char *resource, const char *jit_source_dir); - -// Finalize libCEED with CeedDestroy. -void Finalize(); - -// Get the configured libCEED backend. -std::string Print(); - -// Initialize a CeedVector from an mfem::Vector. -void InitCeedVector(const mfem::Vector &v, Ceed ceed, CeedVector *cv); - -namespace internal -{ - -// Access the Ceed objects initialized by CeedInit. -const std::vector &GetCeedObjects(); - -} // namespace internal - -} // namespace palace::ceed - -#endif // PALACE_LIBCEED_UTILS_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_UTILS_HPP +#define PALACE_LIBCEED_UTILS_HPP + +#include +#include +#include +#include +#include + +#define PalaceCeedCall(ceed, ...) \ + do \ + { \ + int ierr_ = __VA_ARGS__; \ + if (ierr_ != CEED_ERROR_SUCCESS) \ + { \ + const char *msg; \ + CeedGetErrorMessage(ceed, &msg); \ + MFEM_ABORT(msg); \ + } \ + } while (0) + +#define PalaceCeedCallBackend(...) \ + do \ + { \ + int ierr_ = __VA_ARGS__; \ + if (ierr_ != CEED_ERROR_SUCCESS) \ + { \ + MFEM_ABORT("libCEED encountered a fatal error!"); \ + } \ + } while (0) + +#define PalaceQFunctionRelativePath(path) strstr(path, "qfunctions") + +namespace palace::ceed +{ + +// Call libCEED's CeedInit for the given resource. The specific device to use is set prior +// to this using mfem::Device. +void Initialize(const char *resource, const char *jit_source_dir); + +// Finalize libCEED with CeedDestroy. +void Finalize(); + +// Get the configured libCEED backend. +std::string Print(); + +// Initialize a CeedVector from an mfem::Vector. +void InitCeedVector(const mfem::Vector &v, Ceed ceed, CeedVector *cv); + +namespace internal +{ + +// Access the Ceed objects initialized by CeedInit. +const std::vector &GetCeedObjects(); + +} // namespace internal + +} // namespace palace::ceed + +#endif // PALACE_LIBCEED_UTILS_HPP diff --git a/palace/fem/lumpedelement.cpp b/palace/fem/lumpedelement.cpp index 40086394d4..27b4713dc7 100644 --- a/palace/fem/lumpedelement.cpp +++ b/palace/fem/lumpedelement.cpp @@ -1,133 +1,123 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "lumpedelement.hpp" - -#include "fem/integrator.hpp" -#include "utils/communication.hpp" - -namespace palace -{ - -double LumpedElementData::GetArea(mfem::ParFiniteElementSpace &fespace) -{ - mfem::ConstantCoefficient one_func(1.0); - mfem::LinearForm s(&fespace); - s.AddBoundaryIntegrator(new BoundaryLFIntegrator(one_func), attr_marker); - s.UseFastAssembly(false); - s.Assemble(); - - mfem::GridFunction ones(&fespace); - ones = 1.0; - double dot = s * ones; - Mpi::GlobalSum(1, &dot, fespace.GetComm()); - return dot; -} - -UniformElementData::UniformElementData(const std::array &input_dir, - const mfem::Array &marker, - mfem::ParFiniteElementSpace &fespace) - : LumpedElementData(fespace.GetParMesh()->SpaceDimension(), marker), - bounding_box(mesh::GetBoundingBox(*fespace.GetParMesh(), marker, true)), direction(3) -{ - // Check that the bounding box discovered matches the area. This validates that the - // boundary elements form a right angled quadrilateral port. - constexpr double rel_tol = 1.0e-6; - double A = GetArea(fespace); - MFEM_VERIFY((!bounding_box.planar || (std::abs(A - bounding_box.Area()) / A < rel_tol)), - "Discovered bounding box area " - << bounding_box.Area() << " and integrated area " << A - << " do not match: Planar port geometry is not a quadrilateral!"); - - // Check the user specified direction aligns with an axis direction. - constexpr double angle_warning_deg = 0.1; - constexpr double angle_error_deg = 1.0; - auto lengths = bounding_box.Lengths(); - auto deviation_deg = bounding_box.Deviation(input_dir); - if (std::none_of(deviation_deg.begin(), deviation_deg.end(), - [](double x) { return x < angle_warning_deg; })) - { - auto normal_0 = bounding_box.normals[0]; - for (auto &x : normal_0) - { - x /= lengths[0]; - } - auto normal_1 = bounding_box.normals[1]; - for (auto &x : normal_1) - { - x /= lengths[1]; - } - auto normal_2 = bounding_box.normals[2]; - for (auto &x : normal_2) - { - x /= lengths[2]; - } - Mpi::Warning("User specified direction {} does not align with either bounding box " - "axis up to {:.3e} degrees!\n" - "Axis 1: {} ({:.3e} degrees)\nAxis 2: {} ({:.3e} degrees)\nAxis 3: " - "{} ({:.3e} degrees)!\n", - input_dir, angle_warning_deg, normal_0, deviation_deg[0], normal_1, - deviation_deg[1], normal_2, deviation_deg[2]); - } - MFEM_VERIFY(std::any_of(deviation_deg.begin(), deviation_deg.end(), - [](double x) { return x < angle_error_deg; }), - "Specified direction does not align sufficiently with bounding box axes: " - << deviation_deg[0] << ' ' << deviation_deg[1] << ' ' << deviation_deg[2] - << " tolerance " << angle_error_deg << '!'); - std::copy(input_dir.begin(), input_dir.end(), direction.begin()); - direction /= direction.Norml2(); - - // Compute the length from the most aligned normal direction. - l = lengths[std::distance(deviation_deg.begin(), - std::min_element(deviation_deg.begin(), deviation_deg.end()))]; - MFEM_ASSERT( - (l - mesh::GetProjectedLength(*fespace.GetParMesh(), marker, true, input_dir)) / l < - rel_tol, - "Bounding box discovered length should match projected length!"); - w = A / l; -} - -std::unique_ptr -UniformElementData::GetModeCoefficient(double coef) const -{ - mfem::Vector source = direction; - source *= coef; - return std::make_unique(source); -} - -CoaxialElementData::CoaxialElementData(const std::array &direction, - const mfem::Array &marker, - mfem::ParFiniteElementSpace &fespace) - : LumpedElementData(fespace.GetParMesh()->SpaceDimension(), marker), - bounding_ball(mesh::GetBoundingBall(*fespace.GetParMesh(), marker, true)), - sign(direction[0] > 0) -{ - MFEM_VERIFY(bounding_ball.planar, - "Boundary elements must be coplanar to define a coaxial lumped element!"); - - // Get inner radius of annulus assuming full 2π circumference. - double A = GetArea(fespace); - MFEM_VERIFY(bounding_ball.radius > 0.0 && - std::pow(bounding_ball.radius, 2) - A / M_PI > 0.0, - "Coaxial element boundary is not defined correctly: Radius " - << bounding_ball.radius << ", area " << A << "!"); - ra = std::sqrt(std::pow(bounding_ball.radius, 2) - A / M_PI); -} - -std::unique_ptr -CoaxialElementData::GetModeCoefficient(double coef) const -{ - double scoef = (sign ? 1.0 : -1.0) * coef; - mfem::Vector x0(3); - std::copy(bounding_ball.center.begin(), bounding_ball.center.end(), x0.begin()); - auto Source = [scoef, x0](const mfem::Vector &x, mfem::Vector &f) -> void - { - f = x; - f -= x0; - double oor = 1.0 / f.Norml2(); - f *= scoef * oor * oor; - }; - return std::make_unique(dim, Source); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "lumpedelement.hpp" + +#include "fem/coefficient.hpp" +#include "fem/integrator.hpp" +#include "linalg/vector.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" + +namespace palace +{ + +UniformElementData::UniformElementData(const std::array &input_dir, + const mfem::Array &attr_list, + const mfem::ParMesh &mesh) + : LumpedElementData(attr_list) +{ + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array attr_marker = mesh::AttrToMarker(bdr_attr_max, attr_list); + auto bounding_box = mesh::GetBoundingBox(mesh, attr_marker, true); + + // Check the user specified direction aligns with an axis direction. + constexpr double angle_warning_deg = 0.1; + constexpr double angle_error_deg = 1.0; + auto lengths = bounding_box.Lengths(); + auto deviations_deg = bounding_box.Deviations(input_dir); + if (std::none_of(deviations_deg.begin(), deviations_deg.end(), + [](double x) { return x < angle_warning_deg; })) + { + auto normals = bounding_box.Normals(); + Mpi::Warning("User specified direction {} does not align with either bounding box " + "axis up to {:.3e} degrees!\n" + "Axis 1: {} ({:.3e} degrees)\nAxis 2: {} ({:.3e} degrees)\nAxis 3: " + "{} ({:.3e} degrees)!\n", + input_dir, angle_warning_deg, normals[0], deviations_deg[0], normals[1], + deviations_deg[1], normals[2], deviations_deg[2]); + } + if (std::none_of(deviations_deg.begin(), deviations_deg.end(), + [](double x) { return x < angle_error_deg; })) + { + Mpi::Barrier(mesh.GetComm()); + MFEM_ABORT("Specified direction does not align sufficiently with bounding box axes (" + << deviations_deg[0] << ", " << deviations_deg[1] << ", " + << deviations_deg[2] << " vs. tolerance " << angle_error_deg << ")!"); + } + direction.SetSize(input_dir.size()); + std::copy(input_dir.begin(), input_dir.end(), direction.begin()); + direction /= direction.Norml2(); + + // Compute the length from the most aligned normal direction. + constexpr double rel_tol = 1.0e-6; + auto l_component = + std::distance(deviations_deg.begin(), + std::min_element(deviations_deg.begin(), deviations_deg.end())); + l = lengths[l_component]; + MFEM_VERIFY(std::abs(l - mesh::GetProjectedLength(mesh, attr_marker, true, input_dir)) < + rel_tol * l, + "Bounding box discovered length (" + << l << ") should match projected length (" + << mesh::GetProjectedLength(mesh, attr_marker, true, input_dir) << "!"); + + // Compute the width as area / length. This allows the lumped element to be non-planar, + // and generalizes nicely to the case for an infinitely thin rectangular lumped element + // with elements on both sides (for which the width computed from the bounding box would + // be incorrect by a factor of 2). + double area = mesh::GetSurfaceArea(mesh, attr_marker); + MFEM_VERIFY(area > 0.0, "Uniform lumped element has zero area!"); + w = area / l; +} + +std::unique_ptr +UniformElementData::GetModeCoefficient(double coeff) const +{ + mfem::Vector source = direction; + source *= coeff; + return std::make_unique>( + attr_list, source); +} + +CoaxialElementData::CoaxialElementData(const std::array &input_dir, + const mfem::Array &attr_list, + const mfem::ParMesh &mesh) + : LumpedElementData(attr_list) +{ + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array attr_marker = mesh::AttrToMarker(bdr_attr_max, attr_list); + auto bounding_ball = mesh::GetBoundingBall(mesh, attr_marker, true); + MFEM_VERIFY(bounding_ball.planar, + "Boundary elements must be coplanar to define a coaxial lumped element!"); + + // Direction of the excitation as +/-r̂. + direction = (input_dir[0] > 0 ? +1 : -1); + origin.SetSize(bounding_ball.center.size()); + std::copy(bounding_ball.center.begin(), bounding_ball.center.end(), origin.begin()); + + // Get outer and inner radius of the annulus, assuming full 2π circumference. + r_outer = 0.5 * bounding_ball.Lengths()[0]; + r_inner = mesh::GetDistanceFromPoint(mesh, attr_marker, true, bounding_ball.center); + MFEM_VERIFY(r_inner > 0.0, + "Coaxial element annulus has should have positive inner radius!"); + MFEM_VERIFY(r_outer > r_inner, "Coaxial element annulus has unexpected outer radius " + << r_outer << " <= inner radius " << r_inner << "!"); +} + +std::unique_ptr +CoaxialElementData::GetModeCoefficient(double coeff) const +{ + coeff *= direction; + mfem::Vector x0(origin); + auto Source = [coeff, x0](const mfem::Vector &x, mfem::Vector &f) -> void + { + f = x; + f -= x0; + double oor = 1.0 / f.Norml2(); + f *= coeff * oor * oor; + }; + return std::make_unique>( + attr_list, x0.Size(), Source); +} + +} // namespace palace diff --git a/palace/fem/lumpedelement.hpp b/palace/fem/lumpedelement.hpp index 29e8b635c5..16a8ee7df0 100644 --- a/palace/fem/lumpedelement.hpp +++ b/palace/fem/lumpedelement.hpp @@ -1,91 +1,81 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_FEM_LUMPED_ELEMENT_HPP -#define PALACE_FEM_LUMPED_ELEMENT_HPP - -#include -#include -#include "utils/geodata.hpp" - -namespace palace -{ - -// -// Base class handling geometry of lumped elements for uniform and coaxial lumped port and -// surface current source boundaries. -// -class LumpedElementData -{ -protected: - // Spatial dimension. - const int dim; - - // Marker for all boundary attributes making up this lumped element boundary. - mfem::Array attr_marker; - - double GetArea(mfem::ParFiniteElementSpace &fespace); - -public: - LumpedElementData(int d, const mfem::Array &marker) : dim(d), attr_marker(marker) {} - virtual ~LumpedElementData() = default; - - mfem::Array &GetMarker() { return attr_marker; } - const mfem::Array &GetMarker() const { return attr_marker; } - - virtual double GetGeometryLength() const = 0; - virtual double GetGeometryWidth() const = 0; - - virtual std::unique_ptr - GetModeCoefficient(double coef = 1.0) const = 0; -}; - -class UniformElementData : public LumpedElementData -{ -protected: - // Bounding box defining the rectangular lumped port. - mesh::BoundingBox bounding_box; - - // Cartesian vector specifying signed direction of incident field. - mfem::Vector direction; - - // Lumped element length and width. - double l, w; - -public: - UniformElementData(const std::array &input_dir, const mfem::Array &marker, - mfem::ParFiniteElementSpace &fespace); - - double GetGeometryLength() const override { return l; } - double GetGeometryWidth() const override { return w; } - - std::unique_ptr - GetModeCoefficient(double coef = 1.0) const override; -}; - -class CoaxialElementData : public LumpedElementData -{ -protected: - // Bounding ball defined by boundary element. - mesh::BoundingBall bounding_ball; - - // Sign of incident field, +r̂ if true. - bool sign; - - // Inner radius of coaxial annulus. - double ra; - -public: - CoaxialElementData(const std::array &direction, const mfem::Array &marker, - mfem::ParFiniteElementSpace &fespace); - - double GetGeometryLength() const override { return std::log(bounding_ball.radius / ra); } - double GetGeometryWidth() const override { return 2.0 * M_PI; } - - std::unique_ptr - GetModeCoefficient(double coef = 1.0) const override; -}; - -} // namespace palace - -#endif // PALACE_FEM_LUMPED_ELEMENT_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_FEM_LUMPED_ELEMENT_HPP +#define PALACE_FEM_LUMPED_ELEMENT_HPP + +#include +#include + +namespace palace +{ + +// +// Base class handling geometry of lumped elements for uniform and coaxial lumped port and +// surface current source boundaries. +// +class LumpedElementData +{ +protected: + // List of all boundary attributes making up this lumped element boundary. + mfem::Array attr_list; + +public: + LumpedElementData(const mfem::Array &attr_list) : attr_list(attr_list) {} + virtual ~LumpedElementData() = default; + + const auto &GetAttrList() const { return attr_list; } + + virtual double GetGeometryLength() const = 0; + virtual double GetGeometryWidth() const = 0; + + virtual std::unique_ptr + GetModeCoefficient(double coeff = 1.0) const = 0; +}; + +class UniformElementData : public LumpedElementData +{ +private: + // Cartesian vector specifying signed direction of incident field. + mfem::Vector direction; + + // Lumped element length and width. + double l, w; + +public: + UniformElementData(const std::array &input_dir, + const mfem::Array &attr_list, const mfem::ParMesh &mesh); + + double GetGeometryLength() const override { return l; } + double GetGeometryWidth() const override { return w; } + + std::unique_ptr + GetModeCoefficient(double coeff = 1.0) const override; +}; + +class CoaxialElementData : public LumpedElementData +{ +private: + // Sign of incident field, +1 for +r̂, -1 for -r̂. + double direction; + + // Origin of the coaxial annulus. + mfem::Vector origin; + + // Outer and inner radii of coaxial annulus. + double r_outer, r_inner; + +public: + CoaxialElementData(const std::array &input_dir, + const mfem::Array &attr_list, const mfem::ParMesh &mesh); + + double GetGeometryLength() const override { return std::log(r_outer / r_inner); } + double GetGeometryWidth() const override { return 2.0 * M_PI; } + + std::unique_ptr + GetModeCoefficient(double coeff = 1.0) const override; +}; + +} // namespace palace + +#endif // PALACE_FEM_LUMPED_ELEMENT_HPP diff --git a/palace/fem/multigrid.hpp b/palace/fem/multigrid.hpp index 1064201867..533d9d6e01 100644 --- a/palace/fem/multigrid.hpp +++ b/palace/fem/multigrid.hpp @@ -1,182 +1,130 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_FEM_MULTIGRID_HPP -#define PALACE_FEM_MULTIGRID_HPP - -#include -#include -#include -#include "fem/fespace.hpp" -#include "utils/iodata.hpp" - -namespace palace::fem -{ - -// -// Methods for constructing hierarchies of finite element spaces for geometric multigrid. -// - -// Construct sequence of FECollection objects. -template -inline std::vector> -ConstructFECollections(int p, int dim, int mg_max_levels, - config::LinearSolverData::MultigridCoarsenType mg_coarsen_type, - bool mat_lor) -{ - // If the solver will use a LOR preconditioner, we need to construct with a specific basis - // type. - constexpr int pmin = (std::is_base_of::value || - std::is_base_of::value) - ? 1 - : 0; - MFEM_VERIFY(p >= pmin, "FE space order must not be less than " << pmin << "!"); - int b1 = mfem::BasisType::GaussLobatto, b2 = mfem::BasisType::GaussLegendre; - if (mat_lor) - { - b2 = mfem::BasisType::IntegratedGLL; - } - - // Construct the p-multigrid hierarchy, first finest to coarsest and then reverse the - // order. - std::vector> fecs; - for (int l = 0; l < std::max(1, mg_max_levels); l++) - { - if constexpr (std::is_base_of::value || - std::is_base_of::value) - { - fecs.push_back(std::make_unique(p, dim, b1, b2)); - } - else - { - fecs.push_back(std::make_unique(p, dim, b1)); - MFEM_CONTRACT_VAR(b2); - } - if (p == pmin) - { - break; - } - switch (mg_coarsen_type) - { - case config::LinearSolverData::MultigridCoarsenType::LINEAR: - p--; - break; - case config::LinearSolverData::MultigridCoarsenType::LOGARITHMIC: - p = (p + pmin) / 2; - break; - } - } - std::reverse(fecs.begin(), fecs.end()); - - return fecs; -} - -// Construct a hierarchy of finite element spaces given a sequence of meshes and finite -// element collections. Additionally, Dirichlet boundary conditions are marked. -template -inline FiniteElementSpaceHierarchy ConstructFiniteElementSpaceHierarchy( - int mg_max_levels, const std::vector> &mesh, - const std::vector> &fecs, - const mfem::Array *dbc_marker = nullptr, - std::vector> *dbc_tdof_lists = nullptr) -{ - MFEM_VERIFY(!mesh.empty() && !fecs.empty() && - (!dbc_tdof_lists || dbc_tdof_lists->empty()), - "Empty mesh or FE collection for FE space construction!"); - int coarse_mesh_l = - std::max(0, static_cast(mesh.size() + fecs.size()) - 1 - mg_max_levels); - FiniteElementSpaceHierarchy fespaces( - std::make_unique(mesh[coarse_mesh_l].get(), fecs[0].get())); - if (dbc_marker && dbc_tdof_lists) - { - fespaces.GetFinestFESpace().GetEssentialTrueDofs(*dbc_marker, - dbc_tdof_lists->emplace_back()); - } - - // h-refinement - for (std::size_t l = coarse_mesh_l + 1; l < mesh.size(); l++) - { - fespaces.AddLevel(std::make_unique(mesh[l].get(), fecs[0].get())); - if (dbc_marker && dbc_tdof_lists) - { - fespaces.GetFinestFESpace().GetEssentialTrueDofs(*dbc_marker, - dbc_tdof_lists->emplace_back()); - } - } - - // p-refinement - for (std::size_t l = 1; l < fecs.size(); l++) - { - fespaces.AddLevel( - std::make_unique(mesh.back().get(), fecs[l].get())); - if (dbc_marker && dbc_tdof_lists) - { - fespaces.GetFinestFESpace().GetEssentialTrueDofs(*dbc_marker, - dbc_tdof_lists->emplace_back()); - } - } - - return fespaces; -} - -// Similar to ConstructFiniteElementSpaceHierarchy above, but in this case the finite -// element space at each level is an auxiliary space associated with the coresponding level -// of the provided finite element space objects. -template -inline AuxiliaryFiniteElementSpaceHierarchy ConstructAuxiliaryFiniteElementSpaceHierarchy( - const FiniteElementSpaceHierarchy &primal_fespaces, - const std::vector> &fecs, - const mfem::Array *dbc_marker = nullptr, - std::vector> *dbc_tdof_lists = nullptr) -{ - MFEM_VERIFY((primal_fespaces.GetNumLevels() > 0) && !fecs.empty() && - (!dbc_tdof_lists || dbc_tdof_lists->empty()), - "Empty mesh or FE collection for FE space construction!"); - mfem::ParMesh *mesh = primal_fespaces.GetFESpaceAtLevel(0).GetParMesh(); - AuxiliaryFiniteElementSpaceHierarchy fespaces( - std::make_unique(primal_fespaces.GetFESpaceAtLevel(0), - mesh, fecs[0].get())); - if (dbc_marker && dbc_tdof_lists) - { - fespaces.GetFinestFESpace().GetEssentialTrueDofs(*dbc_marker, - dbc_tdof_lists->emplace_back()); - } - - // h-refinement - std::size_t l; - for (l = 1; l < primal_fespaces.GetNumLevels(); l++) - { - if (primal_fespaces.GetFESpaceAtLevel(l).GetParMesh() == mesh) - { - break; - } - fespaces.AddLevel(std::make_unique( - primal_fespaces.GetFESpaceAtLevel(l), - primal_fespaces.GetFESpaceAtLevel(l).GetParMesh(), fecs[0].get())); - if (dbc_marker && dbc_tdof_lists) - { - fespaces.GetFinestFESpace().GetEssentialTrueDofs(*dbc_marker, - dbc_tdof_lists->emplace_back()); - } - mesh = primal_fespaces.GetFESpaceAtLevel(l).GetParMesh(); - } - - // p-refinement - const auto l0 = l - 1; - for (; l < primal_fespaces.GetNumLevels(); l++) - { - fespaces.AddLevel(std::make_unique( - primal_fespaces.GetFESpaceAtLevel(l), mesh, fecs[l - l0].get())); - if (dbc_marker && dbc_tdof_lists) - { - fespaces.GetFinestFESpace().GetEssentialTrueDofs(*dbc_marker, - dbc_tdof_lists->emplace_back()); - } - } - - return fespaces; -} - -} // namespace palace::fem - -#endif // PALACE_FEM_MULTIGRID_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_FEM_MULTIGRID_HPP +#define PALACE_FEM_MULTIGRID_HPP + +#include +#include +#include +#include "fem/fespace.hpp" +#include "fem/mesh.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" + +namespace palace::fem +{ + +// +// Methods for constructing hierarchies of finite element spaces for geometric multigrid. +// + +// Construct sequence of FECollection objects. +template +inline std::vector> +ConstructFECollections(int p, int dim, int mg_max_levels, MultigridCoarsening mg_coarsening, + bool mat_lor) +{ + // If the solver will use a LOR preconditioner, we need to construct with a specific basis + // type. + constexpr int pmin = (std::is_base_of::value || + std::is_base_of::value) + ? 1 + : 0; + MFEM_VERIFY(p >= pmin, "FE space order must not be less than " << pmin << "!"); + int b1 = mfem::BasisType::GaussLobatto, b2 = mfem::BasisType::GaussLegendre; + if (mat_lor) + { + b2 = mfem::BasisType::IntegratedGLL; + } + + // Construct the p-multigrid hierarchy, first finest to coarsest and then reverse the + // order. + std::vector> fecs; + for (int l = 0; l < std::max(1, mg_max_levels); l++) + { + if constexpr (std::is_base_of::value || + std::is_base_of::value) + { + fecs.push_back(std::make_unique(p, dim, b1, b2)); + } + else + { + fecs.push_back(std::make_unique(p, dim, b1)); + MFEM_CONTRACT_VAR(b2); + } + if (p == pmin) + { + break; + } + switch (mg_coarsening) + { + case MultigridCoarsening::LINEAR: + p--; + break; + case MultigridCoarsening::LOGARITHMIC: + p = (p + pmin) / 2; + break; + } + } + std::reverse(fecs.begin(), fecs.end()); + + return fecs; +} + +// Construct a hierarchy of finite element spaces given a sequence of meshes and finite +// element collections. Additionally, Dirichlet boundary conditions are marked. +template +inline FiniteElementSpaceHierarchy ConstructFiniteElementSpaceHierarchy( + int mg_max_levels, const std::vector> &mesh, + const std::vector> &fecs, + const mfem::Array *dbc_attr = nullptr, + std::vector> *dbc_tdof_lists = nullptr) +{ + MFEM_VERIFY(!mesh.empty() && !fecs.empty() && + (!dbc_tdof_lists || dbc_tdof_lists->empty()), + "Empty mesh or FE collection for FE space construction!"); + int coarse_mesh_l = std::max(0, static_cast(mesh.size() + fecs.size()) - 1 - + std::max(1, mg_max_levels)); + FiniteElementSpaceHierarchy fespaces( + std::make_unique(*mesh[coarse_mesh_l], fecs[0].get())); + + mfem::Array dbc_marker; + if (dbc_attr && dbc_tdof_lists) + { + int bdr_attr_max = mesh[coarse_mesh_l]->Get().bdr_attributes.Size() + ? mesh[coarse_mesh_l]->Get().bdr_attributes.Max() + : 0; + dbc_marker = mesh::AttrToMarker(bdr_attr_max, *dbc_attr); + fespaces.GetFinestFESpace().Get().GetEssentialTrueDofs(dbc_marker, + dbc_tdof_lists->emplace_back()); + } + + // h-refinement. + for (std::size_t l = coarse_mesh_l + 1; l < mesh.size(); l++) + { + fespaces.AddLevel(std::make_unique(*mesh[l], fecs[0].get())); + if (dbc_attr && dbc_tdof_lists) + { + fespaces.GetFinestFESpace().Get().GetEssentialTrueDofs( + dbc_marker, dbc_tdof_lists->emplace_back()); + } + } + + // p-refinement. + for (std::size_t l = 1; l < fecs.size(); l++) + { + fespaces.AddLevel(std::make_unique(*mesh.back(), fecs[l].get())); + if (dbc_attr && dbc_tdof_lists) + { + fespaces.GetFinestFESpace().Get().GetEssentialTrueDofs( + dbc_marker, dbc_tdof_lists->emplace_back()); + } + } + + return fespaces; +} + +} // namespace palace::fem + +#endif // PALACE_FEM_MULTIGRID_HPP diff --git a/palace/fem/qfunctions/1/h1_1_qf.h b/palace/fem/qfunctions/1/h1_1_qf.h new file mode 100644 index 0000000000..7d94354a25 --- /dev/null +++ b/palace/fem/qfunctions/1/h1_1_qf.h @@ -0,0 +1,24 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_H1_1_QF_H +#define PALACE_LIBCEED_H1_1_QF_H + +#include "../coeff/coeff_1_qf.h" + +CEED_QFUNCTION(f_apply_h1_1)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + v[i] = coeff * wdetJ[i] * u[i]; + } + return 0; +} + +#endif // PALACE_LIBCEED_H1_1_QF_H diff --git a/palace/fem/qfunctions/1/h1_build_1_qf.h b/palace/fem/qfunctions/1/h1_build_1_qf.h new file mode 100644 index 0000000000..3a7548734a --- /dev/null +++ b/palace/fem/qfunctions/1/h1_build_1_qf.h @@ -0,0 +1,24 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_H1_BUILD_1_QF_H +#define PALACE_LIBCEED_H1_BUILD_1_QF_H + +#include "../coeff/coeff_1_qf.h" + +CEED_QFUNCTION(f_build_h1_1)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + qd[i] = coeff * wdetJ[i]; + } + return 0; +} + +#endif // PALACE_LIBCEED_H1_BUILD_1_QF_H diff --git a/palace/fem/qfunctions/1/l2_1_qf.h b/palace/fem/qfunctions/1/l2_1_qf.h new file mode 100644 index 0000000000..160f64344b --- /dev/null +++ b/palace/fem/qfunctions/1/l2_1_qf.h @@ -0,0 +1,24 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_1_QF_H +#define PALACE_LIBCEED_L2_1_QF_H + +#include "../coeff/coeff_1_qf.h" + +CEED_QFUNCTION(f_apply_l2_1)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *qw = in[1], *u = in[2]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + v[i] = (coeff * qw[i] * qw[i] / wdetJ[i]) * u[i]; + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_1_QF_H diff --git a/palace/fem/qfunctions/1/l2_build_1_qf.h b/palace/fem/qfunctions/1/l2_build_1_qf.h new file mode 100644 index 0000000000..e5e0a99a53 --- /dev/null +++ b/palace/fem/qfunctions/1/l2_build_1_qf.h @@ -0,0 +1,24 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_BUILD_1_QF_H +#define PALACE_LIBCEED_L2_BUILD_1_QF_H + +#include "../coeff/coeff_1_qf.h" + +CEED_QFUNCTION(f_build_l2_1)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *qw = in[1]; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + qd[i] = coeff * qw[i] * qw[i] / wdetJ[i]; + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_BUILD_1_QF_H diff --git a/palace/fem/qfunctions/2/h1_2_qf.h b/palace/fem/qfunctions/2/h1_2_qf.h new file mode 100644 index 0000000000..3a5de69ada --- /dev/null +++ b/palace/fem/qfunctions/2/h1_2_qf.h @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_H1_2_QF_H +#define PALACE_LIBCEED_H1_2_QF_H + +#include "../coeff/coeff_2_qf.h" + +CEED_QFUNCTION(f_apply_h1_2)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + v[i + Q * 0] = wdetJ[i] * (coeff[0] * u0 + coeff[2] * u1); + v[i + Q * 1] = wdetJ[i] * (coeff[1] * u0 + coeff[3] * u1); + } + return 0; +} + +#endif // PALACE_LIBCEED_H1_2_QF_H diff --git a/palace/fem/qfunctions/2/h1_build_2_qf.h b/palace/fem/qfunctions/2/h1_build_2_qf.h new file mode 100644 index 0000000000..497c64b29d --- /dev/null +++ b/palace/fem/qfunctions/2/h1_build_2_qf.h @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_H1_BUILD_2_QF_H +#define PALACE_LIBCEED_H1_BUILD_2_QF_H + +#include "../coeff/coeff_2_qf.h" + +CEED_QFUNCTION(f_build_h1_2)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + + qd[i + Q * 0] = wdetJ[i] * coeff[0]; + qd[i + Q * 1] = wdetJ[i] * coeff[1]; + qd[i + Q * 2] = wdetJ[i] * coeff[2]; + qd[i + Q * 3] = wdetJ[i] * coeff[3]; + } + return 0; +} + +#endif // PALACE_LIBCEED_H1_BUILD_2_QF_H diff --git a/palace/fem/qfunctions/2/l2_2_qf.h b/palace/fem/qfunctions/2/l2_2_qf.h new file mode 100644 index 0000000000..c35571d32c --- /dev/null +++ b/palace/fem/qfunctions/2/l2_2_qf.h @@ -0,0 +1,29 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_2_QF_H +#define PALACE_LIBCEED_L2_2_QF_H + +#include "../coeff/coeff_2_qf.h" + +CEED_QFUNCTION(f_apply_l2_2)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *qw = in[1], *u = in[2]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + const CeedScalar w = qw[i] * qw[i] / wdetJ[i]; + + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + v[i + Q * 0] = w * (coeff[0] * u0 + coeff[2] * u1); + v[i + Q * 1] = w * (coeff[1] * u0 + coeff[3] * u1); + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_2_QF_H diff --git a/palace/fem/qfunctions/2/l2_build_2_qf.h b/palace/fem/qfunctions/2/l2_build_2_qf.h new file mode 100644 index 0000000000..606a5f35bd --- /dev/null +++ b/palace/fem/qfunctions/2/l2_build_2_qf.h @@ -0,0 +1,29 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_BUILD_2_QF_H +#define PALACE_LIBCEED_L2_BUILD_2_QF_H + +#include "../coeff/coeff_2_qf.h" + +CEED_QFUNCTION(f_build_l2_2)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *qw = in[1]; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + const CeedScalar w = qw[i] * qw[i] / wdetJ[i]; + + qd[i + Q * 0] = w * coeff[0]; + qd[i + Q * 1] = w * coeff[1]; + qd[i + Q * 2] = w * coeff[2]; + qd[i + Q * 3] = w * coeff[3]; + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_BUILD_2_QF_H diff --git a/palace/fem/qfunctions/21/geom_21_qf.h b/palace/fem/qfunctions/21/geom_21_qf.h new file mode 100644 index 0000000000..02371315a1 --- /dev/null +++ b/palace/fem/qfunctions/21/geom_21_qf.h @@ -0,0 +1,29 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_GEOM_21_QF_H +#define PALACE_LIBCEED_GEOM_21_QF_H + +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_build_geom_factor_21)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *qw = in[1], *J = in[2]; + CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar J_loc[2], adjJt_loc[2]; + MatUnpack21(J + i, Q, J_loc); + const CeedScalar detJ = AdjJt21(J_loc, adjJt_loc); + + qd_attr[i] = attr[i]; + qd_wdetJ[i] = qw[i] * detJ; + qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ; + qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ; + } + return 0; +} + +#endif // PALACE_LIBCEED_GEOM_21_QF_H diff --git a/palace/fem/qfunctions/21/hcurl_21_qf.h b/palace/fem/qfunctions/21/hcurl_21_qf.h new file mode 100644 index 0000000000..27094e6e0e --- /dev/null +++ b/palace/fem/qfunctions/21/hcurl_21_qf.h @@ -0,0 +1,29 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_21_QF_H +#define PALACE_LIBCEED_HCURL_21_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_apply_hcurl_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[1] = {u[i + Q * 0]}; + CeedScalar coeff[4], adjJt_loc[2], v_loc[1]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + MultAtBCx21(adjJt_loc, coeff, adjJt_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_21_QF_H diff --git a/palace/fem/qfunctions/21/hcurl_build_21_qf.h b/palace/fem/qfunctions/21/hcurl_build_21_qf.h new file mode 100644 index 0000000000..1a10ef5c21 --- /dev/null +++ b/palace/fem/qfunctions/21/hcurl_build_21_qf.h @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_BUILD_21_QF_H +#define PALACE_LIBCEED_HCURL_BUILD_21_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_build_hcurl_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4], adjJt_loc[2], qd_loc[1]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + MultAtBA21(adjJt_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_BUILD_21_QF_H diff --git a/palace/fem/qfunctions/21/hcurlh1d_21_qf.h b/palace/fem/qfunctions/21/hcurlh1d_21_qf.h new file mode 100644 index 0000000000..5924450162 --- /dev/null +++ b/palace/fem/qfunctions/21/hcurlh1d_21_qf.h @@ -0,0 +1,30 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_H1D_21_QF_H +#define PALACE_LIBCEED_HCURL_H1D_21_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_apply_hcurlh1d_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[1] = {u[i + Q * 0]}; + CeedScalar coeff[4], adjJt_loc[2], v_loc[2]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + MultBAx21(adjJt_loc, coeff, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_H1D_21_QF_H diff --git a/palace/fem/qfunctions/21/hcurlh1d_build_21_qf.h b/palace/fem/qfunctions/21/hcurlh1d_build_21_qf.h new file mode 100644 index 0000000000..20a4821f16 --- /dev/null +++ b/palace/fem/qfunctions/21/hcurlh1d_build_21_qf.h @@ -0,0 +1,29 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_H1D_BUILD_21_QF_H +#define PALACE_LIBCEED_HCURL_H1D_BUILD_21_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_build_hcurlh1d_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4], adjJt_loc[2], qd_loc[2]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + MultBA21(adjJt_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_H1D_BUILD_21_QF_H diff --git a/palace/fem/qfunctions/21/hcurlhdiv_21_qf.h b/palace/fem/qfunctions/21/hcurlhdiv_21_qf.h new file mode 100644 index 0000000000..8cc2b33dff --- /dev/null +++ b/palace/fem/qfunctions/21/hcurlhdiv_21_qf.h @@ -0,0 +1,50 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_HDIV_21_QF_H +#define PALACE_LIBCEED_HCURL_HDIV_21_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_apply_hcurlhdiv_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[1] = {u[i + Q * 0]}; + CeedScalar coeff[4], adjJt_loc[2], J_loc[2], v_loc[1]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + AdjJt21(adjJt_loc, J_loc); + MultAtBCx21(J_loc, coeff, adjJt_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + } + return 0; +} + +CEED_QFUNCTION(f_apply_hdivhcurl_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[1] = {u[i + Q * 0]}; + CeedScalar coeff[4], adjJt_loc[2], J_loc[2], v_loc[1]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + AdjJt21(adjJt_loc, J_loc); + MultAtBCx21(adjJt_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_HDIV_21_QF_H diff --git a/palace/fem/qfunctions/21/hcurlhdiv_build_21_qf.h b/palace/fem/qfunctions/21/hcurlhdiv_build_21_qf.h new file mode 100644 index 0000000000..2785409e7d --- /dev/null +++ b/palace/fem/qfunctions/21/hcurlhdiv_build_21_qf.h @@ -0,0 +1,48 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_HDIV_BUILD_21_QF_H +#define PALACE_LIBCEED_HCURL_HDIV_BUILD_21_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_build_hcurlhdiv_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4], adjJt_loc[2], J_loc[2], qd_loc[1]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + AdjJt21(adjJt_loc, J_loc); + MultAtBC21(J_loc, coeff, adjJt_loc, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + } + return 0; +} + +CEED_QFUNCTION(f_build_hdivhcurl_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4], adjJt_loc[2], J_loc[2], qd_loc[1]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + AdjJt21(adjJt_loc, J_loc); + MultAtBC21(adjJt_loc, coeff, J_loc, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_HDIV_BUILD_21_QF_H diff --git a/palace/fem/qfunctions/21/hcurlmass_21_qf.h b/palace/fem/qfunctions/21/hcurlmass_21_qf.h new file mode 100644 index 0000000000..1683c1b394 --- /dev/null +++ b/palace/fem/qfunctions/21/hcurlmass_21_qf.h @@ -0,0 +1,38 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_MASS_21_QF_H +#define PALACE_LIBCEED_HCURL_MASS_21_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_apply_hcurlmass_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1], + *gradu = in[2]; + CeedScalar *__restrict v = out[0], *__restrict gradv = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + v[i] = coeff * wdetJ[i] * u[i]; + } + { + const CeedScalar u_loc[1] = {gradu[i + Q * 0]}; + CeedScalar coeff[4], adjJt_loc[2], v_loc[1]; + CoeffUnpack2(CoeffPairSecond<1>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + MultAtBCx21(adjJt_loc, coeff, adjJt_loc, u_loc, v_loc); + + gradv[i + Q * 0] = wdetJ[i] * v_loc[0]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_MASS_21_QF_H diff --git a/palace/fem/qfunctions/21/hcurlmass_build_21_qf.h b/palace/fem/qfunctions/21/hcurlmass_build_21_qf.h new file mode 100644 index 0000000000..9480f339f1 --- /dev/null +++ b/palace/fem/qfunctions/21/hcurlmass_build_21_qf.h @@ -0,0 +1,36 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_MASS_BUILD_21_QF_H +#define PALACE_LIBCEED_HCURL_MASS_BUILD_21_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_build_hcurlmass_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *__restrict qd1 = out[0], *__restrict qd2 = out[0] + Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + qd1[i + Q * 0] = coeff * wdetJ[i]; + } + { + CeedScalar coeff[4], adjJt_loc[2], qd_loc[1]; + CoeffUnpack2(CoeffPairSecond<1>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + MultAtBA21(adjJt_loc, coeff, qd_loc); + + qd2[i + Q * 0] = wdetJ[i] * qd_loc[0]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_MASS_BUILD_21_QF_H diff --git a/palace/fem/qfunctions/21/hdiv_21_qf.h b/palace/fem/qfunctions/21/hdiv_21_qf.h new file mode 100644 index 0000000000..47d5ac546b --- /dev/null +++ b/palace/fem/qfunctions/21/hdiv_21_qf.h @@ -0,0 +1,30 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_21_QF_H +#define PALACE_LIBCEED_HDIV_21_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_apply_hdiv_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[1] = {u[i + Q * 0]}; + CeedScalar coeff[4], adjJt_loc[2], J_loc[2], v_loc[1]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + AdjJt21(adjJt_loc, J_loc); + MultAtBCx21(J_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_21_QF_H diff --git a/palace/fem/qfunctions/21/hdiv_build_21_qf.h b/palace/fem/qfunctions/21/hdiv_build_21_qf.h new file mode 100644 index 0000000000..89852c5066 --- /dev/null +++ b/palace/fem/qfunctions/21/hdiv_build_21_qf.h @@ -0,0 +1,29 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_BUILD_21_QF_H +#define PALACE_LIBCEED_HDIV_BUILD_21_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_build_hdiv_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4], adjJt_loc[2], J_loc[2], qd_loc[1]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + AdjJt21(adjJt_loc, J_loc); + MultAtBA21(J_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_BUILD_21_QF_H diff --git a/palace/fem/qfunctions/21/l2mass_21_qf.h b/palace/fem/qfunctions/21/l2mass_21_qf.h new file mode 100644 index 0000000000..203bd4d027 --- /dev/null +++ b/palace/fem/qfunctions/21/l2mass_21_qf.h @@ -0,0 +1,40 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_MASS_21_QF_H +#define PALACE_LIBCEED_L2_MASS_21_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_apply_l2mass_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1], + *u = in[2], *divu = in[3]; + CeedScalar *__restrict v = out[0], *__restrict divv = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar u_loc[1] = {u[i + Q * 0]}; + CeedScalar coeff[4], adjJt_loc[2], J_loc[2], v_loc[1]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + AdjJt21(adjJt_loc, J_loc); + MultAtBCx21(J_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<2>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + divv[i] = (coeff * qw[i] * qw[i] / wdetJ[i]) * divu[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_MASS_21_QF_H diff --git a/palace/fem/qfunctions/21/l2mass_build_21_qf.h b/palace/fem/qfunctions/21/l2mass_build_21_qf.h new file mode 100644 index 0000000000..e6c29bec2a --- /dev/null +++ b/palace/fem/qfunctions/21/l2mass_build_21_qf.h @@ -0,0 +1,38 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_MASS_BUILD_21_QF_H +#define PALACE_LIBCEED_L2_MASS_BUILD_21_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_2_qf.h" +#include "utils_21_qf.h" + +CEED_QFUNCTION(f_build_l2mass_21)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1]; + CeedScalar *qd1 = out[0], *qd2 = out[0] + Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + CeedScalar coeff[4], adjJt_loc[2], J_loc[2], qd_loc[1]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack21(adjJt + i, Q, adjJt_loc); + AdjJt21(adjJt_loc, J_loc); + MultAtBA21(J_loc, coeff, qd_loc); + + qd1[i + Q * 0] = wdetJ[i] * qd_loc[0]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<2>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + qd2[i] = coeff * qw[i] * qw[i] / wdetJ[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_MASS_BUILD_21_QF_H diff --git a/palace/fem/qfunctions/21/utils_21_qf.h b/palace/fem/qfunctions/21/utils_21_qf.h new file mode 100644 index 0000000000..83ebad61d2 --- /dev/null +++ b/palace/fem/qfunctions/21/utils_21_qf.h @@ -0,0 +1,103 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_UTILS_21_QF_H +#define PALACE_LIBCEED_UTILS_21_QF_H + +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + +CEED_QFUNCTION_HELPER CeedScalar DetJ21(const CeedScalar J[2]) +{ + // J: 0 + // 1 + return sqrt(J[0] * J[0] + J[1] * J[1]); +} + +template +CEED_QFUNCTION_HELPER CeedScalar AdjJt21(const CeedScalar J[2], CeedScalar adjJt[2]) +{ + // Compute adj(J)^T / det(J) and store the result. + // J: 0 adj(J): 1/sqrt(J^T J) J^T + // 1 + const CeedScalar d = sqrt(J[0] * J[0] + J[1] * J[1]); + adjJt[0] = J[0] / d; + adjJt[1] = J[1] / d; + return ComputeDet ? d : 0.0; +} + +CEED_QFUNCTION_HELPER void MatUnpack21(const CeedScalar *A, const CeedInt A_stride, + CeedScalar A_loc[2]) +{ + A_loc[0] = A[A_stride * 0]; + A_loc[1] = A[A_stride * 1]; +} + +CEED_QFUNCTION_HELPER void MultAtBCx21(const CeedScalar A[2], const CeedScalar B[4], + const CeedScalar C[2], const CeedScalar x[1], + CeedScalar y[1]) +{ + // A: 0 B: 0 2 C: 0 + // 1 1 3 1 + CeedScalar z[2], t; + + y[0] = C[0] * x[0]; + t = C[1] * x[0]; + + z[0] = B[0] * y[0] + B[2] * t; + z[1] = B[1] * y[0] + B[3] * t; + + y[0] = A[0] * z[0] + A[1] * z[1]; +} + +CEED_QFUNCTION_HELPER void MultBAx21(const CeedScalar A[2], const CeedScalar B[4], + const CeedScalar x[1], CeedScalar y[2]) +{ + // A: 0 B: 0 2 + // 1 1 3 + CeedScalar z[2]; + + z[0] = A[0] * x[0]; + z[1] = A[1] * x[0]; + + y[0] = B[0] * z[0] + B[2] * z[1]; + y[1] = B[1] * z[0] + B[3] * z[1]; +} + +CEED_QFUNCTION_HELPER void MultAtBA21(const CeedScalar A[2], const CeedScalar B[4], + CeedScalar C[1]) +{ + // A: 0 B: 0 2 C: 0 + // 1 1 3 + + // First compute entries of R = B A. + const CeedScalar R11 = B[0] * A[0] + B[2] * A[1]; + const CeedScalar R21 = B[1] * A[0] + B[3] * A[1]; + + C[0] = A[0] * R11 + A[1] * R21; +} + +CEED_QFUNCTION_HELPER void MultAtBC21(const CeedScalar A[2], const CeedScalar B[4], + const CeedScalar C[2], CeedScalar D[1]) +{ + // A, C: 0 B: 0 2 D: 0 + // 1 1 3 + + // First compute entries of R = B C. + const CeedScalar R11 = B[0] * C[0] + B[2] * C[1]; + const CeedScalar R21 = B[1] * C[0] + B[3] * C[1]; + + D[0] = A[0] * R11 + A[1] * R21; +} + +CEED_QFUNCTION_HELPER void MultBA21(const CeedScalar A[2], const CeedScalar B[4], + CeedScalar C[2]) +{ + // A: 0 B: 0 2 C: 0 + // 1 1 3 1 + C[0] = B[0] * A[0] + B[2] * A[1]; + C[1] = B[1] * A[0] + B[3] * A[1]; +} + +#endif // PALACE_LIBCEED_UTILS_21_QF_H diff --git a/palace/fem/qfunctions/22/geom_22_qf.h b/palace/fem/qfunctions/22/geom_22_qf.h new file mode 100644 index 0000000000..e45d4df0f1 --- /dev/null +++ b/palace/fem/qfunctions/22/geom_22_qf.h @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_GEOM_22_QF_H +#define PALACE_LIBCEED_GEOM_22_QF_H + +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_build_geom_factor_22)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *qw = in[1], *J = in[2]; + CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar J_loc[4], adjJt_loc[4]; + MatUnpack22(J + i, Q, J_loc); + const CeedScalar detJ = AdjJt22(J_loc, adjJt_loc); + + qd_attr[i] = attr[i]; + qd_wdetJ[i] = qw[i] * detJ; + qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ; + qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ; + qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ; + qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ; + } + return 0; +} + +#endif // PALACE_LIBCEED_GEOM_22_QF_H diff --git a/palace/fem/qfunctions/22/hcurl_22_qf.h b/palace/fem/qfunctions/22/hcurl_22_qf.h new file mode 100644 index 0000000000..594f57c9eb --- /dev/null +++ b/palace/fem/qfunctions/22/hcurl_22_qf.h @@ -0,0 +1,30 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_22_QF_H +#define PALACE_LIBCEED_HCURL_22_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_apply_hcurl_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[4], adjJt_loc[4], v_loc[2]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + MultAtBCx22(adjJt_loc, coeff, adjJt_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_22_QF_H diff --git a/palace/fem/qfunctions/22/hcurl_build_22_qf.h b/palace/fem/qfunctions/22/hcurl_build_22_qf.h new file mode 100644 index 0000000000..abe6e4f10e --- /dev/null +++ b/palace/fem/qfunctions/22/hcurl_build_22_qf.h @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_BUILD_22_QF_H +#define PALACE_LIBCEED_HCURL_BUILD_22_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_build_hcurl_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4], adjJt_loc[4], qd_loc[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + MultAtBA22(adjJt_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_BUILD_22_QF_H diff --git a/palace/fem/qfunctions/22/hcurlh1d_22_qf.h b/palace/fem/qfunctions/22/hcurlh1d_22_qf.h new file mode 100644 index 0000000000..fd973d4172 --- /dev/null +++ b/palace/fem/qfunctions/22/hcurlh1d_22_qf.h @@ -0,0 +1,30 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_H1D_22_QF_H +#define PALACE_LIBCEED_HCURL_H1D_22_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_apply_hcurlh1d_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[4], adjJt_loc[4], v_loc[2]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + MultBAx22(adjJt_loc, coeff, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_H1D_22_QF_H diff --git a/palace/fem/qfunctions/22/hcurlh1d_build_22_qf.h b/palace/fem/qfunctions/22/hcurlh1d_build_22_qf.h new file mode 100644 index 0000000000..5781ad3fc9 --- /dev/null +++ b/palace/fem/qfunctions/22/hcurlh1d_build_22_qf.h @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_H1D_BUILD_22_QF_H +#define PALACE_LIBCEED_HCURL_H1D_BUILD_22_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_build_hcurlh1d_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4], adjJt_loc[4], qd_loc[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + MultBA22(adjJt_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_H1D_BUILD_22_QF_H diff --git a/palace/fem/qfunctions/22/hcurlhdiv_22_qf.h b/palace/fem/qfunctions/22/hcurlhdiv_22_qf.h new file mode 100644 index 0000000000..47ecb41118 --- /dev/null +++ b/palace/fem/qfunctions/22/hcurlhdiv_22_qf.h @@ -0,0 +1,52 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_HDIV_22_QF_H +#define PALACE_LIBCEED_HCURL_HDIV_22_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_apply_hcurlhdiv_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[4], adjJt_loc[4], J_loc[4], v_loc[2]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + AdjJt22(adjJt_loc, J_loc); + MultAtBCx22(J_loc, coeff, adjJt_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + return 0; +} + +CEED_QFUNCTION(f_apply_hdivhcurl_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[4], adjJt_loc[4], J_loc[4], v_loc[2]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + AdjJt22(adjJt_loc, J_loc); + MultAtBCx22(adjJt_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_HDIV_22_QF_H diff --git a/palace/fem/qfunctions/22/hcurlhdiv_build_22_qf.h b/palace/fem/qfunctions/22/hcurlhdiv_build_22_qf.h new file mode 100644 index 0000000000..c650025ccf --- /dev/null +++ b/palace/fem/qfunctions/22/hcurlhdiv_build_22_qf.h @@ -0,0 +1,54 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_HDIV_BUILD_22_QF_H +#define PALACE_LIBCEED_HCURL_HDIV_BUILD_22_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_build_hcurlhdiv_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4], adjJt_loc[4], J_loc[4], qd_loc[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + AdjJt22(adjJt_loc, J_loc); + MultAtBC22(J_loc, coeff, adjJt_loc, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + return 0; +} + +CEED_QFUNCTION(f_build_hdivhcurl_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4], adjJt_loc[4], J_loc[4], qd_loc[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + AdjJt22(adjJt_loc, J_loc); + MultAtBC22(adjJt_loc, coeff, J_loc, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_HDIV_BUILD_22_QF_H diff --git a/palace/fem/qfunctions/22/hcurlhdiv_error_22_qf.h b/palace/fem/qfunctions/22/hcurlhdiv_error_22_qf.h new file mode 100644 index 0000000000..c84241b754 --- /dev/null +++ b/palace/fem/qfunctions/22/hcurlhdiv_error_22_qf.h @@ -0,0 +1,74 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURLHDIV_ERROR_22_QF_H +#define PALACE_LIBCEED_HCURLHDIV_ERROR_22_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_apply_hcurlhdiv_error_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u1 = in[1], + *u2 = in[2]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar adjJt_loc[4], v1_loc[2], v2_loc[2]; + MatUnpack22(adjJt + i, Q, adjJt_loc); + { + const CeedScalar u1_loc[2] = {u1[i + Q * 0], u1[i + Q * 1]}; + CeedScalar coeff[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MultBAx22(adjJt_loc, coeff, u1_loc, v1_loc); + } + { + const CeedScalar u2_loc[2] = {u2[i + Q * 0], u2[i + Q * 1]}; + CeedScalar coeff[4], J_loc[4]; + CoeffUnpack2(CoeffPairSecond<2>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + AdjJt22(adjJt_loc, J_loc); + MultBAx22(J_loc, coeff, u2_loc, v2_loc); + } + v2_loc[0] -= v1_loc[0]; + v2_loc[1] -= v1_loc[1]; + v[i] = wdetJ[i] * (v2_loc[0] * v2_loc[0] + v2_loc[1] * v2_loc[1]); + } + return 0; +} + +CEED_QFUNCTION(f_apply_hdivhcurl_error_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u1 = in[1], + *u2 = in[2]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar adjJt_loc[4], v1_loc[2], v2_loc[2]; + MatUnpack22(adjJt + i, Q, adjJt_loc); + { + const CeedScalar u1_loc[2] = {u1[i + Q * 0], u1[i + Q * 1]}; + CeedScalar coeff[4], J_loc[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + AdjJt22(adjJt_loc, J_loc); + MultBAx22(J_loc, coeff, u1_loc, v1_loc); + } + { + const CeedScalar u2_loc[2] = {u2[i + Q * 0], u2[i + Q * 1]}; + CeedScalar coeff[4]; + CoeffUnpack2(CoeffPairSecond<2>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MultBAx22(adjJt_loc, coeff, u2_loc, v2_loc); + } + v2_loc[0] -= v1_loc[0]; + v2_loc[1] -= v1_loc[1]; + v[i] = wdetJ[i] * (v2_loc[0] * v2_loc[0] + v2_loc[1] * v2_loc[1]); + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURLHDIV_ERROR_22_QF_H diff --git a/palace/fem/qfunctions/22/hcurlmass_22_qf.h b/palace/fem/qfunctions/22/hcurlmass_22_qf.h new file mode 100644 index 0000000000..9b0585906d --- /dev/null +++ b/palace/fem/qfunctions/22/hcurlmass_22_qf.h @@ -0,0 +1,39 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_MASS_22_QF_H +#define PALACE_LIBCEED_HCURL_MASS_22_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_apply_hcurlmass_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1], + *gradu = in[2]; + CeedScalar *__restrict v = out[0], *__restrict gradv = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + v[i] = coeff * wdetJ[i] * u[i]; + } + { + const CeedScalar u_loc[2] = {gradu[i + Q * 0], gradu[i + Q * 1]}; + CeedScalar coeff[4], adjJt_loc[4], v_loc[2]; + CoeffUnpack2(CoeffPairSecond<1>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + MultAtBCx22(adjJt_loc, coeff, adjJt_loc, u_loc, v_loc); + + gradv[i + Q * 0] = wdetJ[i] * v_loc[0]; + gradv[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_MASS_22_QF_H diff --git a/palace/fem/qfunctions/22/hcurlmass_build_22_qf.h b/palace/fem/qfunctions/22/hcurlmass_build_22_qf.h new file mode 100644 index 0000000000..7086180d1e --- /dev/null +++ b/palace/fem/qfunctions/22/hcurlmass_build_22_qf.h @@ -0,0 +1,39 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_MASS_BUILD_22_QF_H +#define PALACE_LIBCEED_HCURL_MASS_BUILD_22_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_build_hcurlmass_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *__restrict qd1 = out[0], *__restrict qd2 = out[0] + Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + qd1[i + Q * 0] = coeff * wdetJ[i]; + } + { + CeedScalar coeff[4], adjJt_loc[4], qd_loc[4]; + CoeffUnpack2(CoeffPairSecond<1>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + MultAtBA22(adjJt_loc, coeff, qd_loc); + + qd2[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd2[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd2[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd2[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_MASS_BUILD_22_QF_H diff --git a/palace/fem/qfunctions/22/hdiv_22_qf.h b/palace/fem/qfunctions/22/hdiv_22_qf.h new file mode 100644 index 0000000000..6e801cd6a5 --- /dev/null +++ b/palace/fem/qfunctions/22/hdiv_22_qf.h @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_22_QF_H +#define PALACE_LIBCEED_HDIV_22_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_apply_hdiv_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[4], adjJt_loc[4], J_loc[4], v_loc[2]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + AdjJt22(adjJt_loc, J_loc); + MultAtBCx22(J_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_22_QF_H diff --git a/palace/fem/qfunctions/22/hdiv_build_22_qf.h b/palace/fem/qfunctions/22/hdiv_build_22_qf.h new file mode 100644 index 0000000000..a2466dbb76 --- /dev/null +++ b/palace/fem/qfunctions/22/hdiv_build_22_qf.h @@ -0,0 +1,32 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_BUILD_22_QF_H +#define PALACE_LIBCEED_HDIV_BUILD_22_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_build_hdiv_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[4], adjJt_loc[4], J_loc[4], qd_loc[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + AdjJt22(adjJt_loc, J_loc); + MultAtBA22(J_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_BUILD_22_QF_H diff --git a/palace/fem/qfunctions/22/hdivmass_22_qf.h b/palace/fem/qfunctions/22/hdivmass_22_qf.h new file mode 100644 index 0000000000..3e92f31b62 --- /dev/null +++ b/palace/fem/qfunctions/22/hdivmass_22_qf.h @@ -0,0 +1,40 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_MASS_22_QF_H +#define PALACE_LIBCEED_HDIV_MASS_22_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_apply_hdivmass_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1], + *u = in[2], *curlu = in[3]; + CeedScalar *__restrict v = out[0], *__restrict curlv = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[4], adjJt_loc[4], v_loc[2]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + MultAtBCx22(adjJt_loc, coeff, adjJt_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<2>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + curlv[i] = (coeff * qw[i] * qw[i] / wdetJ[i]) * curlu[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_MASS_22_QF_H diff --git a/palace/fem/qfunctions/22/hdivmass_build_22_qf.h b/palace/fem/qfunctions/22/hdivmass_build_22_qf.h new file mode 100644 index 0000000000..1e91769140 --- /dev/null +++ b/palace/fem/qfunctions/22/hdivmass_build_22_qf.h @@ -0,0 +1,40 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_MASS_BUILD_22_QF_H +#define PALACE_LIBCEED_HDIV_MASS_BUILD_22_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_build_hdivmass_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1]; + CeedScalar *__restrict qd1 = out[0], *__restrict qd2 = out[0] + 4 * Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + CeedScalar coeff[4], adjJt_loc[4], qd_loc[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + MultAtBA22(adjJt_loc, coeff, qd_loc); + + qd1[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd1[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd1[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd1[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<2>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + qd2[i] = coeff * qw[i] * qw[i] / wdetJ[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_MASS_BUILD_22_QF_H diff --git a/palace/fem/qfunctions/22/l2mass_22_qf.h b/palace/fem/qfunctions/22/l2mass_22_qf.h new file mode 100644 index 0000000000..d84f6da996 --- /dev/null +++ b/palace/fem/qfunctions/22/l2mass_22_qf.h @@ -0,0 +1,41 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_MASS_22_QF_H +#define PALACE_LIBCEED_L2_MASS_22_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_apply_l2mass_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1], + *u = in[2], *divu = in[3]; + CeedScalar *__restrict v = out[0], *__restrict divv = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[4], adjJt_loc[4], J_loc[4], v_loc[2]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + AdjJt22(adjJt_loc, J_loc); + MultAtBCx22(J_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<2>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + divv[i] = (coeff * qw[i] * qw[i] / wdetJ[i]) * divu[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_MASS_22_QF_H diff --git a/palace/fem/qfunctions/22/l2mass_build_22_qf.h b/palace/fem/qfunctions/22/l2mass_build_22_qf.h new file mode 100644 index 0000000000..a082c74780 --- /dev/null +++ b/palace/fem/qfunctions/22/l2mass_build_22_qf.h @@ -0,0 +1,41 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_MASS_BUILD_22_QF_H +#define PALACE_LIBCEED_L2_MASS_BUILD_22_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_build_l2mass_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1]; + CeedScalar *__restrict qd1 = out[0], *__restrict qd2 = out[0] + 4 * Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + CeedScalar coeff[4], adjJt_loc[4], J_loc[4], qd_loc[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + AdjJt22(adjJt_loc, J_loc); + MultAtBA22(J_loc, coeff, qd_loc); + + qd1[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd1[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd1[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd1[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<2>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + qd2[i] = coeff * qw[i] * qw[i] / wdetJ[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_MASS_BUILD_22_QF_H diff --git a/palace/fem/qfunctions/22/utils_22_qf.h b/palace/fem/qfunctions/22/utils_22_qf.h new file mode 100644 index 0000000000..c943bae941 --- /dev/null +++ b/palace/fem/qfunctions/22/utils_22_qf.h @@ -0,0 +1,128 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_UTILS_22_QF_H +#define PALACE_LIBCEED_UTILS_22_QF_H + +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + +CEED_QFUNCTION_HELPER CeedScalar DetJ22(const CeedScalar J[4]) +{ + // J: 0 2 + // 1 3 + return J[0] * J[3] - J[1] * J[2]; +} + +template +CEED_QFUNCTION_HELPER CeedScalar AdjJt22(const CeedScalar J[4], CeedScalar adjJt[4]) +{ + // Compute adj(J)^T / det(J) and store the result. + // J: 0 2 adj(J): J22 -J12 + // 1 3 -J21 J11 + adjJt[0] = J[3]; + adjJt[1] = -J[2]; + adjJt[2] = -J[1]; + adjJt[3] = J[0]; + return ComputeDet ? (J[0] * J[3] - J[1] * J[2]) : 0.0; +} + +CEED_QFUNCTION_HELPER void MatUnpack22(const CeedScalar *A, const CeedInt A_stride, + CeedScalar A_loc[4]) +{ + A_loc[0] = A[A_stride * 0]; + A_loc[1] = A[A_stride * 1]; + A_loc[2] = A[A_stride * 2]; + A_loc[3] = A[A_stride * 3]; +} + +CEED_QFUNCTION_HELPER void MultBx22(const CeedScalar B[4], const CeedScalar x[2], + CeedScalar y[2]) +{ + // B: 0 2 + // 1 3 + y[0] = B[0] * x[0] + B[2] * x[1]; + y[1] = B[1] * x[0] + B[3] * x[1]; +} + +CEED_QFUNCTION_HELPER void MultAtBCx22(const CeedScalar A[4], const CeedScalar B[4], + const CeedScalar C[4], const CeedScalar x[2], + CeedScalar y[2]) +{ + // A, B, C: 0 2 + // 1 3 + CeedScalar z[2]; + + y[0] = C[0] * x[0] + C[2] * x[1]; + y[1] = C[1] * x[0] + C[3] * x[1]; + + z[0] = B[0] * y[0] + B[2] * y[1]; + z[1] = B[1] * y[0] + B[3] * y[1]; + + y[0] = A[0] * z[0] + A[1] * z[1]; + y[1] = A[2] * z[0] + A[3] * z[1]; +} + +CEED_QFUNCTION_HELPER void MultBAx22(const CeedScalar A[4], const CeedScalar B[4], + const CeedScalar x[2], CeedScalar y[2]) +{ + // A, B: 0 2 + // 1 3 + CeedScalar z[2]; + + z[0] = A[0] * x[0] + A[2] * x[1]; + z[1] = A[1] * x[0] + A[3] * x[1]; + + y[0] = B[0] * z[0] + B[2] * z[1]; + y[1] = B[1] * z[0] + B[3] * z[1]; +} + +CEED_QFUNCTION_HELPER void MultAtBA22(const CeedScalar A[4], const CeedScalar B[4], + CeedScalar C[4]) +{ + // A, B, C: 0 2 + // 1 3 + + // First compute entries of R = B A. + const CeedScalar R11 = B[0] * A[0] + B[2] * A[1]; + const CeedScalar R21 = B[1] * A[0] + B[3] * A[1]; + const CeedScalar R12 = B[0] * A[2] + B[2] * A[3]; + const CeedScalar R22 = B[1] * A[2] + B[3] * A[3]; + + C[0] = A[0] * R11 + A[1] * R21; + C[1] = A[2] * R11 + A[3] * R21; + C[2] = A[0] * R12 + A[1] * R22; + C[3] = A[2] * R12 + A[3] * R22; +} + +CEED_QFUNCTION_HELPER void MultAtBC22(const CeedScalar A[4], const CeedScalar B[4], + const CeedScalar C[4], CeedScalar D[4]) +{ + // A, B, C, D: 0 2 + // 1 3 + + // First compute entries of R = B C. + const CeedScalar R11 = B[0] * C[0] + B[2] * C[1]; + const CeedScalar R21 = B[1] * C[0] + B[3] * C[1]; + const CeedScalar R12 = B[0] * C[2] + B[2] * C[3]; + const CeedScalar R22 = B[1] * C[2] + B[3] * C[3]; + + D[0] = A[0] * R11 + A[1] * R21; + D[1] = A[2] * R11 + A[3] * R21; + D[2] = A[0] * R12 + A[1] * R22; + D[3] = A[2] * R12 + A[3] * R22; +} + +CEED_QFUNCTION_HELPER void MultBA22(const CeedScalar A[4], const CeedScalar B[4], + CeedScalar C[4]) +{ + // A, B, C: 0 2 + // 1 3 + C[0] = B[0] * A[0] + B[2] * A[1]; + C[1] = B[1] * A[0] + B[3] * A[1]; + C[2] = B[0] * A[2] + B[2] * A[3]; + C[3] = B[1] * A[2] + B[3] * A[3]; +} + +#endif // PALACE_LIBCEED_UTILS_22_QF_H diff --git a/palace/fem/qfunctions/3/h1_3_qf.h b/palace/fem/qfunctions/3/h1_3_qf.h new file mode 100644 index 0000000000..6d86a92cc4 --- /dev/null +++ b/palace/fem/qfunctions/3/h1_3_qf.h @@ -0,0 +1,30 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_H1_3_QF_H +#define PALACE_LIBCEED_H1_3_QF_H + +#include "../coeff/coeff_3_qf.h" + +CEED_QFUNCTION(f_apply_h1_3)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + const CeedScalar u2 = u[i + Q * 2]; + v[i + Q * 0] = wdetJ[i] * (coeff[0] * u0 + coeff[3] * u1 + coeff[6] * u2); + v[i + Q * 1] = wdetJ[i] * (coeff[1] * u0 + coeff[4] * u1 + coeff[7] * u2); + v[i + Q * 2] = wdetJ[i] * (coeff[2] * u0 + coeff[5] * u1 + coeff[8] * u2); + } + return 0; +} + +#endif // PALACE_LIBCEED_H1_3_QF_H diff --git a/palace/fem/qfunctions/3/h1_build_3_qf.h b/palace/fem/qfunctions/3/h1_build_3_qf.h new file mode 100644 index 0000000000..dd2cf7a6ce --- /dev/null +++ b/palace/fem/qfunctions/3/h1_build_3_qf.h @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_H1_BUILD_3_QF_H +#define PALACE_LIBCEED_H1_BUILD_3_QF_H + +#include "../coeff/coeff_3_qf.h" + +CEED_QFUNCTION(f_build_h1_3)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + + qd[i + Q * 0] = wdetJ[i] * coeff[0]; + qd[i + Q * 1] = wdetJ[i] * coeff[1]; + qd[i + Q * 2] = wdetJ[i] * coeff[2]; + qd[i + Q * 3] = wdetJ[i] * coeff[3]; + qd[i + Q * 4] = wdetJ[i] * coeff[4]; + qd[i + Q * 5] = wdetJ[i] * coeff[5]; + qd[i + Q * 6] = wdetJ[i] * coeff[6]; + qd[i + Q * 7] = wdetJ[i] * coeff[7]; + qd[i + Q * 8] = wdetJ[i] * coeff[8]; + } + return 0; +} + +#endif // PALACE_LIBCEED_H1_BUILD_3_QF_H diff --git a/palace/fem/qfunctions/3/l2_3_qf.h b/palace/fem/qfunctions/3/l2_3_qf.h new file mode 100644 index 0000000000..bfad5d83bc --- /dev/null +++ b/palace/fem/qfunctions/3/l2_3_qf.h @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_3_QF_H +#define PALACE_LIBCEED_L2_3_QF_H + +#include "../coeff/coeff_3_qf.h" + +CEED_QFUNCTION(f_apply_l2_3)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *qw = in[1], *u = in[2]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + const CeedScalar w = qw[i] * qw[i] / wdetJ[i]; + + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + const CeedScalar u2 = u[i + Q * 2]; + v[i + Q * 0] = w * (coeff[0] * u0 + coeff[3] * u1 + coeff[6] * u2); + v[i + Q * 1] = w * (coeff[1] * u0 + coeff[4] * u1 + coeff[7] * u2); + v[i + Q * 2] = w * (coeff[2] * u0 + coeff[5] * u1 + coeff[8] * u2); + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_3_QF_H diff --git a/palace/fem/qfunctions/3/l2_build_3_qf.h b/palace/fem/qfunctions/3/l2_build_3_qf.h new file mode 100644 index 0000000000..e9d8227d78 --- /dev/null +++ b/palace/fem/qfunctions/3/l2_build_3_qf.h @@ -0,0 +1,34 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_BUILD_3_QF_H +#define PALACE_LIBCEED_L2_BUILD_3_QF_H + +#include "../coeff/coeff_3_qf.h" + +CEED_QFUNCTION(f_build_l2_3)(void *__restrict ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *qw = in[1]; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + const CeedScalar w = qw[i] * qw[i] / wdetJ[i]; + + qd[i + Q * 0] = w * coeff[0]; + qd[i + Q * 1] = w * coeff[1]; + qd[i + Q * 2] = w * coeff[2]; + qd[i + Q * 3] = w * coeff[3]; + qd[i + Q * 4] = w * coeff[4]; + qd[i + Q * 5] = w * coeff[5]; + qd[i + Q * 6] = w * coeff[6]; + qd[i + Q * 7] = w * coeff[7]; + qd[i + Q * 8] = w * coeff[8]; + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_BUILD_3_QF_H diff --git a/palace/fem/qfunctions/32/geom_32_qf.h b/palace/fem/qfunctions/32/geom_32_qf.h new file mode 100644 index 0000000000..55f561670d --- /dev/null +++ b/palace/fem/qfunctions/32/geom_32_qf.h @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_GEOM_32_QF_H +#define PALACE_LIBCEED_GEOM_32_QF_H + +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_build_geom_factor_32)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *qw = in[1], *J = in[2]; + CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar J_loc[6], adjJt_loc[6]; + MatUnpack32(J + i, Q, J_loc); + const CeedScalar detJ = AdjJt32(J_loc, adjJt_loc); + + qd_attr[i] = attr[i]; + qd_wdetJ[i] = qw[i] * detJ; + qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ; + qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ; + qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ; + qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ; + qd_adjJt[i + Q * 4] = adjJt_loc[4] / detJ; + qd_adjJt[i + Q * 5] = adjJt_loc[5] / detJ; + } + return 0; +} + +#endif // PALACE_LIBCEED_GEOM_32_QF_H diff --git a/palace/fem/qfunctions/32/hcurl_32_qf.h b/palace/fem/qfunctions/32/hcurl_32_qf.h new file mode 100644 index 0000000000..2b841fd61a --- /dev/null +++ b/palace/fem/qfunctions/32/hcurl_32_qf.h @@ -0,0 +1,30 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_32_QF_H +#define PALACE_LIBCEED_HCURL_32_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_apply_hcurl_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[9], adjJt_loc[6], v_loc[2]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + MultAtBCx32(adjJt_loc, coeff, adjJt_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_32_QF_H diff --git a/palace/fem/qfunctions/32/hcurl_build_32_qf.h b/palace/fem/qfunctions/32/hcurl_build_32_qf.h new file mode 100644 index 0000000000..5367fcc66a --- /dev/null +++ b/palace/fem/qfunctions/32/hcurl_build_32_qf.h @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_BUILD_32_QF_H +#define PALACE_LIBCEED_HCURL_BUILD_32_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_build_hcurl_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9], adjJt_loc[6], qd_loc[4]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + MultAtBA32(adjJt_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_BUILD_32_QF_H diff --git a/palace/fem/qfunctions/32/hcurlh1d_32_qf.h b/palace/fem/qfunctions/32/hcurlh1d_32_qf.h new file mode 100644 index 0000000000..d31ca26ad6 --- /dev/null +++ b/palace/fem/qfunctions/32/hcurlh1d_32_qf.h @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_H1D_32_QF_H +#define PALACE_LIBCEED_HCURL_H1D_32_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_apply_hcurlh1d_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[9], adjJt_loc[6], v_loc[3]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + MultBAx32(adjJt_loc, coeff, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + v[i + Q * 2] = wdetJ[i] * v_loc[2]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_H1D_32_QF_H diff --git a/palace/fem/qfunctions/32/hcurlh1d_build_32_qf.h b/palace/fem/qfunctions/32/hcurlh1d_build_32_qf.h new file mode 100644 index 0000000000..7069d92ace --- /dev/null +++ b/palace/fem/qfunctions/32/hcurlh1d_build_32_qf.h @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_H1D_BUILD_32_QF_H +#define PALACE_LIBCEED_HCURL_H1D_BUILD_32_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_build_hcurlh1d_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9], adjJt_loc[6], qd_loc[6]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + MultBA32(adjJt_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + qd[i + Q * 4] = wdetJ[i] * qd_loc[4]; + qd[i + Q * 5] = wdetJ[i] * qd_loc[5]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_H1D_BUILD_32_QF_H diff --git a/palace/fem/qfunctions/32/hcurlhdiv_32_qf.h b/palace/fem/qfunctions/32/hcurlhdiv_32_qf.h new file mode 100644 index 0000000000..58b28b564e --- /dev/null +++ b/palace/fem/qfunctions/32/hcurlhdiv_32_qf.h @@ -0,0 +1,52 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_HDIV_32_QF_H +#define PALACE_LIBCEED_HCURL_HDIV_32_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_apply_hcurlhdiv_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[9], adjJt_loc[6], J_loc[6], v_loc[2]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + AdjJt32(adjJt_loc, J_loc); + MultAtBCx32(J_loc, coeff, adjJt_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + return 0; +} + +CEED_QFUNCTION(f_apply_hdivhcurl_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[9], adjJt_loc[6], J_loc[6], v_loc[2]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + AdjJt32(adjJt_loc, J_loc); + MultAtBCx32(adjJt_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_HDIV_32_QF_H diff --git a/palace/fem/qfunctions/32/hcurlhdiv_build_32_qf.h b/palace/fem/qfunctions/32/hcurlhdiv_build_32_qf.h new file mode 100644 index 0000000000..29aa759e11 --- /dev/null +++ b/palace/fem/qfunctions/32/hcurlhdiv_build_32_qf.h @@ -0,0 +1,54 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_HDIV_BUILD_32_QF_H +#define PALACE_LIBCEED_HCURL_HDIV_BUILD_32_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_build_hcurlhdiv_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9], adjJt_loc[6], J_loc[6], qd_loc[4]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + AdjJt32(adjJt_loc, J_loc); + MultAtBC32(J_loc, coeff, adjJt_loc, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + return 0; +} + +CEED_QFUNCTION(f_build_hdivhcurl_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9], adjJt_loc[6], J_loc[6], qd_loc[4]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + AdjJt32(adjJt_loc, J_loc); + MultAtBC32(adjJt_loc, coeff, J_loc, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_HDIV_BUILD_32_QF_H diff --git a/palace/fem/qfunctions/32/hcurlmass_32_qf.h b/palace/fem/qfunctions/32/hcurlmass_32_qf.h new file mode 100644 index 0000000000..de1b3ad584 --- /dev/null +++ b/palace/fem/qfunctions/32/hcurlmass_32_qf.h @@ -0,0 +1,39 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_MASS_32_QF_H +#define PALACE_LIBCEED_HCURL_MASS_32_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_apply_hcurlmass_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1], + *gradu = in[2]; + CeedScalar *__restrict v = out[0], *__restrict gradv = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + v[i] = coeff * wdetJ[i] * u[i]; + } + { + const CeedScalar u_loc[2] = {gradu[i + Q * 0], gradu[i + Q * 1]}; + CeedScalar coeff[9], adjJt_loc[6], v_loc[2]; + CoeffUnpack3(CoeffPairSecond<1>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + MultAtBCx32(adjJt_loc, coeff, adjJt_loc, u_loc, v_loc); + + gradv[i + Q * 0] = wdetJ[i] * v_loc[0]; + gradv[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_MASS_32_QF_H diff --git a/palace/fem/qfunctions/32/hcurlmass_build_32_qf.h b/palace/fem/qfunctions/32/hcurlmass_build_32_qf.h new file mode 100644 index 0000000000..f7b9de1cf5 --- /dev/null +++ b/palace/fem/qfunctions/32/hcurlmass_build_32_qf.h @@ -0,0 +1,39 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_MASS_BUILD_32_QF_H +#define PALACE_LIBCEED_HCURL_MASS_BUILD_32_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_build_hcurlmass_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *__restrict qd1 = out[0], *__restrict qd2 = out[0] + Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + qd1[i + Q * 0] = coeff * wdetJ[i]; + } + { + CeedScalar coeff[9], adjJt_loc[6], qd_loc[4]; + CoeffUnpack3(CoeffPairSecond<1>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + MultAtBA32(adjJt_loc, coeff, qd_loc); + + qd2[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd2[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd2[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd2[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_MASS_BUILD_32_QF_H diff --git a/palace/fem/qfunctions/32/hdiv_32_qf.h b/palace/fem/qfunctions/32/hdiv_32_qf.h new file mode 100644 index 0000000000..022addd9bf --- /dev/null +++ b/palace/fem/qfunctions/32/hdiv_32_qf.h @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_32_QF_H +#define PALACE_LIBCEED_HDIV_32_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_apply_hdiv_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[9], adjJt_loc[6], J_loc[6], v_loc[2]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + AdjJt32(adjJt_loc, J_loc); + MultAtBCx32(J_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_32_QF_H diff --git a/palace/fem/qfunctions/32/hdiv_build_32_qf.h b/palace/fem/qfunctions/32/hdiv_build_32_qf.h new file mode 100644 index 0000000000..ea98b357ef --- /dev/null +++ b/palace/fem/qfunctions/32/hdiv_build_32_qf.h @@ -0,0 +1,32 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_BUILD_32_QF_H +#define PALACE_LIBCEED_HDIV_BUILD_32_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_build_hdiv_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9], adjJt_loc[6], J_loc[6], qd_loc[4]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + AdjJt32(adjJt_loc, J_loc); + MultAtBA32(J_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_BUILD_32_QF_H diff --git a/palace/fem/qfunctions/32/hdivmass_32_qf.h b/palace/fem/qfunctions/32/hdivmass_32_qf.h new file mode 100644 index 0000000000..350610f903 --- /dev/null +++ b/palace/fem/qfunctions/32/hdivmass_32_qf.h @@ -0,0 +1,40 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_MASS_32_QF_H +#define PALACE_LIBCEED_HDIV_MASS_32_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_apply_hdivmass_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1], + *u = in[2], *curlu = in[3]; + CeedScalar *__restrict v = out[0], *__restrict curlv = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[9], adjJt_loc[6], v_loc[2]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + MultAtBCx32(adjJt_loc, coeff, adjJt_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + curlv[i] = (coeff * qw[i] * qw[i] / wdetJ[i]) * curlu[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_MASS_32_QF_H diff --git a/palace/fem/qfunctions/32/hdivmass_build_32_qf.h b/palace/fem/qfunctions/32/hdivmass_build_32_qf.h new file mode 100644 index 0000000000..b8e20eae16 --- /dev/null +++ b/palace/fem/qfunctions/32/hdivmass_build_32_qf.h @@ -0,0 +1,40 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_MASS_BUILD_32_QF_H +#define PALACE_LIBCEED_HDIV_MASS_BUILD_32_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_build_hdivmass_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1]; + CeedScalar *__restrict qd1 = out[0], *__restrict qd2 = out[0] + 3 * Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + CeedScalar coeff[9], adjJt_loc[6], qd_loc[4]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + MultAtBA32(adjJt_loc, coeff, qd_loc); + + qd1[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd1[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd1[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd1[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + qd2[i] = coeff * qw[i] * qw[i] / wdetJ[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_MASS_BUILD_32_QF_H diff --git a/palace/fem/qfunctions/32/l2mass_32_qf.h b/palace/fem/qfunctions/32/l2mass_32_qf.h new file mode 100644 index 0000000000..95b5cfdd11 --- /dev/null +++ b/palace/fem/qfunctions/32/l2mass_32_qf.h @@ -0,0 +1,41 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_MASS_32_QF_H +#define PALACE_LIBCEED_L2_MASS_32_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_apply_l2mass_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1], + *u = in[2], *divu = in[3]; + CeedScalar *__restrict v = out[0], *__restrict divv = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar u_loc[2] = {u[i + Q * 0], u[i + Q * 1]}; + CeedScalar coeff[9], adjJt_loc[6], J_loc[6], v_loc[2]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + AdjJt32(adjJt_loc, J_loc); + MultAtBCx32(J_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + divv[i] = (coeff * qw[i] * qw[i] / wdetJ[i]) * divu[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_MASS_32_QF_H diff --git a/palace/fem/qfunctions/32/l2mass_build_32_qf.h b/palace/fem/qfunctions/32/l2mass_build_32_qf.h new file mode 100644 index 0000000000..6bc4b0afad --- /dev/null +++ b/palace/fem/qfunctions/32/l2mass_build_32_qf.h @@ -0,0 +1,41 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_MASS_BUILD_32_QF_H +#define PALACE_LIBCEED_L2_MASS_BUILD_32_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_3_qf.h" +#include "utils_32_qf.h" + +CEED_QFUNCTION(f_build_l2mass_32)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1]; + CeedScalar *qd1 = out[0], *qd2 = out[0] + 4 * Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + CeedScalar coeff[9], adjJt_loc[6], J_loc[6], qd_loc[4]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack32(adjJt + i, Q, adjJt_loc); + AdjJt32(adjJt_loc, J_loc); + MultAtBA32(J_loc, coeff, qd_loc); + + qd1[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd1[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd1[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd1[i + Q * 3] = wdetJ[i] * qd_loc[3]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + qd2[i] = coeff * qw[i] * qw[i] / wdetJ[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_MASS_BUILD_32_QF_H diff --git a/palace/fem/qfunctions/32/utils_32_qf.h b/palace/fem/qfunctions/32/utils_32_qf.h new file mode 100644 index 0000000000..e23b69a5de --- /dev/null +++ b/palace/fem/qfunctions/32/utils_32_qf.h @@ -0,0 +1,147 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_UTILS_32_QF_H +#define PALACE_LIBCEED_UTILS_32_QF_H + +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + +CEED_QFUNCTION_HELPER CeedScalar DetJ32(const CeedScalar J[6]) +{ + // J: 0 3 + // 1 4 + // 2 5 + const CeedScalar E = J[0] * J[0] + J[1] * J[1] + J[2] * J[2]; + const CeedScalar G = J[3] * J[3] + J[4] * J[4] + J[5] * J[5]; + const CeedScalar F = J[0] * J[3] + J[1] * J[4] + J[2] * J[5]; + return sqrt(E * G - F * F); +} + +template +CEED_QFUNCTION_HELPER CeedScalar AdjJt32(const CeedScalar J[6], CeedScalar adjJt[6]) +{ + // Compute adj(J)^T / det(J) and store the result. + // J: 0 3 + // 1 4 + // 2 5 + const CeedScalar E = J[0] * J[0] + J[1] * J[1] + J[2] * J[2]; + const CeedScalar G = J[3] * J[3] + J[4] * J[4] + J[5] * J[5]; + const CeedScalar F = J[0] * J[3] + J[1] * J[4] + J[2] * J[5]; + const CeedScalar d = sqrt(E * G - F * F); + adjJt[0] = (G * J[0] - F * J[3]) / d; + adjJt[1] = (G * J[1] - F * J[4]) / d; + adjJt[2] = (G * J[2] - F * J[5]) / d; + adjJt[3] = (E * J[3] - F * J[0]) / d; + adjJt[4] = (E * J[4] - F * J[1]) / d; + adjJt[5] = (E * J[5] - F * J[2]) / d; + return ComputeDet ? d : 0.0; +} + +CEED_QFUNCTION_HELPER void MatUnpack32(const CeedScalar *A, const CeedInt A_stride, + CeedScalar A_loc[6]) +{ + A_loc[0] = A[A_stride * 0]; + A_loc[1] = A[A_stride * 1]; + A_loc[2] = A[A_stride * 2]; + A_loc[3] = A[A_stride * 3]; + A_loc[4] = A[A_stride * 4]; + A_loc[5] = A[A_stride * 5]; +} + +CEED_QFUNCTION_HELPER void MultAtBCx32(const CeedScalar A[6], const CeedScalar B[9], + const CeedScalar C[6], const CeedScalar x[2], + CeedScalar y[2]) +{ + // A, C: 0 3 B: 0 3 6 + // 1 4 1 4 7 + // 2 5 2 5 8 + CeedScalar z[3], t; + + y[0] = C[0] * x[0] + C[3] * x[1]; + y[1] = C[1] * x[0] + C[4] * x[1]; + t = C[2] * x[0] + C[5] * x[1]; + + z[0] = B[0] * y[0] + B[3] * y[1] + B[6] * t; + z[1] = B[1] * y[0] + B[4] * y[1] + B[7] * t; + z[2] = B[2] * y[0] + B[5] * y[1] + B[8] * t; + + y[0] = A[0] * z[0] + A[1] * z[1] + A[2] * z[2]; + y[1] = A[3] * z[0] + A[4] * z[1] + A[5] * z[2]; +} + +CEED_QFUNCTION_HELPER void MultBAx32(const CeedScalar A[6], const CeedScalar B[9], + const CeedScalar x[2], CeedScalar y[3]) +{ + // A: 0 3 B: 0 3 6 + // 1 4 1 4 7 + // 2 5 2 5 8 + CeedScalar z[3]; + + z[0] = A[0] * x[0] + A[3] * x[1]; + z[1] = A[1] * x[0] + A[4] * x[1]; + z[2] = A[2] * x[0] + A[5] * x[1]; + + y[0] = B[0] * z[0] + B[3] * z[1] + B[6] * z[2]; + y[1] = B[1] * z[0] + B[4] * z[1] + B[7] * z[2]; + y[2] = B[2] * z[0] + B[5] * z[1] + B[8] * z[2]; +} + +CEED_QFUNCTION_HELPER void MultAtBA32(const CeedScalar A[6], const CeedScalar B[9], + CeedScalar C[4]) +{ + // A: 0 3 B: 0 3 6 C: 0 2 + // 1 4 1 4 7 1 3 + // 2 5 2 5 8 + + // First compute entries of R = B A. + const CeedScalar R11 = B[0] * A[0] + B[3] * A[1] + B[6] * A[2]; + const CeedScalar R21 = B[1] * A[0] + B[4] * A[1] + B[7] * A[2]; + const CeedScalar R31 = B[2] * A[0] + B[5] * A[1] + B[8] * A[2]; + const CeedScalar R12 = B[0] * A[3] + B[3] * A[4] + B[6] * A[5]; + const CeedScalar R22 = B[1] * A[3] + B[4] * A[4] + B[7] * A[5]; + const CeedScalar R32 = B[2] * A[3] + B[5] * A[4] + B[8] * A[5]; + + C[0] = A[0] * R11 + A[1] * R21 + A[2] * R31; + C[1] = A[3] * R11 + A[4] * R21 + A[5] * R31; + C[2] = A[0] * R12 + A[1] * R22 + A[2] * R32; + C[3] = A[3] * R12 + A[4] * R22 + A[5] * R32; +} + +CEED_QFUNCTION_HELPER void MultAtBC32(const CeedScalar A[6], const CeedScalar B[9], + const CeedScalar C[6], CeedScalar D[4]) +{ + // A, C: 0 3 B: 0 3 6 D: 0 2 + // 1 4 1 4 7 1 3 + // 2 5 2 5 8 + + // First compute entries of R = B C. + const CeedScalar R11 = B[0] * C[0] + B[3] * C[1] + B[6] * C[2]; + const CeedScalar R21 = B[1] * C[0] + B[4] * C[1] + B[7] * C[2]; + const CeedScalar R31 = B[2] * C[0] + B[5] * C[1] + B[8] * C[2]; + const CeedScalar R12 = B[0] * C[3] + B[3] * C[4] + B[6] * C[5]; + const CeedScalar R22 = B[1] * C[3] + B[4] * C[4] + B[7] * C[5]; + const CeedScalar R32 = B[2] * C[3] + B[5] * C[4] + B[8] * C[5]; + + D[0] = A[0] * R11 + A[1] * R21 + A[2] * R31; + D[1] = A[3] * R11 + A[4] * R21 + A[5] * R31; + D[2] = A[0] * R12 + A[1] * R22 + A[2] * R32; + D[3] = A[3] * R12 + A[4] * R22 + A[5] * R32; +} + +CEED_QFUNCTION_HELPER void MultBA32(const CeedScalar A[6], const CeedScalar B[9], + CeedScalar C[6]) +{ + // A, C: 0 3 B: 0 3 6 + // 1 4 1 4 7 + // 2 5 2 5 8 + C[0] = B[0] * A[0] + B[3] * A[1] + B[6] * A[2]; + C[1] = B[1] * A[0] + B[4] * A[1] + B[7] * A[2]; + C[2] = B[2] * A[0] + B[5] * A[1] + B[8] * A[2]; + C[3] = B[0] * A[3] + B[3] * A[4] + B[6] * A[5]; + C[4] = B[1] * A[3] + B[4] * A[4] + B[7] * A[5]; + C[5] = B[2] * A[3] + B[5] * A[4] + B[8] * A[5]; +} + +#endif // PALACE_LIBCEED_UTILS_32_QF_H diff --git a/palace/fem/qfunctions/33/geom_33_qf.h b/palace/fem/qfunctions/33/geom_33_qf.h new file mode 100644 index 0000000000..41dc5fbd20 --- /dev/null +++ b/palace/fem/qfunctions/33/geom_33_qf.h @@ -0,0 +1,36 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_GEOM_33_QF_H +#define PALACE_LIBCEED_GEOM_33_QF_H + +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_build_geom_factor_33)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *qw = in[1], *J = in[2]; + CeedScalar *qd_attr = out[0], *qd_wdetJ = out[0] + Q, *qd_adjJt = out[0] + 2 * Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar J_loc[9], adjJt_loc[9]; + MatUnpack33(J + i, Q, J_loc); + const CeedScalar detJ = AdjJt33(J_loc, adjJt_loc); + + qd_attr[i] = attr[i]; + qd_wdetJ[i] = qw[i] * detJ; + qd_adjJt[i + Q * 0] = adjJt_loc[0] / detJ; + qd_adjJt[i + Q * 1] = adjJt_loc[1] / detJ; + qd_adjJt[i + Q * 2] = adjJt_loc[2] / detJ; + qd_adjJt[i + Q * 3] = adjJt_loc[3] / detJ; + qd_adjJt[i + Q * 4] = adjJt_loc[4] / detJ; + qd_adjJt[i + Q * 5] = adjJt_loc[5] / detJ; + qd_adjJt[i + Q * 6] = adjJt_loc[6] / detJ; + qd_adjJt[i + Q * 7] = adjJt_loc[7] / detJ; + qd_adjJt[i + Q * 8] = adjJt_loc[8] / detJ; + } + return 0; +} + +#endif // PALACE_LIBCEED_GEOM_33_QF_H diff --git a/palace/fem/qfunctions/33/hcurl_33_qf.h b/palace/fem/qfunctions/33/hcurl_33_qf.h new file mode 100644 index 0000000000..9bb201ae22 --- /dev/null +++ b/palace/fem/qfunctions/33/hcurl_33_qf.h @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_33_QF_H +#define PALACE_LIBCEED_HCURL_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_apply_hcurl_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[3] = {u[i + Q * 0], u[i + Q * 1], u[i + Q * 2]}; + CeedScalar coeff[9], adjJt_loc[9], v_loc[3]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + MultAtBCx33(adjJt_loc, coeff, adjJt_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + v[i + Q * 2] = wdetJ[i] * v_loc[2]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_33_QF_H diff --git a/palace/fem/qfunctions/33/hcurl_build_33_qf.h b/palace/fem/qfunctions/33/hcurl_build_33_qf.h new file mode 100644 index 0000000000..ce4bec9a30 --- /dev/null +++ b/palace/fem/qfunctions/33/hcurl_build_33_qf.h @@ -0,0 +1,36 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_BUILD_33_QF_H +#define PALACE_LIBCEED_HCURL_BUILD_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_build_hcurl_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9], adjJt_loc[9], qd_loc[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + MultAtBA33(adjJt_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + qd[i + Q * 4] = wdetJ[i] * qd_loc[4]; + qd[i + Q * 5] = wdetJ[i] * qd_loc[5]; + qd[i + Q * 6] = wdetJ[i] * qd_loc[6]; + qd[i + Q * 7] = wdetJ[i] * qd_loc[7]; + qd[i + Q * 8] = wdetJ[i] * qd_loc[8]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_BUILD_33_QF_H diff --git a/palace/fem/qfunctions/33/hcurlh1d_33_qf.h b/palace/fem/qfunctions/33/hcurlh1d_33_qf.h new file mode 100644 index 0000000000..0e4ddf93bd --- /dev/null +++ b/palace/fem/qfunctions/33/hcurlh1d_33_qf.h @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_H1D_33_QF_H +#define PALACE_LIBCEED_HCURL_H1D_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_apply_hcurlh1d_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[3] = {u[i + Q * 0], u[i + Q * 1], u[i + Q * 2]}; + CeedScalar coeff[9], adjJt_loc[9], v_loc[3]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + MultBAx33(adjJt_loc, coeff, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + v[i + Q * 2] = wdetJ[i] * v_loc[2]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_H1D_33_QF_H diff --git a/palace/fem/qfunctions/33/hcurlh1d_build_33_qf.h b/palace/fem/qfunctions/33/hcurlh1d_build_33_qf.h new file mode 100644 index 0000000000..6f3a948342 --- /dev/null +++ b/palace/fem/qfunctions/33/hcurlh1d_build_33_qf.h @@ -0,0 +1,36 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_H1D_BUILD_33_QF_H +#define PALACE_LIBCEED_HCURL_H1D_BUILD_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_build_hcurlh1d_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9], adjJt_loc[9], qd_loc[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + MultBA33(adjJt_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + qd[i + Q * 4] = wdetJ[i] * qd_loc[4]; + qd[i + Q * 5] = wdetJ[i] * qd_loc[5]; + qd[i + Q * 6] = wdetJ[i] * qd_loc[6]; + qd[i + Q * 7] = wdetJ[i] * qd_loc[7]; + qd[i + Q * 8] = wdetJ[i] * qd_loc[8]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_H1D_BUILD_33_QF_H diff --git a/palace/fem/qfunctions/33/hcurlh1d_error_22_qf.h b/palace/fem/qfunctions/33/hcurlh1d_error_22_qf.h new file mode 100644 index 0000000000..6baa331d84 --- /dev/null +++ b/palace/fem/qfunctions/33/hcurlh1d_error_22_qf.h @@ -0,0 +1,72 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURLH1D_ERROR_22_QF_H +#define PALACE_LIBCEED_HCURLH1D_ERROR_22_QF_H + +#include "../coeff/coeff_2_qf.h" +#include "utils_22_qf.h" + +CEED_QFUNCTION(f_apply_hcurlh1d_error_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u1 = in[1], + *u2 = in[2]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar v1_loc[2], v2_loc[2]; + { + const CeedScalar u1_loc[2] = {u1[i + Q * 0], u1[i + Q * 1]}; + CeedScalar coeff[4], adjJt_loc[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + MultBAx22(adjJt_loc, coeff, u1_loc, v1_loc); + } + { + const CeedScalar u2_loc[2] = {u2[i + Q * 0], u2[i + Q * 1]}; + CeedScalar coeff[4]; + CoeffUnpack2(CoeffPairSecond<2>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MultBx22(coeff, u2_loc, v2_loc); + } + v2_loc[0] -= v1_loc[0]; + v2_loc[1] -= v1_loc[1]; + v[i] = wdetJ[i] * (v2_loc[0] * v2_loc[0] + v2_loc[1] * v2_loc[1]); + } + return 0; +} + +CEED_QFUNCTION(f_apply_h1dhcurl_error_22)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u1 = in[1], + *u2 = in[2]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar v1_loc[2], v2_loc[2]; + { + const CeedScalar u1_loc[2] = {u1[i + Q * 0], u1[i + Q * 1]}; + CeedScalar coeff[4]; + CoeffUnpack2((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MultBx22(coeff, u1_loc, v1_loc); + } + { + const CeedScalar u2_loc[2] = {u2[i + Q * 0], u2[i + Q * 1]}; + CeedScalar coeff[4], adjJt_loc[4]; + CoeffUnpack2(CoeffPairSecond<2>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MatUnpack22(adjJt + i, Q, adjJt_loc); + MultBAx22(adjJt_loc, coeff, u2_loc, v2_loc); + } + v2_loc[0] -= v1_loc[0]; + v2_loc[1] -= v1_loc[1]; + v[i] = wdetJ[i] * (v2_loc[0] * v2_loc[0] + v2_loc[1] * v2_loc[1]); + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURLH1D_ERROR_22_QF_H diff --git a/palace/fem/qfunctions/33/hcurlh1d_error_33_qf.h b/palace/fem/qfunctions/33/hcurlh1d_error_33_qf.h new file mode 100644 index 0000000000..30adc01e92 --- /dev/null +++ b/palace/fem/qfunctions/33/hcurlh1d_error_33_qf.h @@ -0,0 +1,76 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURLH1D_ERROR_33_QF_H +#define PALACE_LIBCEED_HCURLH1D_ERROR_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_apply_hcurlh1d_error_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u1 = in[1], + *u2 = in[2]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar v1_loc[3], v2_loc[3]; + { + const CeedScalar u1_loc[3] = {u1[i + Q * 0], u1[i + Q * 1], u1[i + Q * 2]}; + CeedScalar coeff[9], adjJt_loc[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + MultBAx33(adjJt_loc, coeff, u1_loc, v1_loc); + } + { + const CeedScalar u2_loc[3] = {u2[i + Q * 0], u2[i + Q * 1], u2[i + Q * 2]}; + CeedScalar coeff[9]; + CoeffUnpack3(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MultBx33(coeff, u2_loc, v2_loc); + } + v2_loc[0] -= v1_loc[0]; + v2_loc[1] -= v1_loc[1]; + v2_loc[2] -= v1_loc[2]; + v[i] = + wdetJ[i] * (v2_loc[0] * v2_loc[0] + v2_loc[1] * v2_loc[1] + v2_loc[2] * v2_loc[2]); + } + return 0; +} + +CEED_QFUNCTION(f_apply_h1dhcurl_error_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u1 = in[1], + *u2 = in[2]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar v1_loc[3], v2_loc[3]; + { + const CeedScalar u1_loc[3] = {u1[i + Q * 0], u1[i + Q * 1], u1[i + Q * 2]}; + CeedScalar coeff[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MultBx33(coeff, u1_loc, v1_loc); + } + { + const CeedScalar u2_loc[3] = {u2[i + Q * 0], u2[i + Q * 1], u2[i + Q * 2]}; + CeedScalar coeff[9], adjJt_loc[9]; + CoeffUnpack3(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + MultBAx33(adjJt_loc, coeff, u2_loc, v2_loc); + } + v2_loc[0] -= v1_loc[0]; + v2_loc[1] -= v1_loc[1]; + v2_loc[2] -= v1_loc[2]; + v[i] = + wdetJ[i] * (v2_loc[0] * v2_loc[0] + v2_loc[1] * v2_loc[1] + v2_loc[2] * v2_loc[2]); + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURLH1D_ERROR_33_QF_H diff --git a/palace/fem/qfunctions/33/hcurlhdiv_33_qf.h b/palace/fem/qfunctions/33/hcurlhdiv_33_qf.h new file mode 100644 index 0000000000..489f6b6aa3 --- /dev/null +++ b/palace/fem/qfunctions/33/hcurlhdiv_33_qf.h @@ -0,0 +1,54 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_HDIV_33_QF_H +#define PALACE_LIBCEED_HCURL_HDIV_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_apply_hcurlhdiv_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[3] = {u[i + Q * 0], u[i + Q * 1], u[i + Q * 2]}; + CeedScalar coeff[9], adjJt_loc[9], J_loc[9], v_loc[3]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + AdjJt33(adjJt_loc, J_loc); + MultAtBCx33(J_loc, coeff, adjJt_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + v[i + Q * 2] = wdetJ[i] * v_loc[2]; + } + return 0; +} + +CEED_QFUNCTION(f_apply_hdivhcurl_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[3] = {u[i + Q * 0], u[i + Q * 1], u[i + Q * 2]}; + CeedScalar coeff[9], adjJt_loc[9], J_loc[9], v_loc[3]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + AdjJt33(adjJt_loc, J_loc); + MultAtBCx33(adjJt_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + v[i + Q * 2] = wdetJ[i] * v_loc[2]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_HDIV_33_QF_H diff --git a/palace/fem/qfunctions/33/hcurlhdiv_build_33_qf.h b/palace/fem/qfunctions/33/hcurlhdiv_build_33_qf.h new file mode 100644 index 0000000000..ca8ba99a57 --- /dev/null +++ b/palace/fem/qfunctions/33/hcurlhdiv_build_33_qf.h @@ -0,0 +1,64 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_HDIV_BUILD_33_QF_H +#define PALACE_LIBCEED_HCURL_HDIV_BUILD_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_build_hcurlhdiv_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9], adjJt_loc[9], J_loc[9], qd_loc[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + AdjJt33(adjJt_loc, J_loc); + MultAtBC33(J_loc, coeff, adjJt_loc, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + qd[i + Q * 4] = wdetJ[i] * qd_loc[4]; + qd[i + Q * 5] = wdetJ[i] * qd_loc[5]; + qd[i + Q * 6] = wdetJ[i] * qd_loc[6]; + qd[i + Q * 7] = wdetJ[i] * qd_loc[7]; + qd[i + Q * 8] = wdetJ[i] * qd_loc[8]; + } + return 0; +} + +CEED_QFUNCTION(f_build_hdivhcurl_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9], adjJt_loc[9], J_loc[9], qd_loc[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + AdjJt33(adjJt_loc, J_loc); + MultAtBC33(adjJt_loc, coeff, J_loc, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + qd[i + Q * 4] = wdetJ[i] * qd_loc[4]; + qd[i + Q * 5] = wdetJ[i] * qd_loc[5]; + qd[i + Q * 6] = wdetJ[i] * qd_loc[6]; + qd[i + Q * 7] = wdetJ[i] * qd_loc[7]; + qd[i + Q * 8] = wdetJ[i] * qd_loc[8]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_HDIV_BUILD_33_QF_H diff --git a/palace/fem/qfunctions/33/hcurlhdiv_error_33_qf.h b/palace/fem/qfunctions/33/hcurlhdiv_error_33_qf.h new file mode 100644 index 0000000000..bdeec00b70 --- /dev/null +++ b/palace/fem/qfunctions/33/hcurlhdiv_error_33_qf.h @@ -0,0 +1,78 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURLHDIV_ERROR_33_QF_H +#define PALACE_LIBCEED_HCURLHDIV_ERROR_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_apply_hcurlhdiv_error_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u1 = in[1], + *u2 = in[2]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar adjJt_loc[9], v1_loc[3], v2_loc[3]; + MatUnpack33(adjJt + i, Q, adjJt_loc); + { + const CeedScalar u1_loc[3] = {u1[i + Q * 0], u1[i + Q * 1], u1[i + Q * 2]}; + CeedScalar coeff[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MultBAx33(adjJt_loc, coeff, u1_loc, v1_loc); + } + { + const CeedScalar u2_loc[3] = {u2[i + Q * 0], u2[i + Q * 1], u2[i + Q * 2]}; + CeedScalar coeff[9], J_loc[9]; + CoeffUnpack3(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + AdjJt33(adjJt_loc, J_loc); + MultBAx33(J_loc, coeff, u2_loc, v2_loc); + } + v2_loc[0] -= v1_loc[0]; + v2_loc[1] -= v1_loc[1]; + v2_loc[2] -= v1_loc[2]; + v[i] = + wdetJ[i] * (v2_loc[0] * v2_loc[0] + v2_loc[1] * v2_loc[1] + v2_loc[2] * v2_loc[2]); + } + return 0; +} + +CEED_QFUNCTION(f_apply_hdivhcurl_error_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u1 = in[1], + *u2 = in[2]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar adjJt_loc[9], v1_loc[3], v2_loc[3]; + MatUnpack33(adjJt + i, Q, adjJt_loc); + { + const CeedScalar u1_loc[3] = {u1[i + Q * 0], u1[i + Q * 1], u1[i + Q * 2]}; + CeedScalar coeff[9], J_loc[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + AdjJt33(adjJt_loc, J_loc); + MultBAx33(J_loc, coeff, u1_loc, v1_loc); + } + { + const CeedScalar u2_loc[3] = {u2[i + Q * 0], u2[i + Q * 1], u2[i + Q * 2]}; + CeedScalar coeff[9]; + CoeffUnpack3(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MultBAx33(adjJt_loc, coeff, u2_loc, v2_loc); + } + v2_loc[0] -= v1_loc[0]; + v2_loc[1] -= v1_loc[1]; + v2_loc[2] -= v1_loc[2]; + v[i] = + wdetJ[i] * (v2_loc[0] * v2_loc[0] + v2_loc[1] * v2_loc[1] + v2_loc[2] * v2_loc[2]); + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURLHDIV_ERROR_33_QF_H diff --git a/palace/fem/qfunctions/33/hcurlmass_33_qf.h b/palace/fem/qfunctions/33/hcurlmass_33_qf.h new file mode 100644 index 0000000000..5fcf14f0b6 --- /dev/null +++ b/palace/fem/qfunctions/33/hcurlmass_33_qf.h @@ -0,0 +1,40 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_MASS_33_QF_H +#define PALACE_LIBCEED_HCURL_MASS_33_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_apply_hcurlmass_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1], + *gradu = in[2]; + CeedScalar *__restrict v = out[0], *__restrict gradv = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + v[i] = coeff * wdetJ[i] * u[i]; + } + { + const CeedScalar u_loc[3] = {gradu[i + Q * 0], gradu[i + Q * 1], gradu[i + Q * 2]}; + CeedScalar coeff[9], adjJt_loc[9], v_loc[3]; + CoeffUnpack3(CoeffPairSecond<1>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + MultAtBCx33(adjJt_loc, coeff, adjJt_loc, u_loc, v_loc); + + gradv[i + Q * 0] = wdetJ[i] * v_loc[0]; + gradv[i + Q * 1] = wdetJ[i] * v_loc[1]; + gradv[i + Q * 2] = wdetJ[i] * v_loc[2]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_MASS_33_QF_H diff --git a/palace/fem/qfunctions/33/hcurlmass_build_33_qf.h b/palace/fem/qfunctions/33/hcurlmass_build_33_qf.h new file mode 100644 index 0000000000..3cdd9e2e3d --- /dev/null +++ b/palace/fem/qfunctions/33/hcurlmass_build_33_qf.h @@ -0,0 +1,44 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_MASS_BUILD_33_QF_H +#define PALACE_LIBCEED_HCURL_MASS_BUILD_33_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_build_hcurlmass_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *__restrict qd1 = out[0], *__restrict qd2 = out[0] + Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar coeff = CoeffUnpack1((const CeedIntScalar *)ctx, (CeedInt)attr[i]); + + qd1[i + Q * 0] = coeff * wdetJ[i]; + } + { + CeedScalar coeff[9], adjJt_loc[9], qd_loc[9]; + CoeffUnpack3(CoeffPairSecond<1>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + MultAtBA33(adjJt_loc, coeff, qd_loc); + + qd2[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd2[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd2[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd2[i + Q * 3] = wdetJ[i] * qd_loc[3]; + qd2[i + Q * 4] = wdetJ[i] * qd_loc[4]; + qd2[i + Q * 5] = wdetJ[i] * qd_loc[5]; + qd2[i + Q * 6] = wdetJ[i] * qd_loc[6]; + qd2[i + Q * 7] = wdetJ[i] * qd_loc[7]; + qd2[i + Q * 8] = wdetJ[i] * qd_loc[8]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HCURL_MASS_BUILD_33_QF_H diff --git a/palace/fem/qfunctions/33/hdiv_33_qf.h b/palace/fem/qfunctions/33/hdiv_33_qf.h new file mode 100644 index 0000000000..da84f79445 --- /dev/null +++ b/palace/fem/qfunctions/33/hdiv_33_qf.h @@ -0,0 +1,32 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_33_QF_H +#define PALACE_LIBCEED_HDIV_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_apply_hdiv_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1]; + CeedScalar *v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u_loc[3] = {u[i + Q * 0], u[i + Q * 1], u[i + Q * 2]}; + CeedScalar coeff[9], adjJt_loc[9], J_loc[9], v_loc[3]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + AdjJt33(adjJt_loc, J_loc); + MultAtBCx33(J_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + v[i + Q * 2] = wdetJ[i] * v_loc[2]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_33_QF_H diff --git a/palace/fem/qfunctions/33/hdiv_build_33_qf.h b/palace/fem/qfunctions/33/hdiv_build_33_qf.h new file mode 100644 index 0000000000..0233c3642b --- /dev/null +++ b/palace/fem/qfunctions/33/hdiv_build_33_qf.h @@ -0,0 +1,37 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_BUILD_33_QF_H +#define PALACE_LIBCEED_HDIV_BUILD_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_build_hdiv_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *qd = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar coeff[9], adjJt_loc[9], J_loc[9], qd_loc[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + AdjJt33(adjJt_loc, J_loc); + MultAtBA33(J_loc, coeff, qd_loc); + + qd[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd[i + Q * 3] = wdetJ[i] * qd_loc[3]; + qd[i + Q * 4] = wdetJ[i] * qd_loc[4]; + qd[i + Q * 5] = wdetJ[i] * qd_loc[5]; + qd[i + Q * 6] = wdetJ[i] * qd_loc[6]; + qd[i + Q * 7] = wdetJ[i] * qd_loc[7]; + qd[i + Q * 8] = wdetJ[i] * qd_loc[8]; + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_BUILD_33_QF_H diff --git a/palace/fem/qfunctions/33/hdivmass_33_qf.h b/palace/fem/qfunctions/33/hdivmass_33_qf.h new file mode 100644 index 0000000000..ca59baa5ba --- /dev/null +++ b/palace/fem/qfunctions/33/hdivmass_33_qf.h @@ -0,0 +1,46 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_MASS_33_QF_H +#define PALACE_LIBCEED_HDIV_MASS_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_apply_hdivmass_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *u = in[1], + *curlu = in[2]; + CeedScalar *__restrict v = out[0], *__restrict curlv = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar adjJt_loc[9]; + MatUnpack33(adjJt + i, Q, adjJt_loc); + { + const CeedScalar u_loc[3] = {u[i + Q * 0], u[i + Q * 1], u[i + Q * 2]}; + CeedScalar coeff[9], v_loc[3]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MultAtBCx33(adjJt_loc, coeff, adjJt_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + v[i + Q * 2] = wdetJ[i] * v_loc[2]; + } + { + const CeedScalar u_loc[3] = {curlu[i + Q * 0], curlu[i + Q * 1], curlu[i + Q * 2]}; + CeedScalar coeff[9], J_loc[9], v_loc[3]; + CoeffUnpack3(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + AdjJt33(adjJt_loc, J_loc); + MultAtBCx33(J_loc, coeff, J_loc, u_loc, v_loc); + + curlv[i + Q * 0] = wdetJ[i] * v_loc[0]; + curlv[i + Q * 1] = wdetJ[i] * v_loc[1]; + curlv[i + Q * 2] = wdetJ[i] * v_loc[2]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_MASS_33_QF_H diff --git a/palace/fem/qfunctions/33/hdivmass_build_33_qf.h b/palace/fem/qfunctions/33/hdivmass_build_33_qf.h new file mode 100644 index 0000000000..931451420e --- /dev/null +++ b/palace/fem/qfunctions/33/hdivmass_build_33_qf.h @@ -0,0 +1,55 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_MASS_BUILD_33_QF_H +#define PALACE_LIBCEED_HDIV_MASS_BUILD_33_QF_H + +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_build_hdivmass_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q; + CeedScalar *__restrict qd1 = out[0], *__restrict qd2 = out[0] + 9 * Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedScalar adjJt_loc[9]; + MatUnpack33(adjJt + i, Q, adjJt_loc); + { + CeedScalar coeff[9], qd_loc[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MultAtBA33(adjJt_loc, coeff, qd_loc); + + qd1[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd1[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd1[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd1[i + Q * 3] = wdetJ[i] * qd_loc[3]; + qd1[i + Q * 4] = wdetJ[i] * qd_loc[4]; + qd1[i + Q * 5] = wdetJ[i] * qd_loc[5]; + qd1[i + Q * 6] = wdetJ[i] * qd_loc[6]; + qd1[i + Q * 7] = wdetJ[i] * qd_loc[7]; + qd1[i + Q * 8] = wdetJ[i] * qd_loc[8]; + } + { + CeedScalar coeff[9], J_loc[9], qd_loc[9]; + CoeffUnpack3(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i], coeff); + AdjJt33(adjJt_loc, J_loc); + MultAtBA33(J_loc, coeff, qd_loc); + + qd2[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd2[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd2[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd2[i + Q * 3] = wdetJ[i] * qd_loc[3]; + qd2[i + Q * 4] = wdetJ[i] * qd_loc[4]; + qd2[i + Q * 5] = wdetJ[i] * qd_loc[5]; + qd2[i + Q * 6] = wdetJ[i] * qd_loc[6]; + qd2[i + Q * 7] = wdetJ[i] * qd_loc[7]; + qd2[i + Q * 8] = wdetJ[i] * qd_loc[8]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_HDIV_MASS_BUILD_33_QF_H diff --git a/palace/fem/qfunctions/33/l2mass_33_qf.h b/palace/fem/qfunctions/33/l2mass_33_qf.h new file mode 100644 index 0000000000..39169708ff --- /dev/null +++ b/palace/fem/qfunctions/33/l2mass_33_qf.h @@ -0,0 +1,42 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_MASS_33_QF_H +#define PALACE_LIBCEED_L2_MASS_33_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_apply_l2mass_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1], + *u = in[2], *divu = in[3]; + CeedScalar *__restrict v = out[0], *__restrict divv = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + const CeedScalar u_loc[3] = {u[i + Q * 0], u[i + Q * 1], u[i + Q * 2]}; + CeedScalar coeff[9], adjJt_loc[9], J_loc[9], v_loc[3]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + AdjJt33(adjJt_loc, J_loc); + MultAtBCx33(J_loc, coeff, J_loc, u_loc, v_loc); + + v[i + Q * 0] = wdetJ[i] * v_loc[0]; + v[i + Q * 1] = wdetJ[i] * v_loc[1]; + v[i + Q * 2] = wdetJ[i] * v_loc[2]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + divv[i] = (coeff * qw[i] * qw[i] / wdetJ[i]) * divu[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_MASS_33_QF_H diff --git a/palace/fem/qfunctions/33/l2mass_build_33_qf.h b/palace/fem/qfunctions/33/l2mass_build_33_qf.h new file mode 100644 index 0000000000..357f0374e5 --- /dev/null +++ b/palace/fem/qfunctions/33/l2mass_build_33_qf.h @@ -0,0 +1,46 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_MASS_BUILD_33_QF_H +#define PALACE_LIBCEED_L2_MASS_BUILD_33_QF_H + +#include "../coeff/coeff_1_qf.h" +#include "../coeff/coeff_3_qf.h" +#include "utils_33_qf.h" + +CEED_QFUNCTION(f_build_l2mass_33)(void *__restrict ctx, CeedInt Q, + const CeedScalar *const *in, CeedScalar *const *out) +{ + const CeedScalar *attr = in[0], *wdetJ = in[0] + Q, *adjJt = in[0] + 2 * Q, *qw = in[1]; + CeedScalar *qd1 = out[0], *qd2 = out[0] + 9 * Q; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + { + CeedScalar coeff[9], adjJt_loc[9], J_loc[9], qd_loc[9]; + CoeffUnpack3((const CeedIntScalar *)ctx, (CeedInt)attr[i], coeff); + MatUnpack33(adjJt + i, Q, adjJt_loc); + AdjJt33(adjJt_loc, J_loc); + MultAtBA33(J_loc, coeff, qd_loc); + + qd1[i + Q * 0] = wdetJ[i] * qd_loc[0]; + qd1[i + Q * 1] = wdetJ[i] * qd_loc[1]; + qd1[i + Q * 2] = wdetJ[i] * qd_loc[2]; + qd1[i + Q * 3] = wdetJ[i] * qd_loc[3]; + qd1[i + Q * 4] = wdetJ[i] * qd_loc[4]; + qd1[i + Q * 5] = wdetJ[i] * qd_loc[5]; + qd1[i + Q * 6] = wdetJ[i] * qd_loc[6]; + qd1[i + Q * 7] = wdetJ[i] * qd_loc[7]; + qd1[i + Q * 8] = wdetJ[i] * qd_loc[8]; + } + { + const CeedScalar coeff = + CoeffUnpack1(CoeffPairSecond<3>((const CeedIntScalar *)ctx), (CeedInt)attr[i]); + + qd2[i] = coeff * qw[i] * qw[i] / wdetJ[i]; + } + } + return 0; +} + +#endif // PALACE_LIBCEED_L2_MASS_BUILD_33_QF_H diff --git a/palace/fem/qfunctions/33/utils_33_qf.h b/palace/fem/qfunctions/33/utils_33_qf.h new file mode 100644 index 0000000000..6d24fc6bba --- /dev/null +++ b/palace/fem/qfunctions/33/utils_33_qf.h @@ -0,0 +1,178 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_UTILS_33_QF_H +#define PALACE_LIBCEED_UTILS_33_QF_H + +#ifndef CEED_RUNNING_JIT_PASS +#include +#endif + +CEED_QFUNCTION_HELPER CeedScalar DetJ33(const CeedScalar J[9]) +{ + // J: 0 3 6 + // 1 4 7 + // 2 5 8 + return J[0] * (J[4] * J[8] - J[5] * J[7]) - J[1] * (J[3] * J[8] - J[5] * J[6]) + + J[2] * (J[3] * J[7] - J[4] * J[6]); +} + +template +CEED_QFUNCTION_HELPER CeedScalar AdjJt33(const CeedScalar J[9], CeedScalar adjJt[9]) +{ + // Compute adj(J)^T / det(J) and store the result. + // J: 0 3 6 + // 1 4 7 + // 2 5 8 + adjJt[0] = J[4] * J[8] - J[7] * J[5]; + adjJt[3] = J[7] * J[2] - J[1] * J[8]; + adjJt[6] = J[1] * J[5] - J[4] * J[2]; + adjJt[1] = J[6] * J[5] - J[3] * J[8]; + adjJt[4] = J[0] * J[8] - J[6] * J[2]; + adjJt[7] = J[3] * J[2] - J[0] * J[5]; + adjJt[2] = J[3] * J[7] - J[6] * J[4]; + adjJt[5] = J[6] * J[1] - J[0] * J[7]; + adjJt[8] = J[0] * J[4] - J[3] * J[1]; + return ComputeDet ? (J[0] * adjJt[0] + J[1] * adjJt[1] + J[2] * adjJt[2]) : 0.0; +} + +CEED_QFUNCTION_HELPER void MatUnpack33(const CeedScalar *A, const CeedInt A_stride, + CeedScalar A_loc[9]) +{ + A_loc[0] = A[A_stride * 0]; + A_loc[1] = A[A_stride * 1]; + A_loc[2] = A[A_stride * 2]; + A_loc[3] = A[A_stride * 3]; + A_loc[4] = A[A_stride * 4]; + A_loc[5] = A[A_stride * 5]; + A_loc[6] = A[A_stride * 6]; + A_loc[7] = A[A_stride * 7]; + A_loc[8] = A[A_stride * 8]; +} + +CEED_QFUNCTION_HELPER void MultBx33(const CeedScalar B[9], const CeedScalar x[3], + CeedScalar y[3]) +{ + // B: 0 3 6 + // 1 4 7 + // 2 5 8 + y[0] = B[0] * x[0] + B[3] * x[1] + B[6] * x[2]; + y[1] = B[1] * x[0] + B[4] * x[1] + B[7] * x[2]; + y[2] = B[2] * x[0] + B[5] * x[1] + B[8] * x[2]; +} + +CEED_QFUNCTION_HELPER void MultAtBCx33(const CeedScalar A[9], const CeedScalar B[9], + const CeedScalar C[9], const CeedScalar x[3], + CeedScalar y[3]) +{ + // A, B, C: 0 3 6 + // 1 4 7 + // 2 5 8 + CeedScalar z[3]; + + y[0] = C[0] * x[0] + C[3] * x[1] + C[6] * x[2]; + y[1] = C[1] * x[0] + C[4] * x[1] + C[7] * x[2]; + y[2] = C[2] * x[0] + C[5] * x[1] + C[8] * x[2]; + + z[0] = B[0] * y[0] + B[3] * y[1] + B[6] * y[2]; + z[1] = B[1] * y[0] + B[4] * y[1] + B[7] * y[2]; + z[2] = B[2] * y[0] + B[5] * y[1] + B[8] * y[2]; + + y[0] = A[0] * z[0] + A[1] * z[1] + A[2] * z[2]; + y[1] = A[3] * z[0] + A[4] * z[1] + A[5] * z[2]; + y[2] = A[6] * z[0] + A[7] * z[1] + A[8] * z[2]; +} + +CEED_QFUNCTION_HELPER void MultBAx33(const CeedScalar A[9], const CeedScalar B[9], + const CeedScalar x[3], CeedScalar y[3]) +{ + // A, B: 0 3 6 + // 1 4 7 + // 2 5 8 + CeedScalar z[3]; + + z[0] = A[0] * x[0] + A[3] * x[1] + A[6] * x[2]; + z[1] = A[1] * x[0] + A[4] * x[1] + A[7] * x[2]; + z[2] = A[2] * x[0] + A[5] * x[1] + A[8] * x[2]; + + y[0] = B[0] * z[0] + B[3] * z[1] + B[6] * z[2]; + y[1] = B[1] * z[0] + B[4] * z[1] + B[7] * z[2]; + y[2] = B[2] * z[0] + B[5] * z[1] + B[8] * z[2]; +} + +CEED_QFUNCTION_HELPER void MultAtBA33(const CeedScalar A[9], const CeedScalar B[9], + CeedScalar C[9]) +{ + // A, B, C: 0 3 6 + // 1 4 7 + // 2 5 8 + + // First compute entries of R = B A. + const CeedScalar R11 = B[0] * A[0] + B[3] * A[1] + B[6] * A[2]; + const CeedScalar R21 = B[1] * A[0] + B[4] * A[1] + B[7] * A[2]; + const CeedScalar R31 = B[2] * A[0] + B[5] * A[1] + B[8] * A[2]; + const CeedScalar R12 = B[0] * A[3] + B[3] * A[4] + B[6] * A[5]; + const CeedScalar R22 = B[1] * A[3] + B[4] * A[4] + B[7] * A[5]; + const CeedScalar R32 = B[2] * A[3] + B[5] * A[4] + B[8] * A[5]; + const CeedScalar R13 = B[0] * A[6] + B[3] * A[7] + B[6] * A[8]; + const CeedScalar R23 = B[1] * A[6] + B[4] * A[7] + B[7] * A[8]; + const CeedScalar R33 = B[2] * A[6] + B[5] * A[7] + B[8] * A[8]; + + C[0] = A[0] * R11 + A[1] * R21 + A[2] * R31; + C[1] = A[3] * R11 + A[4] * R21 + A[5] * R31; + C[2] = A[6] * R11 + A[7] * R21 + A[8] * R31; + C[3] = A[0] * R12 + A[1] * R22 + A[2] * R32; + C[4] = A[3] * R12 + A[4] * R22 + A[5] * R32; + C[5] = A[6] * R12 + A[7] * R22 + A[8] * R32; + C[6] = A[0] * R13 + A[1] * R23 + A[2] * R33; + C[7] = A[3] * R13 + A[4] * R23 + A[5] * R33; + C[8] = A[6] * R13 + A[7] * R23 + A[8] * R33; +} + +CEED_QFUNCTION_HELPER void MultAtBC33(const CeedScalar A[9], const CeedScalar B[9], + const CeedScalar C[9], CeedScalar D[9]) +{ + // A, B, C, D: 0 3 6 + // 1 4 7 + // 2 5 8 + + // First compute entries of R = B C. + const CeedScalar R11 = B[0] * C[0] + B[3] * C[1] + B[6] * C[2]; + const CeedScalar R21 = B[1] * C[0] + B[4] * C[1] + B[7] * C[2]; + const CeedScalar R31 = B[2] * C[0] + B[5] * C[1] + B[8] * C[2]; + const CeedScalar R12 = B[0] * C[3] + B[3] * C[4] + B[6] * C[5]; + const CeedScalar R22 = B[1] * C[3] + B[4] * C[4] + B[7] * C[5]; + const CeedScalar R32 = B[2] * C[3] + B[5] * C[4] + B[8] * C[5]; + const CeedScalar R13 = B[0] * C[6] + B[3] * C[7] + B[6] * C[8]; + const CeedScalar R23 = B[1] * C[6] + B[4] * C[7] + B[7] * C[8]; + const CeedScalar R33 = B[2] * C[6] + B[5] * C[7] + B[8] * C[8]; + + D[0] = A[0] * R11 + A[1] * R21 + A[2] * R31; + D[1] = A[3] * R11 + A[4] * R21 + A[5] * R31; + D[2] = A[6] * R11 + A[7] * R21 + A[8] * R31; + D[3] = A[0] * R12 + A[1] * R22 + A[2] * R32; + D[4] = A[3] * R12 + A[4] * R22 + A[5] * R32; + D[5] = A[6] * R12 + A[7] * R22 + A[8] * R32; + D[6] = A[0] * R13 + A[1] * R23 + A[2] * R33; + D[7] = A[3] * R13 + A[4] * R23 + A[5] * R33; + D[8] = A[6] * R13 + A[7] * R23 + A[8] * R33; +} + +CEED_QFUNCTION_HELPER void MultBA33(const CeedScalar A[9], const CeedScalar B[9], + CeedScalar C[9]) +{ + // A, B, C: 0 3 6 + // 1 4 7 + // 2 5 8 + C[0] = B[0] * A[0] + B[3] * A[1] + B[6] * A[2]; + C[1] = B[1] * A[0] + B[4] * A[1] + B[7] * A[2]; + C[2] = B[2] * A[0] + B[5] * A[1] + B[8] * A[2]; + C[3] = B[0] * A[3] + B[3] * A[4] + B[6] * A[5]; + C[4] = B[1] * A[3] + B[4] * A[4] + B[7] * A[5]; + C[5] = B[2] * A[3] + B[5] * A[4] + B[8] * A[5]; + C[6] = B[0] * A[6] + B[3] * A[7] + B[6] * A[8]; + C[7] = B[1] * A[6] + B[4] * A[7] + B[7] * A[8]; + C[8] = B[2] * A[6] + B[5] * A[7] + B[8] * A[8]; +} + +#endif // PALACE_LIBCEED_UTILS_33_QF_H diff --git a/palace/fem/qfunctions/apply/apply_12_qf.h b/palace/fem/qfunctions/apply/apply_12_qf.h new file mode 100644 index 0000000000..3a440f3732 --- /dev/null +++ b/palace/fem/qfunctions/apply/apply_12_qf.h @@ -0,0 +1,26 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_APPLY_12_QF_H +#define PALACE_LIBCEED_APPLY_12_QF_H + +CEED_QFUNCTION(f_apply_12)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *__restrict qd1 = in[0], *__restrict qd2 = in[0] + Q, + *__restrict u1 = in[1], *__restrict u2 = in[2]; + CeedScalar *__restrict v1 = out[0], *__restrict v2 = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + v1[i] = qd1[i] * u1[i]; + + const CeedScalar u20 = u2[i + Q * 0]; + const CeedScalar u21 = u2[i + Q * 1]; + v2[i + Q * 0] = qd2[i + Q * 0] * u20 + qd2[i + Q * 2] * u21; + v2[i + Q * 1] = qd2[i + Q * 1] * u20 + qd2[i + Q * 3] * u21; + } + return 0; +} + +#endif // PALACE_LIBCEED_APPLY_12_QF_H diff --git a/palace/fem/qfunctions/apply/apply_13_qf.h b/palace/fem/qfunctions/apply/apply_13_qf.h new file mode 100644 index 0000000000..20e376c65d --- /dev/null +++ b/palace/fem/qfunctions/apply/apply_13_qf.h @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_APPLY_13_QF_H +#define PALACE_LIBCEED_APPLY_13_QF_H + +CEED_QFUNCTION(f_apply_13)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *__restrict qd1 = in[0], *__restrict qd2 = in[0] + Q, + *__restrict u1 = in[1], *__restrict u2 = in[2]; + CeedScalar *__restrict v1 = out[0], *__restrict v2 = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + v1[i] = qd1[i] * u1[i]; + + const CeedScalar u20 = u2[i + Q * 0]; + const CeedScalar u21 = u2[i + Q * 1]; + const CeedScalar u22 = u2[i + Q * 2]; + v2[i + Q * 0] = qd2[i + Q * 0] * u20 + qd2[i + Q * 3] * u21 + qd2[i + Q * 6] * u22; + v2[i + Q * 1] = qd2[i + Q * 1] * u20 + qd2[i + Q * 4] * u21 + qd2[i + Q * 7] * u22; + v2[i + Q * 2] = qd2[i + Q * 2] * u20 + qd2[i + Q * 5] * u21 + qd2[i + Q * 8] * u22; + } + return 0; +} + +#endif // PALACE_LIBCEED_APPLY_13_QF_H diff --git a/palace/fem/qfunctions/apply/apply_1_qf.h b/palace/fem/qfunctions/apply/apply_1_qf.h new file mode 100644 index 0000000000..91a8480e94 --- /dev/null +++ b/palace/fem/qfunctions/apply/apply_1_qf.h @@ -0,0 +1,20 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_APPLY_1_QF_H +#define PALACE_LIBCEED_APPLY_1_QF_H + +CEED_QFUNCTION(f_apply_1)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *__restrict qd = in[0], *__restrict u = in[1]; + CeedScalar *__restrict v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + v[i] = qd[i] * u[i]; + } + return 0; +} + +#endif // PALACE_LIBCEED_APPLY_1_QF_H diff --git a/palace/fem/qfunctions/apply/apply_21_qf.h b/palace/fem/qfunctions/apply/apply_21_qf.h new file mode 100644 index 0000000000..722c81bfbc --- /dev/null +++ b/palace/fem/qfunctions/apply/apply_21_qf.h @@ -0,0 +1,26 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_APPLY_21_QF_H +#define PALACE_LIBCEED_APPLY_21_QF_H + +CEED_QFUNCTION(f_apply_21)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *__restrict qd1 = in[0], *__restrict qd2 = in[0] + 4 * Q, + *__restrict u1 = in[1], *__restrict u2 = in[2]; + CeedScalar *__restrict v1 = out[0], *__restrict v2 = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u10 = u1[i + Q * 0]; + const CeedScalar u11 = u1[i + Q * 1]; + v1[i + Q * 0] = qd1[i + Q * 0] * u10 + qd1[i + Q * 2] * u11; + v1[i + Q * 1] = qd1[i + Q * 1] * u10 + qd1[i + Q * 3] * u11; + + v2[i] = qd2[i] * u2[i]; + } + return 0; +} + +#endif // PALACE_LIBCEED_APPLY_21_QF_H diff --git a/palace/fem/qfunctions/apply/apply_22_qf.h b/palace/fem/qfunctions/apply/apply_22_qf.h new file mode 100644 index 0000000000..bc3b7db27c --- /dev/null +++ b/palace/fem/qfunctions/apply/apply_22_qf.h @@ -0,0 +1,29 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_APPLY_22_QF_H +#define PALACE_LIBCEED_APPLY_22_QF_H + +CEED_QFUNCTION(f_apply_22)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *__restrict qd1 = in[0], *__restrict qd2 = in[0] + 4 * Q, + *__restrict u1 = in[1], *__restrict u2 = in[2]; + CeedScalar *__restrict v1 = out[0], *__restrict v2 = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u10 = u1[i + Q * 0]; + const CeedScalar u11 = u1[i + Q * 1]; + v1[i + Q * 0] = qd1[i + Q * 0] * u10 + qd1[i + Q * 2] * u11; + v1[i + Q * 1] = qd1[i + Q * 1] * u10 + qd1[i + Q * 3] * u11; + + const CeedScalar u20 = u2[i + Q * 0]; + const CeedScalar u21 = u2[i + Q * 1]; + v2[i + Q * 0] = qd2[i + Q * 0] * u20 + qd2[i + Q * 2] * u21; + v2[i + Q * 1] = qd2[i + Q * 1] * u20 + qd2[i + Q * 3] * u21; + } + return 0; +} + +#endif // PALACE_LIBCEED_APPLY_22_QF_H diff --git a/palace/fem/qfunctions/apply/apply_2_qf.h b/palace/fem/qfunctions/apply/apply_2_qf.h new file mode 100644 index 0000000000..8b416d886d --- /dev/null +++ b/palace/fem/qfunctions/apply/apply_2_qf.h @@ -0,0 +1,23 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_APPLY_2_QF_H +#define PALACE_LIBCEED_APPLY_2_QF_H + +CEED_QFUNCTION(f_apply_2)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *__restrict qd = in[0], *__restrict u = in[1]; + CeedScalar *__restrict v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 2] * u1; + v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 3] * u1; + } + return 0; +} + +#endif // PALACE_LIBCEED_APPLY_2_QF_H diff --git a/palace/fem/qfunctions/apply/apply_31_qf.h b/palace/fem/qfunctions/apply/apply_31_qf.h new file mode 100644 index 0000000000..581db19133 --- /dev/null +++ b/palace/fem/qfunctions/apply/apply_31_qf.h @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_APPLY_31_QF_H +#define PALACE_LIBCEED_APPLY_31_QF_H + +CEED_QFUNCTION(f_apply_31)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *__restrict qd1 = in[0], *__restrict qd2 = in[0] + 9 * Q, + *__restrict u1 = in[1], *__restrict u2 = in[2]; + CeedScalar *__restrict v1 = out[0], *__restrict v2 = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u10 = u1[i + Q * 0]; + const CeedScalar u11 = u1[i + Q * 1]; + const CeedScalar u12 = u1[i + Q * 2]; + v1[i + Q * 0] = qd1[i + Q * 0] * u10 + qd1[i + Q * 3] * u11 + qd1[i + Q * 6] * u12; + v1[i + Q * 1] = qd1[i + Q * 1] * u10 + qd1[i + Q * 4] * u11 + qd1[i + Q * 7] * u12; + v1[i + Q * 2] = qd1[i + Q * 2] * u10 + qd1[i + Q * 5] * u11 + qd1[i + Q * 8] * u12; + + v2[i] = qd2[i] * u2[i]; + } + return 0; +} + +#endif // PALACE_LIBCEED_APPLY_31_QF_H diff --git a/palace/fem/qfunctions/apply/apply_33_qf.h b/palace/fem/qfunctions/apply/apply_33_qf.h new file mode 100644 index 0000000000..b589b1fd3a --- /dev/null +++ b/palace/fem/qfunctions/apply/apply_33_qf.h @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_APPLY_33_QF_H +#define PALACE_LIBCEED_APPLY_33_QF_H + +CEED_QFUNCTION(f_apply_33)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *__restrict qd1 = in[0], *__restrict qd2 = in[0] + 9 * Q, + *__restrict u1 = in[1], *__restrict u2 = in[2]; + CeedScalar *__restrict v1 = out[0], *__restrict v2 = out[1]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u10 = u1[i + Q * 0]; + const CeedScalar u11 = u1[i + Q * 1]; + const CeedScalar u12 = u1[i + Q * 2]; + v1[i + Q * 0] = qd1[i + Q * 0] * u10 + qd1[i + Q * 3] * u11 + qd1[i + Q * 6] * u12; + v1[i + Q * 1] = qd1[i + Q * 1] * u10 + qd1[i + Q * 4] * u11 + qd1[i + Q * 7] * u12; + v1[i + Q * 2] = qd1[i + Q * 2] * u10 + qd1[i + Q * 5] * u11 + qd1[i + Q * 8] * u12; + + const CeedScalar u20 = u2[i + Q * 0]; + const CeedScalar u21 = u2[i + Q * 1]; + const CeedScalar u22 = u2[i + Q * 2]; + v2[i + Q * 0] = qd2[i + Q * 0] * u20 + qd2[i + Q * 3] * u21 + qd2[i + Q * 6] * u22; + v2[i + Q * 1] = qd2[i + Q * 1] * u20 + qd2[i + Q * 4] * u21 + qd2[i + Q * 7] * u22; + v2[i + Q * 2] = qd2[i + Q * 2] * u20 + qd2[i + Q * 5] * u21 + qd2[i + Q * 8] * u22; + } + return 0; +} + +#endif // PALACE_LIBCEED_APPLY_33_QF_H diff --git a/palace/fem/qfunctions/apply/apply_3_qf.h b/palace/fem/qfunctions/apply/apply_3_qf.h new file mode 100644 index 0000000000..63c2a13019 --- /dev/null +++ b/palace/fem/qfunctions/apply/apply_3_qf.h @@ -0,0 +1,25 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_APPLY_3_QF_H +#define PALACE_LIBCEED_APPLY_3_QF_H + +CEED_QFUNCTION(f_apply_3)(void *, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + const CeedScalar *__restrict qd = in[0], *__restrict u = in[1]; + CeedScalar *__restrict v = out[0]; + + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + const CeedScalar u2 = u[i + Q * 2]; + v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 3] * u1 + qd[i + Q * 6] * u2; + v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 4] * u1 + qd[i + Q * 7] * u2; + v[i + Q * 2] = qd[i + Q * 2] * u0 + qd[i + Q * 5] * u1 + qd[i + Q * 8] * u2; + } + return 0; +} + +#endif // PALACE_LIBCEED_APPLY_3_QF_H diff --git a/palace/fem/qfunctions/apply_qf.h b/palace/fem/qfunctions/apply_qf.h new file mode 100644 index 0000000000..d956e3fa83 --- /dev/null +++ b/palace/fem/qfunctions/apply_qf.h @@ -0,0 +1,26 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_APPLY_QF_H +#define PALACE_LIBCEED_APPLY_QF_H + +// libCEED QFunctions for application of a generic operator with assembled quadrature data. +// in[0] is quadrature data, shape [ncomp=vdim*vdim, Q] +// in[1] is active vector, shape [ncomp=vdim, Q] +// out[0] is active vector, shape [ncomp=vdim, Q] + +// For pairwise apply functions, the inputs and outputs come in pairs and the quadrature +// data is arranged to be applied with the first vdim*vdim components for the first +// input/output and the remainder for the second. + +#include "apply/apply_12_qf.h" +#include "apply/apply_13_qf.h" +#include "apply/apply_1_qf.h" +#include "apply/apply_21_qf.h" +#include "apply/apply_22_qf.h" +#include "apply/apply_2_qf.h" +#include "apply/apply_31_qf.h" +#include "apply/apply_33_qf.h" +#include "apply/apply_3_qf.h" + +#endif // PALACE_LIBCEED_APPLY_QF_H diff --git a/palace/fem/qfunctions/coeff/coeff_1_qf.h b/palace/fem/qfunctions/coeff/coeff_1_qf.h new file mode 100644 index 0000000000..e57d212ce5 --- /dev/null +++ b/palace/fem/qfunctions/coeff/coeff_1_qf.h @@ -0,0 +1,21 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_COEFF_1_QF_H +#define PALACE_LIBCEED_COEFF_1_QF_H + +#include "coeff_qf.h" + +CEED_QFUNCTION_HELPER CeedScalar CoeffUnpack1(const CeedIntScalar *ctx, const CeedInt attr) +{ + const CeedInt k = (NumAttr(ctx) > 0) ? AttrMat(ctx)[attr - 1].first : 0; + return MatCoeff(ctx)[k].second; +} + +CEED_QFUNCTION_HELPER void CoeffUnpack1(const CeedIntScalar *ctx, const CeedInt attr, + CeedScalar coeff[1]) +{ + coeff[0] = CoeffUnpack1(ctx, attr); +} + +#endif // PALACE_LIBCEED_COEFF_1_QF_H diff --git a/palace/fem/qfunctions/coeff/coeff_2_qf.h b/palace/fem/qfunctions/coeff/coeff_2_qf.h new file mode 100644 index 0000000000..882cfb5b67 --- /dev/null +++ b/palace/fem/qfunctions/coeff/coeff_2_qf.h @@ -0,0 +1,20 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_COEFF_2_QF_H +#define PALACE_LIBCEED_COEFF_2_QF_H + +#include "coeff_qf.h" + +CEED_QFUNCTION_HELPER void CoeffUnpack2(const CeedIntScalar *ctx, const CeedInt attr, + CeedScalar coeff[4]) +{ + const CeedInt k = (NumAttr(ctx) > 0) ? AttrMat(ctx)[attr - 1].first : 0; + const CeedIntScalar *mat_coeff = MatCoeff(ctx); + coeff[0] = mat_coeff[4 * k + 0].second; + coeff[1] = mat_coeff[4 * k + 1].second; + coeff[2] = mat_coeff[4 * k + 2].second; + coeff[3] = mat_coeff[4 * k + 3].second; +} + +#endif // PALACE_LIBCEED_COEFF_2_QF_H diff --git a/palace/fem/qfunctions/coeff/coeff_3_qf.h b/palace/fem/qfunctions/coeff/coeff_3_qf.h new file mode 100644 index 0000000000..1d018dfe99 --- /dev/null +++ b/palace/fem/qfunctions/coeff/coeff_3_qf.h @@ -0,0 +1,25 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_COEFF_3_QF_H +#define PALACE_LIBCEED_COEFF_3_QF_H + +#include "coeff_qf.h" + +CEED_QFUNCTION_HELPER void CoeffUnpack3(const CeedIntScalar *ctx, const CeedInt attr, + CeedScalar coeff[9]) +{ + const CeedInt k = (NumAttr(ctx) > 0) ? AttrMat(ctx)[attr - 1].first : 0; + const CeedIntScalar *mat_coeff = MatCoeff(ctx); + coeff[0] = mat_coeff[9 * k + 0].second; + coeff[1] = mat_coeff[9 * k + 1].second; + coeff[2] = mat_coeff[9 * k + 2].second; + coeff[3] = mat_coeff[9 * k + 3].second; + coeff[4] = mat_coeff[9 * k + 4].second; + coeff[5] = mat_coeff[9 * k + 5].second; + coeff[6] = mat_coeff[9 * k + 6].second; + coeff[7] = mat_coeff[9 * k + 7].second; + coeff[8] = mat_coeff[9 * k + 8].second; +} + +#endif // PALACE_LIBCEED_COEFF_3_QF_H diff --git a/palace/fem/qfunctions/coeff/coeff_qf.h b/palace/fem/qfunctions/coeff/coeff_qf.h new file mode 100644 index 0000000000..34164cef3c --- /dev/null +++ b/palace/fem/qfunctions/coeff/coeff_qf.h @@ -0,0 +1,45 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_COEFF_QF_H +#define PALACE_LIBCEED_COEFF_QF_H + +union CeedIntScalar +{ + CeedInt first; + CeedScalar second; +}; + +// The first entry of ctx is the number of (1-based) attributes, followed by the entries of +// the attribute to material index array (these are 0-based). +// The next entry is the number of material property coefficients, followed by the +// coefficients. +// Pair coefficients are two coefficient contexts arranged contiguously in memory. + +CEED_QFUNCTION_HELPER CeedInt NumAttr(const CeedIntScalar *ctx) +{ + return ctx[0].first; +} + +CEED_QFUNCTION_HELPER CeedInt NumMat(const CeedIntScalar *ctx) +{ + return ctx[1 + NumAttr(ctx)].first; +} + +CEED_QFUNCTION_HELPER const CeedIntScalar *AttrMat(const CeedIntScalar *ctx) +{ + return ctx + 1; +} + +CEED_QFUNCTION_HELPER const CeedIntScalar *MatCoeff(const CeedIntScalar *ctx) +{ + return ctx + 2 + NumAttr(ctx); +} + +template +CEED_QFUNCTION_HELPER const CeedIntScalar *CoeffPairSecond(const CeedIntScalar *ctx) +{ + return ctx + 2 + NumAttr(ctx) + (DIM * DIM) * NumMat(ctx); +} + +#endif // PALACE_LIBCEED_COEFF_QF_H diff --git a/palace/fem/qfunctions/curlcurl_qf.h b/palace/fem/qfunctions/curlcurl_qf.h index ce7b34da67..2e7cca630f 100644 --- a/palace/fem/qfunctions/curlcurl_qf.h +++ b/palace/fem/qfunctions/curlcurl_qf.h @@ -1,174 +1,174 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_CURLCURL_QF_H -#define PALACE_LIBCEED_CURLCURL_QF_H - -#include "utils_qf.h" - -struct CurlCurlContext -{ - CeedInt dim, space_dim, curl_dim; - CeedScalar coeff; -}; - -// libCEED QFunction for building quadrature data for a curl-curl operator with a scalar -// constant coefficient. -CEED_QFUNCTION(f_build_curlcurl_const_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of - // the result. In 2D, compute and store qw * c / det(J). - // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[1] is quadrature weights, size (Q) - CurlCurlContext *bc = (CurlCurlContext *)ctx; - const CeedScalar coeff = bc->coeff; - const CeedScalar *J = in[0], *qw = in[1]; - CeedScalar *qd = out[0]; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 221: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff / DetJ22(J + i, Q); - } - break; - case 321: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff / DetJ32(J + i, Q); - } - break; - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a curl-curl operator with a scalar -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_curlcurl_quad_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of - // the result. In 2D, compute and store qw * c / det(J). - // in[0] is coefficients with shape [ncomp=1, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - CurlCurlContext *bc = (CurlCurlContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 221: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] / DetJ22(J + i, Q); - } - break; - case 321: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] / DetJ32(J + i, Q); - } - break; - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a curl-curl operator with a vector -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_curlcurl_quad_vector)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of - // the result. In 2D, compute and store qw * c / det(J). - // in[0] is coefficients with shape [ncomp=space_dim, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - CurlCurlContext *bc = (CurlCurlContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a curl-curl operator -// with a matrix coefficient evaluated at quadrature points -CEED_QFUNCTION(f_build_curlcurl_quad_matrix)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of - // the result. In 2D, compute and store qw * c / det(J). - // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - CurlCurlContext *bc = (CurlCurlContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for applying a curl-curl operator. -CEED_QFUNCTION(f_apply_curlcurl)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // in[0], out[0] have shape [curl_dim, ncomp=1, Q] - CurlCurlContext *bc = (CurlCurlContext *)ctx; - const CeedScalar *uc = in[0], *qd = in[1]; - CeedScalar *vc = out[0]; - switch (10 * bc->dim + bc->curl_dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - vc[i] = qd[i] * uc[i]; - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar uc0 = uc[i + Q * 0]; - const CeedScalar uc1 = uc[i + Q * 1]; - const CeedScalar uc2 = uc[i + Q * 2]; - vc[i + Q * 0] = qd[i + Q * 0] * uc0 + qd[i + Q * 1] * uc1 + qd[i + Q * 2] * uc2; - vc[i + Q * 1] = qd[i + Q * 1] * uc0 + qd[i + Q * 3] * uc1 + qd[i + Q * 4] * uc2; - vc[i + Q * 2] = qd[i + Q * 2] * uc0 + qd[i + Q * 4] * uc1 + qd[i + Q * 5] * uc2; - } - break; - } - return 0; -} - -#endif // PALACE_LIBCEED_CURLCURL_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_CURLCURL_QF_H +#define PALACE_LIBCEED_CURLCURL_QF_H + +#include "utils_qf.h" + +struct CurlCurlContext +{ + CeedInt dim, space_dim, curl_dim; + CeedScalar coeff; +}; + +// libCEED QFunction for building quadrature data for a curl-curl operator with a scalar +// constant coefficient. +CEED_QFUNCTION(f_build_curlcurl_const_scalar)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of + // the result. In 2D, compute and store qw * c / det(J). + // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[1] is quadrature weights, size (Q) + CurlCurlContext *bc = (CurlCurlContext *)ctx; + const CeedScalar coeff = bc->coeff; + const CeedScalar *J = in[0], *qw = in[1]; + CeedScalar *qd = out[0]; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 221: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff / DetJ22(J + i, Q); + } + break; + case 321: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff / DetJ32(J + i, Q); + } + break; + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a curl-curl operator with a scalar +// coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_curlcurl_quad_scalar)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of + // the result. In 2D, compute and store qw * c / det(J). + // in[0] is coefficients with shape [ncomp=1, Q] + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + CurlCurlContext *bc = (CurlCurlContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 221: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] / DetJ22(J + i, Q); + } + break; + case 321: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] / DetJ32(J + i, Q); + } + break; + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a curl-curl operator with a vector +// coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_curlcurl_quad_vector)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of + // the result. In 2D, compute and store qw * c / det(J). + // in[0] is coefficients with shape [ncomp=space_dim, Q] + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + CurlCurlContext *bc = (CurlCurlContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a curl-curl operator +// with a matrix coefficient evaluated at quadrature points +CEED_QFUNCTION(f_build_curlcurl_quad_matrix)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of + // the result. In 2D, compute and store qw * c / det(J). + // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + CurlCurlContext *bc = (CurlCurlContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for applying a curl-curl operator. +CEED_QFUNCTION(f_apply_curlcurl)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // in[0], out[0] have shape [curl_dim, ncomp=1, Q] + CurlCurlContext *bc = (CurlCurlContext *)ctx; + const CeedScalar *uc = in[0], *qd = in[1]; + CeedScalar *vc = out[0]; + switch (10 * bc->dim + bc->curl_dim) + { + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + vc[i] = qd[i] * uc[i]; + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar uc0 = uc[i + Q * 0]; + const CeedScalar uc1 = uc[i + Q * 1]; + const CeedScalar uc2 = uc[i + Q * 2]; + vc[i + Q * 0] = qd[i + Q * 0] * uc0 + qd[i + Q * 1] * uc1 + qd[i + Q * 2] * uc2; + vc[i + Q * 1] = qd[i + Q * 1] * uc0 + qd[i + Q * 3] * uc1 + qd[i + Q * 4] * uc2; + vc[i + Q * 2] = qd[i + Q * 2] * uc0 + qd[i + Q * 4] * uc1 + qd[i + Q * 5] * uc2; + } + break; + } + return 0; +} + +#endif // PALACE_LIBCEED_CURLCURL_QF_H diff --git a/palace/fem/qfunctions/curlcurlmass_qf.h b/palace/fem/qfunctions/curlcurlmass_qf.h index 39e4092097..45623852d3 100644 --- a/palace/fem/qfunctions/curlcurlmass_qf.h +++ b/palace/fem/qfunctions/curlcurlmass_qf.h @@ -1,402 +1,402 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_CURLCURL_MASS_QF_H -#define PALACE_LIBCEED_CURLCURL_MASS_QF_H - -#include "utils_qf.h" - -struct CurlCurlMassContext -{ - CeedInt dim, space_dim, curl_dim; -}; - -// libCEED QFunction for building quadrature data for a curl-curl + mass operator with -// scalar coefficients evaluated at quadrature points. -CEED_QFUNCTION(f_build_curlcurl_mass_quad_scalar_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and - // qw / det(J) adj(J) C adj(J)^T and store the result. - // in[0] is curl-curl coefficients with shape [ncomp=1, Q] - // in[1] is mass coefficients with shape [ncomp=1, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; - const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 221: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdc[i] = qw[i] * cc[i] / DetJ22(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); - } - break; - case 321: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdc[i] = qw[i] * cc[i] / DetJ32(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); - } - break; - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cc + i, Q, 1, qw[i], Q, qdc + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a curl-curl + mass operator with -// scalar and vector coefficients evaluated at quadrature points. -CEED_QFUNCTION(f_build_curlcurl_mass_quad_scalar_vector)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and - // qw / det(J) adj(J) C adj(J)^T and store the result. - // in[0] is curl-curl coefficients with shape [ncomp=1, Q] - // in[1] is mass coefficients with shape [ncomp=space_dim, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; - const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 221: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdc[i] = qw[i] * cc[i] / DetJ22(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, cm + i, Q, 2, qw[i], Q, qdm + i); - } - break; - case 321: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdc[i] = qw[i] * cc[i] / DetJ32(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); - } - break; - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cc + i, Q, 1, qw[i], Q, qdc + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a curl-curl + mass operator with -// scalar and matrix coefficients evaluated at quadrature points. -CEED_QFUNCTION(f_build_curlcurl_mass_quad_scalar_matrix)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and - // qw / det(J) adj(J) C adj(J)^T and store the result. - // in[0] is curl-curl coefficients with shape [ncomp=1, Q] - // in[1] is mass coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; - const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 221: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdc[i] = qw[i] * cc[i] / DetJ22(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); - } - break; - case 321: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdc[i] = qw[i] * cc[i] / DetJ32(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); - } - break; - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cc + i, Q, 1, qw[i], Q, qdc + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a curl-curl + mass operator with -// vector and scalar coefficients evaluated at quadrature points. -CEED_QFUNCTION(f_build_curlcurl_mass_quad_vector_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and - // qw / det(J) adj(J) C adj(J)^T and store the result. - // in[0] is curl-curl coefficients with shape [ncomp=space_dim, Q] - // in[1] is mass coefficients with shape [ncomp=1, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; - const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cc + i, Q, 3, qw[i], Q, qdc + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a curl-curl + mass operator with -// vector coefficients evaluated at quadrature points. -CEED_QFUNCTION(f_build_curlcurl_mass_quad_vector_vector)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and - // qw / det(J) adj(J) C adj(J)^T and store the result. - // in[0] is curl-curl coefficients with shape [ncomp=space_dim, Q] - // in[1] is mass coefficients with shape [ncomp=space_dim, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; - const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cc + i, Q, 3, qw[i], Q, qdc + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a curl-curl + mass operator with -// vector and matrix coefficients evaluated at quadrature points. -CEED_QFUNCTION(f_build_curlcurl_mass_quad_vector_matrix)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and - // qw / det(J) adj(J) C adj(J)^T and store the result. - // in[0] is curl-curl coefficients with shape [ncomp=space_dim, Q] - // in[1] is mass coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; - const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cc + i, Q, 3, qw[i], Q, qdc + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a curl-curl + mass operator with -// matrix and scalar coefficients evaluated at quadrature points. -CEED_QFUNCTION(f_build_curlcurl_mass_quad_matrix_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and - // qw / det(J) adj(J) C adj(J)^T and store the result. - // in[0] is curl-curl coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[1] is mass coefficients with shape [ncomp=1, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; - const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cc + i, Q, 6, qw[i], Q, qdc + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a curl-curl + mass operator with -// matrix and vector coefficients evaluated at quadrature points. -CEED_QFUNCTION(f_build_curlcurl_mass_quad_matrix_vector)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and - // qw / det(J) adj(J) C adj(J)^T and store the result. - // in[0] is curl-curl coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[1] is mass coefficients with shape [ncomp=space_dim, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; - const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cc + i, Q, 6, qw[i], Q, qdc + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a curl-curl + mass operator with -// matrix coefficients evaluated at quadrature points. -CEED_QFUNCTION(f_build_curlcurl_mass_quad_matrix_matrix)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and - // qw / det(J) adj(J) C adj(J)^T and store the result. - // in[0] is curl-curl coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[1] is mass coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; - const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; - switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) - { - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cc + i, Q, 6, qw[i], Q, qdc + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for applying a curl-curl + mass operator. -CEED_QFUNCTION(f_apply_curlcurl_mass)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // in[0], out[0] have shape [dim, ncomp=1, Q] - // in[1], out[1] have shape [curl_dim, ncomp=1, Q] - CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; - const CeedScalar *u = in[0], *uc = in[1], *qdc = in[2], - *qdm = in[2] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; - CeedScalar *v = out[0], *vc = out[1]; - switch (10 * bc->dim + bc->curl_dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - vc[i] = qdc[i] * uc[i]; - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar u0 = u[i + Q * 0]; - const CeedScalar u1 = u[i + Q * 1]; - v[i + Q * 0] = qdm[i + Q * 0] * u0 + qdm[i + Q * 1] * u1; - v[i + Q * 1] = qdm[i + Q * 1] * u0 + qdm[i + Q * 2] * u1; - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar uc0 = uc[i + Q * 0]; - const CeedScalar uc1 = uc[i + Q * 1]; - const CeedScalar uc2 = uc[i + Q * 2]; - vc[i + Q * 0] = qdc[i + Q * 0] * uc0 + qdc[i + Q * 1] * uc1 + qdc[i + Q * 2] * uc2; - vc[i + Q * 1] = qdc[i + Q * 1] * uc0 + qdc[i + Q * 3] * uc1 + qdc[i + Q * 4] * uc2; - vc[i + Q * 2] = qdc[i + Q * 2] * uc0 + qdc[i + Q * 4] * uc1 + qdc[i + Q * 5] * uc2; - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar u0 = u[i + Q * 0]; - const CeedScalar u1 = u[i + Q * 1]; - const CeedScalar u2 = u[i + Q * 2]; - v[i + Q * 0] = qdm[i + Q * 0] * u0 + qdm[i + Q * 1] * u1 + qdm[i + Q * 2] * u2; - v[i + Q * 1] = qdm[i + Q * 1] * u0 + qdm[i + Q * 3] * u1 + qdm[i + Q * 4] * u2; - v[i + Q * 2] = qdm[i + Q * 2] * u0 + qdm[i + Q * 4] * u1 + qdm[i + Q * 5] * u2; - } - break; - } - return 0; -} - -#endif // PALACE_LIBCEED_CURLCURL_MASS_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_CURLCURL_MASS_QF_H +#define PALACE_LIBCEED_CURLCURL_MASS_QF_H + +#include "utils_qf.h" + +struct CurlCurlMassContext +{ + CeedInt dim, space_dim, curl_dim; +}; + +// libCEED QFunction for building quadrature data for a curl-curl + mass operator with +// scalar coefficients evaluated at quadrature points. +CEED_QFUNCTION(f_build_curlcurl_mass_quad_scalar_scalar)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and + // qw / det(J) adj(J) C adj(J)^T and store the result. + // in[0] is curl-curl coefficients with shape [ncomp=1, Q] + // in[1] is mass coefficients with shape [ncomp=1, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; + const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 221: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdc[i] = qw[i] * cc[i] / DetJ22(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt22(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); + } + break; + case 321: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdc[i] = qw[i] * cc[i] / DetJ32(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt32(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); + } + break; + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cc + i, Q, 1, qw[i], Q, qdc + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a curl-curl + mass operator with +// scalar and vector coefficients evaluated at quadrature points. +CEED_QFUNCTION(f_build_curlcurl_mass_quad_scalar_vector)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and + // qw / det(J) adj(J) C adj(J)^T and store the result. + // in[0] is curl-curl coefficients with shape [ncomp=1, Q] + // in[1] is mass coefficients with shape [ncomp=space_dim, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; + const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 221: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdc[i] = qw[i] * cc[i] / DetJ22(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt22(J + i, Q, cm + i, Q, 2, qw[i], Q, qdm + i); + } + break; + case 321: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdc[i] = qw[i] * cc[i] / DetJ32(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt32(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); + } + break; + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cc + i, Q, 1, qw[i], Q, qdc + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a curl-curl + mass operator with +// scalar and matrix coefficients evaluated at quadrature points. +CEED_QFUNCTION(f_build_curlcurl_mass_quad_scalar_matrix)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and + // qw / det(J) adj(J) C adj(J)^T and store the result. + // in[0] is curl-curl coefficients with shape [ncomp=1, Q] + // in[1] is mass coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; + const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 221: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdc[i] = qw[i] * cc[i] / DetJ22(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt22(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); + } + break; + case 321: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdc[i] = qw[i] * cc[i] / DetJ32(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt32(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); + } + break; + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cc + i, Q, 1, qw[i], Q, qdc + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a curl-curl + mass operator with +// vector and scalar coefficients evaluated at quadrature points. +CEED_QFUNCTION(f_build_curlcurl_mass_quad_vector_scalar)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and + // qw / det(J) adj(J) C adj(J)^T and store the result. + // in[0] is curl-curl coefficients with shape [ncomp=space_dim, Q] + // in[1] is mass coefficients with shape [ncomp=1, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; + const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cc + i, Q, 3, qw[i], Q, qdc + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a curl-curl + mass operator with +// vector coefficients evaluated at quadrature points. +CEED_QFUNCTION(f_build_curlcurl_mass_quad_vector_vector)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and + // qw / det(J) adj(J) C adj(J)^T and store the result. + // in[0] is curl-curl coefficients with shape [ncomp=space_dim, Q] + // in[1] is mass coefficients with shape [ncomp=space_dim, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; + const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cc + i, Q, 3, qw[i], Q, qdc + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a curl-curl + mass operator with +// vector and matrix coefficients evaluated at quadrature points. +CEED_QFUNCTION(f_build_curlcurl_mass_quad_vector_matrix)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and + // qw / det(J) adj(J) C adj(J)^T and store the result. + // in[0] is curl-curl coefficients with shape [ncomp=space_dim, Q] + // in[1] is mass coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; + const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cc + i, Q, 3, qw[i], Q, qdc + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a curl-curl + mass operator with +// matrix and scalar coefficients evaluated at quadrature points. +CEED_QFUNCTION(f_build_curlcurl_mass_quad_matrix_scalar)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and + // qw / det(J) adj(J) C adj(J)^T and store the result. + // in[0] is curl-curl coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] + // in[1] is mass coefficients with shape [ncomp=1, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; + const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cc + i, Q, 6, qw[i], Q, qdc + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a curl-curl + mass operator with +// matrix and vector coefficients evaluated at quadrature points. +CEED_QFUNCTION(f_build_curlcurl_mass_quad_matrix_vector)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and + // qw / det(J) adj(J) C adj(J)^T and store the result. + // in[0] is curl-curl coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] + // in[1] is mass coefficients with shape [ncomp=space_dim, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; + const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cc + i, Q, 6, qw[i], Q, qdc + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a curl-curl + mass operator with +// matrix coefficients evaluated at quadrature points. +CEED_QFUNCTION(f_build_curlcurl_mass_quad_matrix_matrix)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) J^T C J (3D) or qw * c / det(J) (2D) and + // qw / det(J) adj(J) C adj(J)^T and store the result. + // in[0] is curl-curl coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] + // in[1] is mass coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; + const CeedScalar *cc = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdc = out[0], *qdm = out[0] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; + switch (100 * bc->space_dim + 10 * bc->dim + bc->curl_dim) + { + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cc + i, Q, 6, qw[i], Q, qdc + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for applying a curl-curl + mass operator. +CEED_QFUNCTION(f_apply_curlcurl_mass)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // in[0], out[0] have shape [dim, ncomp=1, Q] + // in[1], out[1] have shape [curl_dim, ncomp=1, Q] + CurlCurlMassContext *bc = (CurlCurlMassContext *)ctx; + const CeedScalar *u = in[0], *uc = in[1], *qdc = in[2], + *qdm = in[2] + Q * bc->curl_dim * (bc->curl_dim + 1) / 2; + CeedScalar *v = out[0], *vc = out[1]; + switch (10 * bc->dim + bc->curl_dim) + { + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + vc[i] = qdc[i] * uc[i]; + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + v[i + Q * 0] = qdm[i + Q * 0] * u0 + qdm[i + Q * 1] * u1; + v[i + Q * 1] = qdm[i + Q * 1] * u0 + qdm[i + Q * 2] * u1; + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar uc0 = uc[i + Q * 0]; + const CeedScalar uc1 = uc[i + Q * 1]; + const CeedScalar uc2 = uc[i + Q * 2]; + vc[i + Q * 0] = qdc[i + Q * 0] * uc0 + qdc[i + Q * 1] * uc1 + qdc[i + Q * 2] * uc2; + vc[i + Q * 1] = qdc[i + Q * 1] * uc0 + qdc[i + Q * 3] * uc1 + qdc[i + Q * 4] * uc2; + vc[i + Q * 2] = qdc[i + Q * 2] * uc0 + qdc[i + Q * 4] * uc1 + qdc[i + Q * 5] * uc2; + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + const CeedScalar u2 = u[i + Q * 2]; + v[i + Q * 0] = qdm[i + Q * 0] * u0 + qdm[i + Q * 1] * u1 + qdm[i + Q * 2] * u2; + v[i + Q * 1] = qdm[i + Q * 1] * u0 + qdm[i + Q * 3] * u1 + qdm[i + Q * 4] * u2; + v[i + Q * 2] = qdm[i + Q * 2] * u0 + qdm[i + Q * 4] * u1 + qdm[i + Q * 5] * u2; + } + break; + } + return 0; +} + +#endif // PALACE_LIBCEED_CURLCURL_MASS_QF_H diff --git a/palace/fem/qfunctions/diffusion_qf.h b/palace/fem/qfunctions/diffusion_qf.h index 45e0f30d69..6f8c11bc85 100644 --- a/palace/fem/qfunctions/diffusion_qf.h +++ b/palace/fem/qfunctions/diffusion_qf.h @@ -1,239 +1,239 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_DIFFUSION_QF_H -#define PALACE_LIBCEED_DIFFUSION_QF_H - -#include "utils_qf.h" - -struct DiffusionContext -{ - CeedInt dim, space_dim; - CeedScalar coeff; -}; - -// libCEED QFunction for building quadrature data for a diffusion operator with a scalar -// constant coefficient. -CEED_QFUNCTION(f_build_diff_const_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the - // symmetric part of the result. - // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[1] is quadrature weights, size (Q) - DiffusionContext *bc = (DiffusionContext *)ctx; - const CeedScalar coeff = bc->coeff; - const CeedScalar *J = in[0], *qw = in[1]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff / J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt21(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a diffusion operator -// with a scalar coefficient evaluated at quadrature points -CEED_QFUNCTION(f_build_diff_quad_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the - // symmetric part of the result. - // in[0] is coefficients with shape [ncomp=1, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - DiffusionContext *bc = (DiffusionContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] / J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a diffusion operator with a vector -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_diff_quad_vector)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the - // symmetric part of the result. - // in[0] is coefficients with shape [ncomp=space_dim, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - DiffusionContext *bc = (DiffusionContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a diffusion operator with a matrix -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_diff_quad_matrix)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the - // symmetric part of the result. - // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - DiffusionContext *bc = (DiffusionContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for applying a diffusion operator. -CEED_QFUNCTION(f_apply_diff)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // in[0], out[0] have shape [dim, ncomp=1, Q] - DiffusionContext *bc = (DiffusionContext *)ctx; - const CeedScalar *ug = in[0], *qd = in[1]; - CeedScalar *vg = out[0]; - switch (bc->dim) - { - case 1: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - vg[i] = qd[i] * ug[i]; - } - break; - case 2: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar ug0 = ug[i + Q * 0]; - const CeedScalar ug1 = ug[i + Q * 1]; - vg[i + Q * 0] = qd[i + Q * 0] * ug0 + qd[i + Q * 1] * ug1; - vg[i + Q * 1] = qd[i + Q * 1] * ug0 + qd[i + Q * 2] * ug1; - } - break; - case 3: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar ug0 = ug[i + Q * 0]; - const CeedScalar ug1 = ug[i + Q * 1]; - const CeedScalar ug2 = ug[i + Q * 2]; - vg[i + Q * 0] = qd[i + Q * 0] * ug0 + qd[i + Q * 1] * ug1 + qd[i + Q * 2] * ug2; - vg[i + Q * 1] = qd[i + Q * 1] * ug0 + qd[i + Q * 3] * ug1 + qd[i + Q * 4] * ug2; - vg[i + Q * 2] = qd[i + Q * 2] * ug0 + qd[i + Q * 4] * ug1 + qd[i + Q * 5] * ug2; - } - break; - } - return 0; -} - -#endif // PALACE_LIBCEED_DIFFUSION_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_DIFFUSION_QF_H +#define PALACE_LIBCEED_DIFFUSION_QF_H + +#include "utils_qf.h" + +struct DiffusionContext +{ + CeedInt dim, space_dim; + CeedScalar coeff; +}; + +// libCEED QFunction for building quadrature data for a diffusion operator with a scalar +// constant coefficient. +CEED_QFUNCTION(f_build_diff_const_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the + // symmetric part of the result. + // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[1] is quadrature weights, size (Q) + DiffusionContext *bc = (DiffusionContext *)ctx; + const CeedScalar coeff = bc->coeff; + const CeedScalar *J = in[0], *qw = in[1]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff / J[i]; + } + break; + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt21(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt22(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt32(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a diffusion operator +// with a scalar coefficient evaluated at quadrature points +CEED_QFUNCTION(f_build_diff_quad_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the + // symmetric part of the result. + // in[0] is coefficients with shape [ncomp=1, Q] + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + DiffusionContext *bc = (DiffusionContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] / J[i]; + } + break; + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a diffusion operator with a vector +// coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_diff_quad_vector)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the + // symmetric part of the result. + // in[0] is coefficients with shape [ncomp=space_dim, Q] + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + DiffusionContext *bc = (DiffusionContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a diffusion operator with a matrix +// coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_diff_quad_matrix)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the + // symmetric part of the result. + // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + DiffusionContext *bc = (DiffusionContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for applying a diffusion operator. +CEED_QFUNCTION(f_apply_diff)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // in[0], out[0] have shape [dim, ncomp=1, Q] + DiffusionContext *bc = (DiffusionContext *)ctx; + const CeedScalar *ug = in[0], *qd = in[1]; + CeedScalar *vg = out[0]; + switch (bc->dim) + { + case 1: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + vg[i] = qd[i] * ug[i]; + } + break; + case 2: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar ug0 = ug[i + Q * 0]; + const CeedScalar ug1 = ug[i + Q * 1]; + vg[i + Q * 0] = qd[i + Q * 0] * ug0 + qd[i + Q * 1] * ug1; + vg[i + Q * 1] = qd[i + Q * 1] * ug0 + qd[i + Q * 2] * ug1; + } + break; + case 3: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar ug0 = ug[i + Q * 0]; + const CeedScalar ug1 = ug[i + Q * 1]; + const CeedScalar ug2 = ug[i + Q * 2]; + vg[i + Q * 0] = qd[i + Q * 0] * ug0 + qd[i + Q * 1] * ug1 + qd[i + Q * 2] * ug2; + vg[i + Q * 1] = qd[i + Q * 1] * ug0 + qd[i + Q * 3] * ug1 + qd[i + Q * 4] * ug2; + vg[i + Q * 2] = qd[i + Q * 2] * ug0 + qd[i + Q * 4] * ug1 + qd[i + Q * 5] * ug2; + } + break; + } + return 0; +} + +#endif // PALACE_LIBCEED_DIFFUSION_QF_H diff --git a/palace/fem/qfunctions/diffusionmass_qf.h b/palace/fem/qfunctions/diffusionmass_qf.h index caead2bd1b..2fb44b2264 100644 --- a/palace/fem/qfunctions/diffusionmass_qf.h +++ b/palace/fem/qfunctions/diffusionmass_qf.h @@ -1,253 +1,253 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_DIFFUSION_MASS_QF_H -#define PALACE_LIBCEED_DIFFUSION_MASS_QF_H - -#include "utils_qf.h" - -struct DiffusionMassContext -{ - CeedInt dim, space_dim; -}; - -// libCEED QFunction for building quadrature data for a diffusion + mass operator with a -// scalar coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_diff_mass_quad_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and qw * c * det(J) - // and store the result. - // in[0] is diffusion coefficients with shape [ncomp=1, Q] - // in[1] is mass coefficients with shape [ncomp=1, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - DiffusionMassContext *bc = (DiffusionMassContext *)ctx; - const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdd = out[0], *qdm = out[0] + Q * bc->dim * (bc->dim + 1) / 2; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / J[i]; - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt21(J + i, Q, cd + i, Q, 1, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ21(J + i, Q); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, cd + i, Q, 1, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ22(J + i, Q); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, cd + i, Q, 1, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ32(J + i, Q); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cd + i, Q, 1, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ33(J + i, Q); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a diffusion + mass operator with a -// vector coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_diff_mass_quad_vector)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and qw * c * det(J) - // and store the result. - // in[0] is diffusion coefficients with shape [ncomp=space_dim, Q] - // in[1] is mass coefficients with shape [ncomp=1, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - DiffusionMassContext *bc = (DiffusionMassContext *)ctx; - const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdd = out[0], *qdm = out[0] + Q * bc->dim * (bc->dim + 1) / 2; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt21(J + i, Q, cd + i, Q, 2, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ21(J + i, Q); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, cd + i, Q, 2, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ22(J + i, Q); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, cd + i, Q, 3, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ32(J + i, Q); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cd + i, Q, 3, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ33(J + i, Q); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a diffusion + mass operator with a -// matrix coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_diff_mass_quad_matrix)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and qw * c * det(J) - // and store the result. - // in[0] is diffusion coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[1] is mass coefficients with shape [ncomp=1, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - DiffusionMassContext *bc = (DiffusionMassContext *)ctx; - const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdd = out[0], *qdm = out[0] + Q * bc->dim * (bc->dim + 1) / 2; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt21(J + i, Q, cd + i, Q, 3, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ21(J + i, Q); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, cd + i, Q, 3, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ22(J + i, Q); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, cd + i, Q, 6, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ32(J + i, Q); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, cd + i, Q, 6, qw[i], Q, qdd + i); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * DetJ33(J + i, Q); - } - break; - } - return 0; -} - -// libCEED QFunction for applying a diffusion + mass operator. -CEED_QFUNCTION(f_apply_diff_mass)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // in[0], out[0] have shape [ncomp=1, Q] - // in[1], out[1] have shape [dim, ncomp=1, Q] - DiffusionMassContext *bc = (DiffusionMassContext *)ctx; - const CeedScalar *u = in[0], *ug = in[1], *qdd = in[2], - *qdm = in[2] + Q * bc->dim * (bc->dim + 1) / 2; - CeedScalar *v = out[0], *vg = out[1]; - switch (bc->dim) - { - case 1: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - vg[i] = qdd[i] * ug[i]; - } - break; - case 2: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar ug0 = ug[i + Q * 0]; - const CeedScalar ug1 = ug[i + Q * 1]; - vg[i + Q * 0] = qdd[i + Q * 0] * ug0 + qdd[i + Q * 1] * ug1; - vg[i + Q * 1] = qdd[i + Q * 1] * ug0 + qdd[i + Q * 2] * ug1; - } - break; - case 3: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar ug0 = ug[i + Q * 0]; - const CeedScalar ug1 = ug[i + Q * 1]; - const CeedScalar ug2 = ug[i + Q * 2]; - vg[i + Q * 0] = qdd[i + Q * 0] * ug0 + qdd[i + Q * 1] * ug1 + qdd[i + Q * 2] * ug2; - vg[i + Q * 1] = qdd[i + Q * 1] * ug0 + qdd[i + Q * 3] * ug1 + qdd[i + Q * 4] * ug2; - vg[i + Q * 2] = qdd[i + Q * 2] * ug0 + qdd[i + Q * 4] * ug1 + qdd[i + Q * 5] * ug2; - } - break; - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - v[i] = qdm[i] * u[i]; - } - return 0; -} - -#endif // PALACE_LIBCEED_DIFFUSION_MASS_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_DIFFUSION_MASS_QF_H +#define PALACE_LIBCEED_DIFFUSION_MASS_QF_H + +#include "utils_qf.h" + +struct DiffusionMassContext +{ + CeedInt dim, space_dim; +}; + +// libCEED QFunction for building quadrature data for a diffusion + mass operator with a +// scalar coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_diff_mass_quad_scalar)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and qw * c * det(J) + // and store the result. + // in[0] is diffusion coefficients with shape [ncomp=1, Q] + // in[1] is mass coefficients with shape [ncomp=1, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + DiffusionMassContext *bc = (DiffusionMassContext *)ctx; + const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdd = out[0], *qdm = out[0] + Q * bc->dim * (bc->dim + 1) / 2; + switch (10 * bc->space_dim + bc->dim) + { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / J[i]; + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * J[i]; + } + break; + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt21(J + i, Q, cd + i, Q, 1, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ21(J + i, Q); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt22(J + i, Q, cd + i, Q, 1, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ22(J + i, Q); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt32(J + i, Q, cd + i, Q, 1, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ32(J + i, Q); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cd + i, Q, 1, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ33(J + i, Q); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a diffusion + mass operator with a +// vector coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_diff_mass_quad_vector)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and qw * c * det(J) + // and store the result. + // in[0] is diffusion coefficients with shape [ncomp=space_dim, Q] + // in[1] is mass coefficients with shape [ncomp=1, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + DiffusionMassContext *bc = (DiffusionMassContext *)ctx; + const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdd = out[0], *qdm = out[0] + Q * bc->dim * (bc->dim + 1) / 2; + switch (10 * bc->space_dim + bc->dim) + { + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt21(J + i, Q, cd + i, Q, 2, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ21(J + i, Q); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt22(J + i, Q, cd + i, Q, 2, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ22(J + i, Q); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt32(J + i, Q, cd + i, Q, 3, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ32(J + i, Q); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cd + i, Q, 3, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ33(J + i, Q); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a diffusion + mass operator with a +// matrix coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_diff_mass_quad_matrix)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and qw * c * det(J) + // and store the result. + // in[0] is diffusion coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] + // in[1] is mass coefficients with shape [ncomp=1, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + DiffusionMassContext *bc = (DiffusionMassContext *)ctx; + const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdd = out[0], *qdm = out[0] + Q * bc->dim * (bc->dim + 1) / 2; + switch (10 * bc->space_dim + bc->dim) + { + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt21(J + i, Q, cd + i, Q, 3, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ21(J + i, Q); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt22(J + i, Q, cd + i, Q, 3, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ22(J + i, Q); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt32(J + i, Q, cd + i, Q, 6, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ32(J + i, Q); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultAdjJCAdjJt33(J + i, Q, cd + i, Q, 6, qw[i], Q, qdd + i); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * DetJ33(J + i, Q); + } + break; + } + return 0; +} + +// libCEED QFunction for applying a diffusion + mass operator. +CEED_QFUNCTION(f_apply_diff_mass)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // in[0], out[0] have shape [ncomp=1, Q] + // in[1], out[1] have shape [dim, ncomp=1, Q] + DiffusionMassContext *bc = (DiffusionMassContext *)ctx; + const CeedScalar *u = in[0], *ug = in[1], *qdd = in[2], + *qdm = in[2] + Q * bc->dim * (bc->dim + 1) / 2; + CeedScalar *v = out[0], *vg = out[1]; + switch (bc->dim) + { + case 1: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + vg[i] = qdd[i] * ug[i]; + } + break; + case 2: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar ug0 = ug[i + Q * 0]; + const CeedScalar ug1 = ug[i + Q * 1]; + vg[i + Q * 0] = qdd[i + Q * 0] * ug0 + qdd[i + Q * 1] * ug1; + vg[i + Q * 1] = qdd[i + Q * 1] * ug0 + qdd[i + Q * 2] * ug1; + } + break; + case 3: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar ug0 = ug[i + Q * 0]; + const CeedScalar ug1 = ug[i + Q * 1]; + const CeedScalar ug2 = ug[i + Q * 2]; + vg[i + Q * 0] = qdd[i + Q * 0] * ug0 + qdd[i + Q * 1] * ug1 + qdd[i + Q * 2] * ug2; + vg[i + Q * 1] = qdd[i + Q * 1] * ug0 + qdd[i + Q * 3] * ug1 + qdd[i + Q * 4] * ug2; + vg[i + Q * 2] = qdd[i + Q * 2] * ug0 + qdd[i + Q * 4] * ug1 + qdd[i + Q * 5] * ug2; + } + break; + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + v[i] = qdm[i] * u[i]; + } + return 0; +} + +#endif // PALACE_LIBCEED_DIFFUSION_MASS_QF_H diff --git a/palace/fem/qfunctions/divdiv_qf.h b/palace/fem/qfunctions/divdiv_qf.h index f95bc27821..fca9cb5281 100644 --- a/palace/fem/qfunctions/divdiv_qf.h +++ b/palace/fem/qfunctions/divdiv_qf.h @@ -1,125 +1,125 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_DIVDIV_QF_H -#define PALACE_LIBCEED_DIVDIV_QF_H - -#include "utils_qf.h" - -struct DivDivContext -{ - CeedInt dim, space_dim; - CeedScalar coeff; -}; - -// libCEED QFunction for building quadrature data for a div-div operator with a constant -// coefficient. -CEED_QFUNCTION(f_build_divdiv_const)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute and store qw * c / det(J). - // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[1] is quadrature weights, size (Q) - DivDivContext *bc = (DivDivContext *)ctx; - const CeedScalar coeff = bc->coeff; - const CeedScalar *J = in[0], *qw = in[1]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff / J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff / DetJ21(J + i, Q); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff / DetJ22(J + i, Q); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff / DetJ32(J + i, Q); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff / DetJ33(J + i, Q); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a div-div operator with a coefficient -// evaluated at quadrature points. -CEED_QFUNCTION(f_build_divdiv_quad)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute and store qw * c / det(J). - // in[0] is coefficients, size (Q) - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - DivDivContext *bc = (DivDivContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] / J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] / DetJ21(J + i, Q); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] / DetJ22(J + i, Q); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] / DetJ32(J + i, Q); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] / DetJ33(J + i, Q); - } - break; - } - return 0; -} - -// libCEED QFunction for applying a div-div operator. -CEED_QFUNCTION(f_apply_divdiv)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // in[0], out[0] have shape [ncomp=1, Q] - const CeedScalar *ud = in[0], *qd = in[1]; - CeedScalar *vd = out[0]; - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - vd[i] = qd[i] * ud[i]; - } - return 0; -} - -#endif // PALACE_LIBCEED_DIVDIV_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_DIVDIV_QF_H +#define PALACE_LIBCEED_DIVDIV_QF_H + +#include "utils_qf.h" + +struct DivDivContext +{ + CeedInt dim, space_dim; + CeedScalar coeff; +}; + +// libCEED QFunction for building quadrature data for a div-div operator with a constant +// coefficient. +CEED_QFUNCTION(f_build_divdiv_const)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute and store qw * c / det(J). + // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[1] is quadrature weights, size (Q) + DivDivContext *bc = (DivDivContext *)ctx; + const CeedScalar coeff = bc->coeff; + const CeedScalar *J = in[0], *qw = in[1]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff / J[i]; + } + break; + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff / DetJ21(J + i, Q); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff / DetJ22(J + i, Q); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff / DetJ32(J + i, Q); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff / DetJ33(J + i, Q); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a div-div operator with a coefficient +// evaluated at quadrature points. +CEED_QFUNCTION(f_build_divdiv_quad)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute and store qw * c / det(J). + // in[0] is coefficients, size (Q) + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + DivDivContext *bc = (DivDivContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] / J[i]; + } + break; + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] / DetJ21(J + i, Q); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] / DetJ22(J + i, Q); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] / DetJ32(J + i, Q); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] / DetJ33(J + i, Q); + } + break; + } + return 0; +} + +// libCEED QFunction for applying a div-div operator. +CEED_QFUNCTION(f_apply_divdiv)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // in[0], out[0] have shape [ncomp=1, Q] + const CeedScalar *ud = in[0], *qd = in[1]; + CeedScalar *vd = out[0]; + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + vd[i] = qd[i] * ud[i]; + } + return 0; +} + +#endif // PALACE_LIBCEED_DIVDIV_QF_H diff --git a/palace/fem/qfunctions/divdivmass_qf.h b/palace/fem/qfunctions/divdivmass_qf.h index 30e025fa70..21e0a234e3 100644 --- a/palace/fem/qfunctions/divdivmass_qf.h +++ b/palace/fem/qfunctions/divdivmass_qf.h @@ -1,252 +1,252 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_DIVDIV_MASS_QF_H -#define PALACE_LIBCEED_DIVDIV_MASS_QF_H - -#include "utils_qf.h" - -struct DivDivMassContext -{ - CeedInt dim, space_dim; -}; - -// libCEED QFunction for building quadrature data for a div-div + mass operator with a -// scalar coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_divdiv_mass_quad_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw * c / det(J) and qw / det(J) J^T C J and store - // the result. - // in[0] is div-div coefficients with shape [ncomp=1, Q] - // in[1] is mass coefficients with shape [ncomp=1, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - DivDivMassContext *bc = (DivDivMassContext *)ctx; - const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdd = out[0], *qdm = out[0] + Q; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / J[i]; - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdm[i] = qw[i] * cm[i] * J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ21(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ21(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ22(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ22(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ32(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ32(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ33(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a div-div + mass operator with a -// vector coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_divdiv_mass_quad_vector)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw * c / det(J) and qw / det(J) J^T C J and store - // the result. - // in[0] is div-div coefficients with shape [ncomp=1, Q] - // in[1] is mass coefficients with shape [ncomp=space_dim, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - DivDivMassContext *bc = (DivDivMassContext *)ctx; - const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdd = out[0], *qdm = out[0] + Q; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ21(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ21(J + i, Q, cm + i, Q, 2, qw[i], Q, qdm + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ22(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ22(J + i, Q, cm + i, Q, 2, qw[i], Q, qdm + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ32(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ32(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ33(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a div-div + mass operator with a -// matrix coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_divdiv_mass_quad_matrix)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw * c / det(J) and qw / det(J) J^T C J and store - // the result. - // in[0] is div-div coefficients with shape [ncomp=1, Q] - // in[1] is mass coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[3] is quadrature weights, size (Q) - DivDivMassContext *bc = (DivDivMassContext *)ctx; - const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; - CeedScalar *qdd = out[0], *qdm = out[0] + Q; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ21(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ21(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ22(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ22(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ32(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ32(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qdd[i] = qw[i] * cd[i] / DetJ33(J + i, Q); - } - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); - } - break; - } - return 0; -} - -// libCEED QFunction for applying a div-div + mass operator. -CEED_QFUNCTION(f_apply_divdiv_mass)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // in[0], out[0] have shape [dim, ncomp=1, Q] - // in[1], out[1] have shape [ncomp=1, Q] - DivDivMassContext *bc = (DivDivMassContext *)ctx; - const CeedScalar *u = in[0], *ud = in[1], *qdd = in[2], *qdm = in[2] + Q; - CeedScalar *v = out[0], *vd = out[1]; - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - vd[i] = qdd[i] * ud[i]; - } - switch (bc->dim) - { - case 1: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - v[i] = qdm[i] * u[i]; - } - break; - case 2: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar u0 = u[i + Q * 0]; - const CeedScalar u1 = u[i + Q * 1]; - v[i + Q * 0] = qdm[i + Q * 0] * u0 + qdm[i + Q * 1] * u1; - v[i + Q * 1] = qdm[i + Q * 1] * u0 + qdm[i + Q * 2] * u1; - } - break; - case 3: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar u0 = u[i + Q * 0]; - const CeedScalar u1 = u[i + Q * 1]; - const CeedScalar u2 = u[i + Q * 2]; - v[i + Q * 0] = qdm[i + Q * 0] * u0 + qdm[i + Q * 1] * u1 + qdm[i + Q * 2] * u2; - v[i + Q * 1] = qdm[i + Q * 1] * u0 + qdm[i + Q * 3] * u1 + qdm[i + Q * 4] * u2; - v[i + Q * 2] = qdm[i + Q * 2] * u0 + qdm[i + Q * 4] * u1 + qdm[i + Q * 5] * u2; - } - break; - } - return 0; -} - -#endif // MFEM_LIBCEED_DIVDIV_MASS_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_DIVDIV_MASS_QF_H +#define PALACE_LIBCEED_DIVDIV_MASS_QF_H + +#include "utils_qf.h" + +struct DivDivMassContext +{ + CeedInt dim, space_dim; +}; + +// libCEED QFunction for building quadrature data for a div-div + mass operator with a +// scalar coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_divdiv_mass_quad_scalar)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw * c / det(J) and qw / det(J) J^T C J and store + // the result. + // in[0] is div-div coefficients with shape [ncomp=1, Q] + // in[1] is mass coefficients with shape [ncomp=1, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + DivDivMassContext *bc = (DivDivMassContext *)ctx; + const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdd = out[0], *qdm = out[0] + Q; + switch (10 * bc->space_dim + bc->dim) + { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / J[i]; + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdm[i] = qw[i] * cm[i] * J[i]; + } + break; + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ21(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ21(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ22(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ22(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ32(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ32(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ33(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cm + i, Q, 1, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a div-div + mass operator with a +// vector coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_divdiv_mass_quad_vector)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw * c / det(J) and qw / det(J) J^T C J and store + // the result. + // in[0] is div-div coefficients with shape [ncomp=1, Q] + // in[1] is mass coefficients with shape [ncomp=space_dim, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + DivDivMassContext *bc = (DivDivMassContext *)ctx; + const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdd = out[0], *qdm = out[0] + Q; + switch (10 * bc->space_dim + bc->dim) + { + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ21(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ21(J + i, Q, cm + i, Q, 2, qw[i], Q, qdm + i); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ22(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ22(J + i, Q, cm + i, Q, 2, qw[i], Q, qdm + i); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ32(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ32(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ33(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a div-div + mass operator with a +// matrix coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_divdiv_mass_quad_matrix)(void *ctx, CeedInt Q, + const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw * c / det(J) and qw / det(J) J^T C J and store + // the result. + // in[0] is div-div coefficients with shape [ncomp=1, Q] + // in[1] is mass coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] + // in[2] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[3] is quadrature weights, size (Q) + DivDivMassContext *bc = (DivDivMassContext *)ctx; + const CeedScalar *cd = in[0], *cm = in[1], *J = in[2], *qw = in[3]; + CeedScalar *qdd = out[0], *qdm = out[0] + Q; + switch (10 * bc->space_dim + bc->dim) + { + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ21(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ21(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ22(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ22(J + i, Q, cm + i, Q, 3, qw[i], Q, qdm + i); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ32(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ32(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qdd[i] = qw[i] * cd[i] / DetJ33(J + i, Q); + } + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultJtCJ33(J + i, Q, cm + i, Q, 6, qw[i], Q, qdm + i); + } + break; + } + return 0; +} + +// libCEED QFunction for applying a div-div + mass operator. +CEED_QFUNCTION(f_apply_divdiv_mass)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // in[0], out[0] have shape [dim, ncomp=1, Q] + // in[1], out[1] have shape [ncomp=1, Q] + DivDivMassContext *bc = (DivDivMassContext *)ctx; + const CeedScalar *u = in[0], *ud = in[1], *qdd = in[2], *qdm = in[2] + Q; + CeedScalar *v = out[0], *vd = out[1]; + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + vd[i] = qdd[i] * ud[i]; + } + switch (bc->dim) + { + case 1: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + v[i] = qdm[i] * u[i]; + } + break; + case 2: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + v[i + Q * 0] = qdm[i + Q * 0] * u0 + qdm[i + Q * 1] * u1; + v[i + Q * 1] = qdm[i + Q * 1] * u0 + qdm[i + Q * 2] * u1; + } + break; + case 3: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + const CeedScalar u2 = u[i + Q * 2]; + v[i + Q * 0] = qdm[i + Q * 0] * u0 + qdm[i + Q * 1] * u1 + qdm[i + Q * 2] * u2; + v[i + Q * 1] = qdm[i + Q * 1] * u0 + qdm[i + Q * 3] * u1 + qdm[i + Q * 4] * u2; + v[i + Q * 2] = qdm[i + Q * 2] * u0 + qdm[i + Q * 4] * u1 + qdm[i + Q * 5] * u2; + } + break; + } + return 0; +} + +#endif // MFEM_LIBCEED_DIVDIV_MASS_QF_H diff --git a/palace/fem/qfunctions/geom_qf.h b/palace/fem/qfunctions/geom_qf.h new file mode 100644 index 0000000000..c60bf8bb64 --- /dev/null +++ b/palace/fem/qfunctions/geom_qf.h @@ -0,0 +1,20 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_GEOM_QF_H +#define PALACE_LIBCEED_GEOM_QF_H + +// libCEED QFunction for building geometry factors for integration and transformations. +// At every quadrature point, compute qw * det(J) and adj(J)^T / |J| and store the result. +// in[0] is element attributes, shape [Q] +// in[1] is quadrature weights, shape [Q] +// in[2] is Jacobians, shape [qcomp=dim, ncomp=space_dim, Q] +// out[0] is quadrature data, stored as {attribute, Jacobian determinant, (transpose) +// adjugate Jacobian} quadrature data, shape [ncomp=2+space_dim*dim, Q] + +#include "21/geom_21_qf.h" +#include "22/geom_22_qf.h" +#include "32/geom_32_qf.h" +#include "33/geom_33_qf.h" + +#endif // PALACE_LIBCEED_GEOM_QF_H diff --git a/palace/fem/qfunctions/grad_qf.h b/palace/fem/qfunctions/grad_qf.h index 5a0dfab915..a9ac1c1e07 100644 --- a/palace/fem/qfunctions/grad_qf.h +++ b/palace/fem/qfunctions/grad_qf.h @@ -1,254 +1,254 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_GRAD_QF_H -#define PALACE_LIBCEED_GRAD_QF_H - -#include "utils_qf.h" - -struct GradContext -{ - CeedInt dim, space_dim; - CeedScalar coeff; -}; - -// libCEED QFunction for building quadrature data for a gradient operator with a scalar -// constant coefficient. -CEED_QFUNCTION(f_build_grad_const_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw C adj(J)^T and store the result. - // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[1] is quadrature weights, size (Q) - GradContext *bc = (GradContext *)ctx; - const CeedScalar coeff = bc->coeff; - const CeedScalar *J = in[0], *qw = in[1]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt21(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt22(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt32(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt33(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a gradient operator with a scalar -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_grad_quad_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw C adj(J)^T and store the result. - // in[0] is coefficients, size (Q) - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - GradContext *bc = (GradContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a gradient operator with a vector -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_grad_quad_vector)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw C adj(J)^T and store the result. - // in[0] is coefficients with shape [ncomp=vdim, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - GradContext *bc = (GradContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a gradient operator with a matrix -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_grad_quad_matrix)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw C adj(J)^T and store the result. - // in[0] is coefficients with shape [ncomp=vdim*(vdim+1)/2, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - GradContext *bc = (GradContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for applying a gradient operator. -CEED_QFUNCTION(f_apply_grad)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // in[0] has shape [dim, ncomp=1, Q] - // out[0] has shape [ncomp=space_dim, Q] - GradContext *bc = (GradContext *)ctx; - const CeedScalar *ug = in[0], *qd = in[1]; - CeedScalar *v = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - v[i] = qd[i] * ug[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar ug0 = ug[i + Q * 0]; - v[i + Q * 0] = qd[i + Q * 0] * ug0; - v[i + Q * 1] = qd[i + Q * 1] * ug0; - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar ug0 = ug[i + Q * 0]; - const CeedScalar ug1 = ug[i + Q * 1]; - v[i + Q * 0] = qd[i + Q * 0] * ug0 + qd[i + Q * 2] * ug1; - v[i + Q * 1] = qd[i + Q * 1] * ug0 + qd[i + Q * 3] * ug1; - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar ug0 = ug[i + Q * 0]; - const CeedScalar ug1 = ug[i + Q * 1]; - v[i + Q * 0] = qd[i + Q * 0] * ug0 + qd[i + Q * 3] * ug1; - v[i + Q * 1] = qd[i + Q * 1] * ug0 + qd[i + Q * 4] * ug1; - v[i + Q * 2] = qd[i + Q * 2] * ug0 + qd[i + Q * 5] * ug1; - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar ug0 = ug[i + Q * 0]; - const CeedScalar ug1 = ug[i + Q * 1]; - const CeedScalar ug2 = ug[i + Q * 2]; - v[i + Q * 0] = qd[i + Q * 0] * ug0 + qd[i + Q * 3] * ug1 + qd[i + Q * 6] * ug2; - v[i + Q * 1] = qd[i + Q * 1] * ug0 + qd[i + Q * 4] * ug1 + qd[i + Q * 7] * ug2; - v[i + Q * 2] = qd[i + Q * 2] * ug0 + qd[i + Q * 5] * ug1 + qd[i + Q * 8] * ug2; - } - break; - } - return 0; -} - -#endif // PALACE_LIBCEED_GRAD_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_GRAD_QF_H +#define PALACE_LIBCEED_GRAD_QF_H + +#include "utils_qf.h" + +struct GradContext +{ + CeedInt dim, space_dim; + CeedScalar coeff; +}; + +// libCEED QFunction for building quadrature data for a gradient operator with a scalar +// constant coefficient. +CEED_QFUNCTION(f_build_grad_const_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw C adj(J)^T and store the result. + // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[1] is quadrature weights, size (Q) + GradContext *bc = (GradContext *)ctx; + const CeedScalar coeff = bc->coeff; + const CeedScalar *J = in[0], *qw = in[1]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff; + } + break; + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt21(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt22(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt32(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt33(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a gradient operator with a scalar +// coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_grad_quad_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw C adj(J)^T and store the result. + // in[0] is coefficients, size (Q) + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + GradContext *bc = (GradContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i]; + } + break; + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a gradient operator with a vector +// coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_grad_quad_vector)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw C adj(J)^T and store the result. + // in[0] is coefficients with shape [ncomp=vdim, Q] + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + GradContext *bc = (GradContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a gradient operator with a matrix +// coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_grad_quad_matrix)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute qw C adj(J)^T and store the result. + // in[0] is coefficients with shape [ncomp=vdim*(vdim+1)/2, Q] + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + GradContext *bc = (GradContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + MultCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); + } + break; + } + return 0; +} + +// libCEED QFunction for applying a gradient operator. +CEED_QFUNCTION(f_apply_grad)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // in[0] has shape [dim, ncomp=1, Q] + // out[0] has shape [ncomp=space_dim, Q] + GradContext *bc = (GradContext *)ctx; + const CeedScalar *ug = in[0], *qd = in[1]; + CeedScalar *v = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + v[i] = qd[i] * ug[i]; + } + break; + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar ug0 = ug[i + Q * 0]; + v[i + Q * 0] = qd[i + Q * 0] * ug0; + v[i + Q * 1] = qd[i + Q * 1] * ug0; + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar ug0 = ug[i + Q * 0]; + const CeedScalar ug1 = ug[i + Q * 1]; + v[i + Q * 0] = qd[i + Q * 0] * ug0 + qd[i + Q * 2] * ug1; + v[i + Q * 1] = qd[i + Q * 1] * ug0 + qd[i + Q * 3] * ug1; + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar ug0 = ug[i + Q * 0]; + const CeedScalar ug1 = ug[i + Q * 1]; + v[i + Q * 0] = qd[i + Q * 0] * ug0 + qd[i + Q * 3] * ug1; + v[i + Q * 1] = qd[i + Q * 1] * ug0 + qd[i + Q * 4] * ug1; + v[i + Q * 2] = qd[i + Q * 2] * ug0 + qd[i + Q * 5] * ug1; + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar ug0 = ug[i + Q * 0]; + const CeedScalar ug1 = ug[i + Q * 1]; + const CeedScalar ug2 = ug[i + Q * 2]; + v[i + Q * 0] = qd[i + Q * 0] * ug0 + qd[i + Q * 3] * ug1 + qd[i + Q * 6] * ug2; + v[i + Q * 1] = qd[i + Q * 1] * ug0 + qd[i + Q * 4] * ug1 + qd[i + Q * 7] * ug2; + v[i + Q * 2] = qd[i + Q * 2] * ug0 + qd[i + Q * 5] * ug1 + qd[i + Q * 8] * ug2; + } + break; + } + return 0; +} + +#endif // PALACE_LIBCEED_GRAD_QF_H diff --git a/palace/fem/qfunctions/h1_qf.h b/palace/fem/qfunctions/h1_qf.h new file mode 100644 index 0000000000..fbfd9b49c8 --- /dev/null +++ b/palace/fem/qfunctions/h1_qf.h @@ -0,0 +1,21 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_H1_QF_H +#define PALACE_LIBCEED_H1_QF_H + +// libCEED QFunctions for H1 operators (Piola transformation u = ̂u). +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is active vector, shape [ncomp=vdim, Q] +// out[0] is active vector, shape [ncomp=vdim, Q] + +// Build functions assemble the quadrature point data. + +#include "1/h1_1_qf.h" +#include "1/h1_build_1_qf.h" +#include "2/h1_2_qf.h" +#include "2/h1_build_2_qf.h" +#include "3/h1_3_qf.h" +#include "3/h1_build_3_qf.h" + +#endif // PALACE_LIBCEED_H1_QF_H diff --git a/palace/fem/qfunctions/hcurl_qf.h b/palace/fem/qfunctions/hcurl_qf.h index f133bc9b7b..cb891f0818 100644 --- a/palace/fem/qfunctions/hcurl_qf.h +++ b/palace/fem/qfunctions/hcurl_qf.h @@ -1,195 +1,23 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_HCURL_QF_H -#define PALACE_LIBCEED_HCURL_QF_H - -#include "utils_qf.h" -#include "vecfemass_qf.h" - -// libCEED QFunction for building quadrature data for an H(curl) mass operator with a scalar -// constant coefficient. -CEED_QFUNCTION(f_build_hcurl_const_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the - // symmetric part of the result. - // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[1] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar coeff = bc->coeff; - const CeedScalar *J = in[0], *qw = in[1]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff / J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt21(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for an H(curl) mass operator with a scalar -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hcurl_quad_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the - // symmetric part of the result. - // in[0] is coefficients with shape [ncomp=1, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] / J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for an H(curl) mass operator with a vector -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hcurl_quad_vector)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the - // symmetric part of the result. - // in[0] is coefficients with shape [ncomp=space_dim, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for an H(curl) mass operator with a matrix -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hcurl_quad_matrix)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C adj(J)^T and store the - // symmetric part of the result. - // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultAdjJCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -#endif // PALACE_LIBCEED_HCURL_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_QF_H +#define PALACE_LIBCEED_HCURL_QF_H + +// libCEED QFunctions for H(curl) operators (Piola transformation u = adj(J)^T / det(J) ̂u). +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is active vector, shape [qcomp=dim, ncomp=1, Q] +// out[0] is active vector, shape [qcomp=dim, ncomp=1, Q] + +// Build functions assemble the quadrature point data. + +#include "21/hcurl_21_qf.h" +#include "21/hcurl_build_21_qf.h" +#include "22/hcurl_22_qf.h" +#include "22/hcurl_build_22_qf.h" +#include "32/hcurl_32_qf.h" +#include "32/hcurl_build_32_qf.h" +#include "33/hcurl_33_qf.h" +#include "33/hcurl_build_33_qf.h" + +#endif // PALACE_LIBCEED_HCURL_QF_H diff --git a/palace/fem/qfunctions/hcurlh1d_error_qf.h b/palace/fem/qfunctions/hcurlh1d_error_qf.h new file mode 100644 index 0000000000..b71d442522 --- /dev/null +++ b/palace/fem/qfunctions/hcurlh1d_error_qf.h @@ -0,0 +1,19 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_H1D_ERROR_QF_H +#define PALACE_LIBCEED_HCURL_H1D_ERROR_QF_H + +// libCEED QFunctions for computing errors between two functions, one in H(curl) and one in +// (H1)ᵈ (Piola transformations u = adj(J)^T / det(J) ̂u and u = ̂u). +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is active vector 1, shape [qcomp=dim, ncomp=1, Q] or [ncomp=dim, Q] +// in[2] is active vector 2, shape [ncomp=dim, Q] or [qcomp=dim, ncomp=1, Q] +// out[0] is active vector, shape [ncomp=1, Q] + +// Only for the square Jacobian case where dim = space_dim. + +#include "22/hcurlh1d_error_22_qf.h" +#include "33/hcurlh1d_error_33_qf.h" + +#endif // PALACE_LIBCEED_HCURL_H1D_ERROR_QF_H diff --git a/palace/fem/qfunctions/hcurlh1d_qf.h b/palace/fem/qfunctions/hcurlh1d_qf.h new file mode 100644 index 0000000000..d13e383a8c --- /dev/null +++ b/palace/fem/qfunctions/hcurlh1d_qf.h @@ -0,0 +1,24 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_H1D_QF_H +#define PALACE_LIBCEED_HCURL_H1D_QF_H + +// libCEED QFunctions for mixed H(curl)-(H1)ᵈ operators (Piola transformation u = +// adj(J)^T / det(J) ̂u and u = ̂u) +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is active vector, shape [qcomp=dim, ncomp=1, Q] +// out[0] is active vector, shape [ncomp=space_dim, Q] + +// Build functions assemble the quadrature point data. + +#include "21/hcurlh1d_21_qf.h" +#include "21/hcurlh1d_build_21_qf.h" +#include "22/hcurlh1d_22_qf.h" +#include "22/hcurlh1d_build_22_qf.h" +#include "32/hcurlh1d_32_qf.h" +#include "32/hcurlh1d_build_32_qf.h" +#include "33/hcurlh1d_33_qf.h" +#include "33/hcurlh1d_build_33_qf.h" + +#endif // PALACE_LIBCEED_HCURL_H1D_QF_H diff --git a/palace/fem/qfunctions/hcurlhdiv_error_qf.h b/palace/fem/qfunctions/hcurlhdiv_error_qf.h new file mode 100644 index 0000000000..9e97fbb7a7 --- /dev/null +++ b/palace/fem/qfunctions/hcurlhdiv_error_qf.h @@ -0,0 +1,20 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_HDIV_ERROR_QF_H +#define PALACE_LIBCEED_HCURL_HDIV_ERROR_QF_H + +// libCEED QFunctions for computing errors between two functions, one in H(curl) and one in +// H(div) (Piola transformations u = adj(J)^T / det(J) ̂u and u = J / det(J) ̂u). +// Note: J / det(J) = adj(adj(J)^T / det(J))^T +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is active vector 1, shape [qcomp=dim, ncomp=1, Q] +// in[2] is active vector 2, shape [qcomp=dim, ncomp=1, Q] +// out[0] is active vector, shape [ncomp=1, Q] + +// Only for the square Jacobian case where dim = space_dim. + +#include "22/hcurlhdiv_error_22_qf.h" +#include "33/hcurlhdiv_error_33_qf.h" + +#endif // PALACE_LIBCEED_HCURL_HDIV_ERROR_QF_H diff --git a/palace/fem/qfunctions/hcurlhdiv_qf.h b/palace/fem/qfunctions/hcurlhdiv_qf.h index 65272f6654..186031fea8 100644 --- a/palace/fem/qfunctions/hcurlhdiv_qf.h +++ b/palace/fem/qfunctions/hcurlhdiv_qf.h @@ -1,386 +1,25 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_HCURLHDIV_QF_H -#define PALACE_LIBCEED_HCURLHDIV_QF_H - -#include "utils_qf.h" -#include "vecfemass_qf.h" - -// libCEED QFunction for building quadrature data for a mixed H(curl)-H(div) mass operator -// with a scalar constant coefficient. -CEED_QFUNCTION(f_build_hcurlhdiv_const_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C J and store the - // result. - // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[1] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar coeff = bc->coeff; - const CeedScalar *J = in[0], *qw = in[1]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt21(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt22(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt32(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt33(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a mixed H(curl)-H(div) mass operator -// with a scalar coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hcurlhdiv_quad_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C J and store the - // result. - // in[0] is coefficients with shape [ncomp=1, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a mixed H(curl)-H(div) mass operator -// with a vector coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hcurlhdiv_quad_vector)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C J and store the - // result. - // in[0] is coefficients with shape [ncomp=space_dim, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a mixed H(curl)-H(div) mass operator -// with a matrix coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hcurlhdiv_quad_matrix)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C J and store the - // result. - // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a mixed H(div)-H(curl) mass operator -// with a scalar constant coefficient. -CEED_QFUNCTION(f_build_hdivhcurl_const_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C J and store the - // result. - // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[1] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar coeff = bc->coeff; - const CeedScalar *J = in[0], *qw = in[1]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt21(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt22(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt32(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt33(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a mixed H(div)-H(curl) mass operator -// with a scalar coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hdivhcurl_quad_scalar)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C J and store the - // result. - // in[0] is coefficients with shape [ncomp=1, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt21(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt22(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt32(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a mixed H(div)-H(curl) mass operator -// with a vector coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hdivhcurl_quad_vector)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C J and store the - // result. - // in[0] is coefficients with shape [ncomp=space_dim, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt21(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt22(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt32(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a mixed H(div)-H(curl) mass operator -// with a matrix coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hdivhcurl_quad_matrix)(void *ctx, CeedInt Q, - const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) adj(J) C J and store the - // result. - // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt21(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt22(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt32(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCAdjJt33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -#endif // PALACE_LIBCEED_HCURLHDIV_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_HDIV_QF_H +#define PALACE_LIBCEED_HCURL_HDIV_QF_H + +// libCEED QFunctions for mixed H(curl)-H(div) operators (Piola transformations u = +// adj(J)^T / det(J) ̂u and u = J / det(J) ̂u). +// Note: J / det(J) = adj(adj(J)^T / det(J))^T +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is active vector, shape [qcomp=dim, ncomp=1, Q] +// out[0] is active vector, shape [qcomp=dim, ncomp=1, Q] + +// Build functions assemble the quadrature point data. + +#include "21/hcurlhdiv_21_qf.h" +#include "21/hcurlhdiv_build_21_qf.h" +#include "22/hcurlhdiv_22_qf.h" +#include "22/hcurlhdiv_build_22_qf.h" +#include "32/hcurlhdiv_32_qf.h" +#include "32/hcurlhdiv_build_32_qf.h" +#include "33/hcurlhdiv_33_qf.h" +#include "33/hcurlhdiv_build_33_qf.h" + +#endif // PALACE_LIBCEED_HCURL_HDIV_QF_H diff --git a/palace/fem/qfunctions/hcurlmass_qf.h b/palace/fem/qfunctions/hcurlmass_qf.h new file mode 100644 index 0000000000..7b64d24bd4 --- /dev/null +++ b/palace/fem/qfunctions/hcurlmass_qf.h @@ -0,0 +1,26 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HCURL_MASS_QF_H +#define PALACE_LIBCEED_HCURL_MASS_QF_H + +// libCEED QFunctions for H(curl) + H1 mass operators (Piola transformation u = +// adj(J)^T / det(J) ̂u and u = ̂u). +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is active vector, shape [ncomp=1, Q] +// in[2] is active vector gradient, shape [qcomp=dim, ncomp=1, Q] +// out[0] is active vector, shape [ncomp=1, Q] +// out[1] is active vector gradient, shape [qcomp=dim, ncomp=1, Q] + +// Build functions assemble the quadrature point data. + +#include "21/hcurlmass_21_qf.h" +#include "21/hcurlmass_build_21_qf.h" +#include "22/hcurlmass_22_qf.h" +#include "22/hcurlmass_build_22_qf.h" +#include "32/hcurlmass_32_qf.h" +#include "32/hcurlmass_build_32_qf.h" +#include "33/hcurlmass_33_qf.h" +#include "33/hcurlmass_build_33_qf.h" + +#endif // PALACE_LIBCEED_HCURL_MASS_QF_H diff --git a/palace/fem/qfunctions/hdiv_qf.h b/palace/fem/qfunctions/hdiv_qf.h index 927fd67fbc..fd7e91b0d1 100644 --- a/palace/fem/qfunctions/hdiv_qf.h +++ b/palace/fem/qfunctions/hdiv_qf.h @@ -1,194 +1,24 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_HDIV_QF_H -#define PALACE_LIBCEED_HDIV_QF_H - -#include "utils_qf.h" -#include "vecfemass_qf.h" - -// libCEED QFunction for building quadrature data for an H(div) mass operator with a scalar -// constant coefficient. -CEED_QFUNCTION(f_build_hdiv_const_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of - // the result. - // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[1] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar coeff = bc->coeff; - const CeedScalar *J = in[0], *qw = in[1]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff * J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ21(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ22(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ32(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, &coeff, 1, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for an H(div) mass operator with a scalar -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hdiv_quad_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of - // the result. - // in[0] is coefficients with shape [ncomp=1, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] * J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ21(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ22(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ32(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, c + i, Q, 1, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for an H(div) mass operator with a vector -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hdiv_quad_vector)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of - // the result. - // in[0] is coefficients with shape [ncomp=space_dim, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ21(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ22(J + i, Q, c + i, Q, 2, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ32(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for an H(div) mass operator with a matrix -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_hdiv_quad_matrix)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute qw / det(J) J^T C J and store the symmetric part of - // the result. - // in[0] is coefficients with shape [ncomp=space_dim*(space_dim+1)/2, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ21(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ22(J + i, Q, c + i, Q, 3, qw[i], Q, qd + i); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ32(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - MultJtCJ33(J + i, Q, c + i, Q, 6, qw[i], Q, qd + i); - } - break; - } - return 0; -} - -#endif // PALACE_LIBCEED_HDIV_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_QF_H +#define PALACE_LIBCEED_HDIV_QF_H + +// libCEED QFunctions for H(div) operators (Piola transformation u = J / det(J) ̂u). +// Note: J / det(J) = adj(adj(J)^T / det(J))^T +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is active vector, shape [qcomp=dim, ncomp=1, Q] +// out[0] is active vector, shape [qcomp=dim, ncomp=1, Q] + +// Build functions assemble the quadrature point data. + +#include "21/hdiv_21_qf.h" +#include "21/hdiv_build_21_qf.h" +#include "22/hdiv_22_qf.h" +#include "22/hdiv_build_22_qf.h" +#include "32/hdiv_32_qf.h" +#include "32/hdiv_build_32_qf.h" +#include "33/hdiv_33_qf.h" +#include "33/hdiv_build_33_qf.h" + +#endif // PALACE_LIBCEED_HDIV_QF_H diff --git a/palace/fem/qfunctions/hdivmass_qf.h b/palace/fem/qfunctions/hdivmass_qf.h new file mode 100644 index 0000000000..f91441a446 --- /dev/null +++ b/palace/fem/qfunctions/hdivmass_qf.h @@ -0,0 +1,34 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_HDIV_MASS_QF_H +#define PALACE_LIBCEED_HDIV_MASS_QF_H + +// libCEED QFunctions for H(div) + H(curl) mass operators in 3D (Piola transformations u = +// J / det(J) ̂u and u = adj(J)^T / det(J) ̂u). +// Note: J / det(J) = adj(adj(J)^T / det(J))^T +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is active vector, shape [qcomp=dim, ncomp=1, Q] +// in[2] is active vector curl, shape [qcomp=dim, ncomp=1, Q] +// out[0] is active vector, shape [qcomp=dim, ncomp=1, Q] +// out[1] is active vector curl, shape [qcomp=dim, ncomp=1, Q] + +// In 2D, this actually uses the L2 Piola transformation on the curl (u = 1 / det(J) ̂u) and +// the curl is has qcomp=1. There is no boundary integrator support in 2D. +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is quadrature weights, shape [Q] +// in[2] is active vector, shape [qcomp=dim, ncomp=1, Q] +// in[3] is active vector curl, shape [ncomp=1, Q] +// out[0] is active vector, shape [qcomp=dim, ncomp=1, Q] +// out[1] is active vector curl, shape [ncomp=1, Q] + +// Build functions assemble the quadrature point data. + +#include "22/hdivmass_22_qf.h" +#include "22/hdivmass_build_22_qf.h" +#include "32/hdivmass_32_qf.h" +#include "32/hdivmass_build_32_qf.h" +#include "33/hdivmass_33_qf.h" +#include "33/hdivmass_build_33_qf.h" + +#endif // PALACE_LIBCEED_HDIV_MASS_QF_H diff --git a/palace/fem/qfunctions/l2_qf.h b/palace/fem/qfunctions/l2_qf.h new file mode 100644 index 0000000000..b1974311a5 --- /dev/null +++ b/palace/fem/qfunctions/l2_qf.h @@ -0,0 +1,22 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_QF_H +#define PALACE_LIBCEED_L2_QF_H + +// libCEED QFunctions for L2 operators (Piola transformation u = 1 / det(J) ̂u). +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is quadrature weights, shape [Q] +// in[2] is active vector, shape [ncomp=vdim, Q] +// out[0] is active vector, shape [ncomp=vdim, Q] + +// Build functions assemble the quadrature point data. + +#include "1/l2_1_qf.h" +#include "1/l2_build_1_qf.h" +#include "2/l2_2_qf.h" +#include "2/l2_build_2_qf.h" +#include "3/l2_3_qf.h" +#include "3/l2_build_3_qf.h" + +#endif // PALACE_LIBCEED_L2_QF_H diff --git a/palace/fem/qfunctions/l2mass_qf.h b/palace/fem/qfunctions/l2mass_qf.h new file mode 100644 index 0000000000..024798aabe --- /dev/null +++ b/palace/fem/qfunctions/l2mass_qf.h @@ -0,0 +1,28 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_L2_MASS_QF_H +#define PALACE_LIBCEED_L2_MASS_QF_H + +// libCEED QFunctions for L2 + H(div) mass operators (Piola transformations u = 1 / det(J) ̂u +// and u = J / det(J) ̂u). +// Note: J / det(J) = adj(adj(J)^T / det(J))^T +// in[0] is geometry quadrature data, shape [ncomp=2+space_dim*dim, Q] +// in[1] is quadrature weights, shape [Q] +// in[2] is active vector, shape [qcomp=dim, ncomp=1, Q] +// in[3] is active vector divergence, shape [ncomp=1, Q] +// out[0] is active vector, shape [qcomp=dim, ncomp=1, Q] +// out[1] is active vector divergence, shape [ncomp=1, Q] + +// Build functions assemble the quadrature point data. + +#include "21/l2mass_21_qf.h" +#include "21/l2mass_build_21_qf.h" +#include "22/l2mass_22_qf.h" +#include "22/l2mass_build_22_qf.h" +#include "32/l2mass_32_qf.h" +#include "32/l2mass_build_32_qf.h" +#include "33/l2mass_33_qf.h" +#include "33/l2mass_build_33_qf.h" + +#endif // PALACE_LIBCEED_L2_MASS_QF_H diff --git a/palace/fem/qfunctions/mass_qf.h b/palace/fem/qfunctions/mass_qf.h index a764b229d5..bae2aed0ae 100644 --- a/palace/fem/qfunctions/mass_qf.h +++ b/palace/fem/qfunctions/mass_qf.h @@ -1,333 +1,333 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_MASS_QF_H -#define PALACE_LIBCEED_MASS_QF_H - -#include "utils_qf.h" - -struct MassContext -{ - CeedInt dim, space_dim, vdim; - CeedScalar coeff; -}; - -// libCEED QFunction for building quadrature data for a mass operator with a scalar constant -// coefficient. -CEED_QFUNCTION(f_build_mass_const_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute and store qw * c * det(J). - // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[1] is quadrature weights, size (Q) - MassContext *bc = (MassContext *)ctx; - const CeedScalar coeff = bc->coeff; - const CeedScalar *J = in[0], *qw = in[1]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff * J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff * DetJ21(J + i, Q); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff * DetJ22(J + i, Q); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff * DetJ32(J + i, Q); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * coeff * DetJ33(J + i, Q); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a mass operator with a scalar -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_mass_quad_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute and store qw * c * det(J). - // in[0] is coefficients, size (Q) - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - MassContext *bc = (MassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (10 * bc->space_dim + bc->dim) - { - case 11: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] * J[i]; - } - break; - case 21: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] * DetJ21(J + i, Q); - } - break; - case 22: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] * DetJ22(J + i, Q); - } - break; - case 32: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] * DetJ32(J + i, Q); - } - break; - case 33: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - qd[i] = qw[i] * c[i] * DetJ33(J + i, Q); - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a mass operator with a vector -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_mass_quad_vector)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute and store qw * det(J) C. - // in[0] is coefficients with shape [ncomp=vdim, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - MassContext *bc = (MassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim) - { - case 212: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar wdetJi = qw[i] * DetJ21(J + i, Q); - CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++) - { - qd[i + Q * d] = wdetJi * c[i + Q * d]; - } - } - break; - case 222: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar wdetJi = qw[i] * DetJ22(J + i, Q); - CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++) - { - qd[i + Q * d] = wdetJi * c[i + Q * d]; - } - } - break; - case 323: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar wdetJi = qw[i] * DetJ32(J + i, Q); - CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) - { - qd[i + Q * d] = wdetJi * c[i + Q * d]; - } - } - break; - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar wdetJi = qw[i] * DetJ33(J + i, Q); - CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) - { - qd[i + Q * d] = wdetJi * c[i + Q * d]; - } - } - break; - } - return 0; -} - -// libCEED QFunction for building quadrature data for a mass operator with a matrix -// coefficient evaluated at quadrature points. -CEED_QFUNCTION(f_build_mass_quad_matrix)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // At every quadrature point, compute and store qw * det(J) C. - // in[0] is coefficients with shape [ncomp=vdim*(vdim+1)/2, Q] - // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] - // in[2] is quadrature weights, size (Q) - MassContext *bc = (MassContext *)ctx; - const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; - CeedScalar *qd = out[0]; - switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim) - { - case 212: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar wdetJi = qw[i] * DetJ21(J + i, Q); - CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) - { - qd[i + Q * d] = wdetJi * c[i + Q * d]; - } - } - break; - case 222: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar wdetJi = qw[i] * DetJ22(J + i, Q); - CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) - { - qd[i + Q * d] = wdetJi * c[i + Q * d]; - } - } - break; - case 323: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar wdetJi = qw[i] * DetJ32(J + i, Q); - CeedPragmaSIMD for (CeedInt d = 0; d < 6; d++) - { - qd[i + Q * d] = wdetJi * c[i + Q * d]; - } - } - break; - case 333: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar wdetJi = qw[i] * DetJ33(J + i, Q); - CeedPragmaSIMD for (CeedInt d = 0; d < 6; d++) - { - qd[i + Q * d] = wdetJi * c[i + Q * d]; - } - } - break; - } - return 0; -} - -// libCEED QFunction for applying a mass operator with a scalar coefficient. -CEED_QFUNCTION(f_apply_mass_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // in[0], out[0] have shape [ncomp=vdim, Q] - MassContext *bc = (MassContext *)ctx; - const CeedScalar *u = in[0], *qd = in[1]; - CeedScalar *v = out[0]; - switch (bc->vdim) - { - case 1: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - v[i] = qd[i] * u[i]; - } - break; - case 2: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar qdi = qd[i]; - CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++) - { - v[i + Q * d] = qdi * u[i + Q * d]; - } - } - break; - case 3: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar qdi = qd[i]; - CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) - { - v[i + Q * d] = qdi * u[i + Q * d]; - } - } - break; - } - return 0; -} - -// libCEED QFunction for applying a mass operator with a vector coefficient. -CEED_QFUNCTION(f_apply_mass_vector)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // in[0], out[0] have shape [ncomp=vdim, Q] - MassContext *bc = (MassContext *)ctx; - const CeedScalar *u = in[0], *qd = in[1]; - CeedScalar *v = out[0]; - switch (bc->vdim) - { - case 2: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++) - { - v[i + Q * d] = qd[i + Q * d] * u[i + Q * d]; - } - } - break; - case 3: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) - { - v[i + Q * d] = qd[i + Q * d] * u[i + Q * d]; - } - } - break; - } - return 0; -} - -// libCEED QFunction for applying a mass operator with a matrix coefficient. -CEED_QFUNCTION(f_apply_mass_matrix)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // in[0], out[0] have shape [ncomp=vdim, Q] - MassContext *bc = (MassContext *)ctx; - const CeedScalar *u = in[0], *qd = in[1]; - CeedScalar *v = out[0]; - switch (bc->vdim) - { - case 2: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar u0 = u[i + Q * 0]; - const CeedScalar u1 = u[i + Q * 1]; - v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 1] * u1; - v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 2] * u1; - } - break; - case 3: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar u0 = u[i + Q * 0]; - const CeedScalar u1 = u[i + Q * 1]; - const CeedScalar u2 = u[i + Q * 2]; - v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 1] * u1 + qd[i + Q * 2] * u2; - v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 3] * u1 + qd[i + Q * 4] * u2; - v[i + Q * 2] = qd[i + Q * 2] * u0 + qd[i + Q * 4] * u1 + qd[i + Q * 5] * u2; - } - break; - } - return 0; -} - -#endif // PALACE_LIBCEED_MASS_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_MASS_QF_H +#define PALACE_LIBCEED_MASS_QF_H + +#include "utils_qf.h" + +struct MassContext +{ + CeedInt dim, space_dim, vdim; + CeedScalar coeff; +}; + +// libCEED QFunction for building quadrature data for a mass operator with a scalar constant +// coefficient. +CEED_QFUNCTION(f_build_mass_const_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute and store qw * c * det(J). + // in[0] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[1] is quadrature weights, size (Q) + MassContext *bc = (MassContext *)ctx; + const CeedScalar coeff = bc->coeff; + const CeedScalar *J = in[0], *qw = in[1]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff * J[i]; + } + break; + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff * DetJ21(J + i, Q); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff * DetJ22(J + i, Q); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff * DetJ32(J + i, Q); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * coeff * DetJ33(J + i, Q); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a mass operator with a scalar +// coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_mass_quad_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute and store qw * c * det(J). + // in[0] is coefficients, size (Q) + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + MassContext *bc = (MassContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (10 * bc->space_dim + bc->dim) + { + case 11: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] * J[i]; + } + break; + case 21: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] * DetJ21(J + i, Q); + } + break; + case 22: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] * DetJ22(J + i, Q); + } + break; + case 32: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] * DetJ32(J + i, Q); + } + break; + case 33: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + qd[i] = qw[i] * c[i] * DetJ33(J + i, Q); + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a mass operator with a vector +// coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_mass_quad_vector)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute and store qw * det(J) C. + // in[0] is coefficients with shape [ncomp=vdim, Q] + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + MassContext *bc = (MassContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim) + { + case 212: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar wdetJi = qw[i] * DetJ21(J + i, Q); + CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++) + { + qd[i + Q * d] = wdetJi * c[i + Q * d]; + } + } + break; + case 222: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar wdetJi = qw[i] * DetJ22(J + i, Q); + CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++) + { + qd[i + Q * d] = wdetJi * c[i + Q * d]; + } + } + break; + case 323: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar wdetJi = qw[i] * DetJ32(J + i, Q); + CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) + { + qd[i + Q * d] = wdetJi * c[i + Q * d]; + } + } + break; + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar wdetJi = qw[i] * DetJ33(J + i, Q); + CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) + { + qd[i + Q * d] = wdetJi * c[i + Q * d]; + } + } + break; + } + return 0; +} + +// libCEED QFunction for building quadrature data for a mass operator with a matrix +// coefficient evaluated at quadrature points. +CEED_QFUNCTION(f_build_mass_quad_matrix)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // At every quadrature point, compute and store qw * det(J) C. + // in[0] is coefficients with shape [ncomp=vdim*(vdim+1)/2, Q] + // in[1] is Jacobians with shape [dim, ncomp=space_dim, Q] + // in[2] is quadrature weights, size (Q) + MassContext *bc = (MassContext *)ctx; + const CeedScalar *c = in[0], *J = in[1], *qw = in[2]; + CeedScalar *qd = out[0]; + switch (100 * bc->space_dim + 10 * bc->dim + bc->vdim) + { + case 212: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar wdetJi = qw[i] * DetJ21(J + i, Q); + CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) + { + qd[i + Q * d] = wdetJi * c[i + Q * d]; + } + } + break; + case 222: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar wdetJi = qw[i] * DetJ22(J + i, Q); + CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) + { + qd[i + Q * d] = wdetJi * c[i + Q * d]; + } + } + break; + case 323: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar wdetJi = qw[i] * DetJ32(J + i, Q); + CeedPragmaSIMD for (CeedInt d = 0; d < 6; d++) + { + qd[i + Q * d] = wdetJi * c[i + Q * d]; + } + } + break; + case 333: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar wdetJi = qw[i] * DetJ33(J + i, Q); + CeedPragmaSIMD for (CeedInt d = 0; d < 6; d++) + { + qd[i + Q * d] = wdetJi * c[i + Q * d]; + } + } + break; + } + return 0; +} + +// libCEED QFunction for applying a mass operator with a scalar coefficient. +CEED_QFUNCTION(f_apply_mass_scalar)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // in[0], out[0] have shape [ncomp=vdim, Q] + MassContext *bc = (MassContext *)ctx; + const CeedScalar *u = in[0], *qd = in[1]; + CeedScalar *v = out[0]; + switch (bc->vdim) + { + case 1: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + v[i] = qd[i] * u[i]; + } + break; + case 2: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar qdi = qd[i]; + CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++) + { + v[i + Q * d] = qdi * u[i + Q * d]; + } + } + break; + case 3: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar qdi = qd[i]; + CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) + { + v[i + Q * d] = qdi * u[i + Q * d]; + } + } + break; + } + return 0; +} + +// libCEED QFunction for applying a mass operator with a vector coefficient. +CEED_QFUNCTION(f_apply_mass_vector)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // in[0], out[0] have shape [ncomp=vdim, Q] + MassContext *bc = (MassContext *)ctx; + const CeedScalar *u = in[0], *qd = in[1]; + CeedScalar *v = out[0]; + switch (bc->vdim) + { + case 2: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedPragmaSIMD for (CeedInt d = 0; d < 2; d++) + { + v[i + Q * d] = qd[i + Q * d] * u[i + Q * d]; + } + } + break; + case 3: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + CeedPragmaSIMD for (CeedInt d = 0; d < 3; d++) + { + v[i + Q * d] = qd[i + Q * d] * u[i + Q * d]; + } + } + break; + } + return 0; +} + +// libCEED QFunction for applying a mass operator with a matrix coefficient. +CEED_QFUNCTION(f_apply_mass_matrix)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // in[0], out[0] have shape [ncomp=vdim, Q] + MassContext *bc = (MassContext *)ctx; + const CeedScalar *u = in[0], *qd = in[1]; + CeedScalar *v = out[0]; + switch (bc->vdim) + { + case 2: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 1] * u1; + v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 2] * u1; + } + break; + case 3: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + const CeedScalar u2 = u[i + Q * 2]; + v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 1] * u1 + qd[i + Q * 2] * u2; + v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 3] * u1 + qd[i + Q * 4] * u2; + v[i + Q * 2] = qd[i + Q * 2] * u0 + qd[i + Q * 4] * u1 + qd[i + Q * 5] * u2; + } + break; + } + return 0; +} + +#endif // PALACE_LIBCEED_MASS_QF_H diff --git a/palace/fem/qfunctions/utils_qf.h b/palace/fem/qfunctions/utils_qf.h index 62b0d6afcf..b9bdb75323 100644 --- a/palace/fem/qfunctions/utils_qf.h +++ b/palace/fem/qfunctions/utils_qf.h @@ -1,1036 +1,1036 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_UTILS_QF_H -#define PALACE_LIBCEED_UTILS_QF_H - -#include - -CEED_QFUNCTION_HELPER CeedScalar DetJ22(const CeedScalar *J, const CeedInt J_stride) -{ - // J: 0 2 - // 1 3 - return J[J_stride * 0] * J[J_stride * 3] - J[J_stride * 1] * J[J_stride * 2]; -} - -CEED_QFUNCTION_HELPER CeedScalar DetJ21(const CeedScalar *J, const CeedInt J_stride) -{ - // J: 0 - // 1 - return sqrt(J[J_stride * 0] * J[J_stride * 0] + J[J_stride * 1] * J[J_stride * 1]); -} - -CEED_QFUNCTION_HELPER CeedScalar DetJ33(const CeedScalar *J, const CeedInt J_stride) -{ - // J: 0 3 6 - // 1 4 7 - // 2 5 8 - return J[J_stride * 0] * - (J[J_stride * 4] * J[J_stride * 8] - J[J_stride * 5] * J[J_stride * 7]) - - J[J_stride * 1] * - (J[J_stride * 3] * J[J_stride * 8] - J[J_stride * 5] * J[J_stride * 6]) + - J[J_stride * 2] * - (J[J_stride * 3] * J[J_stride * 7] - J[J_stride * 4] * J[J_stride * 6]); -} - -CEED_QFUNCTION_HELPER CeedScalar DetJ32(const CeedScalar *J, const CeedInt J_stride) -{ - // J: 0 3 - // 1 4 - // 2 5 - const CeedScalar E = J[J_stride * 0] * J[J_stride * 0] + - J[J_stride * 1] * J[J_stride * 1] + - J[J_stride * 2] * J[J_stride * 2]; - const CeedScalar G = J[J_stride * 3] * J[J_stride * 3] + - J[J_stride * 4] * J[J_stride * 4] + - J[J_stride * 5] * J[J_stride * 5]; - const CeedScalar F = J[J_stride * 0] * J[J_stride * 3] + - J[J_stride * 1] * J[J_stride * 4] + - J[J_stride * 2] * J[J_stride * 5]; - return sqrt(E * G - F * F); -} - -CEED_QFUNCTION_HELPER void MultAdjJCAdjJt22(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) adj(J) C adj(J)^T and store the symmetric part of the result. - // J: 0 2 adj(J): J22 -J12 qd: 0 1 - // 1 3 -J21 J11 1 2 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J12 = J[J_stride * 2]; - const CeedScalar J22 = J[J_stride * 3]; - const CeedScalar w = qw / (J11 * J22 - J21 * J12); - if (c_comp == 3) // Matrix coefficient (symmetric) - { - // First compute entries of R = C adj(J)^T. - // c: 0 1 - // 1 2 - const CeedScalar R11 = c[c_stride * 0] * J22 - c[c_stride * 1] * J12; - const CeedScalar R21 = c[c_stride * 1] * J22 - c[c_stride * 2] * J12; - const CeedScalar R12 = -c[c_stride * 0] * J21 + c[c_stride * 1] * J11; - const CeedScalar R22 = -c[c_stride * 1] * J21 + c[c_stride * 2] * J11; - qd[qd_stride * 0] = w * (J22 * R11 - J12 * R21); - qd[qd_stride * 1] = w * (J11 * R21 - J21 * R11); - qd[qd_stride * 2] = w * (J11 * R22 - J21 * R12); - } - else if (c_comp == 2) // Vector coefficient - { - // c: 0 - // 1 - qd[qd_stride * 0] = w * (c[c_stride * 1] * J12 * J12 + c[c_stride * 0] * J22 * J22); - qd[qd_stride * 1] = -w * (c[c_stride * 1] * J11 * J12 + c[c_stride * 0] * J21 * J22); - qd[qd_stride * 2] = w * (c[c_stride * 1] * J11 * J11 + c[c_stride * 0] * J21 * J21); - } - else // Scalar coefficient - { - qd[qd_stride * 0] = w * c[c_stride * 0] * (J12 * J12 + J22 * J22); - qd[qd_stride * 1] = -w * c[c_stride * 0] * (J11 * J12 + J21 * J22); - qd[qd_stride * 2] = w * c[c_stride * 0] * (J11 * J11 + J21 * J21); - } -} - -CEED_QFUNCTION_HELPER void MultAdjJCAdjJt21(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) adj(J) C adj(J)^T and store the symmetric part of the result. - // J: 0 adj(J): 1/sqrt(J^T J) J^T qd: 0 - // 1 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar d = J11 * J11 + J21 * J21; - const CeedScalar w = qw / sqrt(d); - if (c_comp == 3) // Matrix coefficient (symmetric) - { - // First compute entries of R = C adj(J)^T. - // c: 0 1 - // 1 2 - const CeedScalar R11 = c[c_stride * 0] * J11 + c[c_stride * 1] * J21; - const CeedScalar R21 = c[c_stride * 1] * J11 + c[c_stride * 2] * J21; - qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21) / d; - } - else if (c_comp == 2) // Vector coefficient - { - // c: 0 - // 1 - qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21) / d; - } - else // Scalar coefficient - { - qd[qd_stride * 0] = w * c[c_stride * 0]; - } -} - -CEED_QFUNCTION_HELPER void MultAdjJCAdjJt33(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) adj(J) C adj(J)^T and store the symmetric part of the result. - // J: 0 3 6 qd: 0 1 2 - // 1 4 7 1 3 4 - // 2 5 8 2 4 5 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J31 = J[J_stride * 2]; - const CeedScalar J12 = J[J_stride * 3]; - const CeedScalar J22 = J[J_stride * 4]; - const CeedScalar J32 = J[J_stride * 5]; - const CeedScalar J13 = J[J_stride * 6]; - const CeedScalar J23 = J[J_stride * 7]; - const CeedScalar J33 = J[J_stride * 8]; - const CeedScalar A11 = J22 * J33 - J23 * J32; - const CeedScalar A21 = J23 * J31 - J21 * J33; - const CeedScalar A31 = J21 * J32 - J22 * J31; - const CeedScalar A12 = J13 * J32 - J12 * J33; - const CeedScalar A22 = J11 * J33 - J13 * J31; - const CeedScalar A32 = J12 * J31 - J11 * J32; - const CeedScalar A13 = J12 * J23 - J13 * J22; - const CeedScalar A23 = J13 * J21 - J11 * J23; - const CeedScalar A33 = J11 * J22 - J12 * J21; - const CeedScalar w = qw / (J11 * A11 + J21 * A12 + J31 * A13); - if (c_comp == 6) // Matrix coefficient (symmetric) - { - // First compute entries of R = C adj(J)^T. - // c: 0 1 2 - // 1 3 4 - // 2 4 5 - const CeedScalar R11 = - c[c_stride * 0] * A11 + c[c_stride * 1] * A12 + c[c_stride * 2] * A13; - const CeedScalar R21 = - c[c_stride * 1] * A11 + c[c_stride * 3] * A12 + c[c_stride * 4] * A13; - const CeedScalar R31 = - c[c_stride * 2] * A11 + c[c_stride * 4] * A12 + c[c_stride * 5] * A13; - const CeedScalar R12 = - c[c_stride * 0] * A21 + c[c_stride * 1] * A22 + c[c_stride * 2] * A23; - const CeedScalar R22 = - c[c_stride * 1] * A21 + c[c_stride * 3] * A22 + c[c_stride * 4] * A23; - const CeedScalar R32 = - c[c_stride * 2] * A21 + c[c_stride * 4] * A22 + c[c_stride * 5] * A23; - const CeedScalar R13 = - c[c_stride * 0] * A31 + c[c_stride * 1] * A32 + c[c_stride * 2] * A33; - const CeedScalar R23 = - c[c_stride * 1] * A31 + c[c_stride * 3] * A32 + c[c_stride * 4] * A33; - const CeedScalar R33 = - c[c_stride * 2] * A31 + c[c_stride * 4] * A32 + c[c_stride * 5] * A33; - qd[qd_stride * 0] = w * (A11 * R11 + A12 * R21 + A13 * R31); - qd[qd_stride * 1] = w * (A11 * R12 + A12 * R22 + A13 * R32); - qd[qd_stride * 2] = w * (A11 * R13 + A12 * R23 + A13 * R33); - qd[qd_stride * 3] = w * (A21 * R12 + A22 * R22 + A23 * R32); - qd[qd_stride * 4] = w * (A21 * R13 + A22 * R23 + A23 * R33); - qd[qd_stride * 5] = w * (A31 * R13 + A32 * R23 + A33 * R33); - } - else if (c_comp == 3) // Vector coefficient - { - // c: 0 - // 1 - // 2 - qd[qd_stride * 0] = w * (c[c_stride * 0] * A11 * A11 + c[c_stride * 1] * A12 * A12 + - c[c_stride * 2] * A13 * A13); - qd[qd_stride * 1] = w * (c[c_stride * 0] * A11 * A21 + c[c_stride * 1] * A12 * A22 + - c[c_stride * 2] * A13 * A23); - qd[qd_stride * 2] = w * (c[c_stride * 0] * A11 * A31 + c[c_stride * 1] * A12 * A32 + - c[c_stride * 2] * A13 * A33); - qd[qd_stride * 3] = w * (c[c_stride * 0] * A21 * A21 + c[c_stride * 1] * A22 * A22 + - c[c_stride * 2] * A23 * A23); - qd[qd_stride * 4] = w * (c[c_stride * 0] * A21 * A31 + c[c_stride * 1] * A22 * A32 + - c[c_stride * 2] * A23 * A33); - qd[qd_stride * 5] = w * (c[c_stride * 0] * A31 * A31 + c[c_stride * 1] * A32 * A32 + - c[c_stride * 2] * A33 * A33); - } - else // Scalar coefficient - { - qd[qd_stride * 0] = w * c[c_stride * 0] * (A11 * A11 + A12 * A12 + A13 * A13); - qd[qd_stride * 1] = w * c[c_stride * 0] * (A11 * A21 + A12 * A22 + A13 * A23); - qd[qd_stride * 2] = w * c[c_stride * 0] * (A11 * A31 + A12 * A32 + A13 * A33); - qd[qd_stride * 3] = w * c[c_stride * 0] * (A21 * A21 + A22 * A22 + A23 * A23); - qd[qd_stride * 4] = w * c[c_stride * 0] * (A21 * A31 + A22 * A32 + A23 * A33); - qd[qd_stride * 5] = w * c[c_stride * 0] * (A31 * A31 + A32 * A32 + A33 * A33); - } -} - -CEED_QFUNCTION_HELPER void MultAdjJCAdjJt32(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) adj(J) C adj(J)^T and store the symmetric part of the result. - // J: 0 3 qd: 0 1 - // 1 4 1 2 - // 2 5 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J31 = J[J_stride * 2]; - const CeedScalar J12 = J[J_stride * 3]; - const CeedScalar J22 = J[J_stride * 4]; - const CeedScalar J32 = J[J_stride * 5]; - const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31; - const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32; - const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32; - const CeedScalar d = E * G - F * F; - const CeedScalar w = qw / sqrt(d); - if (c_comp == 6) // Matrix coefficient (symmetric) - { - // First compute entries of R = C adj(J)^T. - // c: 0 1 2 - // 1 3 4 - // 2 4 5 - const CeedScalar R11 = - G * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31) - - F * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32); - const CeedScalar R21 = - G * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31) - - F * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32); - const CeedScalar R31 = - G * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31) - - F * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32); - const CeedScalar R12 = - E * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32) - - F * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31); - const CeedScalar R22 = - E * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32) - - F * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31); - const CeedScalar R32 = - E * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32) - - F * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31); - qd[qd_stride * 0] = w * - (G * (J11 * R11 + J21 * R21 + J31 * R31) - - F * (J12 * R11 + J22 * R21 + J32 * R31)) / - d; - qd[qd_stride * 1] = w * - (G * (J11 * R12 + J21 * R22 + J31 * R32) - - F * (J12 * R12 + J22 * R22 + J32 * R32)) / - d; - qd[qd_stride * 2] = w * - (E * (J12 * R12 + J22 * R22 + J32 * R32) - - F * (J11 * R12 + J21 * R22 + J31 * R32)) / - d; - } - else if (c_comp == 3) // Vector coefficient - { - // First compute entries of R = C adj(J)^T. - // c: 0 - // 1 - // 2 - const CeedScalar R11 = c[c_stride * 0] * (G * J11 - F * J12); - const CeedScalar R21 = c[c_stride * 1] * (G * J21 - F * J22); - const CeedScalar R31 = c[c_stride * 2] * (G * J31 - F * J32); - const CeedScalar R12 = c[c_stride * 0] * (E * J12 - F * J11); - const CeedScalar R22 = c[c_stride * 1] * (E * J22 - F * J21); - const CeedScalar R32 = c[c_stride * 2] * (E * J32 - F * J31); - qd[qd_stride * 0] = w * - (G * (J11 * R11 + J21 * R21 + J31 * R31) - - F * (J12 * R11 + J22 * R21 + J32 * R31)) / - d; - qd[qd_stride * 1] = w * - (G * (J11 * R12 + J21 * R22 + J31 * R32) - - F * (J12 * R12 + J22 * R22 + J32 * R32)) / - d; - qd[qd_stride * 2] = w * - (E * (J12 * R12 + J22 * R22 + J32 * R32) - - F * (J11 * R12 + J21 * R22 + J31 * R32)) / - d; - } - else // Scalar coefficient - { - qd[qd_stride * 0] = w * c[c_stride * 0] * G; - qd[qd_stride * 1] = -w * c[c_stride * 0] * F; - qd[qd_stride * 2] = w * c[c_stride * 0] * E; - } -} - -CEED_QFUNCTION_HELPER void MultJtCJ22(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) J^T C J and store the symmetric part of the result. - // J: 0 2 qd: 0 1 - // 1 3 1 2 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J12 = J[J_stride * 2]; - const CeedScalar J22 = J[J_stride * 3]; - const CeedScalar w = qw / (J11 * J22 - J21 * J12); - if (c_comp == 3) // Matrix coefficient (symmetric) - { - // First compute entries of R = C J. - // c: 0 1 - // 1 2 - const CeedScalar R11 = c[c_stride * 0] * J11 + c[c_stride * 1] * J21; - const CeedScalar R21 = c[c_stride * 1] * J11 + c[c_stride * 2] * J21; - const CeedScalar R12 = c[c_stride * 0] * J12 + c[c_stride * 1] * J22; - const CeedScalar R22 = c[c_stride * 1] * J12 + c[c_stride * 2] * J22; - qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21); - qd[qd_stride * 1] = w * (J11 * R12 + J21 * R22); - qd[qd_stride * 2] = w * (J12 * R12 + J22 * R22); - } - else if (c_comp == 2) // Vector coefficient - { - // c: 0 - // 1 - qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21); - qd[qd_stride * 1] = w * (c[c_stride * 0] * J11 * J12 + c[c_stride * 1] * J21 * J22); - qd[qd_stride * 2] = w * (c[c_stride * 0] * J12 * J12 + c[c_stride * 1] * J22 * J22); - } - else // Scalar coefficient - { - qd[qd_stride * 0] = w * c[c_stride * 0] * (J11 * J11 + J21 * J21); - qd[qd_stride * 1] = w * c[c_stride * 0] * (J11 * J12 + J21 * J22); - qd[qd_stride * 2] = w * c[c_stride * 0] * (J12 * J12 + J22 * J22); - } -} - -CEED_QFUNCTION_HELPER void MultJtCJ21(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) J^T C J and store the symmetric part of the result. - // J: 0 qd: 0 - // 1 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - if (c_comp == 3) // Matrix coefficient (symmetric) - { - // First compute entries of R = C J. - // c: 0 1 - // 1 2 - const CeedScalar w = qw / sqrt(J11 * J11 + J21 * J21); - const CeedScalar R11 = c[c_stride * 0] * J11 + c[c_stride * 1] * J21; - const CeedScalar R21 = c[c_stride * 1] * J11 + c[c_stride * 2] * J21; - qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21); - } - else if (c_comp == 2) // Vector coefficient - { - // c: 0 - // 1 - const CeedScalar w = qw / sqrt(J11 * J11 + J21 * J21); - qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21); - } - else // Scalar coefficient - { - qd[qd_stride * 0] = qw * c[c_stride * 0] * sqrt(J11 * J11 + J21 * J21); - } -} - -CEED_QFUNCTION_HELPER void MultJtCJ33(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) J^T C J and store the symmetric part of the result. - // J: 0 3 6 qd: 0 1 2 - // 1 4 7 1 3 4 - // 2 5 8 2 4 5 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J31 = J[J_stride * 2]; - const CeedScalar J12 = J[J_stride * 3]; - const CeedScalar J22 = J[J_stride * 4]; - const CeedScalar J32 = J[J_stride * 5]; - const CeedScalar J13 = J[J_stride * 6]; - const CeedScalar J23 = J[J_stride * 7]; - const CeedScalar J33 = J[J_stride * 8]; - const CeedScalar w = qw / (J11 * (J22 * J33 - J23 * J32) + J21 * (J13 * J32 - J12 * J33) + - J31 * (J12 * J23 - J13 * J22)); - if (c_comp == 6) // Matrix coefficient (symmetric) - { - // First compute entries of R = C J. - // c: 0 1 2 - // 1 3 4 - // 2 4 5 - const CeedScalar R11 = - c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31; - const CeedScalar R21 = - c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31; - const CeedScalar R31 = - c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31; - const CeedScalar R12 = - c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32; - const CeedScalar R22 = - c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32; - const CeedScalar R32 = - c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32; - const CeedScalar R13 = - c[c_stride * 0] * J13 + c[c_stride * 1] * J23 + c[c_stride * 2] * J33; - const CeedScalar R23 = - c[c_stride * 1] * J13 + c[c_stride * 3] * J23 + c[c_stride * 4] * J33; - const CeedScalar R33 = - c[c_stride * 2] * J13 + c[c_stride * 4] * J23 + c[c_stride * 5] * J33; - qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31); - qd[qd_stride * 1] = w * (J11 * R12 + J21 * R22 + J31 * R32); - qd[qd_stride * 2] = w * (J11 * R13 + J21 * R23 + J31 * R33); - qd[qd_stride * 3] = w * (J12 * R12 + J22 * R22 + J32 * R32); - qd[qd_stride * 4] = w * (J12 * R13 + J22 * R23 + J32 * R33); - qd[qd_stride * 5] = w * (J13 * R13 + J23 * R23 + J33 * R33); - } - else if (c_comp == 3) // Vector coefficient - { - // c: 0 - // 1 - // 2 - qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21 + - c[c_stride * 2] * J31 * J31); - qd[qd_stride * 1] = w * (c[c_stride * 0] * J11 * J12 + c[c_stride * 1] * J21 * J22 + - c[c_stride * 2] * J31 * J32); - qd[qd_stride * 2] = w * (c[c_stride * 0] * J11 * J13 + c[c_stride * 1] * J21 * J23 + - c[c_stride * 2] * J31 * J33); - qd[qd_stride * 3] = w * (c[c_stride * 0] * J12 * J12 + c[c_stride * 1] * J22 * J22 + - c[c_stride * 2] * J32 * J32); - qd[qd_stride * 4] = w * (c[c_stride * 0] * J12 * J13 + c[c_stride * 1] * J22 * J23 + - c[c_stride * 2] * J32 * J33); - qd[qd_stride * 5] = w * (c[c_stride * 0] * J13 * J13 + c[c_stride * 1] * J23 * J23 + - c[c_stride * 2] * J33 * J33); - } - else // Scalar coefficient - { - qd[qd_stride * 0] = w * c[c_stride * 0] * (J11 * J11 + J21 * J21 + J31 * J31); - qd[qd_stride * 1] = w * c[c_stride * 0] * (J11 * J12 + J21 * J22 + J31 * J32); - qd[qd_stride * 2] = w * c[c_stride * 0] * (J11 * J13 + J21 * J23 + J31 * J33); - qd[qd_stride * 3] = w * c[c_stride * 0] * (J12 * J12 + J22 * J22 + J32 * J32); - qd[qd_stride * 4] = w * c[c_stride * 0] * (J12 * J13 + J22 * J23 + J32 * J33); - qd[qd_stride * 5] = w * c[c_stride * 0] * (J13 * J13 + J23 * J23 + J33 * J33); - } -} - -CEED_QFUNCTION_HELPER void MultJtCJ32(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) J^T C J and store the symmetric part of the result. - // J: 0 3 qd: 0 1 - // 1 4 1 2 - // 2 5 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J31 = J[J_stride * 2]; - const CeedScalar J12 = J[J_stride * 3]; - const CeedScalar J22 = J[J_stride * 4]; - const CeedScalar J32 = J[J_stride * 5]; - const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31; - const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32; - const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32; - const CeedScalar w = qw / sqrt(E * G - F * F); - if (c_comp == 6) // Matrix coefficient (symmetric) - { - // First compute entries of R = C J. - // c: 0 1 2 - // 1 3 4 - // 2 4 5 - const CeedScalar R11 = - c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31; - const CeedScalar R21 = - c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31; - const CeedScalar R31 = - c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31; - const CeedScalar R12 = - c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32; - const CeedScalar R22 = - c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32; - const CeedScalar R32 = - c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32; - qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31); - qd[qd_stride * 1] = w * (J11 * R12 + J21 * R22 + J31 * R32); - qd[qd_stride * 2] = w * (J12 * R12 + J22 * R22 + J32 * R32); - } - else if (c_comp == 3) // Vector coefficient - { - // c: 0 - // 1 - // 2 - qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21 + - c[c_stride * 2] * J31 * J31); - qd[qd_stride * 1] = w * (c[c_stride * 0] * J11 * J12 + c[c_stride * 1] * J21 * J22 + - c[c_stride * 2] * J31 * J32); - qd[qd_stride * 2] = w * (c[c_stride * 0] * J12 * J12 + c[c_stride * 1] * J22 * J22 + - c[c_stride * 2] * J32 * J32); - } - else // Scalar coefficient - { - qd[qd_stride * 0] = w * c[c_stride * 0] * E; - qd[qd_stride * 1] = w * c[c_stride * 0] * F; - qd[qd_stride * 2] = w * c[c_stride * 0] * G; - } -} - -template -CEED_QFUNCTION_HELPER void MultJtCAdjJt22(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) J^T C adj(J)^T and store the result. - // J: 0 2 adj(J): J22 -J12 qd: 0 2 - // 1 3 -J21 J11 1 3 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J12 = J[J_stride * 2]; - const CeedScalar J22 = J[J_stride * 3]; - const CeedScalar w = qw / (J11 * J22 - J21 * J12); - if (c_comp == 3) // Matrix coefficient (symmetric) - { - // First compute entries of R = C adj(J)^T. - // c: 0 1 - // 1 2 - const CeedScalar R11 = c[c_stride * 0] * J22 - c[c_stride * 1] * J12; - const CeedScalar R21 = c[c_stride * 1] * J22 - c[c_stride * 2] * J12; - const CeedScalar R12 = -c[c_stride * 0] * J21 + c[c_stride * 1] * J11; - const CeedScalar R22 = -c[c_stride * 1] * J21 + c[c_stride * 2] * J11; - qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21); - qd[qd_stride * 1] = w * (J12 * R11 + J22 * R21); - qd[qd_stride * 2] = w * (J11 * R12 + J21 * R22); - qd[qd_stride * 3] = w * (J12 * R12 + J22 * R22); - } - else if (c_comp == 2) // Vector coefficient - { - // c: 0 - // 1 - qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J22 - c[c_stride * 1] * J12 * J21); - qd[qd_stride * 1] = w * (c[c_stride * 0] * J12 * J22 - c[c_stride * 1] * J12 * J22); - qd[qd_stride * 2] = w * (-c[c_stride * 0] * J11 * J21 + c[c_stride * 1] * J11 * J21); - qd[qd_stride * 3] = w * (-c[c_stride * 0] * J12 * J21 + c[c_stride * 1] * J11 * J22); - } - else // Scalar coefficient - { - qd[qd_stride * 0] = qw * c[c_stride * 0]; - qd[qd_stride * 1] = 0.0; - qd[qd_stride * 2] = 0.0; - qd[qd_stride * 3] = qw * c[c_stride * 0]; - } - if (Transpose && c_comp > 1) - { - const CeedScalar qd21 = qd[qd_stride * 1]; - qd[qd_stride * 1] = qd[qd_stride * 2]; - qd[qd_stride * 2] = qd21; - } -} - -template -CEED_QFUNCTION_HELPER void MultJtCAdjJt21(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) J^T C adj(J)^T and store the result. - // J: 0 adj(J): 1/sqrt(J^T J) J^T qd: 0 - // 1 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar w = qw / (J11 * J11 + J21 * J21); - if (c_comp == 3) // Matrix coefficient (symmetric) - { - // First compute entries of R = C adj(J)^T. - // c: 0 1 - // 1 2 - const CeedScalar R11 = c[c_stride * 0] * J11 + c[c_stride * 1] * J21; - const CeedScalar R21 = c[c_stride * 1] * J11 + c[c_stride * 2] * J21; - qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21); - } - else if (c_comp == 2) // Vector coefficient - { - // c: 0 - // 1 - qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21); - } - else // Scalar coefficient - { - qd[qd_stride * 0] = qw * c[c_stride * 0]; - } -} - -template -CEED_QFUNCTION_HELPER void MultJtCAdjJt33(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) J^T C adj(J)^T and store the result. - // J: 0 3 6 qd: 0 3 6 - // 1 4 7 1 4 7 - // 2 5 8 2 5 8 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J31 = J[J_stride * 2]; - const CeedScalar J12 = J[J_stride * 3]; - const CeedScalar J22 = J[J_stride * 4]; - const CeedScalar J32 = J[J_stride * 5]; - const CeedScalar J13 = J[J_stride * 6]; - const CeedScalar J23 = J[J_stride * 7]; - const CeedScalar J33 = J[J_stride * 8]; - const CeedScalar A11 = J22 * J33 - J23 * J32; - const CeedScalar A21 = J23 * J31 - J21 * J33; - const CeedScalar A31 = J21 * J32 - J22 * J31; - const CeedScalar A12 = J13 * J32 - J12 * J33; - const CeedScalar A22 = J11 * J33 - J13 * J31; - const CeedScalar A32 = J12 * J31 - J11 * J32; - const CeedScalar A13 = J12 * J23 - J13 * J22; - const CeedScalar A23 = J13 * J21 - J11 * J23; - const CeedScalar A33 = J11 * J22 - J12 * J21; - const CeedScalar w = qw / (J11 * A11 + J21 * A12 + J31 * A13); - if (c_comp == 6) // Matrix coefficient (symmetric) - { - // First compute entries of R = C adj(J)^T. - // c: 0 1 2 - // 1 3 4 - // 2 4 5 - const CeedScalar R11 = - c[c_stride * 0] * A11 + c[c_stride * 1] * A12 + c[c_stride * 2] * A13; - const CeedScalar R21 = - c[c_stride * 1] * A11 + c[c_stride * 3] * A12 + c[c_stride * 4] * A13; - const CeedScalar R31 = - c[c_stride * 2] * A11 + c[c_stride * 4] * A12 + c[c_stride * 5] * A13; - const CeedScalar R12 = - c[c_stride * 0] * A21 + c[c_stride * 1] * A22 + c[c_stride * 2] * A23; - const CeedScalar R22 = - c[c_stride * 1] * A21 + c[c_stride * 3] * A22 + c[c_stride * 4] * A23; - const CeedScalar R32 = - c[c_stride * 2] * A21 + c[c_stride * 4] * A22 + c[c_stride * 5] * A23; - const CeedScalar R13 = - c[c_stride * 0] * A31 + c[c_stride * 1] * A32 + c[c_stride * 2] * A33; - const CeedScalar R23 = - c[c_stride * 1] * A31 + c[c_stride * 3] * A32 + c[c_stride * 4] * A33; - const CeedScalar R33 = - c[c_stride * 2] * A31 + c[c_stride * 4] * A32 + c[c_stride * 5] * A33; - qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31); - qd[qd_stride * 1] = w * (J12 * R11 + J22 * R21 + J32 * R31); - qd[qd_stride * 2] = w * (J13 * R11 + J23 * R21 + J33 * R31); - qd[qd_stride * 3] = w * (J11 * R12 + J21 * R22 + J31 * R32); - qd[qd_stride * 4] = w * (J12 * R12 + J22 * R22 + J32 * R32); - qd[qd_stride * 5] = w * (J13 * R12 + J23 * R22 + J33 * R32); - qd[qd_stride * 6] = w * (J11 * R13 + J21 * R23 + J31 * R33); - qd[qd_stride * 7] = w * (J12 * R13 + J22 * R23 + J32 * R33); - qd[qd_stride * 8] = w * (J13 * R13 + J23 * R23 + J33 * R33); - } - else if (c_comp == 3) // Vector coefficient - { - // c: 0 - // 1 - // 2 - qd[qd_stride * 0] = w * (c[c_stride * 0] * A11 * J11 + c[c_stride * 1] * A12 * J21 + - c[c_stride * 2] * A13 * J31); - qd[qd_stride * 1] = w * (c[c_stride * 0] * A11 * J12 + c[c_stride * 1] * A12 * J22 + - c[c_stride * 2] * A13 * J32); - qd[qd_stride * 2] = w * (c[c_stride * 0] * A11 * J13 + c[c_stride * 1] * A12 * J23 + - c[c_stride * 2] * A13 * J33); - qd[qd_stride * 3] = w * (c[c_stride * 0] * A21 * J11 + c[c_stride * 1] * A22 * J21 + - c[c_stride * 2] * A23 * J31); - qd[qd_stride * 4] = w * (c[c_stride * 0] * A21 * J12 + c[c_stride * 1] * A22 * J22 + - c[c_stride * 2] * A23 * J32); - qd[qd_stride * 5] = w * (c[c_stride * 0] * A21 * J13 + c[c_stride * 1] * A22 * J23 + - c[c_stride * 2] * A23 * J33); - qd[qd_stride * 6] = w * (c[c_stride * 0] * A31 * J11 + c[c_stride * 1] * A32 * J21 + - c[c_stride * 2] * A33 * J31); - qd[qd_stride * 7] = w * (c[c_stride * 0] * A31 * J12 + c[c_stride * 1] * A32 * J22 + - c[c_stride * 2] * A33 * J32); - qd[qd_stride * 8] = w * (c[c_stride * 0] * A31 * J13 + c[c_stride * 1] * A32 * J23 + - c[c_stride * 2] * A33 * J33); - } - else // Scalar coefficient - { - qd[qd_stride * 0] = qw * c[c_stride * 0]; - qd[qd_stride * 1] = 0.0; - qd[qd_stride * 2] = 0.0; - qd[qd_stride * 3] = 0.0; - qd[qd_stride * 4] = qw * c[c_stride * 0]; - qd[qd_stride * 5] = 0.0; - qd[qd_stride * 6] = 0.0; - qd[qd_stride * 7] = 0.0; - qd[qd_stride * 8] = qw * c[c_stride * 0]; - } - if (Transpose && c_comp > 1) - { - { - const CeedScalar qd21 = qd[qd_stride * 1]; - qd[qd_stride * 1] = qd[qd_stride * 3]; - qd[qd_stride * 3] = qd21; - } - { - const CeedScalar qd31 = qd[qd_stride * 2]; - qd[qd_stride * 2] = qd[qd_stride * 6]; - qd[qd_stride * 6] = qd31; - } - { - const CeedScalar qd32 = qd[qd_stride * 5]; - qd[qd_stride * 5] = qd[qd_stride * 7]; - qd[qd_stride * 7] = qd32; - } - } -} - -template -CEED_QFUNCTION_HELPER void MultJtCAdjJt32(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw / det(J) J^T C adj(J)^T and store the result. - // J: 0 3 qd: 0 2 - // 1 4 1 3 - // 2 5 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J31 = J[J_stride * 2]; - const CeedScalar J12 = J[J_stride * 3]; - const CeedScalar J22 = J[J_stride * 4]; - const CeedScalar J32 = J[J_stride * 5]; - const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31; - const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32; - const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32; - const CeedScalar w = qw / (E * G - F * F); - if (c_comp == 6) // Matrix coefficient (symmetric) - { - // First compute entries of R = C adj(J)^T. - // c: 0 1 2 - // 1 3 4 - // 2 4 5 - const CeedScalar R11 = - G * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31) - - F * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32); - const CeedScalar R21 = - G * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31) - - F * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32); - const CeedScalar R31 = - G * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31) - - F * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32); - const CeedScalar R12 = - E * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32) - - F * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31); - const CeedScalar R22 = - E * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32) - - F * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31); - const CeedScalar R32 = - E * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32) - - F * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31); - qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31); - qd[qd_stride * 1] = w * (J12 * R11 + J22 * R21 + J32 * R31); - qd[qd_stride * 2] = w * (J11 * R12 + J21 * R22 + J31 * R32); - qd[qd_stride * 3] = w * (J12 * R12 + J22 * R22 + J32 * R32); - } - else if (c_comp == 3) // Vector coefficient - { - // First compute entries of R = C adj(J)^T. - // c: 0 - // 1 - // 2 - const CeedScalar R11 = c[c_stride * 0] * (G * J11 - F * J12); - const CeedScalar R21 = c[c_stride * 1] * (G * J21 - F * J22); - const CeedScalar R31 = c[c_stride * 2] * (G * J31 - F * J32); - const CeedScalar R12 = c[c_stride * 0] * (E * J12 - F * J11); - const CeedScalar R22 = c[c_stride * 1] * (E * J22 - F * J21); - const CeedScalar R32 = c[c_stride * 2] * (E * J32 - F * J31); - qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31); - qd[qd_stride * 1] = w * (J12 * R11 + J22 * R21 + J32 * R31); - qd[qd_stride * 2] = w * (J11 * R12 + J21 * R22 + J31 * R32); - qd[qd_stride * 3] = w * (J12 * R12 + J22 * R22 + J32 * R32); - } - else // Scalar coefficient - { - qd[qd_stride * 0] = qw * c[c_stride * 0]; - qd[qd_stride * 1] = 0.0; - qd[qd_stride * 2] = 0.0; - qd[qd_stride * 3] = qw * c[c_stride * 0]; - } - if (Transpose && c_comp > 1) - { - const CeedScalar qd21 = qd[qd_stride * 1]; - qd[qd_stride * 1] = qd[qd_stride * 2]; - qd[qd_stride * 2] = qd21; - } -} - -CEED_QFUNCTION_HELPER void MultCAdjJt22(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw C adj(J)^T and store the result. - // J: 0 2 adj(J): J22 -J12 qd: 0 2 - // 1 3 -J21 J11 1 3 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J12 = J[J_stride * 2]; - const CeedScalar J22 = J[J_stride * 3]; - if (c_comp == 3) // Matrix coefficient (symmetric) - { - // c: 0 1 - // 1 2 - qd[qd_stride * 0] = qw * (c[c_stride * 0] * J22 - c[c_stride * 1] * J12); - qd[qd_stride * 1] = qw * (c[c_stride * 1] * J22 - c[c_stride * 2] * J12); - qd[qd_stride * 2] = qw * (-c[c_stride * 0] * J21 + c[c_stride * 1] * J11); - qd[qd_stride * 3] = qw * (-c[c_stride * 1] * J21 + c[c_stride * 2] * J11); - } - else if (c_comp == 2) // Vector coefficient - { - // c: 0 - // 1 - const CeedScalar wc0 = qw * c[c_stride * 0]; - const CeedScalar wc1 = qw * c[c_stride * 1]; - qd[qd_stride * 0] = wc0 * J22; - qd[qd_stride * 1] = -wc1 * J12; - qd[qd_stride * 2] = -wc0 * J21; - qd[qd_stride * 3] = wc1 * J11; - } - else // Scalar coefficient - { - const CeedScalar wc = qw * c[c_stride * 0]; - qd[qd_stride * 0] = wc * J22; - qd[qd_stride * 1] = -wc * J12; - qd[qd_stride * 2] = -wc * J21; - qd[qd_stride * 3] = wc * J11; - } -} - -CEED_QFUNCTION_HELPER void MultCAdjJt21(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw C adj(J)^T and store the result. - // J: 0 adj(J): 1/sqrt(J^T J) J^T qd: 0 - // 1 1 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar w = qw / sqrt(J11 * J11 + J21 * J21); - if (c_comp == 3) // Matrix coefficient (symmetric) - { - // c: 0 1 - // 1 2 - qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21); - qd[qd_stride * 1] = w * (c[c_stride * 1] * J11 + c[c_stride * 2] * J21); - } - else if (c_comp == 2) // Vector coefficient - { - // c: 0 - // 1 - qd[qd_stride * 0] = w * c[c_stride * 0] * J11; - qd[qd_stride * 1] = w * c[c_stride * 1] * J21; - } - else // Scalar coefficient - { - const CeedScalar wc = w * c[c_stride * 0]; - qd[qd_stride * 0] = wc * J11; - qd[qd_stride * 1] = wc * J21; - } -} - -CEED_QFUNCTION_HELPER void MultCAdjJt33(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw C adj(J)^T and store the result. - // J: 0 3 6 qd: 0 3 6 - // 1 4 7 1 4 7 - // 2 5 8 2 5 8 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J31 = J[J_stride * 2]; - const CeedScalar J12 = J[J_stride * 3]; - const CeedScalar J22 = J[J_stride * 4]; - const CeedScalar J32 = J[J_stride * 5]; - const CeedScalar J13 = J[J_stride * 6]; - const CeedScalar J23 = J[J_stride * 7]; - const CeedScalar J33 = J[J_stride * 8]; - const CeedScalar A11 = J22 * J33 - J23 * J32; - const CeedScalar A21 = J23 * J31 - J21 * J33; - const CeedScalar A31 = J21 * J32 - J22 * J31; - const CeedScalar A12 = J13 * J32 - J12 * J33; - const CeedScalar A22 = J11 * J33 - J13 * J31; - const CeedScalar A32 = J12 * J31 - J11 * J32; - const CeedScalar A13 = J12 * J23 - J13 * J22; - const CeedScalar A23 = J13 * J21 - J11 * J23; - const CeedScalar A33 = J11 * J22 - J12 * J21; - if (c_comp == 6) // Matrix coefficient (symmetric) - { - // c: 0 1 2 - // 1 3 4 - // 2 4 5 - qd[qd_stride * 0] = - qw * (c[c_stride * 0] * A11 + c[c_stride * 1] * A12 + c[c_stride * 2] * A13); - qd[qd_stride * 1] = - qw * (c[c_stride * 1] * A11 + c[c_stride * 3] * A12 + c[c_stride * 4] * A13); - qd[qd_stride * 2] = - qw * (c[c_stride * 2] * A11 + c[c_stride * 4] * A12 + c[c_stride * 5] * A13); - qd[qd_stride * 3] = - qw * (c[c_stride * 0] * A21 + c[c_stride * 1] * A22 + c[c_stride * 2] * A23); - qd[qd_stride * 4] = - qw * (c[c_stride * 1] * A21 + c[c_stride * 3] * A22 + c[c_stride * 4] * A23); - qd[qd_stride * 5] = - qw * (c[c_stride * 2] * A21 + c[c_stride * 4] * A22 + c[c_stride * 5] * A23); - qd[qd_stride * 6] = - qw * (c[c_stride * 0] * A31 + c[c_stride * 1] * A32 + c[c_stride * 2] * A33); - qd[qd_stride * 7] = - qw * (c[c_stride * 1] * A31 + c[c_stride * 3] * A32 + c[c_stride * 4] * A33); - qd[qd_stride * 8] = - qw * (c[c_stride * 2] * A31 + c[c_stride * 4] * A32 + c[c_stride * 5] * A33); - } - else if (c_comp == 3) // Vector coefficient - { - // c: 0 - // 1 - // 2 - const CeedScalar wc0 = qw * c[c_stride * 0]; - const CeedScalar wc1 = qw * c[c_stride * 1]; - const CeedScalar wc2 = qw * c[c_stride * 2]; - qd[qd_stride * 0] = wc0 * A11; - qd[qd_stride * 1] = wc1 * A12; - qd[qd_stride * 2] = wc2 * A13; - qd[qd_stride * 3] = wc0 * A21; - qd[qd_stride * 4] = wc1 * A22; - qd[qd_stride * 5] = wc2 * A23; - qd[qd_stride * 6] = wc0 * A31; - qd[qd_stride * 7] = wc1 * A32; - qd[qd_stride * 8] = wc2 * A33; - } - else // Scalar coefficient - { - const CeedScalar wc = qw * c[c_stride * 0]; - qd[qd_stride * 0] = wc * A11; - qd[qd_stride * 1] = wc * A12; - qd[qd_stride * 2] = wc * A13; - qd[qd_stride * 3] = wc * A21; - qd[qd_stride * 4] = wc * A22; - qd[qd_stride * 5] = wc * A23; - qd[qd_stride * 6] = wc * A31; - qd[qd_stride * 7] = wc * A32; - qd[qd_stride * 8] = wc * A33; - } -} - -CEED_QFUNCTION_HELPER void MultCAdjJt32(const CeedScalar *J, const CeedInt J_stride, - const CeedScalar *c, const CeedInt c_stride, - const CeedInt c_comp, const CeedScalar qw, - const CeedInt qd_stride, CeedScalar *qd) -{ - // Compute qw C adj(J)^T and store the result. - // J: 0 3 qd: 0 3 - // 1 4 1 4 - // 2 5 2 5 - const CeedScalar J11 = J[J_stride * 0]; - const CeedScalar J21 = J[J_stride * 1]; - const CeedScalar J31 = J[J_stride * 2]; - const CeedScalar J12 = J[J_stride * 3]; - const CeedScalar J22 = J[J_stride * 4]; - const CeedScalar J32 = J[J_stride * 5]; - const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31; - const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32; - const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32; - const CeedScalar w = qw / sqrt(E * G - F * F); - if (c_comp == 6) // Matrix coefficient (symmetric) - { - // c: 0 1 2 - // 1 3 4 - // 2 4 5 - qd[qd_stride * 0] = - w * (G * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31) - - F * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32)); - qd[qd_stride * 1] = - w * (G * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31) - - F * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32)); - qd[qd_stride * 2] = - w * (G * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31) - - F * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32)); - qd[qd_stride * 3] = - w * (E * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32) - - F * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31)); - qd[qd_stride * 4] = - w * (E * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32) - - F * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31)); - qd[qd_stride * 5] = - w * (E * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32) - - F * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31)); - } - else if (c_comp == 3) // Vector coefficient - { - // c: 0 - // 1 - // 2 - const CeedScalar wc0 = w * c[c_stride * 0]; - const CeedScalar wc1 = w * c[c_stride * 1]; - const CeedScalar wc2 = w * c[c_stride * 2]; - qd[qd_stride * 0] = wc0 * (G * J11 - F * J12); - qd[qd_stride * 1] = wc1 * (G * J21 - F * J22); - qd[qd_stride * 2] = wc2 * (G * J31 - F * J32); - qd[qd_stride * 3] = wc0 * (E * J12 - F * J11); - qd[qd_stride * 4] = wc1 * (E * J22 - F * J21); - qd[qd_stride * 5] = wc2 * (E * J32 - F * J31); - } - else // Scalar coefficient - { - const CeedScalar wc = w * c[c_stride * 0]; - qd[qd_stride * 0] = wc * (G * J11 - F * J12); - qd[qd_stride * 1] = wc * (G * J21 - F * J22); - qd[qd_stride * 2] = wc * (G * J31 - F * J32); - qd[qd_stride * 3] = wc * (E * J12 - F * J11); - qd[qd_stride * 4] = wc * (E * J22 - F * J21); - qd[qd_stride * 5] = wc * (E * J32 - F * J31); - } -} - -#endif // PALACE_LIBCEED_UTILS_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_UTILS_QF_H +#define PALACE_LIBCEED_UTILS_QF_H + +#include + +CEED_QFUNCTION_HELPER CeedScalar DetJ22(const CeedScalar *J, const CeedInt J_stride) +{ + // J: 0 2 + // 1 3 + return J[J_stride * 0] * J[J_stride * 3] - J[J_stride * 1] * J[J_stride * 2]; +} + +CEED_QFUNCTION_HELPER CeedScalar DetJ21(const CeedScalar *J, const CeedInt J_stride) +{ + // J: 0 + // 1 + return sqrt(J[J_stride * 0] * J[J_stride * 0] + J[J_stride * 1] * J[J_stride * 1]); +} + +CEED_QFUNCTION_HELPER CeedScalar DetJ33(const CeedScalar *J, const CeedInt J_stride) +{ + // J: 0 3 6 + // 1 4 7 + // 2 5 8 + return J[J_stride * 0] * + (J[J_stride * 4] * J[J_stride * 8] - J[J_stride * 5] * J[J_stride * 7]) - + J[J_stride * 1] * + (J[J_stride * 3] * J[J_stride * 8] - J[J_stride * 5] * J[J_stride * 6]) + + J[J_stride * 2] * + (J[J_stride * 3] * J[J_stride * 7] - J[J_stride * 4] * J[J_stride * 6]); +} + +CEED_QFUNCTION_HELPER CeedScalar DetJ32(const CeedScalar *J, const CeedInt J_stride) +{ + // J: 0 3 + // 1 4 + // 2 5 + const CeedScalar E = J[J_stride * 0] * J[J_stride * 0] + + J[J_stride * 1] * J[J_stride * 1] + + J[J_stride * 2] * J[J_stride * 2]; + const CeedScalar G = J[J_stride * 3] * J[J_stride * 3] + + J[J_stride * 4] * J[J_stride * 4] + + J[J_stride * 5] * J[J_stride * 5]; + const CeedScalar F = J[J_stride * 0] * J[J_stride * 3] + + J[J_stride * 1] * J[J_stride * 4] + + J[J_stride * 2] * J[J_stride * 5]; + return sqrt(E * G - F * F); +} + +CEED_QFUNCTION_HELPER void MultAdjJCAdjJt22(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) adj(J) C adj(J)^T and store the symmetric part of the result. + // J: 0 2 adj(J): J22 -J12 qd: 0 1 + // 1 3 -J21 J11 1 2 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J12 = J[J_stride * 2]; + const CeedScalar J22 = J[J_stride * 3]; + const CeedScalar w = qw / (J11 * J22 - J21 * J12); + if (c_comp == 3) // Matrix coefficient (symmetric) + { + // First compute entries of R = C adj(J)^T. + // c: 0 1 + // 1 2 + const CeedScalar R11 = c[c_stride * 0] * J22 - c[c_stride * 1] * J12; + const CeedScalar R21 = c[c_stride * 1] * J22 - c[c_stride * 2] * J12; + const CeedScalar R12 = -c[c_stride * 0] * J21 + c[c_stride * 1] * J11; + const CeedScalar R22 = -c[c_stride * 1] * J21 + c[c_stride * 2] * J11; + qd[qd_stride * 0] = w * (J22 * R11 - J12 * R21); + qd[qd_stride * 1] = w * (J11 * R21 - J21 * R11); + qd[qd_stride * 2] = w * (J11 * R22 - J21 * R12); + } + else if (c_comp == 2) // Vector coefficient + { + // c: 0 + // 1 + qd[qd_stride * 0] = w * (c[c_stride * 1] * J12 * J12 + c[c_stride * 0] * J22 * J22); + qd[qd_stride * 1] = -w * (c[c_stride * 1] * J11 * J12 + c[c_stride * 0] * J21 * J22); + qd[qd_stride * 2] = w * (c[c_stride * 1] * J11 * J11 + c[c_stride * 0] * J21 * J21); + } + else // Scalar coefficient + { + qd[qd_stride * 0] = w * c[c_stride * 0] * (J12 * J12 + J22 * J22); + qd[qd_stride * 1] = -w * c[c_stride * 0] * (J11 * J12 + J21 * J22); + qd[qd_stride * 2] = w * c[c_stride * 0] * (J11 * J11 + J21 * J21); + } +} + +CEED_QFUNCTION_HELPER void MultAdjJCAdjJt21(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) adj(J) C adj(J)^T and store the symmetric part of the result. + // J: 0 adj(J): 1/sqrt(J^T J) J^T qd: 0 + // 1 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar d = J11 * J11 + J21 * J21; + const CeedScalar w = qw / sqrt(d); + if (c_comp == 3) // Matrix coefficient (symmetric) + { + // First compute entries of R = C adj(J)^T. + // c: 0 1 + // 1 2 + const CeedScalar R11 = c[c_stride * 0] * J11 + c[c_stride * 1] * J21; + const CeedScalar R21 = c[c_stride * 1] * J11 + c[c_stride * 2] * J21; + qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21) / d; + } + else if (c_comp == 2) // Vector coefficient + { + // c: 0 + // 1 + qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21) / d; + } + else // Scalar coefficient + { + qd[qd_stride * 0] = w * c[c_stride * 0]; + } +} + +CEED_QFUNCTION_HELPER void MultAdjJCAdjJt33(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) adj(J) C adj(J)^T and store the symmetric part of the result. + // J: 0 3 6 qd: 0 1 2 + // 1 4 7 1 3 4 + // 2 5 8 2 4 5 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J31 = J[J_stride * 2]; + const CeedScalar J12 = J[J_stride * 3]; + const CeedScalar J22 = J[J_stride * 4]; + const CeedScalar J32 = J[J_stride * 5]; + const CeedScalar J13 = J[J_stride * 6]; + const CeedScalar J23 = J[J_stride * 7]; + const CeedScalar J33 = J[J_stride * 8]; + const CeedScalar A11 = J22 * J33 - J23 * J32; + const CeedScalar A21 = J23 * J31 - J21 * J33; + const CeedScalar A31 = J21 * J32 - J22 * J31; + const CeedScalar A12 = J13 * J32 - J12 * J33; + const CeedScalar A22 = J11 * J33 - J13 * J31; + const CeedScalar A32 = J12 * J31 - J11 * J32; + const CeedScalar A13 = J12 * J23 - J13 * J22; + const CeedScalar A23 = J13 * J21 - J11 * J23; + const CeedScalar A33 = J11 * J22 - J12 * J21; + const CeedScalar w = qw / (J11 * A11 + J21 * A12 + J31 * A13); + if (c_comp == 6) // Matrix coefficient (symmetric) + { + // First compute entries of R = C adj(J)^T. + // c: 0 1 2 + // 1 3 4 + // 2 4 5 + const CeedScalar R11 = + c[c_stride * 0] * A11 + c[c_stride * 1] * A12 + c[c_stride * 2] * A13; + const CeedScalar R21 = + c[c_stride * 1] * A11 + c[c_stride * 3] * A12 + c[c_stride * 4] * A13; + const CeedScalar R31 = + c[c_stride * 2] * A11 + c[c_stride * 4] * A12 + c[c_stride * 5] * A13; + const CeedScalar R12 = + c[c_stride * 0] * A21 + c[c_stride * 1] * A22 + c[c_stride * 2] * A23; + const CeedScalar R22 = + c[c_stride * 1] * A21 + c[c_stride * 3] * A22 + c[c_stride * 4] * A23; + const CeedScalar R32 = + c[c_stride * 2] * A21 + c[c_stride * 4] * A22 + c[c_stride * 5] * A23; + const CeedScalar R13 = + c[c_stride * 0] * A31 + c[c_stride * 1] * A32 + c[c_stride * 2] * A33; + const CeedScalar R23 = + c[c_stride * 1] * A31 + c[c_stride * 3] * A32 + c[c_stride * 4] * A33; + const CeedScalar R33 = + c[c_stride * 2] * A31 + c[c_stride * 4] * A32 + c[c_stride * 5] * A33; + qd[qd_stride * 0] = w * (A11 * R11 + A12 * R21 + A13 * R31); + qd[qd_stride * 1] = w * (A11 * R12 + A12 * R22 + A13 * R32); + qd[qd_stride * 2] = w * (A11 * R13 + A12 * R23 + A13 * R33); + qd[qd_stride * 3] = w * (A21 * R12 + A22 * R22 + A23 * R32); + qd[qd_stride * 4] = w * (A21 * R13 + A22 * R23 + A23 * R33); + qd[qd_stride * 5] = w * (A31 * R13 + A32 * R23 + A33 * R33); + } + else if (c_comp == 3) // Vector coefficient + { + // c: 0 + // 1 + // 2 + qd[qd_stride * 0] = w * (c[c_stride * 0] * A11 * A11 + c[c_stride * 1] * A12 * A12 + + c[c_stride * 2] * A13 * A13); + qd[qd_stride * 1] = w * (c[c_stride * 0] * A11 * A21 + c[c_stride * 1] * A12 * A22 + + c[c_stride * 2] * A13 * A23); + qd[qd_stride * 2] = w * (c[c_stride * 0] * A11 * A31 + c[c_stride * 1] * A12 * A32 + + c[c_stride * 2] * A13 * A33); + qd[qd_stride * 3] = w * (c[c_stride * 0] * A21 * A21 + c[c_stride * 1] * A22 * A22 + + c[c_stride * 2] * A23 * A23); + qd[qd_stride * 4] = w * (c[c_stride * 0] * A21 * A31 + c[c_stride * 1] * A22 * A32 + + c[c_stride * 2] * A23 * A33); + qd[qd_stride * 5] = w * (c[c_stride * 0] * A31 * A31 + c[c_stride * 1] * A32 * A32 + + c[c_stride * 2] * A33 * A33); + } + else // Scalar coefficient + { + qd[qd_stride * 0] = w * c[c_stride * 0] * (A11 * A11 + A12 * A12 + A13 * A13); + qd[qd_stride * 1] = w * c[c_stride * 0] * (A11 * A21 + A12 * A22 + A13 * A23); + qd[qd_stride * 2] = w * c[c_stride * 0] * (A11 * A31 + A12 * A32 + A13 * A33); + qd[qd_stride * 3] = w * c[c_stride * 0] * (A21 * A21 + A22 * A22 + A23 * A23); + qd[qd_stride * 4] = w * c[c_stride * 0] * (A21 * A31 + A22 * A32 + A23 * A33); + qd[qd_stride * 5] = w * c[c_stride * 0] * (A31 * A31 + A32 * A32 + A33 * A33); + } +} + +CEED_QFUNCTION_HELPER void MultAdjJCAdjJt32(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) adj(J) C adj(J)^T and store the symmetric part of the result. + // J: 0 3 qd: 0 1 + // 1 4 1 2 + // 2 5 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J31 = J[J_stride * 2]; + const CeedScalar J12 = J[J_stride * 3]; + const CeedScalar J22 = J[J_stride * 4]; + const CeedScalar J32 = J[J_stride * 5]; + const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31; + const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32; + const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32; + const CeedScalar d = E * G - F * F; + const CeedScalar w = qw / sqrt(d); + if (c_comp == 6) // Matrix coefficient (symmetric) + { + // First compute entries of R = C adj(J)^T. + // c: 0 1 2 + // 1 3 4 + // 2 4 5 + const CeedScalar R11 = + G * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31) - + F * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32); + const CeedScalar R21 = + G * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31) - + F * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32); + const CeedScalar R31 = + G * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31) - + F * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32); + const CeedScalar R12 = + E * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32) - + F * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31); + const CeedScalar R22 = + E * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32) - + F * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31); + const CeedScalar R32 = + E * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32) - + F * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31); + qd[qd_stride * 0] = w * + (G * (J11 * R11 + J21 * R21 + J31 * R31) - + F * (J12 * R11 + J22 * R21 + J32 * R31)) / + d; + qd[qd_stride * 1] = w * + (G * (J11 * R12 + J21 * R22 + J31 * R32) - + F * (J12 * R12 + J22 * R22 + J32 * R32)) / + d; + qd[qd_stride * 2] = w * + (E * (J12 * R12 + J22 * R22 + J32 * R32) - + F * (J11 * R12 + J21 * R22 + J31 * R32)) / + d; + } + else if (c_comp == 3) // Vector coefficient + { + // First compute entries of R = C adj(J)^T. + // c: 0 + // 1 + // 2 + const CeedScalar R11 = c[c_stride * 0] * (G * J11 - F * J12); + const CeedScalar R21 = c[c_stride * 1] * (G * J21 - F * J22); + const CeedScalar R31 = c[c_stride * 2] * (G * J31 - F * J32); + const CeedScalar R12 = c[c_stride * 0] * (E * J12 - F * J11); + const CeedScalar R22 = c[c_stride * 1] * (E * J22 - F * J21); + const CeedScalar R32 = c[c_stride * 2] * (E * J32 - F * J31); + qd[qd_stride * 0] = w * + (G * (J11 * R11 + J21 * R21 + J31 * R31) - + F * (J12 * R11 + J22 * R21 + J32 * R31)) / + d; + qd[qd_stride * 1] = w * + (G * (J11 * R12 + J21 * R22 + J31 * R32) - + F * (J12 * R12 + J22 * R22 + J32 * R32)) / + d; + qd[qd_stride * 2] = w * + (E * (J12 * R12 + J22 * R22 + J32 * R32) - + F * (J11 * R12 + J21 * R22 + J31 * R32)) / + d; + } + else // Scalar coefficient + { + qd[qd_stride * 0] = w * c[c_stride * 0] * G; + qd[qd_stride * 1] = -w * c[c_stride * 0] * F; + qd[qd_stride * 2] = w * c[c_stride * 0] * E; + } +} + +CEED_QFUNCTION_HELPER void MultJtCJ22(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) J^T C J and store the symmetric part of the result. + // J: 0 2 qd: 0 1 + // 1 3 1 2 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J12 = J[J_stride * 2]; + const CeedScalar J22 = J[J_stride * 3]; + const CeedScalar w = qw / (J11 * J22 - J21 * J12); + if (c_comp == 3) // Matrix coefficient (symmetric) + { + // First compute entries of R = C J. + // c: 0 1 + // 1 2 + const CeedScalar R11 = c[c_stride * 0] * J11 + c[c_stride * 1] * J21; + const CeedScalar R21 = c[c_stride * 1] * J11 + c[c_stride * 2] * J21; + const CeedScalar R12 = c[c_stride * 0] * J12 + c[c_stride * 1] * J22; + const CeedScalar R22 = c[c_stride * 1] * J12 + c[c_stride * 2] * J22; + qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21); + qd[qd_stride * 1] = w * (J11 * R12 + J21 * R22); + qd[qd_stride * 2] = w * (J12 * R12 + J22 * R22); + } + else if (c_comp == 2) // Vector coefficient + { + // c: 0 + // 1 + qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21); + qd[qd_stride * 1] = w * (c[c_stride * 0] * J11 * J12 + c[c_stride * 1] * J21 * J22); + qd[qd_stride * 2] = w * (c[c_stride * 0] * J12 * J12 + c[c_stride * 1] * J22 * J22); + } + else // Scalar coefficient + { + qd[qd_stride * 0] = w * c[c_stride * 0] * (J11 * J11 + J21 * J21); + qd[qd_stride * 1] = w * c[c_stride * 0] * (J11 * J12 + J21 * J22); + qd[qd_stride * 2] = w * c[c_stride * 0] * (J12 * J12 + J22 * J22); + } +} + +CEED_QFUNCTION_HELPER void MultJtCJ21(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) J^T C J and store the symmetric part of the result. + // J: 0 qd: 0 + // 1 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + if (c_comp == 3) // Matrix coefficient (symmetric) + { + // First compute entries of R = C J. + // c: 0 1 + // 1 2 + const CeedScalar w = qw / sqrt(J11 * J11 + J21 * J21); + const CeedScalar R11 = c[c_stride * 0] * J11 + c[c_stride * 1] * J21; + const CeedScalar R21 = c[c_stride * 1] * J11 + c[c_stride * 2] * J21; + qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21); + } + else if (c_comp == 2) // Vector coefficient + { + // c: 0 + // 1 + const CeedScalar w = qw / sqrt(J11 * J11 + J21 * J21); + qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21); + } + else // Scalar coefficient + { + qd[qd_stride * 0] = qw * c[c_stride * 0] * sqrt(J11 * J11 + J21 * J21); + } +} + +CEED_QFUNCTION_HELPER void MultJtCJ33(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) J^T C J and store the symmetric part of the result. + // J: 0 3 6 qd: 0 1 2 + // 1 4 7 1 3 4 + // 2 5 8 2 4 5 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J31 = J[J_stride * 2]; + const CeedScalar J12 = J[J_stride * 3]; + const CeedScalar J22 = J[J_stride * 4]; + const CeedScalar J32 = J[J_stride * 5]; + const CeedScalar J13 = J[J_stride * 6]; + const CeedScalar J23 = J[J_stride * 7]; + const CeedScalar J33 = J[J_stride * 8]; + const CeedScalar w = qw / (J11 * (J22 * J33 - J23 * J32) + J21 * (J13 * J32 - J12 * J33) + + J31 * (J12 * J23 - J13 * J22)); + if (c_comp == 6) // Matrix coefficient (symmetric) + { + // First compute entries of R = C J. + // c: 0 1 2 + // 1 3 4 + // 2 4 5 + const CeedScalar R11 = + c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31; + const CeedScalar R21 = + c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31; + const CeedScalar R31 = + c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31; + const CeedScalar R12 = + c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32; + const CeedScalar R22 = + c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32; + const CeedScalar R32 = + c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32; + const CeedScalar R13 = + c[c_stride * 0] * J13 + c[c_stride * 1] * J23 + c[c_stride * 2] * J33; + const CeedScalar R23 = + c[c_stride * 1] * J13 + c[c_stride * 3] * J23 + c[c_stride * 4] * J33; + const CeedScalar R33 = + c[c_stride * 2] * J13 + c[c_stride * 4] * J23 + c[c_stride * 5] * J33; + qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31); + qd[qd_stride * 1] = w * (J11 * R12 + J21 * R22 + J31 * R32); + qd[qd_stride * 2] = w * (J11 * R13 + J21 * R23 + J31 * R33); + qd[qd_stride * 3] = w * (J12 * R12 + J22 * R22 + J32 * R32); + qd[qd_stride * 4] = w * (J12 * R13 + J22 * R23 + J32 * R33); + qd[qd_stride * 5] = w * (J13 * R13 + J23 * R23 + J33 * R33); + } + else if (c_comp == 3) // Vector coefficient + { + // c: 0 + // 1 + // 2 + qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21 + + c[c_stride * 2] * J31 * J31); + qd[qd_stride * 1] = w * (c[c_stride * 0] * J11 * J12 + c[c_stride * 1] * J21 * J22 + + c[c_stride * 2] * J31 * J32); + qd[qd_stride * 2] = w * (c[c_stride * 0] * J11 * J13 + c[c_stride * 1] * J21 * J23 + + c[c_stride * 2] * J31 * J33); + qd[qd_stride * 3] = w * (c[c_stride * 0] * J12 * J12 + c[c_stride * 1] * J22 * J22 + + c[c_stride * 2] * J32 * J32); + qd[qd_stride * 4] = w * (c[c_stride * 0] * J12 * J13 + c[c_stride * 1] * J22 * J23 + + c[c_stride * 2] * J32 * J33); + qd[qd_stride * 5] = w * (c[c_stride * 0] * J13 * J13 + c[c_stride * 1] * J23 * J23 + + c[c_stride * 2] * J33 * J33); + } + else // Scalar coefficient + { + qd[qd_stride * 0] = w * c[c_stride * 0] * (J11 * J11 + J21 * J21 + J31 * J31); + qd[qd_stride * 1] = w * c[c_stride * 0] * (J11 * J12 + J21 * J22 + J31 * J32); + qd[qd_stride * 2] = w * c[c_stride * 0] * (J11 * J13 + J21 * J23 + J31 * J33); + qd[qd_stride * 3] = w * c[c_stride * 0] * (J12 * J12 + J22 * J22 + J32 * J32); + qd[qd_stride * 4] = w * c[c_stride * 0] * (J12 * J13 + J22 * J23 + J32 * J33); + qd[qd_stride * 5] = w * c[c_stride * 0] * (J13 * J13 + J23 * J23 + J33 * J33); + } +} + +CEED_QFUNCTION_HELPER void MultJtCJ32(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) J^T C J and store the symmetric part of the result. + // J: 0 3 qd: 0 1 + // 1 4 1 2 + // 2 5 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J31 = J[J_stride * 2]; + const CeedScalar J12 = J[J_stride * 3]; + const CeedScalar J22 = J[J_stride * 4]; + const CeedScalar J32 = J[J_stride * 5]; + const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31; + const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32; + const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32; + const CeedScalar w = qw / sqrt(E * G - F * F); + if (c_comp == 6) // Matrix coefficient (symmetric) + { + // First compute entries of R = C J. + // c: 0 1 2 + // 1 3 4 + // 2 4 5 + const CeedScalar R11 = + c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31; + const CeedScalar R21 = + c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31; + const CeedScalar R31 = + c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31; + const CeedScalar R12 = + c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32; + const CeedScalar R22 = + c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32; + const CeedScalar R32 = + c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32; + qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31); + qd[qd_stride * 1] = w * (J11 * R12 + J21 * R22 + J31 * R32); + qd[qd_stride * 2] = w * (J12 * R12 + J22 * R22 + J32 * R32); + } + else if (c_comp == 3) // Vector coefficient + { + // c: 0 + // 1 + // 2 + qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21 + + c[c_stride * 2] * J31 * J31); + qd[qd_stride * 1] = w * (c[c_stride * 0] * J11 * J12 + c[c_stride * 1] * J21 * J22 + + c[c_stride * 2] * J31 * J32); + qd[qd_stride * 2] = w * (c[c_stride * 0] * J12 * J12 + c[c_stride * 1] * J22 * J22 + + c[c_stride * 2] * J32 * J32); + } + else // Scalar coefficient + { + qd[qd_stride * 0] = w * c[c_stride * 0] * E; + qd[qd_stride * 1] = w * c[c_stride * 0] * F; + qd[qd_stride * 2] = w * c[c_stride * 0] * G; + } +} + +template +CEED_QFUNCTION_HELPER void MultJtCAdjJt22(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) J^T C adj(J)^T and store the result. + // J: 0 2 adj(J): J22 -J12 qd: 0 2 + // 1 3 -J21 J11 1 3 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J12 = J[J_stride * 2]; + const CeedScalar J22 = J[J_stride * 3]; + const CeedScalar w = qw / (J11 * J22 - J21 * J12); + if (c_comp == 3) // Matrix coefficient (symmetric) + { + // First compute entries of R = C adj(J)^T. + // c: 0 1 + // 1 2 + const CeedScalar R11 = c[c_stride * 0] * J22 - c[c_stride * 1] * J12; + const CeedScalar R21 = c[c_stride * 1] * J22 - c[c_stride * 2] * J12; + const CeedScalar R12 = -c[c_stride * 0] * J21 + c[c_stride * 1] * J11; + const CeedScalar R22 = -c[c_stride * 1] * J21 + c[c_stride * 2] * J11; + qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21); + qd[qd_stride * 1] = w * (J12 * R11 + J22 * R21); + qd[qd_stride * 2] = w * (J11 * R12 + J21 * R22); + qd[qd_stride * 3] = w * (J12 * R12 + J22 * R22); + } + else if (c_comp == 2) // Vector coefficient + { + // c: 0 + // 1 + qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J22 - c[c_stride * 1] * J12 * J21); + qd[qd_stride * 1] = w * (c[c_stride * 0] * J12 * J22 - c[c_stride * 1] * J12 * J22); + qd[qd_stride * 2] = w * (-c[c_stride * 0] * J11 * J21 + c[c_stride * 1] * J11 * J21); + qd[qd_stride * 3] = w * (-c[c_stride * 0] * J12 * J21 + c[c_stride * 1] * J11 * J22); + } + else // Scalar coefficient + { + qd[qd_stride * 0] = qw * c[c_stride * 0]; + qd[qd_stride * 1] = 0.0; + qd[qd_stride * 2] = 0.0; + qd[qd_stride * 3] = qw * c[c_stride * 0]; + } + if (Transpose && c_comp > 1) + { + const CeedScalar qd21 = qd[qd_stride * 1]; + qd[qd_stride * 1] = qd[qd_stride * 2]; + qd[qd_stride * 2] = qd21; + } +} + +template +CEED_QFUNCTION_HELPER void MultJtCAdjJt21(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) J^T C adj(J)^T and store the result. + // J: 0 adj(J): 1/sqrt(J^T J) J^T qd: 0 + // 1 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar w = qw / (J11 * J11 + J21 * J21); + if (c_comp == 3) // Matrix coefficient (symmetric) + { + // First compute entries of R = C adj(J)^T. + // c: 0 1 + // 1 2 + const CeedScalar R11 = c[c_stride * 0] * J11 + c[c_stride * 1] * J21; + const CeedScalar R21 = c[c_stride * 1] * J11 + c[c_stride * 2] * J21; + qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21); + } + else if (c_comp == 2) // Vector coefficient + { + // c: 0 + // 1 + qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 * J11 + c[c_stride * 1] * J21 * J21); + } + else // Scalar coefficient + { + qd[qd_stride * 0] = qw * c[c_stride * 0]; + } +} + +template +CEED_QFUNCTION_HELPER void MultJtCAdjJt33(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) J^T C adj(J)^T and store the result. + // J: 0 3 6 qd: 0 3 6 + // 1 4 7 1 4 7 + // 2 5 8 2 5 8 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J31 = J[J_stride * 2]; + const CeedScalar J12 = J[J_stride * 3]; + const CeedScalar J22 = J[J_stride * 4]; + const CeedScalar J32 = J[J_stride * 5]; + const CeedScalar J13 = J[J_stride * 6]; + const CeedScalar J23 = J[J_stride * 7]; + const CeedScalar J33 = J[J_stride * 8]; + const CeedScalar A11 = J22 * J33 - J23 * J32; + const CeedScalar A21 = J23 * J31 - J21 * J33; + const CeedScalar A31 = J21 * J32 - J22 * J31; + const CeedScalar A12 = J13 * J32 - J12 * J33; + const CeedScalar A22 = J11 * J33 - J13 * J31; + const CeedScalar A32 = J12 * J31 - J11 * J32; + const CeedScalar A13 = J12 * J23 - J13 * J22; + const CeedScalar A23 = J13 * J21 - J11 * J23; + const CeedScalar A33 = J11 * J22 - J12 * J21; + const CeedScalar w = qw / (J11 * A11 + J21 * A12 + J31 * A13); + if (c_comp == 6) // Matrix coefficient (symmetric) + { + // First compute entries of R = C adj(J)^T. + // c: 0 1 2 + // 1 3 4 + // 2 4 5 + const CeedScalar R11 = + c[c_stride * 0] * A11 + c[c_stride * 1] * A12 + c[c_stride * 2] * A13; + const CeedScalar R21 = + c[c_stride * 1] * A11 + c[c_stride * 3] * A12 + c[c_stride * 4] * A13; + const CeedScalar R31 = + c[c_stride * 2] * A11 + c[c_stride * 4] * A12 + c[c_stride * 5] * A13; + const CeedScalar R12 = + c[c_stride * 0] * A21 + c[c_stride * 1] * A22 + c[c_stride * 2] * A23; + const CeedScalar R22 = + c[c_stride * 1] * A21 + c[c_stride * 3] * A22 + c[c_stride * 4] * A23; + const CeedScalar R32 = + c[c_stride * 2] * A21 + c[c_stride * 4] * A22 + c[c_stride * 5] * A23; + const CeedScalar R13 = + c[c_stride * 0] * A31 + c[c_stride * 1] * A32 + c[c_stride * 2] * A33; + const CeedScalar R23 = + c[c_stride * 1] * A31 + c[c_stride * 3] * A32 + c[c_stride * 4] * A33; + const CeedScalar R33 = + c[c_stride * 2] * A31 + c[c_stride * 4] * A32 + c[c_stride * 5] * A33; + qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31); + qd[qd_stride * 1] = w * (J12 * R11 + J22 * R21 + J32 * R31); + qd[qd_stride * 2] = w * (J13 * R11 + J23 * R21 + J33 * R31); + qd[qd_stride * 3] = w * (J11 * R12 + J21 * R22 + J31 * R32); + qd[qd_stride * 4] = w * (J12 * R12 + J22 * R22 + J32 * R32); + qd[qd_stride * 5] = w * (J13 * R12 + J23 * R22 + J33 * R32); + qd[qd_stride * 6] = w * (J11 * R13 + J21 * R23 + J31 * R33); + qd[qd_stride * 7] = w * (J12 * R13 + J22 * R23 + J32 * R33); + qd[qd_stride * 8] = w * (J13 * R13 + J23 * R23 + J33 * R33); + } + else if (c_comp == 3) // Vector coefficient + { + // c: 0 + // 1 + // 2 + qd[qd_stride * 0] = w * (c[c_stride * 0] * A11 * J11 + c[c_stride * 1] * A12 * J21 + + c[c_stride * 2] * A13 * J31); + qd[qd_stride * 1] = w * (c[c_stride * 0] * A11 * J12 + c[c_stride * 1] * A12 * J22 + + c[c_stride * 2] * A13 * J32); + qd[qd_stride * 2] = w * (c[c_stride * 0] * A11 * J13 + c[c_stride * 1] * A12 * J23 + + c[c_stride * 2] * A13 * J33); + qd[qd_stride * 3] = w * (c[c_stride * 0] * A21 * J11 + c[c_stride * 1] * A22 * J21 + + c[c_stride * 2] * A23 * J31); + qd[qd_stride * 4] = w * (c[c_stride * 0] * A21 * J12 + c[c_stride * 1] * A22 * J22 + + c[c_stride * 2] * A23 * J32); + qd[qd_stride * 5] = w * (c[c_stride * 0] * A21 * J13 + c[c_stride * 1] * A22 * J23 + + c[c_stride * 2] * A23 * J33); + qd[qd_stride * 6] = w * (c[c_stride * 0] * A31 * J11 + c[c_stride * 1] * A32 * J21 + + c[c_stride * 2] * A33 * J31); + qd[qd_stride * 7] = w * (c[c_stride * 0] * A31 * J12 + c[c_stride * 1] * A32 * J22 + + c[c_stride * 2] * A33 * J32); + qd[qd_stride * 8] = w * (c[c_stride * 0] * A31 * J13 + c[c_stride * 1] * A32 * J23 + + c[c_stride * 2] * A33 * J33); + } + else // Scalar coefficient + { + qd[qd_stride * 0] = qw * c[c_stride * 0]; + qd[qd_stride * 1] = 0.0; + qd[qd_stride * 2] = 0.0; + qd[qd_stride * 3] = 0.0; + qd[qd_stride * 4] = qw * c[c_stride * 0]; + qd[qd_stride * 5] = 0.0; + qd[qd_stride * 6] = 0.0; + qd[qd_stride * 7] = 0.0; + qd[qd_stride * 8] = qw * c[c_stride * 0]; + } + if (Transpose && c_comp > 1) + { + { + const CeedScalar qd21 = qd[qd_stride * 1]; + qd[qd_stride * 1] = qd[qd_stride * 3]; + qd[qd_stride * 3] = qd21; + } + { + const CeedScalar qd31 = qd[qd_stride * 2]; + qd[qd_stride * 2] = qd[qd_stride * 6]; + qd[qd_stride * 6] = qd31; + } + { + const CeedScalar qd32 = qd[qd_stride * 5]; + qd[qd_stride * 5] = qd[qd_stride * 7]; + qd[qd_stride * 7] = qd32; + } + } +} + +template +CEED_QFUNCTION_HELPER void MultJtCAdjJt32(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw / det(J) J^T C adj(J)^T and store the result. + // J: 0 3 qd: 0 2 + // 1 4 1 3 + // 2 5 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J31 = J[J_stride * 2]; + const CeedScalar J12 = J[J_stride * 3]; + const CeedScalar J22 = J[J_stride * 4]; + const CeedScalar J32 = J[J_stride * 5]; + const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31; + const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32; + const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32; + const CeedScalar w = qw / (E * G - F * F); + if (c_comp == 6) // Matrix coefficient (symmetric) + { + // First compute entries of R = C adj(J)^T. + // c: 0 1 2 + // 1 3 4 + // 2 4 5 + const CeedScalar R11 = + G * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31) - + F * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32); + const CeedScalar R21 = + G * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31) - + F * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32); + const CeedScalar R31 = + G * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31) - + F * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32); + const CeedScalar R12 = + E * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32) - + F * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31); + const CeedScalar R22 = + E * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32) - + F * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31); + const CeedScalar R32 = + E * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32) - + F * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31); + qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31); + qd[qd_stride * 1] = w * (J12 * R11 + J22 * R21 + J32 * R31); + qd[qd_stride * 2] = w * (J11 * R12 + J21 * R22 + J31 * R32); + qd[qd_stride * 3] = w * (J12 * R12 + J22 * R22 + J32 * R32); + } + else if (c_comp == 3) // Vector coefficient + { + // First compute entries of R = C adj(J)^T. + // c: 0 + // 1 + // 2 + const CeedScalar R11 = c[c_stride * 0] * (G * J11 - F * J12); + const CeedScalar R21 = c[c_stride * 1] * (G * J21 - F * J22); + const CeedScalar R31 = c[c_stride * 2] * (G * J31 - F * J32); + const CeedScalar R12 = c[c_stride * 0] * (E * J12 - F * J11); + const CeedScalar R22 = c[c_stride * 1] * (E * J22 - F * J21); + const CeedScalar R32 = c[c_stride * 2] * (E * J32 - F * J31); + qd[qd_stride * 0] = w * (J11 * R11 + J21 * R21 + J31 * R31); + qd[qd_stride * 1] = w * (J12 * R11 + J22 * R21 + J32 * R31); + qd[qd_stride * 2] = w * (J11 * R12 + J21 * R22 + J31 * R32); + qd[qd_stride * 3] = w * (J12 * R12 + J22 * R22 + J32 * R32); + } + else // Scalar coefficient + { + qd[qd_stride * 0] = qw * c[c_stride * 0]; + qd[qd_stride * 1] = 0.0; + qd[qd_stride * 2] = 0.0; + qd[qd_stride * 3] = qw * c[c_stride * 0]; + } + if (Transpose && c_comp > 1) + { + const CeedScalar qd21 = qd[qd_stride * 1]; + qd[qd_stride * 1] = qd[qd_stride * 2]; + qd[qd_stride * 2] = qd21; + } +} + +CEED_QFUNCTION_HELPER void MultCAdjJt22(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw C adj(J)^T and store the result. + // J: 0 2 adj(J): J22 -J12 qd: 0 2 + // 1 3 -J21 J11 1 3 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J12 = J[J_stride * 2]; + const CeedScalar J22 = J[J_stride * 3]; + if (c_comp == 3) // Matrix coefficient (symmetric) + { + // c: 0 1 + // 1 2 + qd[qd_stride * 0] = qw * (c[c_stride * 0] * J22 - c[c_stride * 1] * J12); + qd[qd_stride * 1] = qw * (c[c_stride * 1] * J22 - c[c_stride * 2] * J12); + qd[qd_stride * 2] = qw * (-c[c_stride * 0] * J21 + c[c_stride * 1] * J11); + qd[qd_stride * 3] = qw * (-c[c_stride * 1] * J21 + c[c_stride * 2] * J11); + } + else if (c_comp == 2) // Vector coefficient + { + // c: 0 + // 1 + const CeedScalar wc0 = qw * c[c_stride * 0]; + const CeedScalar wc1 = qw * c[c_stride * 1]; + qd[qd_stride * 0] = wc0 * J22; + qd[qd_stride * 1] = -wc1 * J12; + qd[qd_stride * 2] = -wc0 * J21; + qd[qd_stride * 3] = wc1 * J11; + } + else // Scalar coefficient + { + const CeedScalar wc = qw * c[c_stride * 0]; + qd[qd_stride * 0] = wc * J22; + qd[qd_stride * 1] = -wc * J12; + qd[qd_stride * 2] = -wc * J21; + qd[qd_stride * 3] = wc * J11; + } +} + +CEED_QFUNCTION_HELPER void MultCAdjJt21(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw C adj(J)^T and store the result. + // J: 0 adj(J): 1/sqrt(J^T J) J^T qd: 0 + // 1 1 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar w = qw / sqrt(J11 * J11 + J21 * J21); + if (c_comp == 3) // Matrix coefficient (symmetric) + { + // c: 0 1 + // 1 2 + qd[qd_stride * 0] = w * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21); + qd[qd_stride * 1] = w * (c[c_stride * 1] * J11 + c[c_stride * 2] * J21); + } + else if (c_comp == 2) // Vector coefficient + { + // c: 0 + // 1 + qd[qd_stride * 0] = w * c[c_stride * 0] * J11; + qd[qd_stride * 1] = w * c[c_stride * 1] * J21; + } + else // Scalar coefficient + { + const CeedScalar wc = w * c[c_stride * 0]; + qd[qd_stride * 0] = wc * J11; + qd[qd_stride * 1] = wc * J21; + } +} + +CEED_QFUNCTION_HELPER void MultCAdjJt33(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw C adj(J)^T and store the result. + // J: 0 3 6 qd: 0 3 6 + // 1 4 7 1 4 7 + // 2 5 8 2 5 8 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J31 = J[J_stride * 2]; + const CeedScalar J12 = J[J_stride * 3]; + const CeedScalar J22 = J[J_stride * 4]; + const CeedScalar J32 = J[J_stride * 5]; + const CeedScalar J13 = J[J_stride * 6]; + const CeedScalar J23 = J[J_stride * 7]; + const CeedScalar J33 = J[J_stride * 8]; + const CeedScalar A11 = J22 * J33 - J23 * J32; + const CeedScalar A21 = J23 * J31 - J21 * J33; + const CeedScalar A31 = J21 * J32 - J22 * J31; + const CeedScalar A12 = J13 * J32 - J12 * J33; + const CeedScalar A22 = J11 * J33 - J13 * J31; + const CeedScalar A32 = J12 * J31 - J11 * J32; + const CeedScalar A13 = J12 * J23 - J13 * J22; + const CeedScalar A23 = J13 * J21 - J11 * J23; + const CeedScalar A33 = J11 * J22 - J12 * J21; + if (c_comp == 6) // Matrix coefficient (symmetric) + { + // c: 0 1 2 + // 1 3 4 + // 2 4 5 + qd[qd_stride * 0] = + qw * (c[c_stride * 0] * A11 + c[c_stride * 1] * A12 + c[c_stride * 2] * A13); + qd[qd_stride * 1] = + qw * (c[c_stride * 1] * A11 + c[c_stride * 3] * A12 + c[c_stride * 4] * A13); + qd[qd_stride * 2] = + qw * (c[c_stride * 2] * A11 + c[c_stride * 4] * A12 + c[c_stride * 5] * A13); + qd[qd_stride * 3] = + qw * (c[c_stride * 0] * A21 + c[c_stride * 1] * A22 + c[c_stride * 2] * A23); + qd[qd_stride * 4] = + qw * (c[c_stride * 1] * A21 + c[c_stride * 3] * A22 + c[c_stride * 4] * A23); + qd[qd_stride * 5] = + qw * (c[c_stride * 2] * A21 + c[c_stride * 4] * A22 + c[c_stride * 5] * A23); + qd[qd_stride * 6] = + qw * (c[c_stride * 0] * A31 + c[c_stride * 1] * A32 + c[c_stride * 2] * A33); + qd[qd_stride * 7] = + qw * (c[c_stride * 1] * A31 + c[c_stride * 3] * A32 + c[c_stride * 4] * A33); + qd[qd_stride * 8] = + qw * (c[c_stride * 2] * A31 + c[c_stride * 4] * A32 + c[c_stride * 5] * A33); + } + else if (c_comp == 3) // Vector coefficient + { + // c: 0 + // 1 + // 2 + const CeedScalar wc0 = qw * c[c_stride * 0]; + const CeedScalar wc1 = qw * c[c_stride * 1]; + const CeedScalar wc2 = qw * c[c_stride * 2]; + qd[qd_stride * 0] = wc0 * A11; + qd[qd_stride * 1] = wc1 * A12; + qd[qd_stride * 2] = wc2 * A13; + qd[qd_stride * 3] = wc0 * A21; + qd[qd_stride * 4] = wc1 * A22; + qd[qd_stride * 5] = wc2 * A23; + qd[qd_stride * 6] = wc0 * A31; + qd[qd_stride * 7] = wc1 * A32; + qd[qd_stride * 8] = wc2 * A33; + } + else // Scalar coefficient + { + const CeedScalar wc = qw * c[c_stride * 0]; + qd[qd_stride * 0] = wc * A11; + qd[qd_stride * 1] = wc * A12; + qd[qd_stride * 2] = wc * A13; + qd[qd_stride * 3] = wc * A21; + qd[qd_stride * 4] = wc * A22; + qd[qd_stride * 5] = wc * A23; + qd[qd_stride * 6] = wc * A31; + qd[qd_stride * 7] = wc * A32; + qd[qd_stride * 8] = wc * A33; + } +} + +CEED_QFUNCTION_HELPER void MultCAdjJt32(const CeedScalar *J, const CeedInt J_stride, + const CeedScalar *c, const CeedInt c_stride, + const CeedInt c_comp, const CeedScalar qw, + const CeedInt qd_stride, CeedScalar *qd) +{ + // Compute qw C adj(J)^T and store the result. + // J: 0 3 qd: 0 3 + // 1 4 1 4 + // 2 5 2 5 + const CeedScalar J11 = J[J_stride * 0]; + const CeedScalar J21 = J[J_stride * 1]; + const CeedScalar J31 = J[J_stride * 2]; + const CeedScalar J12 = J[J_stride * 3]; + const CeedScalar J22 = J[J_stride * 4]; + const CeedScalar J32 = J[J_stride * 5]; + const CeedScalar E = J11 * J11 + J21 * J21 + J31 * J31; + const CeedScalar G = J12 * J12 + J22 * J22 + J32 * J32; + const CeedScalar F = J11 * J12 + J21 * J22 + J31 * J32; + const CeedScalar w = qw / sqrt(E * G - F * F); + if (c_comp == 6) // Matrix coefficient (symmetric) + { + // c: 0 1 2 + // 1 3 4 + // 2 4 5 + qd[qd_stride * 0] = + w * (G * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31) - + F * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32)); + qd[qd_stride * 1] = + w * (G * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31) - + F * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32)); + qd[qd_stride * 2] = + w * (G * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31) - + F * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32)); + qd[qd_stride * 3] = + w * (E * (c[c_stride * 0] * J12 + c[c_stride * 1] * J22 + c[c_stride * 2] * J32) - + F * (c[c_stride * 0] * J11 + c[c_stride * 1] * J21 + c[c_stride * 2] * J31)); + qd[qd_stride * 4] = + w * (E * (c[c_stride * 1] * J12 + c[c_stride * 3] * J22 + c[c_stride * 4] * J32) - + F * (c[c_stride * 1] * J11 + c[c_stride * 3] * J21 + c[c_stride * 4] * J31)); + qd[qd_stride * 5] = + w * (E * (c[c_stride * 2] * J12 + c[c_stride * 4] * J22 + c[c_stride * 5] * J32) - + F * (c[c_stride * 2] * J11 + c[c_stride * 4] * J21 + c[c_stride * 5] * J31)); + } + else if (c_comp == 3) // Vector coefficient + { + // c: 0 + // 1 + // 2 + const CeedScalar wc0 = w * c[c_stride * 0]; + const CeedScalar wc1 = w * c[c_stride * 1]; + const CeedScalar wc2 = w * c[c_stride * 2]; + qd[qd_stride * 0] = wc0 * (G * J11 - F * J12); + qd[qd_stride * 1] = wc1 * (G * J21 - F * J22); + qd[qd_stride * 2] = wc2 * (G * J31 - F * J32); + qd[qd_stride * 3] = wc0 * (E * J12 - F * J11); + qd[qd_stride * 4] = wc1 * (E * J22 - F * J21); + qd[qd_stride * 5] = wc2 * (E * J32 - F * J31); + } + else // Scalar coefficient + { + const CeedScalar wc = w * c[c_stride * 0]; + qd[qd_stride * 0] = wc * (G * J11 - F * J12); + qd[qd_stride * 1] = wc * (G * J21 - F * J22); + qd[qd_stride * 2] = wc * (G * J31 - F * J32); + qd[qd_stride * 3] = wc * (E * J12 - F * J11); + qd[qd_stride * 4] = wc * (E * J22 - F * J21); + qd[qd_stride * 5] = wc * (E * J32 - F * J31); + } +} + +#endif // PALACE_LIBCEED_UTILS_QF_H diff --git a/palace/fem/qfunctions/vecfemass_qf.h b/palace/fem/qfunctions/vecfemass_qf.h index b99ba9c887..c7a672d32e 100644 --- a/palace/fem/qfunctions/vecfemass_qf.h +++ b/palace/fem/qfunctions/vecfemass_qf.h @@ -1,82 +1,82 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LIBCEED_VECFEMASS_QF_H -#define PALACE_LIBCEED_VECFEMASS_QF_H - -struct VectorFEMassContext -{ - CeedInt dim, space_dim; - bool sym; - CeedScalar coeff; -}; - -// libCEED QFunction for applying a symmetric or nonsymmetric vector FE mass operator. -CEED_QFUNCTION(f_apply_vecfemass)(void *ctx, CeedInt Q, const CeedScalar *const *in, - CeedScalar *const *out) -{ - // in[0], out[0] have shape [dim, ncomp=1, Q] - VectorFEMassContext *bc = (VectorFEMassContext *)ctx; - const CeedScalar *u = in[0], *qd = in[1]; - CeedScalar *v = out[0]; - switch (bc->dim) - { - case 1: - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - v[i] = qd[i] * u[i]; - } - break; - case 2: - if (bc->sym) - { - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar u0 = u[i + Q * 0]; - const CeedScalar u1 = u[i + Q * 1]; - v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 1] * u1; - v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 2] * u1; - } - } - else - { - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar u0 = u[i + Q * 0]; - const CeedScalar u1 = u[i + Q * 1]; - v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 2] * u1; - v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 3] * u1; - } - } - break; - case 3: - if (bc->sym) - { - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar u0 = u[i + Q * 0]; - const CeedScalar u1 = u[i + Q * 1]; - const CeedScalar u2 = u[i + Q * 2]; - v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 1] * u1 + qd[i + Q * 2] * u2; - v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 3] * u1 + qd[i + Q * 4] * u2; - v[i + Q * 2] = qd[i + Q * 2] * u0 + qd[i + Q * 4] * u1 + qd[i + Q * 5] * u2; - } - } - else - { - CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) - { - const CeedScalar u0 = u[i + Q * 0]; - const CeedScalar u1 = u[i + Q * 1]; - const CeedScalar u2 = u[i + Q * 2]; - v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 3] * u1 + qd[i + Q * 6] * u2; - v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 4] * u1 + qd[i + Q * 7] * u2; - v[i + Q * 2] = qd[i + Q * 2] * u0 + qd[i + Q * 5] * u1 + qd[i + Q * 8] * u2; - } - } - break; - } - return 0; -} - -#endif // PALACE_LIBCEED_VECFEMASS_QF_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LIBCEED_VECFEMASS_QF_H +#define PALACE_LIBCEED_VECFEMASS_QF_H + +struct VectorFEMassContext +{ + CeedInt dim, space_dim; + bool sym; + CeedScalar coeff; +}; + +// libCEED QFunction for applying a symmetric or nonsymmetric vector FE mass operator. +CEED_QFUNCTION(f_apply_vecfemass)(void *ctx, CeedInt Q, const CeedScalar *const *in, + CeedScalar *const *out) +{ + // in[0], out[0] have shape [dim, ncomp=1, Q] + VectorFEMassContext *bc = (VectorFEMassContext *)ctx; + const CeedScalar *u = in[0], *qd = in[1]; + CeedScalar *v = out[0]; + switch (bc->dim) + { + case 1: + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + v[i] = qd[i] * u[i]; + } + break; + case 2: + if (bc->sym) + { + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 1] * u1; + v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 2] * u1; + } + } + else + { + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 2] * u1; + v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 3] * u1; + } + } + break; + case 3: + if (bc->sym) + { + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + const CeedScalar u2 = u[i + Q * 2]; + v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 1] * u1 + qd[i + Q * 2] * u2; + v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 3] * u1 + qd[i + Q * 4] * u2; + v[i + Q * 2] = qd[i + Q * 2] * u0 + qd[i + Q * 4] * u1 + qd[i + Q * 5] * u2; + } + } + else + { + CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) + { + const CeedScalar u0 = u[i + Q * 0]; + const CeedScalar u1 = u[i + Q * 1]; + const CeedScalar u2 = u[i + Q * 2]; + v[i + Q * 0] = qd[i + Q * 0] * u0 + qd[i + Q * 3] * u1 + qd[i + Q * 6] * u2; + v[i + Q * 1] = qd[i + Q * 1] * u0 + qd[i + Q * 4] * u1 + qd[i + Q * 7] * u2; + v[i + Q * 2] = qd[i + Q * 2] * u0 + qd[i + Q * 5] * u1 + qd[i + Q * 8] * u2; + } + } + break; + } + return 0; +} + +#endif // PALACE_LIBCEED_VECFEMASS_QF_H diff --git a/palace/libpalace.props b/palace/libpalace.props index f244aa0cae..ba1661107e 100644 --- a/palace/libpalace.props +++ b/palace/libpalace.props @@ -3,8 +3,10 @@ - $(MSMPI_INC);.\;$(MSMPI_INC)\x64;$(WELSIM_LIBPACK)\include\palace\mfem;$(WELSIM_LIBPACK)\include\palace\arpack;$(WELSIM_LIBPACK)\include\palace\libCEED;$(WELSIM_LIBPACK)\include\palace;$(WELSIM_LIBPACK)\include\zlib;$(WELSIM_LIBPACK)\include\hypre;$(WELSIM_LIBPACK)\include\petsc;$(WELSIM_LIBPACK)\include\SLEPc;$(WELSIM_LIBPACK)\include\fmt;$(WELSIM_LIBPACK)\include;$(INTEL_MKL)\include;$(WELSIM_LIBPACK)\include\eigen3;$(WELSIM_LIBPACK)\include\mumps;$(CUDA_PATH)\include;$(ExternalIncludePath) + $(MSMPI_INC);.\;$(MSMPI_INC)\x64;$(WELSIM_LIBPACK)\include\palace\mfem;$(WELSIM_LIBPACK)\include\zlib;$(WELSIM_LIBPACK)\include\hypre;$(WELSIM_LIBPACK)\include\petsc;$(WELSIM_LIBPACK)\include\SLEPc;$(WELSIM_LIBPACK)\include\fmt;$(WELSIM_LIBPACK)\include\palace;$(WELSIM_LIBPACK)\include\palace\libCEED;$(WELSIM_LIBPACK)\include;$(INTEL_MKL)\include;$(WELSIM_LIBPACK)\include\eigen3;$(WELSIM_LIBPACK)\include\mumps;$(WELSIM_LIBPACK)\include\palace\arpack;$(ExternalIncludePath) $(Platform)\$(Configuration)\ + $(CUDA_PATH)/include;$(IncludePath) + $(Platform)\$(Configuration)\ @@ -13,6 +15,7 @@ true _UNICODE;UNICODE;CEED_SKIP_VISIBILITY;PALACE_WITH_ARPACK;PALACE_WITH_MUMPS;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) + /utf-8 %(AdditionalOptions) diff --git a/palace/libpalace.vcxproj b/palace/libpalace.vcxproj index c7ac12ec1e..09dbcfa952 100644 --- a/palace/libpalace.vcxproj +++ b/palace/libpalace.vcxproj @@ -21,14 +21,18 @@ + + + - + + - + @@ -41,21 +45,24 @@ - + + + + @@ -71,9 +78,12 @@ + + + @@ -83,6 +93,7 @@ + @@ -92,6 +103,7 @@ + @@ -106,8 +118,12 @@ + + + + @@ -123,20 +139,24 @@ - + + + + + $(IntDir)/linalg/operator.cpp.obj @@ -152,9 +172,12 @@ + + + @@ -162,10 +185,14 @@ + + + + 17.0 @@ -204,6 +231,7 @@ $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ $(Platform)\$(Configuration)\ diff --git a/palace/libpalace.vcxproj.filters b/palace/libpalace.vcxproj.filters index ad1b8b1209..73c33ab118 100644 --- a/palace/libpalace.vcxproj.filters +++ b/palace/libpalace.vcxproj.filters @@ -52,15 +52,9 @@ Source Files - - Source Files - Source Files - - Source Files - Source Files @@ -94,9 +88,6 @@ Source Files - - Source Files - Source Files @@ -256,6 +247,43 @@ Source Files + + Source Files + + + Source Files + + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + + + Source Files + + + Source Files + + + + Source Files + + + Source Files + @@ -288,9 +316,6 @@ Source Files - - Source Files - Source Files @@ -333,9 +358,6 @@ Source Files - - Source Files - Source Files @@ -462,5 +484,56 @@ Source Files + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + \ No newline at end of file diff --git a/palace/libpalace_d.props b/palace/libpalace_d.props index 3e17e504b3..c8047f394e 100644 --- a/palace/libpalace_d.props +++ b/palace/libpalace_d.props @@ -3,8 +3,10 @@ - $(MSMPI_INC);.\;$(MSMPI_INC)\x64;$(WELSIM_LIBPACK)\include\palace\mfem;$(WELSIM_LIBPACK)\include\zlib;$(WELSIM_LIBPACK)\include\hypre;$(WELSIM_LIBPACK)\include\petsc;$(WELSIM_LIBPACK)\include\SLEPc;$(WELSIM_LIBPACK)\include\fmt;$(WELSIM_LIBPACK)\include\palace;$(WELSIM_LIBPACK)\include\palace\libCEED;$(WELSIM_LIBPACK)\include;$(INTEL_MKL)\include;$(WELSIM_LIBPACK)\include\eigen3;$(WELSIM_LIBPACK)\include\mumps;$(WELSIM_LIBPACK)\include\palace\arpack;$(CUDA_PATH)\include;$(ExternalIncludePath) + $(MSMPI_INC);.\;$(MSMPI_INC)\x64;$(WELSIM_LIBPACK)\include\zlib;$(WELSIM_LIBPACK)\include\hypre;$(WELSIM_LIBPACK)\include\petsc;$(WELSIM_LIBPACK)\include\SLEPc;$(WELSIM_LIBPACK)\include\fmt;$(WELSIM_LIBPACK)\include\palace;$(WELSIM_LIBPACK)\include\palace\mfem;$(WELSIM_LIBPACK)\include\palace\libCEED;$(WELSIM_LIBPACK)\include\palace\arpack;$(WELSIM_LIBPACK)\include;$(INTEL_MKL)\include;$(WELSIM_LIBPACK)\include\eigen3;$(WELSIM_LIBPACK)\include\mumps;$(ExternalIncludePath) $(Platform)\$(Configuration)\ + $(CUDA_PATH)/include;$(IncludePath) + $(Platform)\$(Configuration)\ @@ -12,7 +14,8 @@ true - _UNICODE;UNICODE;CEED_SKIP_VISIBILITY;PALACE_WITH_ARPACK;_CRT_SECURE_NO_WARNINGS;_SILENCE_STDEXT_ARR_ITERS_DEPRECATION_WARNING;%(PreprocessorDefinitions) + _UNICODE;UNICODE;CEED_SKIP_VISIBILITY;PALACE_WITH_ARPACK;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) + /utf-8 %(AdditionalOptions) diff --git a/palace/linalg/CMakeLists.txt b/palace/linalg/CMakeLists.txt index 5e0bbf052b..ba4aac9530 100644 --- a/palace/linalg/CMakeLists.txt +++ b/palace/linalg/CMakeLists.txt @@ -1,30 +1,45 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Add source files and subdirectories. -# - -target_sources(${LIB_TARGET_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/amg.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/ams.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/arpack.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/chebyshev.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/distrelaxation.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/divfree.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/errorestimator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/gmg.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/hcurl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/jacobi.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/ksp.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/iterative.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/mumps.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/operator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/rap.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/slepc.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/solver.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/strumpack.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/superlu.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/vector.cpp -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Add source files and subdirectories. +# + +target_sources(${LIB_TARGET_NAME} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/amg.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ams.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/arpack.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/chebyshev.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/densematrix.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/distrelaxation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/divfree.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/errorestimator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/floquetcorrection.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/gmg.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/hcurl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/hypre.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/jacobi.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ksp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/iterative.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/mumps.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/nleps.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/operator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/rap.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/slepc.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/solver.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/strumpack.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/superlu.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/vector.cpp +) + +# Handle device source code +set(TARGET_SOURCES_DEVICE + ${TARGET_SOURCES_DEVICE} + ${CMAKE_CURRENT_SOURCE_DIR}/chebyshev.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/densematrix.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/jacobi.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/operator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/vector.cpp + PARENT_SCOPE +) diff --git a/palace/linalg/amg.cpp b/palace/linalg/amg.cpp index f34ef25b78..0f6aa043c3 100644 --- a/palace/linalg/amg.cpp +++ b/palace/linalg/amg.cpp @@ -1,43 +1,36 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "amg.hpp" - -#include "linalg/rap.hpp" - -namespace palace -{ - -BoomerAmgSolver::BoomerAmgSolver(int cycle_it, int smooth_it, int print) - : mfem::HypreBoomerAMG() -{ - SetPrintLevel((print > 1) ? print - 1 : 0); - SetMaxIter(cycle_it); - SetTol(0.0); - - // Set additional BoomerAMG options. - double theta = 0.5; // AMG strength parameter = 0.25 is 2D optimal (0.5-0.8 for 3D) - int agg_levels = 1; // Number of aggressive coarsening levels - - SetStrengthThresh(theta); - SetAggressiveCoarsening(agg_levels); - HYPRE_BoomerAMGSetNumSweeps(*this, smooth_it); - - // int coarse_relax_type = 8; // l1-symm. GS (inexact coarse solve) - // HYPRE_BoomerAMGSetCycleRelaxType(*this, coarse_relax_type, 3); -} - -void BoomerAmgSolver::SetOperator(const Operator &op) -{ - const auto *PtAP = dynamic_cast(&op); - if (PtAP) - { - mfem::HypreBoomerAMG::SetOperator(PtAP->ParallelAssemble()); - } - else - { - mfem::HypreBoomerAMG::SetOperator(op); - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "amg.hpp" + +namespace palace +{ + +BoomerAmgSolver::BoomerAmgSolver(int cycle_it, int smooth_it, bool agg_coarsen, int print) + : mfem::HypreBoomerAMG() +{ + HYPRE_BoomerAMGSetPrintLevel(*this, (print > 1) ? print - 1 : 0); + HYPRE_BoomerAMGSetMaxIter(*this, cycle_it); + HYPRE_BoomerAMGSetTol(*this, 0.0); + + // Set additional BoomerAMG options. + int agg_levels = agg_coarsen ? 1 : 0; // Number of aggressive coarsening levels + double theta = 0.5; // AMG strength parameter = 0.25 is 2D optimal (0.5-0.8 for 3D) + int relax_type = 8; // 8 = l1-symm. GS, 13 = l1-GS, 18 = l1-Jacobi, 16 = Chebyshev + if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK)) + { + // Modify options for GPU-supported features. + agg_levels = 0; + relax_type = 18; + } + + HYPRE_BoomerAMGSetAggNumLevels(*this, agg_levels); + HYPRE_BoomerAMGSetStrongThreshold(*this, theta); + HYPRE_BoomerAMGSetRelaxType(*this, relax_type); + HYPRE_BoomerAMGSetNumSweeps(*this, smooth_it); + + // int coarse_relax_type = 8; // l1-symm. GS (inexact coarse solve) + // HYPRE_BoomerAMGSetCycleRelaxType(*this, coarse_relax_type, 3); +} + +} // namespace palace diff --git a/palace/linalg/amg.hpp b/palace/linalg/amg.hpp index 1806c77027..f1e06b387f 100644 --- a/palace/linalg/amg.hpp +++ b/palace/linalg/amg.hpp @@ -1,32 +1,31 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_AMG_HPP -#define PALACE_LINALG_AMG_HPP - -#include -#include "linalg/operator.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -// -// A wrapper for Hypre's BoomerAMG solver. -// -class BoomerAmgSolver : public mfem::HypreBoomerAMG -{ -public: - BoomerAmgSolver(int cycle_it = 1, int smooth_it = 1, int print = 0); - BoomerAmgSolver(const IoData &iodata, bool coarse_solver, int print) - : BoomerAmgSolver(coarse_solver ? 1 : iodata.solver.linear.mg_cycle_it, - iodata.solver.linear.mg_smooth_it, print) - { - } - - void SetOperator(const Operator &op) override; -}; - -} // namespace palace - -#endif // PALACE_LINALG_AMG_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_AMG_HPP +#define PALACE_LINALG_AMG_HPP + +#include +#include "utils/iodata.hpp" + +namespace palace +{ + +// +// A wrapper for Hypre's BoomerAMG solver. +// +class BoomerAmgSolver : public mfem::HypreBoomerAMG +{ +public: + BoomerAmgSolver(int cycle_it = 1, int smooth_it = 1, bool agg_coarsen = true, + int print = 0); + BoomerAmgSolver(const IoData &iodata, bool coarse_solver, int print) + : BoomerAmgSolver(coarse_solver ? 1 : iodata.solver.linear.mg_cycle_it, + iodata.solver.linear.mg_smooth_it, + iodata.solver.linear.amg_agg_coarsen, print) + { + } +}; + +} // namespace palace + +#endif // PALACE_LINALG_AMG_HPP diff --git a/palace/linalg/ams.cpp b/palace/linalg/ams.cpp index 21925df153..74494555d6 100644 --- a/palace/linalg/ams.cpp +++ b/palace/linalg/ams.cpp @@ -1,239 +1,225 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "ams.hpp" - -#include "fem/bilinearform.hpp" -#include "fem/fespace.hpp" -#include "fem/integrator.hpp" -#include "linalg/rap.hpp" - -namespace palace -{ - -HypreAmsSolver::HypreAmsSolver(const FiniteElementSpace &nd_fespace, - const AuxiliaryFiniteElementSpace &h1_fespace, int cycle_it, - int smooth_it, int agg_coarsen, bool vector_interp, - bool op_singular, int print) - : mfem::HypreSolver(), - // From the Hypre docs for AMS: cycles 1, 5, 8, 11, 13 are fastest, 7 yields fewest its - // (MFEM default is 13). 14 is similar to 11/13 but is cheaper in that is uses additive - // scalar Pi-space corrections. - cycle_type(vector_interp ? 1 : 14), - space_dim(nd_fespace.GetParMesh()->SpaceDimension()), - // When used as the coarse solver of geometric multigrid, always do only a single - // V-cycle. - ams_it(cycle_it), ams_smooth_it(smooth_it), - // Use no aggressive coarsening for frequency domain problems when the preconditioner - // matrix is not SPD. - amg_agg_levels(agg_coarsen), - // If we know the operator is singular (no mass matrix, for magnetostatic problems), - // internally the AMS solver will avoid G-space corrections. - ams_singular(op_singular), print((print > 1) ? print - 1 : 0) -{ - // From MFEM: The AMS preconditioner may sometimes require inverting singular matrices - // with BoomerAMG, which are handled correctly in Hypre's Solve method, but can produce - // Hypre errors in the Setup (specifically in the row l1-norm computation). See the - // documentation of MFEM's SetErrorMode() for more details. - error_mode = IGNORE_HYPRE_ERRORS; - - // Set up the AMS solver. - ConstructAuxiliaryMatrices(nd_fespace, h1_fespace); - InitializeSolver(); -} - -HypreAmsSolver::~HypreAmsSolver() -{ - HYPRE_AMSDestroy(ams); -} - -void HypreAmsSolver::ConstructAuxiliaryMatrices( - const FiniteElementSpace &nd_fespace, const AuxiliaryFiniteElementSpace &h1_fespace) -{ - // Set up the auxiliary space objects for the preconditioner. Mostly the same as MFEM's - // HypreAMS:Init. Start with the discrete gradient matrix. - { - constexpr bool skip_zeros_interp = true; - const auto *PtGP = - dynamic_cast(&h1_fespace.GetDiscreteInterpolator()); - MFEM_VERIFY( - PtGP, - "HypreAmsSolver requires the discrete gradient matrix as a ParOperator operator!"); - G = &PtGP->ParallelAssemble(skip_zeros_interp); - } - - // Vertex coordinates for the lowest order case, or Nedelec interpolation matrix or - // matrices for order > 1. - mfem::ParMesh &mesh = *h1_fespace.GetParMesh(); - if (h1_fespace.GetMaxElementOrder() == 1) - { - mfem::ParGridFunction x_coord(const_cast(&h1_fespace)), - y_coord(const_cast(&h1_fespace)), - z_coord(const_cast(&h1_fespace)); - if (mesh.GetNodes()) - { - mesh.GetNodes()->GetNodalValues(x_coord, 1); - MFEM_VERIFY(x_coord.Size() == h1_fespace.GetVSize(), - "Unexpected size for vertex coordinates in AMS setup!"); - if (space_dim > 1) - { - mesh.GetNodes()->GetNodalValues(y_coord, 2); - } - if (space_dim > 2) - { - mesh.GetNodes()->GetNodalValues(z_coord, 3); - } - } - else - { - MFEM_VERIFY(x_coord.Size() == mesh.GetNV(), - "Unexpected size for vertex coordinates in AMS setup!"); - for (int i = 0; i < mesh.GetNV(); i++) - { - x_coord(i) = mesh.GetVertex(i)[0]; - if (space_dim > 1) - { - y_coord(i) = mesh.GetVertex(i)[1]; - } - if (space_dim > 2) - { - z_coord(i) = mesh.GetVertex(i)[2]; - } - } - } - x.reset(x_coord.ParallelProject()); - x->HypreReadWrite(); - if (space_dim > 1) - { - y.reset(y_coord.ParallelProject()); - y->HypreReadWrite(); - } - if (space_dim > 2) - { - z.reset(z_coord.ParallelProject()); - z->HypreReadWrite(); - } - } - else - { - // Fall back to MFEM legacy assembly for identity interpolator. - mfem::ParFiniteElementSpace h1d_fespace(&mesh, h1_fespace.FEColl(), space_dim, - mfem::Ordering::byVDIM); - mfem::DiscreteLinearOperator pi(&h1d_fespace, - const_cast(&nd_fespace)); - pi.AddDomainInterpolator(new mfem::IdentityInterpolator); - pi.SetAssemblyLevel(mfem::AssemblyLevel::LEGACY); - pi.Assemble(); - pi.Finalize(); - ParOperator RAP_Pi(std::unique_ptr(pi.LoseMat()), h1d_fespace, - nd_fespace, true); - Pi = RAP_Pi.StealParallelAssemble(); - if (cycle_type >= 10) - { - // Get blocks of Pi corresponding to each component, and free Pi. - mfem::Array2D Pi_blocks(1, space_dim); - Pi->GetBlocks(Pi_blocks, false, true); - Pix.reset(Pi_blocks(0, 0)); - if (space_dim > 1) - { - Piy.reset(Pi_blocks(0, 1)); - } - if (space_dim > 2) - { - Piz.reset(Pi_blocks(0, 2)); - } - Pi.reset(); - } - } -} - -void HypreAmsSolver::InitializeSolver() -{ - // Create the Hypre solver object. - HYPRE_AMSCreate(&ams); - HYPRE_AMSSetDimension(ams, space_dim); - HYPRE_AMSSetCycleType(ams, cycle_type); - - // Control printing and number of iterations for use as a preconditioner. - HYPRE_AMSSetPrintLevel(ams, print); - HYPRE_AMSSetMaxIter(ams, ams_it); - // HYPRE_AMSSetTol(ams, 1.0e-16); // Avoid issues with zero RHS - - // Set this option when solving a curl-curl problem with zero mass term. - if (ams_singular) - { - HYPRE_AMSSetBetaPoissonMatrix(ams, nullptr); - } - - // Set additional AMS options. - int coarsen_type = 10; // 10 = HMIS, 8 = PMIS, 6 = Falgout, 0 = CLJP - double theta = 0.5; // AMG strength parameter = 0.25 is 2D optimal (0.5-0.8 for 3D) - int amg_relax_type = 8; // 3 = GS, 6 = symm. GS, 8 = l1-symm. GS, 13 = l1-GS, - // 18 = l1-Jacobi, 16 = Chebyshev - int interp_type = 6; // 6 = Extended+i, 0 = Classical, 13 = FF1 - int Pmax = 4; // Interpolation width - int relax_type = 2; // 2 = l1-SSOR, 4 = trunc. l1-SSOR, 1 = l1-Jacobi, 16 = Chebyshev - double weight = 1.0; - double omega = 1.0; - - HYPRE_AMSSetSmoothingOptions(ams, relax_type, ams_smooth_it, weight, omega); - HYPRE_AMSSetAlphaAMGOptions(ams, coarsen_type, amg_agg_levels, amg_relax_type, theta, - interp_type, Pmax); - HYPRE_AMSSetBetaAMGOptions(ams, coarsen_type, amg_agg_levels, amg_relax_type, theta, - interp_type, Pmax); - - // int coarse_relax_type = 8; // Default, l1-symm. GS - // HYPRE_AMSSetAlphaAMGCoarseRelaxType(ams, coarse_relax_type); - // HYPRE_AMSSetBetaAMGCoarseRelaxType(ams, coarse_relax_type); - - // Set the discrete gradient matrix. - HYPRE_AMSSetDiscreteGradient(ams, (HYPRE_ParCSRMatrix)*G); - - // Set the mesh vertex coordinates or Nedelec interpolation matrix or matrices. - HYPRE_ParVector HY_X = (x) ? (HYPRE_ParVector)*x : nullptr; - HYPRE_ParVector HY_Y = (y) ? (HYPRE_ParVector)*y : nullptr; - HYPRE_ParVector HY_Z = (z) ? (HYPRE_ParVector)*z : nullptr; - HYPRE_AMSSetCoordinateVectors(ams, HY_X, HY_Y, HY_Z); - - HYPRE_ParCSRMatrix HY_Pi = (Pi) ? (HYPRE_ParCSRMatrix)*Pi : nullptr; - HYPRE_ParCSRMatrix HY_Pix = (Pix) ? (HYPRE_ParCSRMatrix)*Pix : nullptr; - HYPRE_ParCSRMatrix HY_Piy = (Piy) ? (HYPRE_ParCSRMatrix)*Piy : nullptr; - HYPRE_ParCSRMatrix HY_Piz = (Piz) ? (HYPRE_ParCSRMatrix)*Piz : nullptr; - HYPRE_AMSSetInterpolations(ams, HY_Pi, HY_Pix, HY_Piy, HY_Piz); -} - -void HypreAmsSolver::SetOperator(const Operator &op) -{ - // When the operator changes, we need to rebuild the AMS solver but can use the unchanged - // auxiliary space matrices. - if (A) - { - HYPRE_AMSDestroy(ams); - InitializeSolver(); - } - - const auto *PtAP = dynamic_cast(&op); - if (PtAP) - { - A = &PtAP->ParallelAssemble(); - } - else - { - A = dynamic_cast(const_cast(&op)); - } - MFEM_VERIFY(A, "HypreAmsSolver requires a HypreParMatrix operator!"); - height = A->Height(); - width = A->Width(); - - // From mfem::HypreAMS: Update HypreSolver base class. - setup_called = 0; - delete X; - delete B; - B = X = nullptr; - auxB.Delete(); - auxB.Reset(); - auxX.Delete(); - auxX.Reset(); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "ams.hpp" + +#include "fem/bilinearform.hpp" +#include "fem/fespace.hpp" +#include "fem/integrator.hpp" +#include "linalg/hypre.hpp" +#include "linalg/rap.hpp" +#include "utils/omp.hpp" + +namespace palace +{ + +HypreAmsSolver::HypreAmsSolver(FiniteElementSpace &nd_fespace, + FiniteElementSpace &h1_fespace, int cycle_it, int smooth_it, + bool vector_interp, bool singular_op, bool agg_coarsen, + int print) + : mfem::HypreSolver(), + // From the Hypre docs for AMS: cycles 1, 5, 8, 11, 13 are fastest, 7 yields fewest its + // (MFEM default is 13). 14 is similar to 11/13 but is cheaper in that is uses additive + // scalar Pi-space corrections. + cycle_type(vector_interp ? 1 : 14), space_dim(nd_fespace.SpaceDimension()), + // When used as the coarse solver of geometric multigrid, always do only a single + // V-cycle. + ams_it(cycle_it), ams_smooth_it(smooth_it), + // If we know the operator is singular (no mass matrix, for magnetostatic problems), + // internally the AMS solver will avoid G-space corrections. + ams_singular(singular_op), + // For positive (SPD) operators, we will use aggressive coarsening but not for frequency + // domain problems when the preconditioner matrix is not SPD. + agg_coarsen(agg_coarsen), print((print > 1) ? print - 1 : 0) +{ + // From MFEM: The AMS preconditioner may sometimes require inverting singular matrices + // with BoomerAMG, which are handled correctly in Hypre's Solve method, but can produce + // Hypre errors in the Setup (specifically in the row l1-norm computation). See the + // documentation of MFEM's SetErrorMode() for more details. + error_mode = IGNORE_HYPRE_ERRORS; + + // Set up the AMS solver. + ConstructAuxiliaryMatrices(nd_fespace, h1_fespace); + InitializeSolver(); +} + +HypreAmsSolver::~HypreAmsSolver() +{ + HYPRE_AMSDestroy(ams); +} + +void HypreAmsSolver::ConstructAuxiliaryMatrices(FiniteElementSpace &nd_fespace, + FiniteElementSpace &h1_fespace) +{ + // Set up the auxiliary space objects for the preconditioner. Mostly the same as MFEM's + // HypreAMS:Init. Start with the discrete gradient matrix. We don't skip zeros for the + // full assembly to accelerate things on GPU and since they shouldn't affect the sparsity + // pattern of the parallel G^T A G matrix (computed by Hypre). + const bool skip_zeros_interp = !mfem::Device::Allows(mfem::Backend::DEVICE_MASK); + { + const auto *PtGP = + dynamic_cast(&nd_fespace.GetDiscreteInterpolator(h1_fespace)); + MFEM_VERIFY( + PtGP, + "HypreAmsSolver requires the discrete gradient matrix as a ParOperator operator!"); + G = &PtGP->ParallelAssemble(skip_zeros_interp); + } + + // Vertex coordinates for the lowest order case, or Nedelec interpolation matrix or + // matrices for order > 1. Expects that Mesh::SetVerticesFromNodes has been called at some + // point to avoid calling GridFunction::GetNodalValues here. + mfem::ParMesh &mesh = h1_fespace.GetParMesh(); + if (h1_fespace.GetMaxElementOrder() == 1) + { + mfem::ParGridFunction x_coord(&h1_fespace.Get()), y_coord(&h1_fespace.Get()), + z_coord(&h1_fespace.Get()); + MFEM_VERIFY(x_coord.Size() == mesh.GetNV(), + "Unexpected size for vertex coordinates in AMS setup!"); + PalacePragmaOmp(parallel for schedule(static)) + for (int i = 0; i < mesh.GetNV(); i++) + { + x_coord(i) = mesh.GetVertex(i)[0]; + if (space_dim > 1) + { + y_coord(i) = mesh.GetVertex(i)[1]; + } + if (space_dim > 2) + { + z_coord(i) = mesh.GetVertex(i)[2]; + } + } + x.reset(x_coord.ParallelProject()); + x->HypreReadWrite(); + if (space_dim > 1) + { + y.reset(y_coord.ParallelProject()); + y->HypreReadWrite(); + } + if (space_dim > 2) + { + z.reset(z_coord.ParallelProject()); + z->HypreReadWrite(); + } + } + else + { + // Fall back to MFEM legacy assembly for identity interpolator. + FiniteElementSpace h1d_fespace(h1_fespace.GetMesh(), &h1_fespace.GetFEColl(), space_dim, + mfem::Ordering::byVDIM); + mfem::DiscreteLinearOperator pi(&h1d_fespace.Get(), &nd_fespace.Get()); + pi.AddDomainInterpolator(new mfem::IdentityInterpolator); + pi.SetAssemblyLevel(mfem::AssemblyLevel::LEGACY); + pi.Assemble(skip_zeros_interp); + pi.Finalize(skip_zeros_interp); + ParOperator RAP_Pi(std::make_unique(pi.SpMat()), h1d_fespace, + nd_fespace, true); + Pi = RAP_Pi.StealParallelAssemble(); + if (cycle_type >= 10) + { + // Get blocks of Pi corresponding to each component, and free Pi. + mfem::Array2D Pi_blocks(1, space_dim); + Pi->GetBlocks(Pi_blocks, false, true); + Pix.reset(Pi_blocks(0, 0)); + if (space_dim > 1) + { + Piy.reset(Pi_blocks(0, 1)); + } + if (space_dim > 2) + { + Piz.reset(Pi_blocks(0, 2)); + } + Pi.reset(); + } + } +} + +void HypreAmsSolver::InitializeSolver() +{ + // Create the Hypre solver object. + HYPRE_AMSCreate(&ams); + HYPRE_AMSSetDimension(ams, space_dim); + HYPRE_AMSSetCycleType(ams, cycle_type); + + // Control printing and number of iterations for use as a preconditioner. + HYPRE_AMSSetPrintLevel(ams, print); + HYPRE_AMSSetMaxIter(ams, ams_it); + // HYPRE_AMSSetTol(ams, 1.0e-16); // Avoid issues with zero RHS + + // Set this option when solving a curl-curl problem with zero mass term. + if (ams_singular) + { + HYPRE_AMSSetBetaPoissonMatrix(ams, nullptr); + } + + // Set additional AMS options. + int coarsen_type = 10; // 10 = HMIS, 8 = PMIS, 6 = Falgout, 0 = CLJP + int amg_agg_levels = agg_coarsen ? 1 : 0; // Number of aggressive coarsening levels + double theta = 0.5; // AMG strength parameter = 0.25 is 2D optimal (0.5-0.8 for 3D) + int amg_relax_type = 8; // 3 = GS, 6 = symm. GS, 8 = l1-symm. GS, 13 = l1-GS, + // 18 = l1-Jacobi, 16 = Chebyshev + int interp_type = 6; // 6 = Extended+i, 0 = Classical, 13 = FF1 + int Pmax = 4; // Interpolation width + int relax_type = 2; // 2 = l1-SSOR, 4 = trunc. l1-SSOR, 1 = l1-Jacobi, 16 = Chebyshev + double weight = 1.0; + double omega = 1.0; + if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK)) + { + // Modify options for GPU-supported features. + coarsen_type = 8; + amg_agg_levels = 0; + amg_relax_type = 18; + relax_type = 1; + } + + HYPRE_AMSSetSmoothingOptions(ams, relax_type, ams_smooth_it, weight, omega); + HYPRE_AMSSetAlphaAMGOptions(ams, coarsen_type, amg_agg_levels, amg_relax_type, theta, + interp_type, Pmax); + HYPRE_AMSSetBetaAMGOptions(ams, coarsen_type, amg_agg_levels, amg_relax_type, theta, + interp_type, Pmax); + + // int coarse_relax_type = 8; // Default, l1-symm. GS + // HYPRE_AMSSetAlphaAMGCoarseRelaxType(ams, coarse_relax_type); + // HYPRE_AMSSetBetaAMGCoarseRelaxType(ams, coarse_relax_type); + + // Set the discrete gradient matrix. + HYPRE_AMSSetDiscreteGradient(ams, (HYPRE_ParCSRMatrix)*G); + + // Set the mesh vertex coordinates or Nedelec interpolation matrix or matrices. + HYPRE_ParVector HY_X = (x) ? (HYPRE_ParVector)*x : nullptr; + HYPRE_ParVector HY_Y = (y) ? (HYPRE_ParVector)*y : nullptr; + HYPRE_ParVector HY_Z = (z) ? (HYPRE_ParVector)*z : nullptr; + HYPRE_AMSSetCoordinateVectors(ams, HY_X, HY_Y, HY_Z); + + HYPRE_ParCSRMatrix HY_Pi = (Pi) ? (HYPRE_ParCSRMatrix)*Pi : nullptr; + HYPRE_ParCSRMatrix HY_Pix = (Pix) ? (HYPRE_ParCSRMatrix)*Pix : nullptr; + HYPRE_ParCSRMatrix HY_Piy = (Piy) ? (HYPRE_ParCSRMatrix)*Piy : nullptr; + HYPRE_ParCSRMatrix HY_Piz = (Piz) ? (HYPRE_ParCSRMatrix)*Piz : nullptr; + HYPRE_AMSSetInterpolations(ams, HY_Pi, HY_Pix, HY_Piy, HY_Piz); +} + +void HypreAmsSolver::SetOperator(const Operator &op) +{ + // When the operator changes, we need to rebuild the AMS solver but can use the unchanged + // auxiliary space matrices. + if (A) + { + HYPRE_AMSDestroy(ams); + InitializeSolver(); + } + A = const_cast(dynamic_cast(&op)); + MFEM_VERIFY(A, "HypreAmsSolver requires a HypreParMatrix operator!"); + height = A->Height(); + width = A->Width(); + + // From mfem::HypreAMS: Update HypreSolver base class. + setup_called = 0; + delete X; + delete B; + B = X = nullptr; + auxB.Delete(); + auxB.Reset(); + auxX.Delete(); + auxX.Reset(); +} + +} // namespace palace diff --git a/palace/linalg/ams.hpp b/palace/linalg/ams.hpp index 50386bd2a7..4c5d624b78 100644 --- a/palace/linalg/ams.hpp +++ b/palace/linalg/ams.hpp @@ -1,88 +1,81 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_AMS_HPP -#define PALACE_LINALG_AMS_HPP - -#include -#include -#include "linalg/operator.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -class AuxiliaryFiniteElementSpace; -class FiniteElementSpace; - -// -// A wrapper for Hypre's AMS solver. -// -class HypreAmsSolver : public mfem::HypreSolver -{ -private: - // The Hypre solver object. - HYPRE_Solver ams; - - // Parameters used for preconditioner construction. - const int cycle_type, space_dim, ams_it, ams_smooth_it, amg_agg_levels; - const bool ams_singular; - - // Control print level for debugging. - const int print; - - // Discrete gradient matrix (not owned). - const mfem::HypreParMatrix *G; - - // Nedelec interpolation matrix and its components, or, for p = 1, the mesh vertex - // coordinates. - std::unique_ptr Pi, Pix, Piy, Piz; - std::unique_ptr x, y, z; - - // Helper function to set up the auxiliary objects required by the AMS solver. - void ConstructAuxiliaryMatrices(const FiniteElementSpace &nd_fespace, - const AuxiliaryFiniteElementSpace &h1_fespace); - - // Helper function to construct and configure the AMS solver. - void InitializeSolver(); - -public: - // Constructor requires the ND space, but will construct the H1 and (H1)ᵈ spaces - // internally as needed. - HypreAmsSolver(const FiniteElementSpace &nd_fespace, - const AuxiliaryFiniteElementSpace &h1_fespace, int cycle_it, int smooth_it, - int agg_coarsen, bool vector_interp, bool op_singular, int print); - HypreAmsSolver(const IoData &iodata, bool coarse_solver, - const FiniteElementSpace &nd_fespace, - const AuxiliaryFiniteElementSpace &h1_fespace, int print) - : HypreAmsSolver( - nd_fespace, h1_fespace, coarse_solver ? 1 : iodata.solver.linear.mg_cycle_it, - iodata.solver.linear.mg_smooth_it, - (iodata.problem.type == config::ProblemData::Type::TRANSIENT || - iodata.problem.type == config::ProblemData::Type::MAGNETOSTATIC) - ? 1 - : 0, - iodata.solver.linear.ams_vector, - (iodata.problem.type == config::ProblemData::Type::MAGNETOSTATIC), print) - { - } - ~HypreAmsSolver() override; - - void SetOperator(const Operator &op) override; - - operator HYPRE_Solver() const override { return ams; } - - HYPRE_PtrToParSolverFcn SetupFcn() const override - { - return (HYPRE_PtrToParSolverFcn)HYPRE_AMSSetup; - } - - HYPRE_PtrToParSolverFcn SolveFcn() const override - { - return (HYPRE_PtrToParSolverFcn)HYPRE_AMSSolve; - } -}; - -} // namespace palace - -#endif // PALACE_LINALG_AMS_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_AMS_HPP +#define PALACE_LINALG_AMS_HPP + +#include +#include +#include "linalg/operator.hpp" +#include "utils/iodata.hpp" + +namespace palace +{ + +class FiniteElementSpace; + +// +// A wrapper for Hypre's AMS solver. +// +class HypreAmsSolver : public mfem::HypreSolver +{ +private: + // The Hypre solver object. + HYPRE_Solver ams; + + // Parameters used for preconditioner construction. + const int cycle_type, space_dim, ams_it, ams_smooth_it; + const bool ams_singular, agg_coarsen; + + // Control print level for debugging. + const int print; + + // Discrete gradient matrix (not owned). + const mfem::HypreParMatrix *G; + + // Nedelec interpolation matrix and its components, or, for p = 1, the mesh vertex + // coordinates. + std::unique_ptr Pi, Pix, Piy, Piz; + std::unique_ptr x, y, z; + + // Helper function to set up the auxiliary objects required by the AMS solver. + void ConstructAuxiliaryMatrices(FiniteElementSpace &nd_fespace, + FiniteElementSpace &h1_fespace); + + // Helper function to construct and configure the AMS solver. + void InitializeSolver(); + +public: + // Constructor requires the ND space, but will construct the H1 and (H1)ᵈ spaces + // internally as needed. + HypreAmsSolver(FiniteElementSpace &nd_fespace, FiniteElementSpace &h1_fespace, + int cycle_it, int smooth_it, bool vector_interp, bool singular_op, + bool agg_coarsen, int print); + HypreAmsSolver(const IoData &iodata, bool coarse_solver, FiniteElementSpace &nd_fespace, + FiniteElementSpace &h1_fespace, int print) + : HypreAmsSolver( + nd_fespace, h1_fespace, coarse_solver ? 1 : iodata.solver.linear.mg_cycle_it, + iodata.solver.linear.mg_smooth_it, iodata.solver.linear.ams_vector_interp, + iodata.solver.linear.ams_singular_op, iodata.solver.linear.amg_agg_coarsen, print) + { + } + ~HypreAmsSolver() override; + + void SetOperator(const Operator &op) override; + + operator HYPRE_Solver() const override { return ams; } + + HYPRE_PtrToParSolverFcn SetupFcn() const override + { + return (HYPRE_PtrToParSolverFcn)HYPRE_AMSSetup; + } + + HYPRE_PtrToParSolverFcn SolveFcn() const override + { + return (HYPRE_PtrToParSolverFcn)HYPRE_AMSSolve; + } +}; + +} // namespace palace + +#endif // PALACE_LINALG_AMS_HPP diff --git a/palace/linalg/arpack.cpp b/palace/linalg/arpack.cpp index feae05190b..9f74b293c0 100644 --- a/palace/linalg/arpack.cpp +++ b/palace/linalg/arpack.cpp @@ -1,828 +1,851 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "arpack.hpp" - -#if defined(PALACE_WITH_ARPACK) - -#include -#include -#include -// clang-format off -#include // ARPACK headers -#include -#include -// clang-format on -#include "linalg/divfree.hpp" -#include "utils/communication.hpp" - -namespace -{ - -void CheckInfoAUPD(a_int info) -{ - if (info != 0) - { - std::string msg = "ARPACK pznaupd error: "; - switch (info) - { - case 1: - msg += "Maximum number of iterations taken, all possible eigenvalues " - "have been found!"; - break; - case 2: - msg += "No longer an informational error (deprecated starting with " - "release 2 of ARPACK)!"; - break; - case 3: - msg += "No shifts could be applied during a cycle of the Implicitly " - "restarted Arnoldi iteration!"; - break; - case -1: - msg += "N must be positive!"; - break; - case -2: - msg += "NEV must be positive!"; - break; - case -3: - msg += "NCV-NEV >= 2 and less than or equal to N!"; - break; - case -4: - msg += "The maximum number of Arnoldi update iterations allowed must " - "be greater than zero!"; - break; - case -5: - msg += "WHICH must be one of 'LM', 'SM', 'LR', 'SR', 'LI', 'SI'"; - break; - case -6: - msg += "BMAT must be one of 'I' or 'G'!"; - break; - case -7: - msg += "Length of private work array WORKL is not sufficient!"; - break; - case -8: - msg += "Error return from LAPACK eigenvalue calculation!"; - break; - case -9: - msg += "Starting vector is zero!"; - break; - case -10: - msg += "IPARAM(7) must be 1, 2, or 3!"; - break; - case -11: - msg += "IPARAM(7) = 1 and BMAT = 'G' are incompatible!"; - break; - case -12: - msg += "IPARAM(1) must be equal to 0 or 1!"; - break; - case -9999: - msg += "Could not build an Arnoldi factorization!"; - break; - default: - msg += "Unknown ARPACK error message!"; - break; - } - MFEM_ABORT(msg.c_str()); - } -} - -void CheckInfoEUPD(a_int info) -{ - if (info != 0) - { - std::string msg = "ARPACK pzneupd error: "; - switch (info) - { - case 1: - msg += "The Schur form computed by LAPACK routine csheqr could not " - "be reordered by LAPACK routine ztrsen!"; - break; - case -1: - msg += "N must be positive!"; - break; - case -2: - msg += "NEV must be positive!"; - break; - case -3: - msg += "NCV-NEV >= 2 and less than or equal to N!"; - break; - case -4: - msg += "The maximum number of Arnoldi update iterations allowed must " - "be greater than zero!"; - break; - case -5: - msg += "WHICH must be one of 'LM', 'SM', 'LR', 'SR', 'LI', 'SI'"; - break; - case -6: - msg += "BMAT must be one of 'I' or 'G'!"; - break; - case -7: - msg += "Length of private work array WORKL is not sufficient!"; - break; - case -8: - msg += "Error return from LAPACK eigenvalue calculation!"; - break; - case -9: - msg += "Error return from calculation of eigenvectors!"; - break; - case -10: - msg += "IPARAM(7) must be 1, 2, or 3!"; - break; - case -11: - msg += "IPARAM(7) = 1 and BMAT = 'G' are incompatible!"; - break; - case -12: - msg += "HOWMNY = 'S' not yet implemented!"; - break; - case -13: - msg += "HOWMNY must be one of 'A' or 'P' if RVEC = true!"; - break; - case -14: - msg += "PZNAUPD did not find any eigenvalues to sufficient accuracy!"; - break; - case -15: - msg += "ZNEUPD got a different count of the number of converged Ritz " - "values than ZNAUPD got!"; - break; - default: - msg += "Unknown ARPACK error message!"; - break; - } - MFEM_ABORT(msg.c_str()); - } -} - -} // namespace - -namespace palace::arpack -{ - -// Base class methods - -ArpackEigenvalueSolver::ArpackEigenvalueSolver(MPI_Comm comm, int print) - : comm(comm), print(print) -{ - // Initialization. - info = 0; - nev = ncv = n = 0; - rtol = 0.0; - arpack_it = 0; - which_type = WhichType::LARGEST_MAGNITUDE; - gamma = delta = 1.0; - sinvert = false; - sigma = 0.0; - - opInv = nullptr; - opProj = nullptr; - opB = nullptr; - - // Configure debugging output. - a_int logfill = 6, ndigit = -6, mgetv0 = 0; - a_int _aupd = (print > 2) ? 1 : 0, _aup2 = (print > 2) ? 2 : ((print > 0) ? 1 : 0), - _aitr = 0, _eigh = 0, _gets = 0, _apps = 0, _eupd = 0; - debug_c(logfill, ndigit, mgetv0, _aupd, _aup2, _aitr, _eigh, _gets, _apps, _eupd, _aupd, - _aup2, _aitr, _eigh, _gets, _apps, _eupd, _aupd, _aup2, _aitr, _eigh, _gets, - _apps, _eupd); - cstatn_c(); -} - -void ArpackEigenvalueSolver::SetOperators(const ComplexOperator &K, - const ComplexOperator &M, - EigenvalueSolver::ScaleType type) -{ - MFEM_ABORT("SetOperators not defined for base class ArpackEigenvalueSolver!"); -} - -void ArpackEigenvalueSolver::SetOperators(const ComplexOperator &K, - const ComplexOperator &C, - const ComplexOperator &M, - EigenvalueSolver::ScaleType type) -{ - MFEM_ABORT("SetOperators not defined for base class ArpackEigenvalueSolver!"); -} - -void ArpackEigenvalueSolver::SetLinearSolver(const ComplexKspSolver &ksp) -{ - opInv = &ksp; -} - -void ArpackEigenvalueSolver::SetDivFreeProjector(const DivFreeSolver &divfree) -{ - opProj = &divfree; -} - -void ArpackEigenvalueSolver::SetBMat(const Operator &B) -{ - MFEM_VERIFY(!opB || opB->Height() == B.Height(), - "Invalid modification of eigenvalue problem size!"); - opB = &B; -} - -void ArpackEigenvalueSolver::SetNumModes(int num_eig, int num_vec) -{ - if (nev > 0 && num_eig != nev) - { - eig.reset(); - perm.reset(); - res.reset(); - } - if (ncv > 0 && num_vec != ncv) - { - V.reset(); - } - nev = num_eig; - ncv = (num_vec > 0) ? num_vec : std::max(20, 2 * nev + 1); // Default from SLEPc -} - -void ArpackEigenvalueSolver::SetTol(double tol) -{ - rtol = tol; -} - -void ArpackEigenvalueSolver::SetMaxIter(int max_it) -{ - arpack_it = max_it; -} - -void ArpackEigenvalueSolver::SetWhichEigenpairs(EigenvalueSolver::WhichType type) -{ - which_type = type; -} - -void ArpackEigenvalueSolver::SetShiftInvert(std::complex s, bool precond) -{ - MFEM_VERIFY(!precond, "ARPACK eigenvalue solver does not support preconditioned " - "spectral transformation option!"); - sigma = s; - sinvert = true; -} - -void ArpackEigenvalueSolver::SetInitialSpace(const ComplexVector &v) -{ - MFEM_VERIFY( - n > 0, - "Must call SetOperators before using SetInitialSpace for ARPACK eigenvalue solver!"); - if (!r) - { - r = std::make_unique[]>(n); - } - MFEM_VERIFY(v.Size() == n, "Invalid size mismatch for provided initial space vector!"); - v.Get(r.get(), n); - info = 1; -} - -int ArpackEigenvalueSolver::SolveInternal(int n, std::complex *r, - std::complex *V, - std::complex *eig, int *perm) -{ - MPI_Fint fcomm = MPI_Comm_c2f(comm); - a_int iparam[11] = {0}; - iparam[0] = 1; // Exact shifts - iparam[2] = (a_int)arpack_it; // Maximum number of Arnoldi iterations - iparam[3] = 1; // Block size - iparam[4] = 0; // Number of converged Ritz values - iparam[6] = sinvert ? 3 : 1; // Problem mode - - ::arpack::bmat bmat_option = - (opB) ? ::arpack::bmat::generalized : ::arpack::bmat::identity; - - ::arpack::which which_option = ::arpack::which::largest_magnitude; - switch (which_type) - { - case WhichType::LARGEST_MAGNITUDE: - case WhichType::TARGET_MAGNITUDE: - which_option = ::arpack::which::largest_magnitude; - break; - case WhichType::SMALLEST_MAGNITUDE: - which_option = ::arpack::which::smallest_magnitude; - break; - case WhichType::LARGEST_REAL: - which_option = ::arpack::which::largest_real; - break; - case WhichType::SMALLEST_REAL: - which_option = ::arpack::which::smallest_real; - break; - case WhichType::LARGEST_IMAGINARY: - which_option = ::arpack::which::largest_imaginary; - break; - case WhichType::SMALLEST_IMAGINARY: - which_option = ::arpack::which::smallest_imaginary; - break; - case WhichType::TARGET_REAL: - case WhichType::TARGET_IMAGINARY: - MFEM_ABORT("ARPACK eigenvalue solver does not implement TARGET_REAL or " - "TARGET_IMAGINARY for SetWhichEigenpairs!"); - break; - } - - // Allocate work arrays. - a_int lworkl = 3 * ncv * ncv + 5 * ncv; - auto workd = std::make_unique[]>(3 * n); - auto workl = std::make_unique[]>(lworkl); - auto rwork = std::make_unique(ncv); - - // Begin RCI loop. - a_int ido = 0, ainfo = (a_int)info, ipntr[14] = {0}; - while (true) - { - // Call complex problem driver. - naupd(fcomm, ido, bmat_option, (a_int)n, which_option, (a_int)nev, rtol, r, (a_int)ncv, - V, (a_int)n, iparam, ipntr, workd.get(), workl.get(), lworkl, rwork.get(), ainfo); - CheckInfoAUPD(ainfo); - - // We never use pre-computed B * x in workd[ipntr[2] - 1]. - if (ido == 1 || ido == -1) - { - ApplyOp(&workd.get()[ipntr[0] - 1], &workd.get()[ipntr[1] - 1]); - } - else if (ido == 2) - { - ApplyOpB(&workd.get()[ipntr[0] - 1], &workd.get()[ipntr[1] - 1]); - } - else if (ido == 99) - { - break; - } - else - { - MFEM_ABORT("Internal error in ARPACK RCI interface!"); - } - } - - // Print some log information. - int num_it = (int)iparam[2]; - int num_conv = (int)iparam[4]; - if (print > 0) - { - Mpi::Print(comm, - "\n ARPACK {} eigensolve {} ({:d} eigenpairs); iterations {:d}\n" - " Total number of linear systems solved: {:d}\n" - " Total number of linear solver iterations: {:d}\n", - GetName(), (num_conv >= nev) ? "converged" : "finished", num_conv, num_it, - opInv->NumTotalMult(), opInv->NumTotalMultIterations()); - } - if (num_conv < nev) - { - Mpi::Warning( - comm, "ARPACK eigenvalue solver found only {:d} of requested {:d} eigenvalues!\n", - num_conv, nev); - } - - // Postprocess eigenvalues and eigenvectors. - a_int rvec = 1; - ::arpack::howmny howmny_option = ::arpack::howmny::ritz_vectors; - - // Allocate eigenvalue storage and work arrays. - auto select = std::make_unique(ncv); - auto workev = std::make_unique[]>(2 * ncv); - - // Call complex problem driver. - neupd(fcomm, rvec, howmny_option, select.get(), eig, V, (a_int)n, sigma / gamma, - workev.get(), bmat_option, (a_int)n, which_option, (a_int)nev, rtol, r, (a_int)ncv, - V, (a_int)n, iparam, ipntr, workd.get(), workl.get(), lworkl, rwork.get(), ainfo); - CheckInfoEUPD(ainfo); - - // Unscale and properly sort the eigenvalues. - auto CompareReal = [&eig](const int &l, const int &r) - { return eig[l].real() < eig[r].real(); }; - auto CompareImag = [&eig](const int &l, const int &r) - { return eig[l].imag() < eig[r].imag(); }; - auto CompareAbs = [&eig](const int &l, const int &r) - { return std::abs(eig[l]) < std::abs(eig[r]); }; - for (int i = 0; i < nev; i++) - { - eig[i] = eig[i] * gamma; - perm[i] = i; - } - if (which_option == ::arpack::which::largest_real || - which_option == ::arpack::which::smallest_real) - { - std::sort(perm, perm + nev, CompareReal); - } - else if (which_option == ::arpack::which::largest_imaginary || - which_option == ::arpack::which::smallest_imaginary) - { - std::sort(perm, perm + nev, CompareImag); - } - else - { - std::sort(perm, perm + nev, CompareAbs); - } - - return num_conv; -} - -void ArpackEigenvalueSolver::CheckParameters() const -{ - MFEM_VERIFY(n > 0, "Operators are not set for ARPACK eigenvalue solver!"); - MFEM_VERIFY(nev > 0, "Number of requested modes is not positive!"); - MFEM_VERIFY(rtol > 0.0, "Eigensolver tolerance is not positive!"); - MFEM_VERIFY(opInv, "No linear solver provided for operator!"); -} - -std::complex ArpackEigenvalueSolver::GetEigenvalue(int i) const -{ - MFEM_VERIFY(eig && i >= 0 && i < nev, - "Out of range eigenpair requested (i = " << i << ", nev = " << nev << ")!"); - const int &j = perm.get()[i]; - return eig.get()[j]; -} - -void ArpackEigenvalueSolver::GetEigenvector(int i, ComplexVector &x) const -{ - MFEM_VERIFY(eig && i >= 0 && i < nev, - "Out of range eigenpair requested (i = " << i << ", nev = " << nev << ")!"); - MFEM_VERIFY(x.Size() == n, "Invalid size mismatch for provided eigenvector!"); - const int &j = perm.get()[i]; - x.Set(V.get() + j * n, n); -} - -double ArpackEigenvalueSolver::GetError(int i, EigenvalueSolver::ErrorType type) const -{ - MFEM_VERIFY(eig && i >= 0 && i < nev, - "Out of range eigenpair requested (i = " << i << ", nev = " << nev << ")!"); - const int &j = perm.get()[i]; - switch (type) - { - case ErrorType::ABSOLUTE: - return res.get()[j]; - case ErrorType::RELATIVE: - return res.get()[j] / std::abs(eig.get()[j]); - case ErrorType::BACKWARD: - return res.get()[j] / GetBackwardScaling(eig.get()[j]); - } - return 0.0; -} - -// EPS specific methods - -ArpackEPSSolver::ArpackEPSSolver(MPI_Comm comm, int print) - : ArpackEigenvalueSolver(comm, print) -{ - opK = opM = nullptr; - normK = normM = 0.0; -} - -void ArpackEPSSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &M, - EigenvalueSolver::ScaleType type) -{ - MFEM_VERIFY(!opK || opK->Height() == K.Height(), - "Invalid modification of eigenvalue problem size!"); - bool first = (opK == nullptr); - opK = &K; - opM = &M; - if (first && type != ScaleType::NONE) - { - normK = linalg::SpectralNorm(comm, *opK, opK->IsReal()); - normM = linalg::SpectralNorm(comm, *opM, opM->IsReal()); - MFEM_VERIFY(normK >= 0.0 && normM >= 0.0, "Invalid matrix norms for EPS scaling!"); - if (normK > 0 && normM > 0.0) - { - gamma = normK / normM; // Store γ² for linear problem - delta = 2.0 / normK; - } - } - - // Set up workspace. - x.SetSize(opK->Height()); - y.SetSize(opK->Height()); - z.SetSize(opK->Height()); - n = opK->Height(); -} - -int ArpackEPSSolver::Solve() -{ - // Set some defaults (default maximum iterations from SLEPc). - CheckParameters(); - HYPRE_BigInt N = linalg::GlobalSize(comm, z); - if (ncv > N) - { - ncv = mfem::internal::to_int(N); - } - if (arpack_it <= 0) - { - arpack_it = std::max(300, mfem::internal::to_int(2 * N / ncv)); - } - - // Initialize if user did not provide an initial space. - if (!r) - { - r = std::make_unique[]>(n); - info = 0; - } - if (!info) - { - std::fill(r.get(), r.get() + n, 0.0); - } - - // Allocate Arnoldi basis for the problem. - if (!V) - { - V = std::make_unique[]>(n * ncv); - } - - // Allocate storage for eigenvalues and residual norms. - if (!eig) - { - eig = std::make_unique[]>(nev + 1); - perm = std::make_unique(nev); - res = std::make_unique(nev); - } - - // Solve the generalized eigenvalue problem. - int num_conv = SolveInternal(n, r.get(), V.get(), eig.get(), perm.get()); - - // Compute the eigenpair residuals: || (K - λ M) x ||₂ for eigenvalue λ. - for (int i = 0; i < nev; i++) - { - const std::complex l = eig.get()[i]; - x.Set(V.get() + i * n, n); - opK->Mult(x, y); - opM->AddMult(x, y, -l); - res.get()[i] = linalg::Norml2(comm, y); - } - - // Reset for next solve. - info = 0; - return num_conv; -} - -void ArpackEPSSolver::ApplyOp(const std::complex *px, - std::complex *py) const -{ - // Case 1: No spectral transformation (opInv = M⁻¹) - // y = M⁻¹ K x . - // Case 2: Shift-and-invert spectral transformation (opInv = (K - σ M)⁻¹) - // y = (K - σ M)⁻¹ M x . - x.Set(px, n); - if (!sinvert) - { - opK->Mult(x, z); - opInv->Mult(z, y); - y *= 1.0 / gamma; - } - else - { - opM->Mult(x, z); - opInv->Mult(z, y); - y *= gamma; - } - if (opProj) - { - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y)); - opProj->Mult(y); - // Mpi::Print(" After projection: {:e}\n", linalg::Norml2(comm, y)); - } - y.Get(py, n); -} - -void ArpackEPSSolver::ApplyOpB(const std::complex *px, - std::complex *py) const -{ - MFEM_VERIFY(opB, "No B operator for weighted inner product in ARPACK solve!"); - x.Set(px, n); - opB->Mult(x.Real(), y.Real()); - opB->Mult(x.Imag(), y.Imag()); - y *= delta * gamma; - y.Get(py, n); -} - -double ArpackEPSSolver::GetBackwardScaling(std::complex l) const -{ - // Make sure not to use norms from scaling as this can be confusing if they are different. - // Note that SLEPc uses ||.||∞, not the 2-norm. - if (normK <= 0.0) - { - normK = linalg::SpectralNorm(comm, *opK, opK->IsReal()); - } - if (normM <= 0.0) - { - normM = linalg::SpectralNorm(comm, *opM, opM->IsReal()); - } - return normK + std::abs(l) * normM; -} - -// PEP specific methods - -ArpackPEPSolver::ArpackPEPSolver(MPI_Comm comm, int print) - : ArpackEigenvalueSolver(comm, print) -{ - opK = opC = opM = nullptr; - normK = normC = normM = 0.0; -} - -void ArpackPEPSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &C, - const ComplexOperator &M, - EigenvalueSolver::ScaleType type) -{ - MFEM_VERIFY(!opK || opK->Height() == K.Height(), - "Invalid modification of eigenvalue problem size!"); - bool first = (opK == nullptr); - opK = &K; - opC = &C; - opM = &M; - if (first && type != ScaleType::NONE) - { - normK = linalg::SpectralNorm(comm, *opK, opK->IsReal()); - normC = linalg::SpectralNorm(comm, *opC, opC->IsReal()); - normM = linalg::SpectralNorm(comm, *opM, opM->IsReal()); - MFEM_VERIFY(normK >= 0.0 && normC >= 0.0 && normM >= 0.0, - "Invalid matrix norms for PEP scaling!"); - if (normK > 0 && normC > 0.0 && normM > 0.0) - { - gamma = std::sqrt(normK / normM); - delta = 2.0 / (normK + gamma * normC); - } - } - - // Set up workspace. - x1.SetSize(opK->Height()); - x2.SetSize(opK->Height()); - y1.SetSize(opK->Height()); - y2.SetSize(opK->Height()); - z.SetSize(opK->Height()); - n = opK->Height(); -} - -int ArpackPEPSolver::Solve() -{ - // Set some defaults (from SLEPc ARPACK interface). The problem size is the size of the - // 2x2 block linearized problem. - CheckParameters(); - HYPRE_BigInt N = linalg::GlobalSize(comm, z); - if (ncv > 2 * N) - { - ncv = mfem::internal::to_int(2 * N); - } - if (arpack_it <= 0) - { - arpack_it = std::max(300, mfem::internal::to_int(4 * N / ncv)); - } - - // Initialize if user did not provide an initial space. - if (!r) - { - r = std::make_unique[]>(n); - info = 0; - } - if (!info) - { - std::fill(r.get(), r.get() + n, 0.0); - } - auto s = std::make_unique[]>(2 * n); - std::copy(r.get(), r.get() + n, s.get()); - std::fill(s.get() + n, s.get() + 2 * n, 0.0); - - // Allocate Arnoldi basis for original and linearized problem. - if (!V) - { - V = std::make_unique[]>(n * ncv); - } - auto W = std::make_unique[]>(2 * n * ncv); - - // Allocate storage for eigenvalues and residual norms. - if (!eig) - { - eig = std::make_unique[]>(nev + 1); - perm = std::make_unique(nev + 1); - res = std::make_unique(nev + 1); - } - - // Solve the linearized eigenvalue problem. - int num_conv = SolveInternal(2 * n, s.get(), W.get(), eig.get(), perm.get()); - - // Extract the eigenvector from the linearized problem and compute the eigenpair - // residuals: || P(λ) x ||₂ = || (K + λ C + λ² M) x ||₂ for eigenvalue λ. - for (int i = 0; i < nev; i++) - { - const std::complex &l = eig.get()[i]; - ExtractEigenvector(l, W.get() + i * 2 * n, V.get() + i * n); - x1.Set(V.get() + i * n, n); - opK->Mult(x1, y1); - opC->AddMult(x1, y1, l); - opM->AddMult(x1, y1, l * l); - res.get()[i] = linalg::Norml2(comm, y1); - } - - // Reset for next solve. - info = 0; - return num_conv; -} - -void ArpackPEPSolver::ApplyOp(const std::complex *px, - std::complex *py) const -{ - // Case 1: No spectral transformation (opInv = M⁻¹) - // y = L₁⁻¹ L₀ x . - // Case 2: Shift-and-invert spectral transformation (opInv = P(σ)⁻¹) - // y = (L₀ - σ L₁)⁻¹ L₁ x . - // With: - // L₀ = [ -K 0 ] L₁ = [ C M ] - // [ 0 M ] , [ M 0 ] . - x1.Set(px, n); - x2.Set(px + n, n); - if (!sinvert) - { - y1 = x2; - if (opProj) - { - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y1)); - opProj->Mult(y1); - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y1)); - } - - opK->Mult(x1, z); - opC->AddMult(x2, z, std::complex(gamma, 0.0)); - opInv->Mult(z, y2); - y2 *= -1.0 / (gamma * gamma); - if (opProj) - { - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y2)); - opProj->Mult(y2); - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y2)); - } - } - else - { - y2.AXPBYPCZ(sigma, x1, gamma, x2, 0.0); // Just temporarily - opM->Mult(y2, z); - opC->AddMult(x1, z, std::complex(1.0, 0.0)); - opInv->Mult(z, y1); - y1 *= -gamma; - if (opProj) - { - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y1)); - opProj->Mult(y1); - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y1)); - } - - y2.AXPBYPCZ(sigma / gamma, y1, 1.0, x1, 0.0); - if (opProj) - { - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y2)); - opProj->Mult(y2); - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y2)); - } - } - y1.Get(py, n); - y2.Get(py + n, n); -} - -void ArpackPEPSolver::ApplyOpB(const std::complex *px, - std::complex *py) const -{ - MFEM_VERIFY(opB, "No B operator for weighted inner product in ARPACK solve!"); - x1.Set(px, n); - x2.Set(px + n, n); - opB->Mult(x1.Real(), y1.Real()); - opB->Mult(x1.Imag(), y1.Imag()); - opB->Mult(x2.Real(), y2.Real()); - opB->Mult(x2.Imag(), y2.Imag()); - y1 *= delta * gamma * gamma; - y2 *= delta * gamma * gamma; - y1.Get(py, n); - y2.Get(py + n, n); -} - -double ArpackPEPSolver::GetBackwardScaling(std::complex l) const -{ - // Make sure not to use norms from scaling as this can be confusing if they are different. - // Note that SLEPc uses ||.||∞, not the 2-norm. - if (normK <= 0.0) - { - normK = linalg::SpectralNorm(comm, *opK, opK->IsReal()); - } - if (normC <= 0.0) - { - normC = linalg::SpectralNorm(comm, *opC, opC->IsReal()); - } - if (normM <= 0.0) - { - normM = linalg::SpectralNorm(comm, *opM, opM->IsReal()); - } - double t = std::abs(l); - return normK + t * normC + t * t * normM; -} - -void ArpackPEPSolver::ExtractEigenvector(std::complex l, - const std::complex *py, - std::complex *px) const -{ - // Select the most accurate x for y = [x₁; x₂] from the linearized eigenvalue problem. Or, - // just take x = x₁. - x1.Set(py, n); - if (opB) - { - linalg::Normalize(comm, x1, *opB, y1); - } - else - { - linalg::Normalize(comm, x1); - } - x1.Get(px, n); -} - -} // namespace palace::arpack - - -#endif +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "arpack.hpp" + +#if defined(PALACE_WITH_ARPACK) + +#include +#include +#include +// clang-format off +#include // ARPACK headers +#include +#include +// clang-format on +#include "linalg/divfree.hpp" +#include "utils/communication.hpp" + +using namespace std::complex_literals; + +namespace +{ + +void CheckInfoAUPD(a_int info) +{ + if (info != 0) + { + std::string msg = "ARPACK pznaupd error: "; + switch (info) + { + case 1: + msg += "Maximum number of iterations taken, all possible eigenvalues " + "have been found!"; + break; + case 2: + msg += "No longer an informational error (deprecated starting with " + "release 2 of ARPACK)!"; + break; + case 3: + msg += "No shifts could be applied during a cycle of the Implicitly " + "restarted Arnoldi iteration!"; + break; + case -1: + msg += "N must be positive!"; + break; + case -2: + msg += "NEV must be positive!"; + break; + case -3: + msg += "NCV <= NEV + 1 or NCV > N!"; + break; + case -4: + msg += "The maximum number of Arnoldi update iterations allowed must " + "be greater than zero!"; + break; + case -5: + msg += "WHICH must be one of 'LM', 'SM', 'LR', 'SR', 'LI', 'SI'"; + break; + case -6: + msg += "BMAT must be one of 'I' or 'G'!"; + break; + case -7: + msg += "Length of private work array WORKL is not sufficient!"; + break; + case -8: + msg += "Error return from LAPACK eigenvalue calculation!"; + break; + case -9: + msg += "Starting vector is zero!"; + break; + case -10: + msg += "IPARAM(7) must be 1, 2, or 3!"; + break; + case -11: + msg += "IPARAM(7) = 1 and BMAT = 'G' are incompatible!"; + break; + case -12: + msg += "IPARAM(1) must be equal to 0 or 1!"; + break; + case -9999: + msg += "Could not build an Arnoldi factorization!"; + break; + default: + msg += "Unknown ARPACK error message!"; + break; + } + MFEM_ABORT(msg.c_str()); + } +} + +void CheckInfoEUPD(a_int info) +{ + if (info != 0) + { + std::string msg = "ARPACK pzneupd error: "; + switch (info) + { + case 1: + msg += "The Schur form computed by LAPACK routine csheqr could not " + "be reordered by LAPACK routine ztrsen!"; + break; + case -1: + msg += "N must be positive!"; + break; + case -2: + msg += "NEV must be positive!"; + break; + case -3: + msg += "NCV <= NEV + 1 or NCV > N!"; + break; + case -4: + msg += "The maximum number of Arnoldi update iterations allowed must " + "be greater than zero!"; + break; + case -5: + msg += "WHICH must be one of 'LM', 'SM', 'LR', 'SR', 'LI', 'SI'"; + break; + case -6: + msg += "BMAT must be one of 'I' or 'G'!"; + break; + case -7: + msg += "Length of private work array WORKL is not sufficient!"; + break; + case -8: + msg += "Error return from LAPACK eigenvalue calculation!"; + break; + case -9: + msg += "Error return from calculation of eigenvectors!"; + break; + case -10: + msg += "IPARAM(7) must be 1, 2, or 3!"; + break; + case -11: + msg += "IPARAM(7) = 1 and BMAT = 'G' are incompatible!"; + break; + case -12: + msg += "HOWMNY = 'S' not yet implemented!"; + break; + case -13: + msg += "HOWMNY must be one of 'A' or 'P' if RVEC = true!"; + break; + case -14: + msg += "PZNAUPD did not find any eigenvalues to sufficient accuracy!"; + break; + case -15: + msg += "ZNEUPD got a different count of the number of converged Ritz " + "values than ZNAUPD got!"; + break; + default: + msg += "Unknown ARPACK error message!"; + break; + } + MFEM_ABORT(msg.c_str()); + } +} + +} // namespace + +namespace palace::arpack +{ + +// Base class methods. + +ArpackEigenvalueSolver::ArpackEigenvalueSolver(MPI_Comm comm, int print) + : comm(comm), print(print) +{ + // Initialization. + info = 0; + nev = ncv = n = 0; + rtol = 0.0; + arpack_it = 0; + which_type = WhichType::LARGEST_MAGNITUDE; + gamma = delta = 1.0; + sinvert = false; + sigma = 0.0; + + opInv = nullptr; + opProj = nullptr; + opB = nullptr; + + // Configure debugging output. + a_int logfill = 6, ndigit = -6, mgetv0 = 0; + a_int _aupd = (print > 2) ? 1 : 0, _aup2 = (print > 2) ? 2 : ((print > 0) ? 1 : 0), + _aitr = 0, _eigh = 0, _gets = 0, _apps = 0, _eupd = 0; + debug_c(logfill, ndigit, mgetv0, _aupd, _aup2, _aitr, _eigh, _gets, _apps, _eupd, _aupd, + _aup2, _aitr, _eigh, _gets, _apps, _eupd, _aupd, _aup2, _aitr, _eigh, _gets, + _apps, _eupd); + cstatn_c(); +} + +void ArpackEigenvalueSolver::SetLinearSolver(ComplexKspSolver &ksp) +{ + opInv = &ksp; +} + +void ArpackEigenvalueSolver::SetDivFreeProjector( + const DivFreeSolver &divfree) +{ + opProj = &divfree; +} + +void ArpackEigenvalueSolver::SetBMat(const Operator &B) +{ + MFEM_VERIFY(!opB || opB->Height() == B.Height(), + "Invalid modification of eigenvalue problem size!"); + opB = &B; +} + +void ArpackEigenvalueSolver::SetNumModes(int num_eig, int num_vec) +{ + if (nev > 0 && num_eig != nev) + { + eig.reset(); + perm.reset(); + res.reset(); + xscale.reset(); + } + if (ncv > 0 && num_vec != ncv) + { + V.reset(); + } + nev = num_eig; + ncv = (num_vec > 0) ? num_vec : std::max(20, 2 * nev + 1); // Default from SLEPc +} + +void ArpackEigenvalueSolver::SetTol(double tol) +{ + rtol = tol; +} + +void ArpackEigenvalueSolver::SetMaxIter(int max_it) +{ + arpack_it = max_it; +} + +void ArpackEigenvalueSolver::SetWhichEigenpairs(EigenvalueSolver::WhichType type) +{ + which_type = type; +} + +void ArpackEigenvalueSolver::SetShiftInvert(std::complex s, bool precond) +{ + MFEM_VERIFY(!precond, "ARPACK eigenvalue solver does not support preconditioned " + "spectral transformation option!"); + sigma = s; + sinvert = true; +} + +void ArpackEigenvalueSolver::SetInitialSpace(const ComplexVector &v) +{ + MFEM_VERIFY( + n > 0, + "Must call SetOperators before using SetInitialSpace for ARPACK eigenvalue solver!"); + if (!r) + { + r = std::make_unique[]>(n); + } + MFEM_VERIFY(v.Size() == n, "Invalid size mismatch for provided initial space vector!"); + v.Get(r.get(), n, false); + info = 1; +} + +int ArpackEigenvalueSolver::SolveInternal(int n, std::complex *r, + std::complex *V, + std::complex *eig, int *perm) +{ + MPI_Fint fcomm = MPI_Comm_c2f(comm); + a_int iparam[11] = {0}; + iparam[0] = 1; // Exact shifts + iparam[2] = (a_int)arpack_it; // Maximum number of Arnoldi iterations + iparam[3] = 1; // Block size + iparam[4] = 0; // Number of converged Ritz values + iparam[6] = sinvert ? 3 : 1; // Problem mode + + ::arpack::bmat bmat_option = + (opB) ? ::arpack::bmat::generalized : ::arpack::bmat::identity; + + ::arpack::which which_option = ::arpack::which::largest_magnitude; + switch (which_type) + { + case WhichType::LARGEST_MAGNITUDE: + case WhichType::TARGET_MAGNITUDE: + which_option = ::arpack::which::largest_magnitude; + break; + case WhichType::SMALLEST_MAGNITUDE: + which_option = ::arpack::which::smallest_magnitude; + break; + case WhichType::LARGEST_REAL: + which_option = ::arpack::which::largest_real; + break; + case WhichType::SMALLEST_REAL: + which_option = ::arpack::which::smallest_real; + break; + case WhichType::LARGEST_IMAGINARY: + which_option = ::arpack::which::largest_imaginary; + break; + case WhichType::SMALLEST_IMAGINARY: + which_option = ::arpack::which::smallest_imaginary; + break; + case WhichType::TARGET_REAL: + case WhichType::TARGET_IMAGINARY: + MFEM_ABORT("ARPACK eigenvalue solver does not implement TARGET_REAL or " + "TARGET_IMAGINARY for SetWhichEigenpairs!"); + break; + } + + // Allocate work arrays. + a_int lworkl = 3 * ncv * ncv + 5 * ncv; + auto workd = std::make_unique[]>(3 * n); + auto workl = std::make_unique[]>(lworkl); + auto rwork = std::make_unique(ncv); + + // Begin RCI loop. + a_int ido = 0, ainfo = (a_int)info, ipntr[14] = {0}; + while (true) + { + // Call complex problem driver. + naupd(fcomm, ido, bmat_option, (a_int)n, which_option, (a_int)nev, rtol, r, (a_int)ncv, + V, (a_int)n, iparam, ipntr, workd.get(), workl.get(), lworkl, rwork.get(), ainfo); + CheckInfoAUPD(ainfo); + + // We never use pre-computed B * x in workd[ipntr[2] - 1]. + if (ido == 1 || ido == -1) + { + ApplyOp(&workd.get()[ipntr[0] - 1], &workd.get()[ipntr[1] - 1]); + } + else if (ido == 2) + { + ApplyOpB(&workd.get()[ipntr[0] - 1], &workd.get()[ipntr[1] - 1]); + } + else if (ido == 99) + { + break; + } + else + { + MFEM_ABORT("Internal error in ARPACK RCI interface!"); + } + } + + // Print some log information. + int num_it = (int)iparam[2]; + int num_conv = (int)iparam[4]; + if (print > 0) + { + Mpi::Print(comm, + "\n ARPACK {} eigensolve {} ({:d} eigenpairs); iterations {:d}\n" + " Total number of linear systems solved: {:d}\n" + " Total number of linear solver iterations: {:d}\n", + GetName(), (num_conv >= nev) ? "converged" : "finished", num_conv, num_it, + opInv->NumTotalMult(), opInv->NumTotalMultIterations()); + } + if (num_conv < nev) + { + Mpi::Warning( + comm, "ARPACK eigenvalue solver found only {:d} of requested {:d} eigenvalues!\n", + num_conv, nev); + } + + // Postprocess eigenvalues and eigenvectors. + a_int rvec = 1; + ::arpack::howmny howmny_option = ::arpack::howmny::ritz_vectors; + + // Allocate eigenvalue storage and work arrays. + auto select = std::make_unique(ncv); + auto workev = std::make_unique[]>(2 * ncv); + + // Call complex problem driver. + neupd(fcomm, rvec, howmny_option, select.get(), eig, V, (a_int)n, sigma / gamma, + workev.get(), bmat_option, (a_int)n, which_option, (a_int)nev, rtol, r, (a_int)ncv, + V, (a_int)n, iparam, ipntr, workd.get(), workl.get(), lworkl, rwork.get(), ainfo); + CheckInfoEUPD(ainfo); + + // Unscale and properly sort the eigenvalues. + auto CompareReal = [&eig](const int &l, const int &r) + { return eig[l].real() < eig[r].real(); }; + auto CompareImag = [&eig](const int &l, const int &r) + { return eig[l].imag() < eig[r].imag(); }; + auto CompareAbs = [&eig](const int &l, const int &r) + { return std::abs(eig[l]) < std::abs(eig[r]); }; + for (int i = 0; i < nev; i++) + { + eig[i] = eig[i] * gamma; + perm[i] = i; + } + if (which_option == ::arpack::which::largest_real || + which_option == ::arpack::which::smallest_real) + { + std::sort(perm, perm + nev, CompareReal); + } + else if (which_option == ::arpack::which::largest_imaginary || + which_option == ::arpack::which::smallest_imaginary) + { + std::sort(perm, perm + nev, CompareImag); + } + else + { + std::sort(perm, perm + nev, CompareAbs); + } + + return num_conv; +} + +void ArpackEigenvalueSolver::CheckParameters() const +{ + MFEM_VERIFY(n > 0, "Operators are not set for ARPACK eigenvalue solver!"); + MFEM_VERIFY(nev > 0, "Number of requested modes is not positive!"); + MFEM_VERIFY(rtol > 0.0, "Eigensolver tolerance is not positive!"); + MFEM_VERIFY(opInv, "No linear solver provided for operator!"); +} + +std::complex ArpackEigenvalueSolver::GetEigenvalue(int i) const +{ + MFEM_VERIFY(eig && i >= 0 && i < nev, + "Out of range eigenpair requested (i = " << i << ", nev = " << nev << ")!"); + const int &j = perm.get()[i]; + return eig.get()[j]; +} + +void ArpackEigenvalueSolver::GetEigenvector(int i, ComplexVector &x) const +{ + MFEM_VERIFY(eig && i >= 0 && i < nev, + "Out of range eigenpair requested (i = " << i << ", nev = " << nev << ")!"); + MFEM_VERIFY(x.Size() == n, "Invalid size mismatch for provided eigenvector!"); + const int &j = perm.get()[i]; + x.Set(V.get() + j * n, n, false); + if (xscale.get()[j] > 0.0) + { + x *= xscale.get()[j]; + } +} + +double ArpackEigenvalueSolver::GetEigenvectorNorm(const ComplexVector &x, + ComplexVector &Bx) const +{ + if (opB) + { + return linalg::Norml2(comm, x, *opB, Bx); + } + else + { + return linalg::Norml2(comm, x); + } +} + +double ArpackEigenvalueSolver::GetError(int i, EigenvalueSolver::ErrorType type) const +{ + MFEM_VERIFY(eig && i >= 0 && i < nev, + "Out of range eigenpair requested (i = " << i << ", nev = " << nev << ")!"); + const int &j = perm.get()[i]; + switch (type) + { + case ErrorType::ABSOLUTE: + return res.get()[j]; + case ErrorType::RELATIVE: + return res.get()[j] / std::abs(eig.get()[j]); + case ErrorType::BACKWARD: + return res.get()[j] / GetBackwardScaling(eig.get()[j]); + } + return 0.0; +} + +void ArpackEigenvalueSolver::RescaleEigenvectors(int num_eig) +{ + res = std::make_unique(num_eig); + xscale = std::make_unique(num_eig); + for (int i = 0; i < num_eig; i++) + { + x1.Set(V.get() + i * n, n, false); + xscale.get()[i] = 1.0 / GetEigenvectorNorm(x1, y1); + res.get()[i] = GetResidualNorm(eig.get()[i], x1, y1) / linalg::Norml2(comm, x1); + } +} + +// EPS specific methods. + +ArpackEPSSolver::ArpackEPSSolver(MPI_Comm comm, int print) + : ArpackEigenvalueSolver(comm, print) +{ + opK = opM = nullptr; + normK = normM = 0.0; +} + +void ArpackEPSSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &M, + EigenvalueSolver::ScaleType type) +{ + MFEM_VERIFY(!opK || K.Height() == n, "Invalid modification of eigenvalue problem size!"); + bool first = (opK == nullptr); + opK = &K; + opM = &M; + if (first && type != ScaleType::NONE) + { + normK = linalg::SpectralNorm(comm, *opK, opK->IsReal()); + normM = linalg::SpectralNorm(comm, *opM, opM->IsReal()); + MFEM_VERIFY(normK >= 0.0 && normM >= 0.0, "Invalid matrix norms for EPS scaling!"); + if (normK > 0 && normM > 0.0) + { + gamma = normK / normM; // Store γ² for linear problem + delta = 2.0 / normK; + } + } + + // Set up workspace. + x1.SetSize(opK->Height()); + y1.SetSize(opK->Height()); + z1.SetSize(opK->Height()); + x1.UseDevice(true); + y1.UseDevice(true); + z1.UseDevice(true); + n = opK->Height(); +} + +int ArpackEPSSolver::Solve() +{ + // Set some defaults (default maximum iterations from SLEPc). + CheckParameters(); + HYPRE_BigInt N = linalg::GlobalSize(comm, z1); + if (ncv > N) + { + ncv = mfem::internal::to_int(N); + } + if (arpack_it <= 0) + { + arpack_it = std::max(300, mfem::internal::to_int(2 * N / ncv)); + } + + // Initialize if user did not provide an initial space. + if (!r) + { + r = std::make_unique[]>(n); + info = 0; + } + if (!info) + { + std::fill(r.get(), r.get() + n, 0.0); + } + + // Allocate Arnoldi basis for the problem. + if (!V) + { + V = std::make_unique[]>(n * ncv); + } + + // Allocate storage for eigenvalues and residual norms. + if (!eig) + { + eig = std::make_unique[]>(nev + 1); + perm = std::make_unique(nev); + } + + // Solve the generalized eigenvalue problem. + int num_conv = SolveInternal(n, r.get(), V.get(), eig.get(), perm.get()); + + // Compute the eigenpair residuals: || (K - λ M) x ||₂ for eigenvalue λ. + RescaleEigenvectors(nev); + + // Reset for next solve. + info = 0; + return num_conv; +} + +void ArpackEPSSolver::ApplyOp(const std::complex *px, + std::complex *py) const +{ + // Case 1: No spectral transformation (opInv = M⁻¹) + // y = M⁻¹ K x . + // Case 2: Shift-and-invert spectral transformation (opInv = (K - σ M)⁻¹) + // y = (K - σ M)⁻¹ M x . + // The input pointers are always to host memory (ARPACK runs on host). + x1.Set(px, n, false); + if (!sinvert) + { + opK->Mult(x1, z1); + opInv->Mult(z1, y1); + y1 *= 1.0 / gamma; + } + else + { + opM->Mult(x1, z1); + opInv->Mult(z1, y1); + y1 *= gamma; + } + if (opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y1)); + opProj->Mult(y1); + // Mpi::Print(" After projection: {:e}\n", linalg::Norml2(comm, y1)); + } + y1.Get(py, n, false); +} + +void ArpackEPSSolver::ApplyOpB(const std::complex *px, + std::complex *py) const +{ + MFEM_VERIFY(opB, "No B operator for weighted inner product in ARPACK solve!"); + x1.Set(px, n, false); + opB->Mult(x1.Real(), y1.Real()); + opB->Mult(x1.Imag(), y1.Imag()); + y1 *= delta * gamma; + y1.Get(py, n, false); +} + +double ArpackEPSSolver::GetResidualNorm(std::complex l, const ComplexVector &x, + ComplexVector &r) const +{ + // Compute the i-th eigenpair residual: || (K - λ M) x ||₂ for eigenvalue λ. + opK->Mult(x, r); + opM->AddMult(x, r, -l); + return linalg::Norml2(comm, r); +} + +double ArpackEPSSolver::GetBackwardScaling(std::complex l) const +{ + // Make sure not to use norms from scaling as this can be confusing if they are different. + // Note that SLEPc uses ||.||∞, not the 2-norm. + if (normK <= 0.0) + { + normK = linalg::SpectralNorm(comm, *opK, opK->IsReal()); + } + if (normM <= 0.0) + { + normM = linalg::SpectralNorm(comm, *opM, opM->IsReal()); + } + return normK + std::abs(l) * normM; +} + +// PEP specific methods. + +ArpackPEPSolver::ArpackPEPSolver(MPI_Comm comm, int print) + : ArpackEigenvalueSolver(comm, print) +{ + opK = opC = opM = nullptr; + normK = normC = normM = 0.0; +} + +void ArpackPEPSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, + EigenvalueSolver::ScaleType type) +{ + MFEM_VERIFY(!opK || K.Height() == n, "Invalid modification of eigenvalue problem size!"); + bool first = (opK == nullptr); + opK = &K; + opC = &C; + opM = &M; + if (first && type != ScaleType::NONE) + { + normK = linalg::SpectralNorm(comm, *opK, opK->IsReal()); + normC = linalg::SpectralNorm(comm, *opC, opC->IsReal()); + normM = linalg::SpectralNorm(comm, *opM, opM->IsReal()); + MFEM_VERIFY(normK >= 0.0 && normC >= 0.0 && normM >= 0.0, + "Invalid matrix norms for PEP scaling!"); + if (normK > 0 && normC >= 0.0 && normM > 0.0) + { + gamma = std::sqrt(normK / normM); + delta = 2.0 / (normK + gamma * normC); + } + } + + // Set up workspace. + x1.SetSize(opK->Height()); + x2.SetSize(opK->Height()); + y1.SetSize(opK->Height()); + y2.SetSize(opK->Height()); + z1.SetSize(opK->Height()); + x1.UseDevice(true); + x2.UseDevice(true); + y1.UseDevice(true); + y2.UseDevice(true); + z1.UseDevice(true); + n = opK->Height(); +} + +int ArpackPEPSolver::Solve() +{ + // Set some defaults (from SLEPc ARPACK interface). The problem size is the size of the + // 2x2 block linearized problem. + CheckParameters(); + HYPRE_BigInt N = linalg::GlobalSize(comm, z1); + if (ncv > 2 * N) + { + ncv = mfem::internal::to_int(2 * N); + } + if (arpack_it <= 0) + { + arpack_it = std::max(300, mfem::internal::to_int(4 * N / ncv)); + } + + // Initialize if user did not provide an initial space. + if (!r) + { + r = std::make_unique[]>(n); + info = 0; + } + if (!info) + { + std::fill(r.get(), r.get() + n, 0.0); + } + auto s = std::make_unique[]>(2 * n); + std::copy(r.get(), r.get() + n, s.get()); + std::fill(s.get() + n, s.get() + 2 * n, 0.0); + + // Allocate Arnoldi basis for original and linearized problem. + if (!V) + { + V = std::make_unique[]>(n * ncv); + } + auto W = std::make_unique[]>(2 * n * ncv); + + // Allocate storage for eigenvalues and residual norms. + if (!eig) + { + eig = std::make_unique[]>(nev + 1); + perm = std::make_unique(nev); + } + + // Solve the linearized eigenvalue problem. + int num_conv = SolveInternal(2 * n, s.get(), W.get(), eig.get(), perm.get()); + + // Extract the eigenvector from the linearized problem and compute the eigenpair + // residuals: || P(λ) x ||₂ = || (K + λ C + λ² M) x ||₂ for eigenvalue λ. For the + // linearized problem, select the most accurate x for y = [x₁; x₂]. Or, just take x = x₁. + for (int i = 0; i < nev; i++) + { + std::copy(W.get() + i * 2 * n, W.get() + (i * 2 + 1) * n, V.get() + i * n); + } + RescaleEigenvectors(nev); + + // Reset for next solve. + info = 0; + return num_conv; +} + +void ArpackPEPSolver::ApplyOp(const std::complex *px, + std::complex *py) const +{ + // Case 1: No spectral transformation (opInv = M⁻¹) + // y = L₁⁻¹ L₀ x . + // Case 2: Shift-and-invert spectral transformation (opInv = P(σ)⁻¹) + // y = (L₀ - σ L₁)⁻¹ L₁ x . + // With: + // L₀ = [ -K 0 ] L₁ = [ C M ] + // [ 0 M ] , [ M 0 ] . + // The input pointers are always to host memory (ARPACK runs on host). + x1.Set(px, n, false); + x2.Set(px + n, n, false); + if (!sinvert) + { + y1 = x2; + if (opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y1)); + opProj->Mult(y1); + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y1)); + } + + opK->Mult(x1, z1); + if (opC) + { + opC->AddMult(x2, z1, gamma + 0.0i); + } + opInv->Mult(z1, y2); + y2 *= -1.0 / (gamma * gamma); + if (opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y2)); + opProj->Mult(y2); + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y2)); + } + } + else + { + y2.AXPBYPCZ(sigma, x1, gamma, x2, 0.0); // Just temporarily + opM->Mult(y2, z1); + if (opC) + { + opC->AddMult(x1, z1, 1.0 + 0.0i); + } + opInv->Mult(z1, y1); + y1 *= -gamma; + if (opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y1)); + opProj->Mult(y1); + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y1)); + } + + y2.AXPBYPCZ(sigma / gamma, y1, 1.0, x1, 0.0); + if (opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y2)); + opProj->Mult(y2); + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(comm, y2)); + } + } + y1.Get(py, n, false); + y2.Get(py + n, n, false); +} + +void ArpackPEPSolver::ApplyOpB(const std::complex *px, + std::complex *py) const +{ + MFEM_VERIFY(opB, "No B operator for weighted inner product in ARPACK solve!"); + x1.Set(px, n, false); + x2.Set(px + n, n, false); + opB->Mult(x1.Real(), y1.Real()); + opB->Mult(x1.Imag(), y1.Imag()); + opB->Mult(x2.Real(), y2.Real()); + opB->Mult(x2.Imag(), y2.Imag()); + y1 *= delta * gamma * gamma; + y2 *= delta * gamma * gamma; + y1.Get(py, n, false); + y2.Get(py + n, n, false); +} + +double ArpackPEPSolver::GetResidualNorm(std::complex l, const ComplexVector &x, + ComplexVector &r) const +{ + // Compute the i-th eigenpair residual: || P(λ) x ||₂ = || (K + λ C + λ² M) x ||₂ for + // eigenvalue λ. + opK->Mult(x, r); + if (opC) + { + opC->AddMult(x, r, l); + } + opM->AddMult(x, r, l * l); + return linalg::Norml2(comm, r); +} + +double ArpackPEPSolver::GetBackwardScaling(std::complex l) const +{ + // Make sure not to use norms from scaling as this can be confusing if they are different. + // Note that SLEPc uses ||.||∞, not the 2-norm. + if (normK <= 0.0) + { + normK = linalg::SpectralNorm(comm, *opK, opK->IsReal()); + } + if (normC <= 0.0 && opC) + { + normC = linalg::SpectralNorm(comm, *opC, opC->IsReal()); + } + if (normM <= 0.0) + { + normM = linalg::SpectralNorm(comm, *opM, opM->IsReal()); + } + double t = std::abs(l); + return normK + t * normC + t * t * normM; +} + +} // namespace palace::arpack + +#endif diff --git a/palace/linalg/arpack.hpp b/palace/linalg/arpack.hpp index 25771b7358..a63054ab9b 100644 --- a/palace/linalg/arpack.hpp +++ b/palace/linalg/arpack.hpp @@ -1,236 +1,244 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_ARPACK_HPP -#define PALACE_LINALG_ARPACK_HPP - -#if defined(PALACE_WITH_ARPACK) - -#include -#include -#include -#include "linalg/eps.hpp" -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" - -namespace palace -{ - -class DivFreeSolver; - -namespace arpack -{ - -// -// A wrapper for the ARPACK/PARPACK library for generalized linear eigenvalue problems or -// quadratic polynomial eigenvalue problems. Shift-and-invert spectral transformations are -// used to compute interior eigenvalues. Currently only implemented for complex scalar -// interface. -// -class ArpackEigenvalueSolver : public EigenvalueSolver -{ -protected: - // MPI communicator for PARPACK. - MPI_Comm comm; - - // Control print level for debugging. - int print; - - // Status variable for ARPACK. - int info; - - // Number eigenvalues to be computed, subspace dimension, and problem size. - int nev, ncv, n; - - // Relative eigenvalue error convergence tolerance for the solver. - double rtol; - - // Maximum number of Arnoldi update iterations. - int arpack_it; - - // Specifies which part of the spectrum to search for. - EigenvalueSolver::WhichType which_type; - - // Variables for scaling, from Higham et al., IJNME 2008. - double gamma, delta; - - // Parameters defining the spectral transformation. - std::complex sigma; - bool sinvert; - - // Storage for computed eigenvalues. - std::unique_ptr[]> eig; - std::unique_ptr perm; - - // Storage for Arnoldi basis vectors. - std::unique_ptr[]> V; - - // Storage for computed residual norms. - std::unique_ptr res; - - // On input used to define optional initial guess, on output stores final residual - // vector. - std::unique_ptr[]> r; - - // Reference to linear solver used for operator action for M⁻¹ (with no spectral - // transformation) or (K - σ M)⁻¹ (generalized EVP with shift-and- invert) or P(σ)⁻¹ - // (polynomial with shift-and-invert) (not owned). - const ComplexKspSolver *opInv; - - // Reference to solver for projecting an intermediate vector onto a divergence-free space - // (not owned). - const DivFreeSolver *opProj; - - // Reference to matrix used for weighted inner products (not owned). May be nullptr, in - // which case identity is used. - const Operator *opB; - - // Perform the ARPACK RCI loop. - int SolveInternal(int n, std::complex *r, std::complex *V, - std::complex *eig, int *perm); - - // Helper routine for parameter checking. - void CheckParameters() const; - - // Helper routines for ARPACK RCI. - virtual void ApplyOp(const std::complex *px, std::complex *py) const = 0; - virtual void ApplyOpB(const std::complex *px, std::complex *py) const = 0; - - // Helper routine for computing the backward error. - virtual double GetBackwardScaling(std::complex l) const = 0; - - // Return problem type name. - virtual const char *GetName() const = 0; - -public: - ArpackEigenvalueSolver(MPI_Comm comm, int print); - - // Set operators for the generalized eigenvalue problem or for the quadratic polynomial - // eigenvalue problem. - void SetOperators(const ComplexOperator &K, const ComplexOperator &M, - ScaleType type) override; - void SetOperators(const ComplexOperator &K, const ComplexOperator &C, - const ComplexOperator &M, ScaleType type) override; - - // For the linear generalized case, the linear solver should be configured to compute the - // action of M⁻¹ (with no spectral transformation) or (K - σ M)⁻¹. For the quadratic - // case, the linear solver should be configured to compute the action of M⁻¹ (with no - // spectral transformation) or P(σ)⁻¹. - void SetLinearSolver(const ComplexKspSolver &ksp) override; - - // Set the projection operator for enforcing the divergence-free constraint. - void SetDivFreeProjector(const DivFreeSolver &divfree) override; - - // Set optional B matrix used for weighted inner products. This must be set explicitly - // even for generalized problems, otherwise the identity will be used. - void SetBMat(const Operator &B) override; - - // Get scaling factors used by the solver. - double GetScalingGamma() const override { return gamma; } - double GetScalingDelta() const override { return delta; } - - // Set the number of required eigenmodes. - void SetNumModes(int num_eig, int num_vec = 0) override; - - // Set solver tolerance. - void SetTol(double tol) override; - - // Set maximum number of Arnoldi update iterations. - void SetMaxIter(int max_it) override; - - // Set target spectrum for the eigensolver. When a spectral transformation is used, this - // applies to the spectrum of the shifted operator. - void SetWhichEigenpairs(WhichType type) override; - - // Set shift-and-invert spectral transformation. - void SetShiftInvert(std::complex s, bool precond = false) override; - - // Set an initial vector for the solution subspace. - void SetInitialSpace(const ComplexVector &v) override; - - // Solve the eigenvalue problem. Returns the number of converged eigenvalues. - int Solve() override = 0; - - // Get the corresponding eigenvalue. - std::complex GetEigenvalue(int i) const override; - - // Get the corresponding eigenvector. - void GetEigenvector(int i, ComplexVector &x) const override; - - // Get the corresponding eigenpair error. - double GetError(int i, ErrorType type) const override; -}; - -// Generalized eigenvalue problem solver: K x = λ M x . -class ArpackEPSSolver : public ArpackEigenvalueSolver -{ -private: - // References to matrices defining the generalized eigenvalue problem (not owned). - const ComplexOperator *opK, *opM; - - // Operator norms for scaling. - mutable double normK, normM; - - // Workspace vector for operator applications. - mutable ComplexVector x, y, z; - -protected: - void ApplyOp(const std::complex *px, std::complex *py) const override; - void ApplyOpB(const std::complex *px, std::complex *py) const override; - - double GetBackwardScaling(std::complex l) const override; - - const char *GetName() const override { return "EPS"; } - -public: - ArpackEPSSolver(MPI_Comm comm, int print); - - void SetOperators(const ComplexOperator &K, const ComplexOperator &M, - ScaleType type) override; - - int Solve() override; -}; - -// Quadratic eigenvalue problem solver: P(λ) x = (K + λ C + λ² M) x = 0 . -class ArpackPEPSolver : public ArpackEigenvalueSolver -{ -private: - // References to matrices defining the quadratic polynomial eigenvalue problem - // (not owned). - const ComplexOperator *opK, *opC, *opM; - - // Operator norms for scaling. - mutable double normK, normC, normM; - - // Workspace vectors for operator applications. - mutable ComplexVector x1, x2, y1, y2, z; - - // Do eigenvector extraction from the linearized problem to the actual eigenvectors. - void ExtractEigenvector(std::complex l, const std::complex *py, - std::complex *px) const; - -protected: - void ApplyOp(const std::complex *px, std::complex *py) const override; - void ApplyOpB(const std::complex *px, std::complex *py) const override; - - double GetBackwardScaling(std::complex l) const override; - - const char *GetName() const override { return "PEP"; } - -public: - ArpackPEPSolver(MPI_Comm comm, int print); - - void SetOperators(const ComplexOperator &K, const ComplexOperator &C, - const ComplexOperator &M, ScaleType type) override; - - int Solve() override; -}; - -} // namespace arpack - -} // namespace palace - -#endif - -#endif // PALACE_LINALG_ARPACK_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_ARPACK_HPP +#define PALACE_LINALG_ARPACK_HPP + +#if defined(PALACE_WITH_ARPACK) + +#include +#include +#include +#include "linalg/eps.hpp" +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +namespace arpack +{ + +// +// A wrapper for the ARPACK/PARPACK library for generalized linear eigenvalue problems or +// quadratic polynomial eigenvalue problems. Shift-and-invert spectral transformations are +// used to compute interior eigenvalues. Currently only implemented for complex scalar +// interface. +// +class ArpackEigenvalueSolver : public EigenvalueSolver +{ +protected: + // MPI communicator for PARPACK. + MPI_Comm comm; + + // Control print level for debugging. + int print; + + // Status variable for ARPACK. + int info; + + // Number eigenvalues to be computed, subspace dimension, and problem size. + int nev, ncv, n; + + // Relative eigenvalue error convergence tolerance for the solver. + double rtol; + + // Maximum number of Arnoldi update iterations. + int arpack_it; + + // Specifies which part of the spectrum to search for. + EigenvalueSolver::WhichType which_type; + + // Variables for scaling, from Higham et al., IJNME 2008. + double gamma, delta; + + // Parameters defining the spectral transformation. + std::complex sigma; + bool sinvert; + + // Storage for computed eigenvalues. + std::unique_ptr[]> eig; + std::unique_ptr perm; + + // Storage for Arnoldi basis vectors. + std::unique_ptr[]> V; + + // Storage for computed residual norms and eigenvector scalings. + std::unique_ptr res, xscale; + + // On input used to define optional initial guess, on output stores final residual + // vector. + std::unique_ptr[]> r; + + // Reference to linear solver used for operator action for M⁻¹ (with no spectral + // transformation) or (K - σ M)⁻¹ (generalized EVP with shift-and- invert) or P(σ)⁻¹ + // (polynomial with shift-and-invert) (not owned). + const ComplexKspSolver *opInv; + + // Reference to solver for projecting an intermediate vector onto a divergence-free space + // (not owned). + const DivFreeSolver *opProj; + + // Reference to matrix used for weighted inner products (not owned). May be nullptr, in + // which case identity is used. + const Operator *opB; + + // Workspace vector for operator applications. + mutable ComplexVector x1, y1, z1; + + // Perform the ARPACK RCI loop. + int SolveInternal(int n, std::complex *r, std::complex *V, + std::complex *eig, int *perm); + + // Helper routine for parameter checking. + void CheckParameters() const; + + // Helper routines for ARPACK RCI. + virtual void ApplyOp(const std::complex *px, std::complex *py) const = 0; + virtual void ApplyOpB(const std::complex *px, std::complex *py) const = 0; + + // Helper routine for computing the eigenvector normalization. + double GetEigenvectorNorm(const ComplexVector &x, ComplexVector &Bx) const; + + // Helper routine for computing the eigenpair residual. + virtual double GetResidualNorm(std::complex l, const ComplexVector &x, + ComplexVector &r) const = 0; + + // Helper routine for computing the backward error. + virtual double GetBackwardScaling(std::complex l) const = 0; + + // Return problem type name. + virtual const char *GetName() const = 0; + +public: + ArpackEigenvalueSolver(MPI_Comm comm, int print); + + // For the linear generalized case, the linear solver should be configured to compute the + // action of M⁻¹ (with no spectral transformation) or (K - σ M)⁻¹. For the quadratic + // case, the linear solver should be configured to compute the action of M⁻¹ (with no + // spectral transformation) or P(σ)⁻¹. + void SetLinearSolver(ComplexKspSolver &ksp) override; + + // Set the projection operator for enforcing the divergence-free constraint. + void SetDivFreeProjector(const DivFreeSolver &divfree) override; + + // Set optional B matrix used for weighted inner products. This must be set explicitly + // even for generalized problems, otherwise the identity will be used. + void SetBMat(const Operator &B) override; + + // Get scaling factors used by the solver. + double GetScalingGamma() const override { return gamma; } + double GetScalingDelta() const override { return delta; } + + // Set the number of required eigenmodes. + void SetNumModes(int num_eig, int num_vec = 0) override; + + // Set solver tolerance. + void SetTol(double tol) override; + + // Set maximum number of Arnoldi update iterations. + void SetMaxIter(int max_it) override; + + // Set target spectrum for the eigensolver. When a spectral transformation is used, this + // applies to the spectrum of the shifted operator. + void SetWhichEigenpairs(WhichType type) override; + + // Set shift-and-invert spectral transformation. + void SetShiftInvert(std::complex s, bool precond = false) override; + + // Set an initial vector for the solution subspace. + void SetInitialSpace(const ComplexVector &v) override; + + // Solve the eigenvalue problem. Returns the number of converged eigenvalues. + int Solve() override = 0; + + // Get the corresponding eigenvalue. + std::complex GetEigenvalue(int i) const override; + + // Get the corresponding eigenvector. Eigenvectors are normalized such that ||x||₂ = 1, + // unless the B-matrix is set for weighted inner products. + void GetEigenvector(int i, ComplexVector &x) const override; + + // Get the corresponding eigenpair error. + double GetError(int i, ErrorType type) const override; + + // Re-normalize the given number of eigenvectors, for example if the matrix B for weighted + // inner products has changed. This does not perform re-orthogonalization with respect to + // the new matrix, only normalization. + void RescaleEigenvectors(int num_eig) override; +}; + +// Generalized eigenvalue problem solver: K x = λ M x . +class ArpackEPSSolver : public ArpackEigenvalueSolver +{ +private: + // References to matrices defining the generalized eigenvalue problem (not owned). + const ComplexOperator *opK, *opM; + + // Operator norms for scaling. + mutable double normK, normM; + +protected: + void ApplyOp(const std::complex *px, std::complex *py) const override; + void ApplyOpB(const std::complex *px, std::complex *py) const override; + + double GetResidualNorm(std::complex l, const ComplexVector &x, + ComplexVector &r) const override; + + double GetBackwardScaling(std::complex l) const override; + + const char *GetName() const override { return "EPS"; } + +public: + ArpackEPSSolver(MPI_Comm comm, int print); + + using ArpackEigenvalueSolver::SetOperators; + void SetOperators(const ComplexOperator &K, const ComplexOperator &M, + ScaleType type) override; + + int Solve() override; +}; + +// Quadratic eigenvalue problem solver: P(λ) x = (K + λ C + λ² M) x = 0 . +class ArpackPEPSolver : public ArpackEigenvalueSolver +{ +private: + // References to matrices defining the quadratic polynomial eigenvalue problem + // (not owned). + const ComplexOperator *opK, *opC, *opM; + + // Operator norms for scaling. + mutable double normK, normC, normM; + + // Workspace vectors for operator applications. + mutable ComplexVector x2, y2; + +protected: + void ApplyOp(const std::complex *px, std::complex *py) const override; + void ApplyOpB(const std::complex *px, std::complex *py) const override; + + double GetResidualNorm(std::complex l, const ComplexVector &x, + ComplexVector &r) const override; + + double GetBackwardScaling(std::complex l) const override; + + const char *GetName() const override { return "PEP"; } + +public: + ArpackPEPSolver(MPI_Comm comm, int print); + + using ArpackEigenvalueSolver::SetOperators; + void SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, ScaleType type) override; + + int Solve() override; +}; + +} // namespace arpack + +} // namespace palace + +#endif + +#endif // PALACE_LINALG_ARPACK_HPP diff --git a/palace/linalg/chebyshev.cpp b/palace/linalg/chebyshev.cpp index 63f90e1671..0d47f66441 100644 --- a/palace/linalg/chebyshev.cpp +++ b/palace/linalg/chebyshev.cpp @@ -1,302 +1,301 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "chebyshev.hpp" - -#include -#include "linalg/rap.hpp" - -namespace palace -{ - -namespace -{ - -double GetLambdaMax(MPI_Comm comm, const Operator &A, const Vector &dinv) -{ - DiagonalOperator Dinv(dinv); - ProductOperator DinvA(Dinv, A); - return linalg::SpectralNorm(comm, DinvA, false); -} - -double GetLambdaMax(MPI_Comm comm, const ComplexOperator &A, const ComplexVector &dinv) -{ - ComplexDiagonalOperator Dinv(dinv); - ComplexProductOperator DinvA(Dinv, A); - return linalg::SpectralNorm(comm, DinvA, false); -} - -template -inline void ApplyOp(const Operator &A, const Vector &x, Vector &y) -{ - A.Mult(x, y); -} - -template -inline void ApplyOp(const ComplexOperator &A, const ComplexVector &x, ComplexVector &y) -{ - if constexpr (!Transpose) - { - A.Mult(x, y); - } - else - { - A.MultHermitianTranspose(x, y); - } -} - -template -inline void ApplyOp(const Operator &A, const Vector &x, Vector &y, const double a) -{ - A.AddMult(x, y, a); -} - -template -inline void ApplyOp(const ComplexOperator &A, const ComplexVector &x, ComplexVector &y, - const double a) -{ - if constexpr (!Transpose) - { - A.AddMult(x, y, a); - } - else - { - A.AddMultHermitianTranspose(x, y, a); - } -} - -template -inline void ApplyOrder0(double sr, const Vector &dinv, const Vector &r, Vector &d) -{ - const int N = d.Size(); - const auto *DI = dinv.Read(); - const auto *R = r.Read(); - auto *D = d.Write(); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { D[i] = sr * DI[i] * R[i]; }); -} - -template -inline void ApplyOrder0(const double sr, const ComplexVector &dinv, const ComplexVector &r, - ComplexVector &d) -{ - const int N = dinv.Size(); - const auto *DIR = dinv.Real().Read(); - const auto *DII = dinv.Imag().Read(); - const auto *RR = r.Real().Read(); - const auto *RI = r.Imag().Read(); - auto *DR = d.Real().Write(); - auto *DI = d.Imag().Write(); - if constexpr (!Transpose) - { - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - DR[i] = sr * (DIR[i] * RR[i] - DII[i] * RI[i]); - DI[i] = sr * (DII[i] * RR[i] + DIR[i] * RI[i]); - }); - } - else - { - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - DR[i] = sr * (DIR[i] * RR[i] + DII[i] * RI[i]); - DI[i] = sr * (-DII[i] * RR[i] + DIR[i] * RI[i]); - }); - } -} - -template -inline void ApplyOrderK(const double sd, const double sr, const Vector &dinv, - const Vector &r, Vector &d) -{ - const int N = dinv.Size(); - const auto *DI = dinv.Read(); - const auto *R = r.Read(); - auto *D = d.ReadWrite(); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { D[i] = sd * D[i] + sr * DI[i] * R[i]; }); -} - -template -inline void ApplyOrderK(const double sd, const double sr, const ComplexVector &dinv, - const ComplexVector &r, ComplexVector &d) -{ - const int N = dinv.Size(); - const auto *DIR = dinv.Real().Read(); - const auto *DII = dinv.Imag().Read(); - const auto *RR = r.Real().Read(); - const auto *RI = r.Imag().Read(); - auto *DR = d.Real().ReadWrite(); - auto *DI = d.Imag().ReadWrite(); - if constexpr (!Transpose) - { - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - DR[i] = sd * DR[i] + sr * (DIR[i] * RR[i] - DII[i] * RI[i]); - DI[i] = sd * DI[i] + sr * (DII[i] * RR[i] + DIR[i] * RI[i]); - }); - } - else - { - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - DR[i] = sd * DR[i] + sr * (DIR[i] * RR[i] + DII[i] * RI[i]); - DI[i] = sd * DI[i] + sr * (-DII[i] * RR[i] + DIR[i] * RI[i]); - }); - } -} - -} // namespace - -template -ChebyshevSmoother::ChebyshevSmoother(int smooth_it, int poly_order, double sf_max) - : Solver(), pc_it(smooth_it), order(poly_order), A(nullptr), lambda_max(0.0), - sf_max(sf_max) -{ - MFEM_VERIFY(order > 0, "Polynomial order for Chebyshev smoothing must be positive!"); -} - -template -void ChebyshevSmoother::SetOperator(const OperType &op) -{ - using ParOperType = - typename std::conditional::value, - ComplexParOperator, ParOperator>::type; - - A = &op; - r.SetSize(op.Height()); - d.SetSize(op.Height()); - - const auto *PtAP = dynamic_cast(&op); - MFEM_VERIFY(PtAP, - "ChebyshevSmoother requires a ParOperator or ComplexParOperator operator!"); - dinv.SetSize(op.Height()); - PtAP->AssembleDiagonal(dinv); - dinv.Reciprocal(); - - // Set up Chebyshev coefficients using the computed maximum eigenvalue estimate. See - // mfem::OperatorChebyshevSmoother or Adams et al. (2003). - lambda_max = sf_max * GetLambdaMax(PtAP->GetComm(), *A, dinv); - - this->height = op.Height(); - this->width = op.Width(); -} - -template -void ChebyshevSmoother::Mult(const VecType &x, VecType &y) const -{ - // Apply smoother: y = y + p(A) (x - A y) . - for (int it = 0; it < pc_it; it++) - { - if (this->initial_guess || it > 0) - { - ApplyOp(*A, y, r); - linalg::AXPBY(1.0, x, -1.0, r); - } - else - { - r = x; - y = 0.0; - } - - // 4th-kind Chebyshev smoother, from Phillips and Fischer or Lottes (with k -> k + 1 - // shift due to 1-based indexing). - ApplyOrder0(4.0 / (3.0 * lambda_max), dinv, r, d); - for (int k = 1; k < order; k++) - { - y += d; - ApplyOp(*A, d, r, -1.0); - const double sd = (2.0 * k - 1.0) / (2.0 * k + 3.0); - const double sr = (8.0 * k + 4.0) / ((2.0 * k + 3.0) * lambda_max); - ApplyOrderK(sd, sr, dinv, r, d); - } - y += d; - } -} - -template -ChebyshevSmoother1stKind::ChebyshevSmoother1stKind(int smooth_it, int poly_order, - double sf_max, double sf_min) - : Solver(), pc_it(smooth_it), order(poly_order), A(nullptr), theta(0.0), - sf_max(sf_max), sf_min(sf_min) -{ - MFEM_VERIFY(order > 0, "Polynomial order for Chebyshev smoothing must be positive!"); -} - -template -void ChebyshevSmoother1stKind::SetOperator(const OperType &op) -{ - using ParOperType = - typename std::conditional::value, - ComplexParOperator, ParOperator>::type; - - A = &op; - r.SetSize(op.Height()); - d.SetSize(op.Height()); - - const auto *PtAP = dynamic_cast(&op); - MFEM_VERIFY( - PtAP, - "ChebyshevSmoother1stKind requires a ParOperator or ComplexParOperator operator!"); - dinv.SetSize(op.Height()); - PtAP->AssembleDiagonal(dinv); - dinv.Reciprocal(); - - // Set up Chebyshev coefficients using the computed maximum eigenvalue estimate. The - // optimized estimate of lambda_min comes from (2.24) of Phillips and Fischer (2022). - if (sf_min <= 0.0) - { - sf_min = 1.69 / (std::pow(order, 1.68) + 2.11 * order + 1.98); - } - const double lambda_max = sf_max * GetLambdaMax(PtAP->GetComm(), *A, dinv); - const double lambda_min = sf_min * lambda_max; - theta = 0.5 * (lambda_max + lambda_min); - delta = 0.5 * (lambda_max - lambda_min); - - this->height = op.Height(); - this->width = op.Width(); -} - -template -void ChebyshevSmoother1stKind::Mult(const VecType &x, VecType &y) const -{ - // Apply smoother: y = y + p(A) (x - A y) . - for (int it = 0; it < pc_it; it++) - { - if (this->initial_guess || it > 0) - { - ApplyOp(*A, y, r); - linalg::AXPBY(1.0, x, -1.0, r); - } - else - { - r = x; - y = 0.0; - } - - // 1th-kind Chebyshev smoother, from Phillips and Fischer or Adams. - ApplyOrder0(1.0 / theta, dinv, r, d); - double rhop = delta / theta; - for (int k = 1; k < order; k++) - { - y += d; - ApplyOp(*A, d, r, -1.0); - const double rho = 1.0 / (2.0 * theta / delta - rhop); - const double sd = rho * rhop; - const double sr = 2.0 * rho / delta; - ApplyOrderK(sd, sr, dinv, r, d); - rhop = rho; - } - y += d; - } -} - -template class ChebyshevSmoother; -template class ChebyshevSmoother; - -template class ChebyshevSmoother1stKind; -template class ChebyshevSmoother1stKind; - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "chebyshev.hpp" + +#include + +namespace palace +{ + +namespace +{ + +double GetLambdaMax(MPI_Comm comm, const Operator &A, const Vector &dinv) +{ + // Assumes A SPD (diag(A) > 0) to use Hermitian eigenvalue solver. + DiagonalOperator Dinv(dinv); + ProductOperator DinvA(Dinv, A); + return linalg::SpectralNorm(comm, DinvA, true); +} + +double GetLambdaMax(MPI_Comm comm, const ComplexOperator &A, const ComplexVector &dinv) +{ + // Assumes A SPD (diag(A) > 0) to use Hermitian eigenvalue solver. + ComplexDiagonalOperator Dinv(dinv); + ComplexProductOperator DinvA(Dinv, A); + return linalg::SpectralNorm(comm, DinvA, A.IsReal()); +} + +template +inline void ApplyOp(const Operator &A, const Vector &x, Vector &y) +{ + A.Mult(x, y); +} + +template +inline void ApplyOp(const ComplexOperator &A, const ComplexVector &x, ComplexVector &y) +{ + if constexpr (!Transpose) + { + A.Mult(x, y); + } + else + { + A.MultHermitianTranspose(x, y); + } +} + +template +inline void ApplyOp(const Operator &A, const Vector &x, Vector &y, const double a) +{ + A.AddMult(x, y, a); +} + +template +inline void ApplyOp(const ComplexOperator &A, const ComplexVector &x, ComplexVector &y, + const double a) +{ + if constexpr (!Transpose) + { + A.AddMult(x, y, a); + } + else + { + A.AddMultHermitianTranspose(x, y, a); + } +} + +template +inline void ApplyOrder0(double sr, const Vector &dinv, const Vector &r, Vector &d) +{ + const bool use_dev = dinv.UseDevice() || r.UseDevice() || d.UseDevice(); + const int N = d.Size(); + const auto *DI = dinv.Read(use_dev); + const auto *R = r.Read(use_dev); + auto *D = d.Write(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) { D[i] = sr * DI[i] * R[i]; }); +} + +template +inline void ApplyOrder0(const double sr, const ComplexVector &dinv, const ComplexVector &r, + ComplexVector &d) +{ + const bool use_dev = dinv.UseDevice() || r.UseDevice() || d.UseDevice(); + const int N = dinv.Size(); + const auto *DIR = dinv.Real().Read(use_dev); + const auto *DII = dinv.Imag().Read(use_dev); + const auto *RR = r.Real().Read(use_dev); + const auto *RI = r.Imag().Read(use_dev); + auto *DR = d.Real().Write(use_dev); + auto *DI = d.Imag().Write(use_dev); + if constexpr (!Transpose) + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + DR[i] = sr * (DIR[i] * RR[i] - DII[i] * RI[i]); + DI[i] = sr * (DII[i] * RR[i] + DIR[i] * RI[i]); + }); + } + else + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + DR[i] = sr * (DIR[i] * RR[i] + DII[i] * RI[i]); + DI[i] = sr * (-DII[i] * RR[i] + DIR[i] * RI[i]); + }); + } +} + +template +inline void ApplyOrderK(const double sd, const double sr, const Vector &dinv, + const Vector &r, Vector &d) +{ + const bool use_dev = dinv.UseDevice() || r.UseDevice() || d.UseDevice(); + const int N = dinv.Size(); + const auto *DI = dinv.Read(use_dev); + const auto *R = r.Read(use_dev); + auto *D = d.ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, [=] MFEM_HOST_DEVICE(int i) + { D[i] = sd * D[i] + sr * DI[i] * R[i]; }); +} + +template +inline void ApplyOrderK(const double sd, const double sr, const ComplexVector &dinv, + const ComplexVector &r, ComplexVector &d) +{ + const bool use_dev = dinv.UseDevice() || r.UseDevice() || d.UseDevice(); + const int N = dinv.Size(); + const auto *DIR = dinv.Real().Read(use_dev); + const auto *DII = dinv.Imag().Read(use_dev); + const auto *RR = r.Real().Read(use_dev); + const auto *RI = r.Imag().Read(use_dev); + auto *DR = d.Real().ReadWrite(use_dev); + auto *DI = d.Imag().ReadWrite(use_dev); + if constexpr (!Transpose) + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + DR[i] = sd * DR[i] + sr * (DIR[i] * RR[i] - DII[i] * RI[i]); + DI[i] = sd * DI[i] + sr * (DII[i] * RR[i] + DIR[i] * RI[i]); + }); + } + else + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + DR[i] = sd * DR[i] + sr * (DIR[i] * RR[i] + DII[i] * RI[i]); + DI[i] = sd * DI[i] + sr * (-DII[i] * RR[i] + DIR[i] * RI[i]); + }); + } +} + +} // namespace + +template +ChebyshevSmoother::ChebyshevSmoother(MPI_Comm comm, int smooth_it, int poly_order, + double sf_max) + : Solver(), comm(comm), pc_it(smooth_it), order(poly_order), A(nullptr), + lambda_max(0.0), sf_max(sf_max) +{ + MFEM_VERIFY(order > 0, "Polynomial order for Chebyshev smoothing must be positive!"); +} + +template +void ChebyshevSmoother::SetOperator(const OperType &op) +{ + A = &op; + d.SetSize(op.Height()); + dinv.SetSize(op.Height()); + d.UseDevice(true); + dinv.UseDevice(true); + op.AssembleDiagonal(dinv); + dinv.Reciprocal(); + + // Set up Chebyshev coefficients using the computed maximum eigenvalue estimate. See + // mfem::OperatorChebyshevSmoother or Adams et al. (2003). + lambda_max = sf_max * GetLambdaMax(comm, *A, dinv); + MFEM_VERIFY(lambda_max > 0.0, + "Encountered zero maximum eigenvalue in Chebyshev smoother!"); + + this->height = op.Height(); + this->width = op.Width(); +} + +template +void ChebyshevSmoother::Mult2(const VecType &x, VecType &y, VecType &r) const +{ + // Apply smoother: y = y + p(A) (x - A y) . + for (int it = 0; it < pc_it; it++) + { + if (this->initial_guess || it > 0) + { + ApplyOp(*A, y, r); + linalg::AXPBY(1.0, x, -1.0, r); + } + else + { + r = x; + y = 0.0; + } + + // 4th-kind Chebyshev smoother, from Phillips and Fischer or Lottes (with k -> k + 1 + // shift due to 1-based indexing). + ApplyOrder0(4.0 / (3.0 * lambda_max), dinv, r, d); + for (int k = 1; k < order; k++) + { + y += d; + ApplyOp(*A, d, r, -1.0); + const double sd = (2.0 * k - 1.0) / (2.0 * k + 3.0); + const double sr = (8.0 * k + 4.0) / ((2.0 * k + 3.0) * lambda_max); + ApplyOrderK(sd, sr, dinv, r, d); + } + y += d; + } +} + +template +ChebyshevSmoother1stKind::ChebyshevSmoother1stKind(MPI_Comm comm, int smooth_it, + int poly_order, double sf_max, + double sf_min) + : Solver(), comm(comm), pc_it(smooth_it), order(poly_order), A(nullptr), + theta(0.0), sf_max(sf_max), sf_min(sf_min) +{ + MFEM_VERIFY(order > 0, "Polynomial order for Chebyshev smoothing must be positive!"); +} + +template +void ChebyshevSmoother1stKind::SetOperator(const OperType &op) +{ + A = &op; + d.SetSize(op.Height()); + dinv.SetSize(op.Height()); + d.UseDevice(true); + dinv.UseDevice(true); + op.AssembleDiagonal(dinv); + dinv.Reciprocal(); + + // Set up Chebyshev coefficients using the computed maximum eigenvalue estimate. The + // optimized estimate of lambda_min comes from (2.24) of Phillips and Fischer (2022). + if (sf_min <= 0.0) + { + sf_min = 1.69 / (std::pow(order, 1.68) + 2.11 * order + 1.98); + } + const double lambda_max = sf_max * GetLambdaMax(comm, *A, dinv); + MFEM_VERIFY(lambda_max > 0.0, + "Encountered zero maximum eigenvalue in Chebyshev smoother!"); + const double lambda_min = sf_min * lambda_max; + theta = 0.5 * (lambda_max + lambda_min); + delta = 0.5 * (lambda_max - lambda_min); + + this->height = op.Height(); + this->width = op.Width(); +} + +template +void ChebyshevSmoother1stKind::Mult2(const VecType &x, VecType &y, + VecType &r) const +{ + // Apply smoother: y = y + p(A) (x - A y) . + for (int it = 0; it < pc_it; it++) + { + if (this->initial_guess || it > 0) + { + ApplyOp(*A, y, r); + linalg::AXPBY(1.0, x, -1.0, r); + } + else + { + r = x; + y = 0.0; + } + + // 1th-kind Chebyshev smoother, from Phillips and Fischer or Adams. + ApplyOrder0(1.0 / theta, dinv, r, d); + double rhop = delta / theta; + for (int k = 1; k < order; k++) + { + y += d; + ApplyOp(*A, d, r, -1.0); + const double rho = 1.0 / (2.0 * theta / delta - rhop); + const double sd = rho * rhop; + const double sr = 2.0 * rho / delta; + ApplyOrderK(sd, sr, dinv, r, d); + rhop = rho; + } + y += d; + } +} + +template class ChebyshevSmoother; +template class ChebyshevSmoother; + +template class ChebyshevSmoother1stKind; +template class ChebyshevSmoother1stKind; + +} // namespace palace diff --git a/palace/linalg/chebyshev.hpp b/palace/linalg/chebyshev.hpp index 56df15a444..64fdcf436c 100644 --- a/palace/linalg/chebyshev.hpp +++ b/palace/linalg/chebyshev.hpp @@ -1,99 +1,146 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_CHEBYSHEV_SMOOTHER_HPP -#define PALACE_LINALG_CHEBYSHEV_SMOOTHER_HPP - -#include "linalg/operator.hpp" -#include "linalg/solver.hpp" -#include "linalg/vector.hpp" - -namespace palace -{ - -// -// Matrix-free diagonally-scaled Chebyshev smoothing. This is largely the same as -// mfem::OperatorChebyshevSmoother allows a nonzero initial guess and uses alternative -// methods to estimate the largest eigenvalue. We use a smoother based on Chebyshev -// polynomials of the 4th-kind as proposed in recent work. -// Reference: Phillips and Fischer, Optimal Chebyshev smoothers and one-sided V-cycles, -// arXiv:2210.03179v1 (2022). -// -template -class ChebyshevSmoother : public Solver -{ - using VecType = typename Solver::VecType; - -private: - // Number of smoother iterations and polynomial order. - const int pc_it, order; - - // System matrix (not owned). - const OperType *A; - - // Inverse diagonal scaling of the operator (real-valued for now). - VecType dinv; - - // Maximum operator eigenvalue for Chebyshev polynomial smoothing. - double lambda_max, sf_max; - - // Temporary vectors for smoother application. - mutable VecType r, d; - -public: - ChebyshevSmoother(int smooth_it, int poly_order, double sf_max); - - void SetOperator(const OperType &op) override; - - void Mult(const VecType &x, VecType &y) const override; - - void MultTranspose(const VecType &x, VecType &y) const override - { - Mult(x, y); // Assumes operator symmetry - } -}; - -// -// Matrix-free diagonally-scaled Chebyshev smoothing using standard 1st-kind Chebyshev -// polynomials. -// Reference: Adams et al., Parallel multigrid smoothing: polynomial versus Gauss–Seidel, -// JCP (2003). -// -template -class ChebyshevSmoother1stKind : public Solver -{ - using VecType = typename Solver::VecType; - -private: - // Number of smoother iterations and polynomial order. - const int pc_it, order; - - // System matrix (not owned). - const OperType *A; - - // Inverse diagonal scaling of the operator (real-valued for now). - VecType dinv; - - // Parameters depending on maximum and minimum operator eigenvalue estimates for Chebyshev - // polynomial smoothing. - double theta, delta, sf_max, sf_min; - - // Temporary vectors for smoother application. - mutable VecType r, d; - -public: - ChebyshevSmoother1stKind(int smooth_it, int poly_order, double sf_max, double sf_min); - - void SetOperator(const OperType &op) override; - - void Mult(const VecType &x, VecType &y) const override; - - void MultTranspose(const VecType &x, VecType &y) const override - { - Mult(x, y); // Assumes operator symmetry - } -}; - -} // namespace palace - -#endif // PALACE_LINALG_CHEBYSHEV_SMOOTHER_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_CHEBYSHEV_SMOOTHER_HPP +#define PALACE_LINALG_CHEBYSHEV_SMOOTHER_HPP + +#include "linalg/operator.hpp" +#include "linalg/solver.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +// +// Matrix-free diagonally-scaled Chebyshev smoothing. This is largely the same as +// mfem::OperatorChebyshevSmoother allows a nonzero initial guess and uses alternative +// methods to estimate the largest eigenvalue. We use a smoother based on Chebyshev +// polynomials of the 4th-kind as proposed in recent work. +// Reference: Phillips and Fischer, Optimal Chebyshev smoothers and one-sided V-cycles, +// arXiv:2210.03179v1 (2022). +// +template +class ChebyshevSmoother : public Solver +{ + using VecType = typename Solver::VecType; + +private: + // MPI communicator associated with the solver operator and vectors. + MPI_Comm comm; + + // Number of smoother iterations and polynomial order. + const int pc_it, order; + + // System matrix (not owned). + const OperType *A; + + // Inverse diagonal scaling of the operator (real-valued for now). + VecType dinv; + + // Maximum operator eigenvalue for Chebyshev polynomial smoothing. + double lambda_max, sf_max; + + // Temporary vector for smoother application. + mutable VecType d, r; + +public: + ChebyshevSmoother(MPI_Comm comm, int smooth_it, int poly_order, double sf_max); + + void SetOperator(const OperType &op) override; + + void Mult(const VecType &x, VecType &y) const override + { + if (r.Size() != y.Size()) + { + r.SetSize(y.Size()); + r.UseDevice(true); + } + Mult2(x, y, r); + } + + void MultTranspose(const VecType &x, VecType &y) const override + { + if (r.Size() != y.Size()) + { + r.SetSize(y.Size()); + r.UseDevice(true); + } + MultTranspose2(x, y, r); + } + + void Mult2(const VecType &x, VecType &y, VecType &r) const override; + + void MultTranspose2(const VecType &x, VecType &y, VecType &r) const override + { + Mult2(x, y, r); // Assumes operator symmetry + } +}; + +// +// Matrix-free diagonally-scaled Chebyshev smoothing using standard 1st-kind Chebyshev +// polynomials. +// Reference: Adams et al., Parallel multigrid smoothing: polynomial versus Gauss–Seidel, +// JCP (2003). +// +template +class ChebyshevSmoother1stKind : public Solver +{ + using VecType = typename Solver::VecType; + +private: + // MPI communicator associated with the solver operator and vectors. + MPI_Comm comm; + + // Number of smoother iterations and polynomial order. + const int pc_it, order; + + // System matrix (not owned). + const OperType *A; + + // Inverse diagonal scaling of the operator (real-valued for now). + VecType dinv; + + // Parameters depending on maximum and minimum operator eigenvalue estimates for Chebyshev + // polynomial smoothing. + double theta, delta, sf_max, sf_min; + + // Temporary vector for smoother application. + mutable VecType d, r; + +public: + ChebyshevSmoother1stKind(MPI_Comm comm, int smooth_it, int poly_order, double sf_max, + double sf_min); + + void SetOperator(const OperType &op) override; + + void Mult(const VecType &x, VecType &y) const override + { + if (r.Size() != y.Size()) + { + r.SetSize(y.Size()); + r.UseDevice(true); + } + Mult2(x, y, r); + } + + void MultTranspose(const VecType &x, VecType &y) const override + { + if (r.Size() != y.Size()) + { + r.SetSize(y.Size()); + r.UseDevice(true); + } + MultTranspose2(x, y, r); + } + + void Mult2(const VecType &x, VecType &y, VecType &r) const override; + + void MultTranspose2(const VecType &x, VecType &y, VecType &r) const override + { + Mult2(x, y, r); // Assumes operator symmetry + } +}; + +} // namespace palace + +#endif // PALACE_LINALG_CHEBYSHEV_SMOOTHER_HPP diff --git a/palace/linalg/densematrix.cpp b/palace/linalg/densematrix.cpp new file mode 100644 index 0000000000..cdedd61408 --- /dev/null +++ b/palace/linalg/densematrix.cpp @@ -0,0 +1,283 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "densematrix.hpp" + +#include +#include +#include +#include + +namespace palace +{ + +namespace +{ + +// Compute matrix functions for symmetric real-valued 2x2 or 3x3 matrices. Returns the +// matrix U * f(Λ) * U' for input U * Λ * U'. +// Reference: Deledalle et al., Closed-form expressions of the eigen decomposition of 2x2 +// and 3x3 Hermitian matrices, HAL hal-01501221 (2017). +mfem::DenseMatrix MatrixFunction(const mfem::DenseMatrix &M, + const std::function &functor) +{ + MFEM_ASSERT(M.Height() == M.Width(), + "MatrixFunction only available for square matrices!"); + const auto N = M.Height(); + constexpr auto tol = 10.0 * std::numeric_limits::epsilon(); + for (int i = 0; i < N; i++) + { + for (int j = i + 1; j < N; j++) + { + MFEM_VERIFY(std::abs(M(i, j) - M(j, i)) < tol, + "MatrixFunction only available for symmetric matrices (" + << M(i, j) << " != " << M(j, i) << ")!"); + } + } + mfem::DenseMatrix Mout(N, N); + Mout = 0.0; + if (N == 2) + { + MFEM_ABORT("2x2 MatrixFunction is not implemented yet!"); + } + else if (N == 3) + { + // Need to specialize based on the number of zeros and their locations. + const auto &a = M(0, 0), &b = M(1, 1), &c = M(2, 2); + const auto &d = M(0, 1), &e = M(1, 2), &f = M(0, 2); + const bool d_non_zero = std::abs(d) > tol; + const bool e_non_zero = std::abs(e) > tol; + const bool f_non_zero = std::abs(f) > tol; + if (!d_non_zero && !e_non_zero && !f_non_zero) + { + // a 0 0 + // 0 b 0 + // 0 0 c + for (int i = 0; i < 3; i++) + { + Mout(i, i) = functor(M(i, i)); + } + return Mout; + } + if (d_non_zero && !e_non_zero && !f_non_zero) + { + // a d 0 + // d b 0 + // 0 0 c + const double disc = std::sqrt(a * a - 2.0 * a * b + b * b + 4.0 * d * d); + const double lambda1 = c; + const double lambda2 = (a + b - disc) / 2.0; + const double lambda3 = (a + b + disc) / 2.0; + const mfem::Vector v1{{0.0, 0.0, 1.0}}; + const mfem::Vector v2{{-(-a + b + disc) / (2.0 * d), 1.0, 0.0}}; + const mfem::Vector v3{{-(-a + b - disc) / (2.0 * d), 1.0, 0.0}}; + AddMult_a_VVt(functor(lambda1), v1, Mout); + AddMult_a_VVt(functor(lambda2), v2, Mout); + AddMult_a_VVt(functor(lambda3), v3, Mout); + return Mout; + } + if (!d_non_zero && e_non_zero && !f_non_zero) + { + // a 0 0 + // 0 b e + // 0 e c + const double disc = std::sqrt(b * b - 2.0 * b * c + c * c + 4.0 * e * e); + const double lambda1 = a; + const double lambda2 = 0.5 * (b + c - disc); + const double lambda3 = 0.5 * (b + c + disc); + const mfem::Vector v1{{1.0, 0.0, 0.0}}; + const mfem::Vector v2{{0.0, -(-b + c + disc) / (2.0 * e), 1.0}}; + const mfem::Vector v3{{0.0, -(-b + c - disc) / (2.0 * e), 1.0}}; + AddMult_a_VVt(functor(lambda1), v1, Mout); + AddMult_a_VVt(functor(lambda2), v2, Mout); + AddMult_a_VVt(functor(lambda3), v3, Mout); + return Mout; + } + if (!d_non_zero && !e_non_zero && f_non_zero) + { + // a 0 f + // 0 b 0 + // f 0 c + const double disc = std::sqrt(a * a - 2.0 * a * c + c * c + 4.0 * f * f); + const double lambda1 = b; + const double lambda2 = 0.5 * (a + c - disc); + const double lambda3 = 0.5 * (a + c + disc); + const mfem::Vector v1{{0.0, 1.0, 0.0}}; + const mfem::Vector v2{{-(-a + c + disc) / (2.0 * f), 0.0, 1.0}}; + const mfem::Vector v3{{-(-a + c - disc) / (2.0 * f), 0.0, 1.0}}; + AddMult_a_VVt(functor(lambda1), v1, Mout); + AddMult_a_VVt(functor(lambda2), v2, Mout); + AddMult_a_VVt(functor(lambda3), v3, Mout); + return Mout; + } + if ((!d_non_zero && e_non_zero && f_non_zero) || + (d_non_zero && !e_non_zero && f_non_zero) || + (d_non_zero && e_non_zero && !f_non_zero)) + { + MFEM_ABORT("This nonzero pattern is not currently supported for MatrixFunction!"); + } + // General case for all nonzero: + // a d f + // d b e + // f e c + const double a2 = a * a, b2 = b * b, c2 = c * c, d2 = d * d, e2 = e * e, f2 = f * f; + const double a2mbmc = 2.0 * a - b - c; + const double b2mamc = 2.0 * b - a - c; + const double c2mamb = 2.0 * c - a - b; + const double x1 = a2 + b2 + c2 - a * b - b * c + 3.0 * (d2 + e2 + f2); + const double x2 = -(a2mbmc * b2mamc * c2mamb) + + 9.0 * (c2mamb * d2 + b2mamc * f2 + a2mbmc * e2) - 54.0 * d * e * f; + const double phi = std::atan2(std::sqrt(4.0 * x1 * x1 * x1 - x2 * x2), x2); + const double lambda1 = (a + b + c - 2.0 * std::sqrt(x1) * std::cos(phi / 3.0)) / 3.0; + const double lambda2 = + (a + b + c + 2.0 * std::sqrt(x1) * std::cos((phi - M_PI) / 3.0)) / 3.0; + const double lambda3 = + (a + b + c + 2.0 * std::sqrt(x1) * std::cos((phi + M_PI) / 3.0)) / 3.0; + + auto SafeDivide = [&](double x, double y) + { + if (std::abs(x) <= tol) + { + return 0.0; + } + if (std::abs(x) >= tol && std::abs(y) <= tol) + { + MFEM_ABORT("Logic error: Zero denominator with nonzero numerator!"); + return 0.0; + } + return x / y; + }; + const double m1 = SafeDivide(d * (c - lambda1) - e * f, f * (b - lambda1) - d * e); + const double m2 = SafeDivide(d * (c - lambda2) - e * f, f * (b - lambda2) - d * e); + const double m3 = SafeDivide(d * (c - lambda3) - e * f, f * (b - lambda3) - d * e); + const double l1mcmem1 = lambda1 - c - e * m1; + const double l2mcmem2 = lambda2 - c - e * m2; + const double l3mcmem3 = lambda3 - c - e * m3; + const double n1 = 1.0 + m1 * m1 + SafeDivide(std::pow(l1mcmem1, 2), f2); + const double n2 = 1.0 + m2 * m2 + SafeDivide(std::pow(l2mcmem2, 2), f2); + const double n3 = 1.0 + m3 * m3 + SafeDivide(std::pow(l3mcmem3, 2), f2); + const double tlambda1 = functor(lambda1) / n1; + const double tlambda2 = functor(lambda2) / n2; + const double tlambda3 = functor(lambda3) / n3; + + const double at = (tlambda1 * l1mcmem1 * l1mcmem1 + tlambda2 * l2mcmem2 * l2mcmem2 + + tlambda3 * l3mcmem3 * l3mcmem3) / + f2; + const double bt = tlambda1 * m1 * m1 + tlambda2 * m2 * m2 + tlambda3 * m3 * m3; + const double ct = tlambda1 + tlambda2 + tlambda3; + const double dt = + (tlambda1 * m1 * l1mcmem1 + tlambda2 * m2 * l2mcmem2 + tlambda3 * m3 * l3mcmem3) / + f; + const double et = tlambda1 * m1 + tlambda2 * m2 + tlambda3 * m3; + const double ft = (tlambda1 * l1mcmem1 + tlambda2 * l2mcmem2 + tlambda3 * l3mcmem3) / f; + Mout(0, 0) = at; + Mout(0, 1) = dt; + Mout(0, 2) = ft; + Mout(1, 0) = dt; + Mout(1, 1) = bt; + Mout(1, 2) = et; + Mout(2, 0) = ft; + Mout(2, 1) = et; + Mout(2, 2) = ct; + return Mout; + } + else + { + MFEM_ABORT("MatrixFunction only supports 2x2 or 3x3 matrices, N: " << N << "!"); + } + return Mout; +} + +} // namespace + +namespace linalg +{ + +mfem::DenseMatrix MatrixSqrt(const mfem::DenseMatrix &M) +{ + return MatrixFunction(M, [](auto s) { return std::sqrt(s); }); +} + +mfem::DenseTensor MatrixSqrt(const mfem::DenseTensor &T) +{ + mfem::DenseTensor S(T); + mfem::DenseMatrix buffS, buffT; + for (int k = 0; k < T.SizeK(); k++) + { + S(k, buffS) = MatrixSqrt(T(k, buffT)); + } + return S; +} + +mfem::DenseMatrix MatrixPow(const mfem::DenseMatrix &M, double p) +{ + return MatrixFunction(M, [p](auto s) { return std::pow(s, p); }); +} + +mfem::DenseTensor MatrixPow(const mfem::DenseTensor &T, double p) +{ + mfem::DenseTensor S(T); + mfem::DenseMatrix buffS, buffT; + for (int k = 0; k < T.SizeK(); k++) + { + S(k, buffS) = MatrixPow(T(k, buffT), p); + } + return S; +} + +double SingularValueMax(const mfem::DenseMatrix &M) +{ + MFEM_ASSERT( + M.Height() == M.Width() && M.Height() > 0 && M.Height() <= 3, + "Matrix singular values only available for square matrices of dimension <= 3!"); + const int N = M.Height(); + if (N == 1) + { + return M(0, 0); + } + else if (N == 2) + { + return mfem::kernels::CalcSingularvalue<2>(M.Data(), 0); + } + else + { + return mfem::kernels::CalcSingularvalue<3>(M.Data(), 0); + } +} + +double SingularValueMin(const mfem::DenseMatrix &M) +{ + MFEM_ASSERT( + M.Height() == M.Width() && M.Height() > 0 && M.Height() <= 3, + "Matrix singular values only available for square matrices of dimension <= 3!"); + const int N = M.Height(); + if (N == 1) + { + return M(0, 0); + } + else if (N == 2) + { + return mfem::kernels::CalcSingularvalue<2>(M.Data(), 1); + } + else + { + return mfem::kernels::CalcSingularvalue<3>(M.Data(), 2); + } +} + +mfem::DenseTensor Mult(const mfem::DenseTensor &A, const mfem::DenseTensor &B) +{ + MFEM_VERIFY(A.SizeK() == B.SizeK(), + "Size mismatch for product of two DenseTensor objects!"); + mfem::DenseTensor C(A.SizeI(), B.SizeJ(), A.SizeK()); + mfem::DenseMatrix buffA, buffB, buffC; + for (int k = 0; k < C.SizeK(); k++) + { + Mult(A(k, buffA), B(k, buffB), C(k, buffC)); + } + return C; +} + +} // namespace linalg + +} // namespace palace diff --git a/palace/linalg/densematrix.hpp b/palace/linalg/densematrix.hpp new file mode 100644 index 0000000000..5bb393b719 --- /dev/null +++ b/palace/linalg/densematrix.hpp @@ -0,0 +1,40 @@ + +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_DENSE_MATRIX_HPP +#define PALACE_LINALG_DENSE_MATRIX_HPP + +namespace mfem +{ + +class DenseMatrix; +class DenseTensor; + +} // namespace mfem + +namespace palace::linalg +{ + +// +// Functionality for manipulating small dense matrices which extends the capabilities of +// mfem::DenseMatrix. +// + +mfem::DenseMatrix MatrixSqrt(const mfem::DenseMatrix &M); + +mfem::DenseTensor MatrixSqrt(const mfem::DenseTensor &T); + +mfem::DenseMatrix MatrixPow(const mfem::DenseMatrix &M, double p); + +mfem::DenseTensor MatrixPow(const mfem::DenseTensor &T, double p); + +double SingularValueMax(const mfem::DenseMatrix &M); + +double SingularValueMin(const mfem::DenseMatrix &M); + +mfem::DenseTensor Mult(const mfem::DenseTensor &A, const mfem::DenseTensor &B); + +} // namespace palace::linalg + +#endif // PALACE_LINALG_DENSE_MATRIX_HPP diff --git a/palace/linalg/distrelaxation.cpp b/palace/linalg/distrelaxation.cpp index 7ef9d36634..da9f8a3041 100644 --- a/palace/linalg/distrelaxation.cpp +++ b/palace/linalg/distrelaxation.cpp @@ -1,152 +1,156 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "distrelaxation.hpp" - -#include -#include "linalg/chebyshev.hpp" -#include "linalg/rap.hpp" - -namespace palace -{ - -template -DistRelaxationSmoother::DistRelaxationSmoother( - const Operator &G, int smooth_it, int cheby_smooth_it, int cheby_order, - double cheby_sf_max, double cheby_sf_min, bool cheby_4th_kind) - : Solver(), pc_it(smooth_it), G(&G), A(nullptr), A_G(nullptr), - dbc_tdof_list_G(nullptr) -{ - // Initialize smoothers. - if (cheby_4th_kind) - { - B = std::make_unique>(cheby_smooth_it, cheby_order, - cheby_sf_max); - B_G = std::make_unique>(cheby_smooth_it, cheby_order, - cheby_sf_max); - } - else - { - B = std::make_unique>(cheby_smooth_it, cheby_order, - cheby_sf_max, cheby_sf_min); - B_G = std::make_unique>(cheby_smooth_it, cheby_order, - cheby_sf_max, cheby_sf_min); - } - B_G->SetInitialGuess(false); -} - -template -void DistRelaxationSmoother::SetOperators(const OperType &op, - const OperType &op_G) -{ - using ParOperType = - typename std::conditional::value, - ComplexParOperator, ParOperator>::type; - - MFEM_VERIFY(op.Height() == G->Height() && op.Width() == G->Height() && - op_G.Height() == G->Width() && op_G.Width() == G->Width(), - "Invalid operator sizes for DistRelaxationSmoother!"); - A = &op; - A_G = &op_G; - r.SetSize(op.Height()); - x_G.SetSize(op_G.Height()); - y_G.SetSize(op_G.Height()); - - const auto *PtAP_G = dynamic_cast(&op_G); - MFEM_VERIFY(PtAP_G, - "ChebyshevSmoother requires a ParOperator or ComplexParOperator operator!"); - dbc_tdof_list_G = PtAP_G->GetEssentialTrueDofs(); - - // Set up smoothers for A and A_G. - B->SetOperator(op); - B_G->SetOperator(op_G); - - this->height = op.Height(); - this->width = op.Width(); -} - -namespace -{ - -inline void RealAddMult(const Operator &op, const Vector &x, Vector &y) -{ - op.AddMult(x, y, 1.0); -} - -inline void RealAddMult(const Operator &op, const ComplexVector &x, ComplexVector &y) -{ - op.AddMult(x.Real(), y.Real(), 1.0); - op.AddMult(x.Imag(), y.Imag(), 1.0); -} - -inline void RealMultTranspose(const Operator &op, const Vector &x, Vector &y) -{ - op.MultTranspose(x, y); -} - -inline void RealMultTranspose(const Operator &op, const ComplexVector &x, ComplexVector &y) -{ - op.MultTranspose(x.Real(), y.Real()); - op.MultTranspose(x.Imag(), y.Imag()); -} - -} // namespace - -template -void DistRelaxationSmoother::Mult(const VecType &x, VecType &y) const -{ - // Apply smoother. - for (int it = 0; it < pc_it; it++) - { - // y = y + B (x - A y) - B->SetInitialGuess(this->initial_guess || it > 0); - B->Mult(x, y); - - // y = y + G B_G Gᵀ (x - A y) - A->Mult(y, r); - linalg::AXPBY(1.0, x, -1.0, r); - RealMultTranspose(*G, r, x_G); - if (dbc_tdof_list_G) - { - linalg::SetSubVector(x_G, *dbc_tdof_list_G, 0.0); - } - B_G->Mult(x_G, y_G); - RealAddMult(*G, y_G, y); - } -} - -template -void DistRelaxationSmoother::MultTranspose(const VecType &x, VecType &y) const -{ - // Apply transpose. - B->SetInitialGuess(true); - for (int it = 0; it < pc_it; it++) - { - // y = y + G B_Gᵀ Gᵀ (x - A y) - if (this->initial_guess || it > 0) - { - A->Mult(y, r); - linalg::AXPBY(1.0, x, -1.0, r); - RealMultTranspose(*G, r, x_G); - } - else - { - y = 0.0; - RealMultTranspose(*G, x, x_G); - } - if (dbc_tdof_list_G) - { - linalg::SetSubVector(x_G, *dbc_tdof_list_G, 0.0); - } - B_G->MultTranspose(x_G, y_G); - RealAddMult(*G, y_G, y); - - // y = y + Bᵀ (x - A y) - B->MultTranspose(x, y); - } -} - -template class DistRelaxationSmoother; -template class DistRelaxationSmoother; - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "distrelaxation.hpp" + +#include +#include "linalg/chebyshev.hpp" +#include "linalg/rap.hpp" + +namespace palace +{ + +template +DistRelaxationSmoother::DistRelaxationSmoother( + MPI_Comm comm, const Operator &G, int smooth_it, int cheby_smooth_it, int cheby_order, + double cheby_sf_max, double cheby_sf_min, bool cheby_4th_kind) + : Solver(), pc_it(smooth_it), G(&G), A(nullptr), A_G(nullptr), + dbc_tdof_list_G(nullptr) +{ + // Initialize smoothers. + if (cheby_4th_kind) + { + B = std::make_unique>(comm, cheby_smooth_it, cheby_order, + cheby_sf_max); + B_G = std::make_unique>(comm, cheby_smooth_it, cheby_order, + cheby_sf_max); + } + else + { + B = std::make_unique>( + comm, cheby_smooth_it, cheby_order, cheby_sf_max, cheby_sf_min); + B_G = std::make_unique>( + comm, cheby_smooth_it, cheby_order, cheby_sf_max, cheby_sf_min); + } + B_G->SetInitialGuess(false); +} + +template +void DistRelaxationSmoother::SetOperators(const OperType &op, + const OperType &op_G) +{ + using ParOperType = + typename std::conditional::value, + ComplexParOperator, ParOperator>::type; + + MFEM_VERIFY(op.Height() == G->Height() && op.Width() == G->Height() && + op_G.Height() == G->Width() && op_G.Width() == G->Width(), + "Invalid operator sizes for DistRelaxationSmoother!"); + A = &op; + A_G = &op_G; + x_G.SetSize(op_G.Height()); + y_G.SetSize(op_G.Height()); + r_G.SetSize(op_G.Height()); + x_G.UseDevice(true); + y_G.UseDevice(true); + r_G.UseDevice(true); + + const auto *PtAP_G = dynamic_cast(&op_G); + MFEM_VERIFY(PtAP_G, + "ChebyshevSmoother requires a ParOperator or ComplexParOperator operator!"); + dbc_tdof_list_G = PtAP_G->GetEssentialTrueDofs(); + + // Set up smoothers for A and A_G. + B->SetOperator(op); + B_G->SetOperator(op_G); + + this->height = op.Height(); + this->width = op.Width(); +} + +namespace +{ + +inline void RealAddMult(const Operator &op, const Vector &x, Vector &y) +{ + op.AddMult(x, y, 1.0); +} + +inline void RealAddMult(const Operator &op, const ComplexVector &x, ComplexVector &y) +{ + op.AddMult(x.Real(), y.Real(), 1.0); + op.AddMult(x.Imag(), y.Imag(), 1.0); +} + +inline void RealMultTranspose(const Operator &op, const Vector &x, Vector &y) +{ + op.MultTranspose(x, y); +} + +inline void RealMultTranspose(const Operator &op, const ComplexVector &x, ComplexVector &y) +{ + op.MultTranspose(x.Real(), y.Real()); + op.MultTranspose(x.Imag(), y.Imag()); +} + +} // namespace + +template +void DistRelaxationSmoother::Mult2(const VecType &x, VecType &y, VecType &r) const +{ + // Apply smoother. + for (int it = 0; it < pc_it; it++) + { + // y = y + B (x - A y) + B->SetInitialGuess(this->initial_guess || it > 0); + B->Mult2(x, y, r); + + // y = y + G B_G Gᵀ (x - A y) + A->Mult(y, r); + linalg::AXPBY(1.0, x, -1.0, r); + RealMultTranspose(*G, r, x_G); + if (dbc_tdof_list_G) + { + linalg::SetSubVector(x_G, *dbc_tdof_list_G, 0.0); + } + B_G->Mult2(x_G, y_G, r_G); + RealAddMult(*G, y_G, y); + } +} + +template +void DistRelaxationSmoother::MultTranspose2(const VecType &x, VecType &y, + VecType &r) const +{ + // Apply transpose. + B->SetInitialGuess(true); + for (int it = 0; it < pc_it; it++) + { + // y = y + G B_Gᵀ Gᵀ (x - A y) + if (this->initial_guess || it > 0) + { + A->Mult(y, r); + linalg::AXPBY(1.0, x, -1.0, r); + RealMultTranspose(*G, r, x_G); + } + else + { + y = 0.0; + RealMultTranspose(*G, x, x_G); + } + if (dbc_tdof_list_G) + { + linalg::SetSubVector(x_G, *dbc_tdof_list_G, 0.0); + } + B_G->MultTranspose2(x_G, y_G, r_G); + RealAddMult(*G, y_G, y); + + // y = y + Bᵀ (x - A y) + B->MultTranspose2(x, y, r); + } +} + +template class DistRelaxationSmoother; +template class DistRelaxationSmoother; + +} // namespace palace diff --git a/palace/linalg/distrelaxation.hpp b/palace/linalg/distrelaxation.hpp index c447f71861..8c1992695c 100644 --- a/palace/linalg/distrelaxation.hpp +++ b/palace/linalg/distrelaxation.hpp @@ -1,71 +1,92 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_DIST_RELAXATION_SMOOTHER_HPP -#define PALACE_LINALG_DIST_RELAXATION_SMOOTHER_HPP - -#include -#include "linalg/operator.hpp" -#include "linalg/solver.hpp" -#include "linalg/vector.hpp" - -namespace mfem -{ - -template -class Array; - -} // namespace mfem - -namespace palace -{ - -// -// Hiptmair distributive relaxation smoother applying smoothers to both the operator in the -// primary space as well as its projection into an auxiliary space. -// Reference: Hiptmair, Multigrid method for Maxwell's equations, SIAM J. Numer. Anal. -// (1998). -// -template -class DistRelaxationSmoother : public Solver -{ - using VecType = typename Solver::VecType; - -private: - // Number of smoother iterations. - const int pc_it; - - // Discrete gradient matrix (not owned). - const Operator *G; - - // System matrix and its projection GᵀAG (not owned). - const OperType *A, *A_G; - const mfem::Array *dbc_tdof_list_G; - - // Point smoother objects for each matrix. - mutable std::unique_ptr> B; - std::unique_ptr> B_G; - - // Temporary vectors for smoother application. - mutable VecType r, x_G, y_G; - -public: - DistRelaxationSmoother(const Operator &G, int smooth_it, int cheby_smooth_it, - int cheby_order, double cheby_sf_max, double cheby_sf_min, - bool cheby_4th_kind); - - void SetOperator(const OperType &op) override - { - MFEM_ABORT("SetOperator with a single operator is not implemented for " - "DistRelaxationSmoother, use the two argument signature instead!"); - } - void SetOperators(const OperType &op, const OperType &op_G); - - void Mult(const VecType &x, VecType &y) const override; - - void MultTranspose(const VecType &x, VecType &y) const override; -}; - -} // namespace palace - -#endif // PALACE_LINALG_DIST_RELAXATION_SMOOTHER_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_DIST_RELAXATION_SMOOTHER_HPP +#define PALACE_LINALG_DIST_RELAXATION_SMOOTHER_HPP + +#include +#include "linalg/operator.hpp" +#include "linalg/solver.hpp" +#include "linalg/vector.hpp" + +namespace mfem +{ + +template +class Array; + +} // namespace mfem + +namespace palace +{ + +// +// Hiptmair distributive relaxation smoother applying smoothers to both the operator in the +// primary space as well as its projection into an auxiliary space. +// Reference: Hiptmair, Multigrid method for Maxwell's equations, SIAM J. Numer. Anal. +// (1998). +// +template +class DistRelaxationSmoother : public Solver +{ + using VecType = typename Solver::VecType; + +private: + // Number of smoother iterations. + const int pc_it; + + // Discrete gradient matrix (not owned). + const Operator *G; + + // System matrix and its projection GᵀAG (not owned). + const OperType *A, *A_G; + const mfem::Array *dbc_tdof_list_G; + + // Point smoother objects for each matrix. + mutable std::unique_ptr> B; + std::unique_ptr> B_G; + + // Temporary vectors for smoother application. + mutable VecType x_G, y_G, r_G, r; + +public: + DistRelaxationSmoother(MPI_Comm comm, const Operator &G, int smooth_it, + int cheby_smooth_it, int cheby_order, double cheby_sf_max, + double cheby_sf_min, bool cheby_4th_kind); + + void SetOperator(const OperType &op) override + { + MFEM_ABORT("SetOperator with a single operator is not implemented for " + "DistRelaxationSmoother, use the two argument signature instead!"); + } + + void SetOperators(const OperType &op, const OperType &op_G); + + void Mult(const VecType &x, VecType &y) const override + { + if (r.Size() != y.Size()) + { + r.SetSize(y.Size()); + r.UseDevice(true); + } + Mult2(x, y, r); + } + + void MultTranspose(const VecType &x, VecType &y) const override + { + if (r.Size() != y.Size()) + { + r.SetSize(y.Size()); + r.UseDevice(true); + } + MultTranspose2(x, y, r); + } + + void Mult2(const VecType &x, VecType &y, VecType &r) const override; + + void MultTranspose2(const VecType &x, VecType &y, VecType &r) const override; +}; + +} // namespace palace + +#endif // PALACE_LINALG_DIST_RELAXATION_SMOOTHER_HPP diff --git a/palace/linalg/divfree.cpp b/palace/linalg/divfree.cpp index e997e06c95..df35a799e6 100644 --- a/palace/linalg/divfree.cpp +++ b/palace/linalg/divfree.cpp @@ -1,86 +1,192 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "divfree.hpp" - -#include -#include -#include "fem/bilinearform.hpp" -#include "fem/coefficient.hpp" -#include "fem/fespace.hpp" -#include "fem/integrator.hpp" -#include "linalg/amg.hpp" -#include "linalg/gmg.hpp" -#include "linalg/iterative.hpp" -#include "linalg/rap.hpp" -#include "models/materialoperator.hpp" - -namespace palace -{ - -DivFreeSolver::DivFreeSolver(const MaterialOperator &mat_op, - const FiniteElementSpace &nd_fespace, - const AuxiliaryFiniteElementSpaceHierarchy &h1_fespaces, - const std::vector> &h1_bdr_tdof_lists, - double tol, int max_it, int print, int pa_order_threshold) -{ - constexpr bool skip_zeros = false; - constexpr auto MatType = MaterialPropertyType::PERMITTIVITY_REAL; - MaterialPropertyCoefficient epsilon_func(mat_op); - { - auto M_mg = std::make_unique(h1_fespaces.GetNumLevels()); - for (std::size_t l = 0; l < h1_fespaces.GetNumLevels(); l++) - { - // Force coarse level operator to be fully assembled always. - const auto &h1_fespace_l = h1_fespaces.GetFESpaceAtLevel(l); - BilinearForm m(h1_fespace_l); - m.AddDomainIntegrator(epsilon_func); - auto M_l = std::make_unique( - m.Assemble((l > 0) ? pa_order_threshold : 99, skip_zeros), h1_fespace_l); - M_l->SetEssentialTrueDofs(h1_bdr_tdof_lists[l], Operator::DiagonalPolicy::DIAG_ONE); - M_mg->AddOperator(std::move(M_l)); - } - M = std::move(M_mg); - } - { - BilinearForm weakdiv(nd_fespace, h1_fespaces.GetFinestFESpace()); - weakdiv.AddDomainIntegrator(epsilon_func); - WeakDiv = - std::make_unique(weakdiv.Assemble(pa_order_threshold, skip_zeros), - nd_fespace, h1_fespaces.GetFinestFESpace(), false); - } - Grad = &h1_fespaces.GetFinestFESpace().GetDiscreteInterpolator(); - bdr_tdof_list_M = &h1_bdr_tdof_lists.back(); - - // The system matrix for the projection is real and SPD. - auto amg = - std::make_unique>(std::make_unique(1, 1, 0)); - std::unique_ptr> pc; - if (h1_fespaces.GetNumLevels() > 1) - { - const int mg_smooth_order = - std::max(h1_fespaces.GetFinestFESpace().GetMaxElementOrder(), 2); - pc = std::make_unique>( - std::move(amg), h1_fespaces.GetProlongationOperators(), nullptr, 1, 1, - mg_smooth_order, 1.0, 0.0, true); - } - else - { - pc = std::move(amg); - } - - auto pcg = - std::make_unique>(h1_fespaces.GetFinestFESpace().GetComm(), print); - pcg->SetInitialGuess(false); - pcg->SetRelTol(tol); - pcg->SetAbsTol(std::numeric_limits::epsilon()); - pcg->SetMaxIter(max_it); - - ksp = std::make_unique(std::move(pcg), std::move(pc)); - ksp->SetOperators(*M, *M); - - psi.SetSize(h1_fespaces.GetFinestFESpace().GetTrueVSize()); - rhs.SetSize(h1_fespaces.GetFinestFESpace().GetTrueVSize()); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "divfree.hpp" + +#include +#include +#include "fem/bilinearform.hpp" +#include "fem/fespace.hpp" +#include "fem/integrator.hpp" +#include "linalg/amg.hpp" +#include "linalg/gmg.hpp" +#include "linalg/iterative.hpp" +#include "linalg/rap.hpp" +#include "models/materialoperator.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +namespace +{ + +template +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &fespace); + +template <> +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &fespace) +{ + return std::make_unique(std::move(a), fespace); +} + +template <> +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &fespace) +{ + return std::make_unique(std::move(a), nullptr, fespace); +} + +} // namespace + +template +DivFreeSolver::DivFreeSolver( + const MaterialOperator &mat_op, FiniteElementSpace &nd_fespace, + FiniteElementSpaceHierarchy &h1_fespaces, + const std::vector> &h1_bdr_tdof_lists, double tol, int max_it, + int print) +{ + BlockTimer bt(Timer::DIV_FREE); + + // If no boundaries on the mesh have been marked, add a single degree of freedom + // constraint so the system for the projection is not singular. This amounts to enforcing + // a scalar potential of 0 at a point in space if it is otherwise completely + // unconstrained. + const auto *ptr_h1_bdr_tdof_lists = &h1_bdr_tdof_lists; + { + MFEM_VERIFY( + !h1_bdr_tdof_lists.empty(), + "Unexpected empty list of boundary true dofs for finite element space hierarchy!"); + HYPRE_BigInt coarse_bdr_tdofs = h1_bdr_tdof_lists[0].Size(); + MPI_Comm comm = h1_fespaces.GetFESpaceAtLevel(0).GetComm(); + Mpi::GlobalSum(1, &coarse_bdr_tdofs, comm); + if (coarse_bdr_tdofs == 0) + { + int root = (h1_fespaces.GetFESpaceAtLevel(0).GetTrueVSize() == 0) ? Mpi::Size(comm) + : Mpi::Rank(comm); + Mpi::GlobalMin(1, &root, comm); + MFEM_VERIFY(root < Mpi::Size(comm), + "No root process found for single true dof constraint!"); + if (root == Mpi::Rank(comm)) + { + aux_tdof_lists.reserve(h1_fespaces.GetNumLevels()); + for (std::size_t l = 0; l < h1_fespaces.GetNumLevels(); l++) + { + auto &tdof_list = aux_tdof_lists.emplace_back(1); + tdof_list[0] = 0; + } + ptr_h1_bdr_tdof_lists = &aux_tdof_lists; + } + } + } + + // Create the mass and weak divergence operators for divergence-free projection. + MaterialPropertyCoefficient epsilon_func(mat_op.GetAttributeToMaterial(), + mat_op.GetPermittivityReal()); + { + constexpr bool skip_zeros = false; + BilinearForm m(h1_fespaces.GetFinestFESpace()); + m.AddDomainIntegrator(epsilon_func); + // m.AssembleQuadratureData(); + auto m_vec = m.Assemble(h1_fespaces, skip_zeros); + auto M_mg = + std::make_unique>(h1_fespaces.GetNumLevels()); + for (std::size_t l = 0; l < h1_fespaces.GetNumLevels(); l++) + { + const auto &h1_fespace_l = h1_fespaces.GetFESpaceAtLevel(l); + auto M_l = BuildLevelParOperator(std::move(m_vec[l]), h1_fespace_l); + M_l->SetEssentialTrueDofs((*ptr_h1_bdr_tdof_lists)[l], + Operator::DiagonalPolicy::DIAG_ONE); + if (l == h1_fespaces.GetNumLevels() - 1) + { + bdr_tdof_list_M = M_l->GetEssentialTrueDofs(); + } + M_mg->AddOperator(std::move(M_l)); + } + M = std::move(M_mg); + } + { + // Weak divergence operator is always partially assembled. + BilinearForm weakdiv(nd_fespace, h1_fespaces.GetFinestFESpace()); + weakdiv.AddDomainIntegrator(epsilon_func); + WeakDiv = std::make_unique(weakdiv.PartialAssemble(), nd_fespace, + h1_fespaces.GetFinestFESpace(), false); + } + Grad = &nd_fespace.GetDiscreteInterpolator(h1_fespaces.GetFinestFESpace()); + + // The system matrix for the projection is real and SPD. + auto amg = std::make_unique>( + std::make_unique(1, 1, true, 0)); + amg->SetDropSmallEntries(false); + std::unique_ptr> pc; + if (h1_fespaces.GetNumLevels() > 1) + { + const int mg_smooth_order = + std::max(h1_fespaces.GetFinestFESpace().GetMaxElementOrder(), 2); + pc = std::make_unique>( + h1_fespaces.GetFinestFESpace().GetComm(), std::move(amg), + h1_fespaces.GetProlongationOperators(), nullptr, 1, 1, mg_smooth_order, 1.0, 0.0, + true); + } + else + { + pc = std::move(amg); + } + + auto pcg = + std::make_unique>(h1_fespaces.GetFinestFESpace().GetComm(), print); + pcg->SetInitialGuess(false); + pcg->SetRelTol(tol); + pcg->SetAbsTol(std::numeric_limits::epsilon()); + pcg->SetMaxIter(max_it); + + ksp = std::make_unique>(std::move(pcg), std::move(pc)); + ksp->SetOperators(*M, *M); + + psi.SetSize(h1_fespaces.GetFinestFESpace().GetTrueVSize()); + rhs.SetSize(h1_fespaces.GetFinestFESpace().GetTrueVSize()); + psi.UseDevice(true); + rhs.UseDevice(true); +} + +template +void DivFreeSolver::Mult(VecType &y) const +{ + BlockTimer bt(Timer::DIV_FREE); + + // Compute the divergence of y. + if constexpr (std::is_same::value) + { + WeakDiv->Mult(y.Real(), rhs.Real()); + WeakDiv->Mult(y.Imag(), rhs.Imag()); + } + else + { + WeakDiv->Mult(y, rhs); + } + + // Apply essential BC and solve the linear system. + if (bdr_tdof_list_M) + { + linalg::SetSubVector(rhs, *bdr_tdof_list_M, 0.0); + } + ksp->Mult(rhs, psi); + + // Compute the irrotational portion of y and subtract. + if constexpr (std::is_same::value) + { + Grad->AddMult(psi.Real(), y.Real(), 1.0); + Grad->AddMult(psi.Imag(), y.Imag(), 1.0); + } + else + { + Grad->AddMult(psi, y, 1.0); + } +} + +template class DivFreeSolver; +template class DivFreeSolver; + +} // namespace palace diff --git a/palace/linalg/divfree.hpp b/palace/linalg/divfree.hpp index ead820895e..1906967267 100644 --- a/palace/linalg/divfree.hpp +++ b/palace/linalg/divfree.hpp @@ -1,93 +1,77 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_DIV_FREE_HPP -#define PALACE_LINALG_DIV_FREE_HPP - -#include -#include -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" - -namespace mfem -{ - -template -class Array; - -} // namespace mfem - -namespace palace -{ - -class AuxiliaryFiniteElementSpaceHierarchy; -class FiniteElementSpace; -class MaterialOperator; - -// -// This solver implements a projection onto a divergence-free space satisfying Gᵀ M x = 0, -// where G represents the discrete gradient matrix with columns spanning the nullspace of -// the curl-curl operator. -// -class DivFreeSolver -{ -private: - // Operators for the divergence-free projection. - std::unique_ptr WeakDiv, M; - const Operator *Grad; - const mfem::Array *bdr_tdof_list_M; - - // Linear solver for the projected linear system (Gᵀ M G) y = x. - std::unique_ptr ksp; - - // Workspace objects for solver application. - mutable Vector psi, rhs; - -public: - DivFreeSolver(const MaterialOperator &mat_op, const FiniteElementSpace &nd_fespace, - const AuxiliaryFiniteElementSpaceHierarchy &h1_fespaces, - const std::vector> &h1_bdr_tdof_lists, double tol, - int max_it, int print, int pa_order_threshold); - - // Given a vector of Nedelec dofs for an arbitrary vector field, compute the Nedelec dofs - // of the irrotational portion of this vector field. The resulting vector will satisfy - // ∇ x y = 0. - void Mult(Vector &y) const - { - // Compute the divergence of y. - WeakDiv->Mult(y, rhs); - - // Apply essential BC and solve the linear system. - if (bdr_tdof_list_M) - { - linalg::SetSubVector(rhs, *bdr_tdof_list_M, 0.0); - } - ksp->Mult(rhs, psi); - - // Compute the irrotational portion of y and subtract. - Grad->AddMult(psi, y, 1.0); - } - - void Mult(const Vector &x, Vector &y) const - { - y = x; - Mult(y); - } - - void Mult(ComplexVector &y) const - { - Mult(y.Real()); - Mult(y.Imag()); - } - - void Mult(const ComplexVector &x, ComplexVector &y) const - { - y = x; - Mult(y); - } -}; - -} // namespace palace - -#endif // PALACE_LINALG_DIV_FREE_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_DIV_FREE_HPP +#define PALACE_LINALG_DIV_FREE_HPP + +#include +#include +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace mfem +{ + +template +class Array; + +} // namespace mfem + +namespace palace +{ + +class FiniteElementSpaceHierarchy; +class FiniteElementSpace; +class MaterialOperator; + +// +// This solver implements a projection onto a divergence-free space satisfying Gᵀ M x = 0, +// where G represents the discrete gradient matrix with columns spanning the nullspace of +// the curl-curl operator. +// +template +class DivFreeSolver +{ + using OperType = typename std::conditional::value, + ComplexOperator, Operator>::type; + +private: + // Operators for the divergence-free projection. + std::unique_ptr M; + std::unique_ptr WeakDiv; + const Operator *Grad; + const mfem::Array *bdr_tdof_list_M; + + // Optional storage for homogeneous Dirichlet boundary condition on a single true dof, + // used when the input array of H1 boundary dofs is empty to prevent the Poisson operator + // from being singular. + std::vector> aux_tdof_lists; + + // Linear solver for the projected linear system (Gᵀ M G) y = x. + std::unique_ptr> ksp; + + // Workspace objects for solver application. + mutable VecType psi, rhs; + +public: + DivFreeSolver(const MaterialOperator &mat_op, FiniteElementSpace &nd_fespace, + FiniteElementSpaceHierarchy &h1_fespaces, + const std::vector> &h1_bdr_tdof_lists, double tol, + int max_it, int print); + + // Given a vector of Nedelec dofs for an arbitrary vector field, compute the Nedelec dofs + // of the irrotational portion of this vector field. The resulting vector will satisfy + // ∇ x y = 0. + void Mult(VecType &y) const; + + void Mult(const VecType &x, VecType &y) const + { + y = x; + Mult(y); + } +}; + +} // namespace palace + +#endif // PALACE_LINALG_DIV_FREE_HPP diff --git a/palace/linalg/eps.hpp b/palace/linalg/eps.hpp index 1754b19d82..54e562981f 100644 --- a/palace/linalg/eps.hpp +++ b/palace/linalg/eps.hpp @@ -1,112 +1,145 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_EPS_HPP -#define PALACE_LINALG_EPS_HPP - -#include -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" - -namespace palace -{ - -class DivFreeSolver; - -// -// Pure abstract base class for solving generalized linear eigenvalue problems problems or -// quadratic polynomial eigenvalue problems. -// -class EigenvalueSolver -{ -public: - enum class ScaleType - { - NONE, - NORM_2 - }; - - enum class WhichType - { - LARGEST_MAGNITUDE, - SMALLEST_MAGNITUDE, - LARGEST_REAL, - SMALLEST_REAL, - LARGEST_IMAGINARY, - SMALLEST_IMAGINARY, - TARGET_MAGNITUDE, - TARGET_REAL, - TARGET_IMAGINARY - }; - - enum class ErrorType - { - ABSOLUTE, - RELATIVE, - BACKWARD - }; - -public: - EigenvalueSolver() = default; - virtual ~EigenvalueSolver() = default; - - // Set operators for the generalized eigenvalue problem or for the quadratic polynomial - // eigenvalue problem. - virtual void SetOperators(const ComplexOperator &K, const ComplexOperator &M, - ScaleType type) = 0; - virtual void SetOperators(const ComplexOperator &K, const ComplexOperator &C, - const ComplexOperator &M, ScaleType type) = 0; - - // For the linear generalized case, the linear solver should be configured to compute the - // action of M⁻¹ (with no spectral transformation) or (K - σ M)⁻¹. For the quadratic - // case, the linear solver should be configured to compute the action of M⁻¹ (with no - // spectral transformation) or P(σ)⁻¹. - virtual void SetLinearSolver(const ComplexKspSolver &ksp) = 0; - - // Set the projection operator for enforcing the divergence-free constraint. - virtual void SetDivFreeProjector(const DivFreeSolver &divfree) = 0; - - // Set optional B matrix used for weighted inner products. This must be set explicitly - // even for generalized problems, otherwise the identity will be used. - virtual void SetBMat(const Operator &B) = 0; - - // Get scaling factors used by the solver. - virtual double GetScalingGamma() const = 0; - virtual double GetScalingDelta() const = 0; - - // Set the number of required eigenmodes. - virtual void SetNumModes(int num_eig, int num_vec = 0) = 0; - - // Set solver tolerance. - virtual void SetTol(double tol) = 0; - - // Set maximum number of Arnoldi update iterations. - virtual void SetMaxIter(int max_it) = 0; - - // Set target spectrum for the eigensolver. When a spectral transformation is used, this - // applies to the spectrum of the shifted operator. - virtual void SetWhichEigenpairs(WhichType type) = 0; - - // Set shift-and-invert spectral transformation. - virtual void SetShiftInvert(std::complex s, bool precond = false) = 0; - - // Set an initial vector for the solution subspace. - virtual void SetInitialSpace(const ComplexVector &v) = 0; - - // Solve the eigenvalue problem. Returns the number of converged eigenvalues. - virtual int Solve() = 0; - - // Get the corresponding eigenvalue. - virtual std::complex GetEigenvalue(int i) const = 0; - - // Get the corresponding eigenvector. - virtual void GetEigenvector(int i, ComplexVector &x) const = 0; - - // Get the corresponding eigenpair error. - virtual double GetError(int i, ErrorType type) const = 0; -}; - -} // namespace palace - -#endif // PALACE_LINALG_EPS_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_EPS_HPP +#define PALACE_LINALG_EPS_HPP + +#include +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +template +class DivFreeSolver; + +// +// Pure abstract base class for solving generalized linear eigenvalue problems problems or +// quadratic polynomial eigenvalue problems. +// +class EigenvalueSolver +{ +public: + enum class ScaleType + { + NONE, + NORM_2 + }; + + enum class WhichType + { + LARGEST_MAGNITUDE, + SMALLEST_MAGNITUDE, + LARGEST_REAL, + SMALLEST_REAL, + LARGEST_IMAGINARY, + SMALLEST_IMAGINARY, + TARGET_MAGNITUDE, + TARGET_REAL, + TARGET_IMAGINARY + }; + + enum class ErrorType + { + ABSOLUTE, + RELATIVE, + BACKWARD + }; + +public: + EigenvalueSolver() = default; + virtual ~EigenvalueSolver() = default; + + // Set operators for the generalized eigenvalue problem, quadratic polynomial + // eigenvalue problem, or nonlinear eigenvalue problem. + virtual void SetOperators(const ComplexOperator &K, const ComplexOperator &M, + ScaleType type) + { + MFEM_ABORT("SetOperators not defined!"); + } + + virtual void SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, ScaleType type) + { + MFEM_ABORT("SetOperators not defined!"); + } + + virtual void SetOperators(const ComplexOperator &K, const ComplexOperator &M, + std::function)> A2, + ScaleType type) + { + MFEM_ABORT("SetOperators not defined!"); + } + + virtual void SetExtraSystemMatrix(std::function(double)>) + { + MFEM_ABORT("SetExtraSystemMatrix not defined!"); + } + + virtual void SetPreconditionerUpdate( + std::function( + std::complex, std::complex, std::complex, double)>) + { + MFEM_ABORT("SetPreconditionerUpdate not defined!"); + } + + // For the linear generalized case, the linear solver should be configured to compute the + // action of M⁻¹ (with no spectral transformation) or (K - σ M)⁻¹. For the quadratic + // case, the linear solver should be configured to compute the action of M⁻¹ (with no + // spectral transformation) or P(σ)⁻¹. + virtual void SetLinearSolver(ComplexKspSolver &ksp) = 0; + + // Set the projection operator for enforcing the divergence-free constraint. + virtual void SetDivFreeProjector(const DivFreeSolver &divfree) = 0; + + // Set optional B matrix used for weighted inner products. This must be set explicitly + // even for generalized problems, otherwise the identity will be used. + virtual void SetBMat(const Operator &B) = 0; + + // Get scaling factors used by the solver. + virtual double GetScalingGamma() const = 0; + virtual double GetScalingDelta() const = 0; + + // Set the number of required eigenmodes. + virtual void SetNumModes(int num_eig, int num_vec = 0) = 0; + + // Set solver tolerance. + virtual void SetTol(double tol) = 0; + + // Set maximum number of Arnoldi update iterations. + virtual void SetMaxIter(int max_it) = 0; + + // Set target spectrum for the eigensolver. When a spectral transformation is used, this + // applies to the spectrum of the shifted operator. + virtual void SetWhichEigenpairs(WhichType type) = 0; + + // Set shift-and-invert spectral transformation. + virtual void SetShiftInvert(std::complex s, bool precond = false) = 0; + + // Set an initial vector for the solution subspace. + virtual void SetInitialSpace(const ComplexVector &v) = 0; + + // Solve the eigenvalue problem. Returns the number of converged eigenvalues. + virtual int Solve() = 0; + + // Get the corresponding eigenvalue. + virtual std::complex GetEigenvalue(int i) const = 0; + + // Get the corresponding eigenvector. Eigenvectors are normalized such that ||x||₂ = 1, + // unless the B-matrix is set for weighted inner products. + virtual void GetEigenvector(int i, ComplexVector &x) const = 0; + + // Get the corresponding eigenpair error. + virtual double GetError(int i, ErrorType type) const = 0; + + // Re-normalize the given number of eigenvectors, for example if the matrix B for weighted + // inner products has changed. This does not perform re-orthogonalization with respect to + // the new matrix, only normalization. + virtual void RescaleEigenvectors(int num_eig) = 0; +}; + +} // namespace palace + +#endif // PALACE_LINALG_EPS_HPP diff --git a/palace/linalg/errorestimator.cpp b/palace/linalg/errorestimator.cpp index 76d75c4038..6be81d7988 100644 --- a/palace/linalg/errorestimator.cpp +++ b/palace/linalg/errorestimator.cpp @@ -1,388 +1,520 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "errorestimator.hpp" - -#include -#include "fem/bilinearform.hpp" -#include "fem/coefficient.hpp" -#include "fem/integrator.hpp" -#include "linalg/amg.hpp" -#include "linalg/gmg.hpp" -#include "linalg/iterative.hpp" -#include "linalg/rap.hpp" -#include "models/materialoperator.hpp" -#include "utils/communication.hpp" -#include "utils/omp.hpp" -#include "utils/timer.hpp" - -namespace palace -{ - -namespace -{ - -std::unique_ptr GetMassMatrix(const FiniteElementSpaceHierarchy &fespaces, - int pa_order_threshold) -{ - constexpr bool skip_zeros = false; - const int dim = fespaces.GetFinestFESpace().GetParMesh()->Dimension(); - const auto type = fespaces.GetFinestFESpace().FEColl()->GetRangeType(dim); - auto M = std::make_unique(fespaces.GetNumLevels()); - for (std::size_t l = 0; l < fespaces.GetNumLevels(); l++) - { - // Force coarse level operator to be fully assembled always. - const auto &fespace_l = fespaces.GetFESpaceAtLevel(l); - BilinearForm m(fespace_l); - if (type == mfem::FiniteElement::SCALAR) - { - MFEM_VERIFY(fespace_l.GetVDim() == 1, - "Scalar mass matrix hierarchy assumes a component-wise solve."); - m.AddDomainIntegrator(); - } - else - { - m.AddDomainIntegrator(); - } - auto M_l = std::make_unique( - m.Assemble((l > 0) ? pa_order_threshold : 99, skip_zeros), fespace_l); - M->AddOperator(std::move(M_l)); - } - return M; -} - -std::unique_ptr -ConfigureLinearSolver(const FiniteElementSpaceHierarchy &fespaces, double tol, int max_it, - int print) -{ - // The system matrix for the projection is real and SPD. - auto amg = - std::make_unique>(std::make_unique(1, 1, 0)); - std::unique_ptr> pc; - if (fespaces.GetNumLevels() > 1) - { - const int mg_smooth_order = - std::max(fespaces.GetFinestFESpace().GetMaxElementOrder(), 2); - pc = std::make_unique>( - std::move(amg), fespaces.GetProlongationOperators(), nullptr, 1, 1, mg_smooth_order, - 1.0, 0.0, true); - } - else - { - pc = std::move(amg); - } - - auto pcg = - std::make_unique>(fespaces.GetFinestFESpace().GetComm(), print); - pcg->SetInitialGuess(false); - pcg->SetRelTol(tol); - pcg->SetAbsTol(std::numeric_limits::epsilon()); - pcg->SetMaxIter(max_it); - - return std::make_unique(std::move(pcg), std::move(pc)); -} - -} // namespace - -FluxProjector::FluxProjector(const MaterialOperator &mat_op, - const FiniteElementSpaceHierarchy &nd_fespaces, double tol, - int max_it, int print, int pa_order_threshold) -{ - BlockTimer bt(Timer::CONSTRUCTESTIMATOR); - { - // Flux operator is always partially assembled. - constexpr auto MatType = MaterialPropertyType::INV_PERMEABILITY; - MaterialPropertyCoefficient muinv_func(mat_op); - BilinearForm flux(nd_fespaces.GetFinestFESpace()); - flux.AddDomainIntegrator(muinv_func); - Flux = std::make_unique(flux.Assemble(), nd_fespaces.GetFinestFESpace()); - } - M = GetMassMatrix(nd_fespaces, pa_order_threshold); - - ksp = ConfigureLinearSolver(nd_fespaces, tol, max_it, print); - ksp->SetOperators(*M, *M); - - rhs.SetSize(nd_fespaces.GetFinestFESpace().GetTrueVSize()); -} - -FluxProjector::FluxProjector(const MaterialOperator &mat_op, - const FiniteElementSpaceHierarchy &h1_fespaces, - const FiniteElementSpace &h1d_fespace, double tol, int max_it, - int print, int pa_order_threshold) -{ - BlockTimer bt(Timer::CONSTRUCTESTIMATOR); - { - // Flux operator is always partially assembled. - constexpr auto MatType = MaterialPropertyType::PERMITTIVITY_REAL; - MaterialPropertyCoefficient epsilon_func(mat_op); - BilinearForm flux(h1_fespaces.GetFinestFESpace(), h1d_fespace); - flux.AddDomainIntegrator(epsilon_func); - Flux = std::make_unique(flux.Assemble(), h1_fespaces.GetFinestFESpace(), - h1d_fespace, false); - } - M = GetMassMatrix(h1_fespaces, pa_order_threshold); - - ksp = ConfigureLinearSolver(h1_fespaces, tol, max_it, print); - ksp->SetOperators(*M, *M); - - rhs.SetSize(h1d_fespace.GetTrueVSize()); -} - -template -void FluxProjector::Mult(const VecType &x, VecType &y) const -{ - BlockTimer bt(Timer::SOLVEESTIMATOR); - MFEM_ASSERT(y.Size() == rhs.Size(), "Invalid vector dimensions for FluxProjector::Mult!"); - MFEM_ASSERT( - y.Size() % x.Size() == 0, - "Invalid vector dimension for FluxProjector::Mult, does not yield even blocking!"); - auto MultImpl = [this](const Vector &x_, Vector &y_) - { - const int vdim = y_.Size() / x_.Size(); - Flux->Mult(x_, rhs); - if (vdim == 1) - { - // Mpi::Print(" Computing smooth flux projection for error estimation\n"); - ksp->Mult(rhs, y_); - } - else - { - for (int i = 0; i < vdim; i++) - { - // Mpi::Print(" Computing smooth flux projection of flux component {:d}/{:d} for " - // "error estimation\n", - // i + 1, vdim); - const Vector rhsb(rhs, i * x_.Size(), x_.Size()); - Vector yb(y_, i * x_.Size(), x_.Size()); - ksp->Mult(rhsb, yb); - } - } - }; - if constexpr (std::is_same::value) - { - MultImpl(x.Real(), y.Real()); - MultImpl(x.Imag(), y.Imag()); - } - else - { - MultImpl(x, y); - } -} - -template -CurlFluxErrorEstimator::CurlFluxErrorEstimator( - const MaterialOperator &mat_op, const FiniteElementSpaceHierarchy &nd_fespaces, - double tol, int max_it, int print, int pa_order_threshold) - : mat_op(mat_op), nd_fespace(nd_fespaces.GetFinestFESpace()), - projector(mat_op, nd_fespaces, tol, max_it, print, pa_order_threshold), - F(nd_fespace.GetTrueVSize()), F_gf(const_cast(&nd_fespace)), - U_gf(const_cast(&nd_fespace)) -{ -} - -template -ErrorIndicator CurlFluxErrorEstimator::ComputeIndicators(const VecType &U) const -{ - // Compute the projection of the discontinuous flux onto the smooth finite element space - // and populate the corresponding grid functions. - BlockTimer bt(Timer::ESTIMATION); - projector.Mult(U, F); - if constexpr (std::is_same::value) - { - F_gf.real().SetFromTrueDofs(F.Real()); - F_gf.imag().SetFromTrueDofs(F.Imag()); - U_gf.real().SetFromTrueDofs(U.Real()); - U_gf.imag().SetFromTrueDofs(U.Imag()); - } - else - { - F_gf.SetFromTrueDofs(F); - U_gf.SetFromTrueDofs(U); - } - - // Loop over elements and accumulate the estimates from this component. The discontinuous - // flux is μ⁻¹ ∇ × U. - auto &mesh = *nd_fespace.GetParMesh(); - Vector estimates(mesh.GetNE()); - double norm2 = 0.0; - PalacePragmaOmp(parallel reduction(+ : norm2)) - { - // Assuming dim == space_dim == curl_dim - mfem::IsoparametricTransformation T; - mfem::Array dofs; - mfem::DofTransformation dof_trans; - mfem::Vector V_ip(mesh.SpaceDimension()), V_smooth(mesh.SpaceDimension()), - V_tmp(mesh.SpaceDimension()), loc_gf; - mfem::DenseMatrix Interp, Curl; - - double loc_norm2 = 0.0; - PalacePragmaOmp(for schedule(static)) - for (int e = 0; e < mesh.GetNE(); e++) - { - const mfem::FiniteElement &fe = *nd_fespace.GetFE(e); - mesh.GetElementTransformation(e, &T); - nd_fespace.GetElementDofs(e, dofs, dof_trans); - Interp.SetSize(fe.GetDof(), V_ip.Size()); - Curl.SetSize(fe.GetDof(), V_ip.Size()); - const int q_order = fem::GetDefaultIntegrationOrder(fe, fe, T); - const mfem::IntegrationRule &ir = - mfem::IntRules.Get(mesh.GetElementGeometry(e), q_order); - - double elem_err = 0.0; - for (int i = 0; i < ir.GetNPoints(); i++) - { - const mfem::IntegrationPoint &ip = ir.IntPoint(i); - T.SetIntPoint(&ip); - fe.CalcVShape(ip, Interp); - fe.CalcCurlShape(ip, Curl); - const double w = ip.weight * T.Weight(); - - auto AccumulateError = - [&](const mfem::ParGridFunction &U_gf_, const mfem::ParGridFunction &F_gf_) - { - // μ⁻¹ ∇ × U - U_gf_.GetSubVector(dofs, loc_gf); - if (dof_trans.GetDofTransformation()) - { - dof_trans.InvTransformPrimal(loc_gf); - } - Curl.MultTranspose(loc_gf, V_ip); - T.Jacobian().Mult(V_ip, V_smooth); - mat_op.GetInvPermeability(T.Attribute).Mult(V_smooth, V_ip); - V_ip *= 1.0 / T.Weight(); - - // Smooth flux - F_gf_.GetSubVector(dofs, loc_gf); - if (dof_trans.GetDofTransformation()) - { - dof_trans.InvTransformPrimal(loc_gf); - } - Interp.MultTranspose(loc_gf, V_tmp); - T.InverseJacobian().MultTranspose(V_tmp, V_smooth); - - V_smooth -= V_ip; - elem_err += w * (V_smooth * V_smooth); - loc_norm2 += w * (V_ip * V_ip); - }; - if constexpr (std::is_same::value) - { - AccumulateError(U_gf.real(), F_gf.real()); - AccumulateError(U_gf.imag(), F_gf.imag()); - } - else - { - AccumulateError(U_gf, F_gf); - } - } - estimates[e] = std::sqrt(elem_err); - } - norm2 += loc_norm2; - } - - // Finalize the element-wise error estimates. - Mpi::GlobalSum(1, &norm2, mesh.GetComm()); - if (norm2 > 0.0) - { - estimates *= 1.0 / std::sqrt(norm2); - } - return ErrorIndicator(std::move(estimates)); -} - -GradFluxErrorEstimator::GradFluxErrorEstimator( - const MaterialOperator &mat_op, const FiniteElementSpaceHierarchy &h1_fespaces, - double tol, int max_it, int print, int pa_order_threshold) - : mat_op(mat_op), h1_fespace(h1_fespaces.GetFinestFESpace()), - h1d_fespace(std::make_unique( - h1_fespace.GetParMesh(), h1_fespace.FEColl(), - h1_fespace.GetParMesh()->SpaceDimension(), mfem::Ordering::byNODES)), - projector(mat_op, h1_fespaces, *h1d_fespace, tol, max_it, print, pa_order_threshold), - F(h1d_fespace->GetTrueVSize()), F_gf(h1d_fespace.get()), - U_gf(const_cast(&h1_fespace)) -{ -} - -ErrorIndicator GradFluxErrorEstimator::ComputeIndicators(const Vector &U) const -{ - // Compute the projection of the discontinuous flux onto the smooth finite element space - // and populate the corresponding grid functions. - BlockTimer bt(Timer::ESTIMATION); - projector.Mult(U, F); - F_gf.SetFromTrueDofs(F); - U_gf.SetFromTrueDofs(U); - - // Loop over elements and accumulate the estimates from this component. The discontinuous - // flux is ε ∇U. - auto &mesh = *h1_fespace.GetParMesh(); - Vector estimates(mesh.GetNE()); - double norm2 = 0.0; - PalacePragmaOmp(parallel reduction(+ : norm2)) - { - // Assuming dim == space_dim - mfem::IsoparametricTransformation T; - mfem::Array dofs, vdofs; - mfem::Vector V_ip(h1d_fespace->GetVDim()), V_smooth(h1d_fespace->GetVDim()), loc_gf; - mfem::Vector Interp; - mfem::DenseMatrix Grad; - - double loc_norm2 = 0.0; - PalacePragmaOmp(for schedule(static)) - for (int e = 0; e < mesh.GetNE(); e++) - { - const mfem::FiniteElement &fe = *h1d_fespace->GetFE(e); - mesh.GetElementTransformation(e, &T); - h1_fespace.GetElementDofs(e, dofs); - vdofs = dofs; - h1d_fespace->DofsToVDofs(vdofs); - Interp.SetSize(fe.GetDof()); - Grad.SetSize(fe.GetDof(), V_ip.Size()); - const int q_order = fem::GetDefaultIntegrationOrder(fe, fe, T); - const mfem::IntegrationRule &ir = - mfem::IntRules.Get(mesh.GetElementGeometry(e), q_order); - - double elem_err = 0.0; - for (int i = 0; i < ir.GetNPoints(); i++) - { - const mfem::IntegrationPoint &ip = ir.IntPoint(i); - T.SetIntPoint(&ip); - fe.CalcShape(ip, Interp); - fe.CalcDShape(ip, Grad); - const double w = ip.weight * T.Weight(); - - // ε ∇U - U_gf.GetSubVector(dofs, loc_gf); - Grad.MultTranspose(loc_gf, V_ip); - T.InverseJacobian().MultTranspose(V_ip, V_smooth); - mat_op.GetPermittivityReal(T.Attribute).Mult(V_smooth, V_ip); - - // Smooth flux - F_gf.GetSubVector(vdofs, loc_gf); - for (int k = 0; k < h1d_fespace->GetVDim(); k++) - { - V_smooth(k) = Interp * (&loc_gf(Interp.Size() * k)); - } - - V_smooth -= V_ip; - elem_err += w * (V_smooth * V_smooth); - loc_norm2 += w * (V_ip * V_ip); - } - estimates[e] = std::sqrt(elem_err); - } - norm2 += loc_norm2; - } - - // Finalize the element-wise error estimates. - Mpi::GlobalSum(1, &norm2, mesh.GetComm()); - if (norm2 > 0.0) - { - estimates *= 1.0 / std::sqrt(norm2); - } - return ErrorIndicator(std::move(estimates)); -} - -template void FluxProjector::Mult(const Vector &, Vector &) const; -template void FluxProjector::Mult(const ComplexVector &, ComplexVector &) const; - -template class CurlFluxErrorEstimator; -template class CurlFluxErrorEstimator; - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "errorestimator.hpp" + +#include +#include "fem/bilinearform.hpp" +#include "fem/integrator.hpp" +#include "fem/libceed/ceed.hpp" +#include "fem/libceed/libceed_coefficient.hpp" +#include "fem/libceed/libceed_integrator.hpp" +#include "linalg/amg.hpp" +#include "linalg/densematrix.hpp" +#include "linalg/gmg.hpp" +#include "linalg/iterative.hpp" +#include "linalg/jacobi.hpp" +#include "linalg/rap.hpp" +#include "models/materialoperator.hpp" +#include "utils/communication.hpp" +#include "utils/diagnostic.hpp" +#include "utils/omp.hpp" +#include "utils/timer.hpp" + +PalacePragmaDiagnosticPush +PalacePragmaDiagnosticDisableUnused + +#include "fem/qfunctions/hcurlhdiv_error_qf.h" + +PalacePragmaDiagnosticPop + +namespace palace +{ + +namespace +{ + +template +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace); + +template <> +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace) +{ + return std::make_unique(std::move(a), trial_fespace, test_fespace, false); +} + +template <> +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace) +{ + return std::make_unique(std::move(a), nullptr, trial_fespace, + test_fespace, false); +} + +template +auto BuildLevelParOperator(std::unique_ptr &&a, const FiniteElementSpace &fespace) +{ + return BuildLevelParOperator(std::move(a), fespace, fespace); +} + +template +auto ConfigureLinearSolver(const FiniteElementSpaceHierarchy &fespaces, double tol, + int max_it, int print, bool use_mg) +{ + // The system matrix for the projection is real, SPD and diagonally dominant. + std::unique_ptr> pc; + if (!use_mg) + { + // Use eigenvalue estimate to compute optimal Jacobi damping parameter. + pc = std::make_unique>(fespaces.GetFinestFESpace().GetComm(), + 0.0); + } + else + { + auto amg = std::make_unique(1, 1, true, 0); + amg->SetStrengthThresh(0.8); // More coarsening to save memory + if (fespaces.GetNumLevels() > 1) + { + const int mg_smooth_order = 2; // Smooth order independent of FE space order + pc = std::make_unique>( + fespaces.GetFinestFESpace().GetComm(), + std::make_unique>(std::move(amg)), + fespaces.GetProlongationOperators(), nullptr, 1, 1, mg_smooth_order, 1.0, 0.0, + true); + } + else + { + pc = std::make_unique>(std::move(amg)); + } + } + auto pcg = + std::make_unique>(fespaces.GetFinestFESpace().GetComm(), print); + pcg->SetInitialGuess(false); + pcg->SetRelTol(tol); + pcg->SetAbsTol(std::numeric_limits::epsilon()); + pcg->SetMaxIter(max_it); + return std::make_unique>(std::move(pcg), std::move(pc)); +} + +} // namespace + +template +FluxProjector::FluxProjector(const MaterialPropertyCoefficient &coeff, + const FiniteElementSpaceHierarchy &smooth_fespaces, + const FiniteElementSpace &rhs_fespace, double tol, + int max_it, int print, bool use_mg) +{ + BlockTimer bt(Timer::CONSTRUCT_ESTIMATOR); + const auto &smooth_fespace = smooth_fespaces.GetFinestFESpace(); + { + constexpr bool skip_zeros = false; + BilinearForm m(smooth_fespace); + m.AddDomainIntegrator(); + if (!use_mg) + { + M = BuildLevelParOperator(m.Assemble(skip_zeros), smooth_fespace); + } + else + { + auto m_vec = m.Assemble(smooth_fespaces, skip_zeros); + auto M_mg = + std::make_unique>(smooth_fespaces.GetNumLevels()); + for (std::size_t l = 0; l < smooth_fespaces.GetNumLevels(); l++) + { + const auto &fespace_l = smooth_fespaces.GetFESpaceAtLevel(l); + M_mg->AddOperator(BuildLevelParOperator(std::move(m_vec[l]), fespace_l)); + } + M = std::move(M_mg); + } + } + { + // Flux operator is always partially assembled. + BilinearForm flux(rhs_fespace, smooth_fespace); + flux.AddDomainIntegrator(coeff); + Flux = BuildLevelParOperator(flux.PartialAssemble(), rhs_fespace, + smooth_fespace); + } + ksp = ConfigureLinearSolver(smooth_fespaces, tol, max_it, print, use_mg); + ksp->SetOperators(*M, *M); + rhs.SetSize(smooth_fespace.GetTrueVSize()); + rhs.UseDevice(true); +} + +template +void FluxProjector::Mult(const VecType &x, VecType &y) const +{ + BlockTimer bt(Timer::SOLVE_ESTIMATOR); + MFEM_ASSERT(x.Size() == Flux->Width() && y.Size() == rhs.Size(), + "Invalid vector dimensions for FluxProjector::Mult!"); + // Mpi::Print(" Computing smooth flux recovery (projection) for error estimation\n"); + Flux->Mult(x, rhs); + ksp->Mult(rhs, y); +} + +namespace +{ + +template +Vector ComputeErrorEstimates(const VecType &F, VecType &F_gf, VecType &G, VecType &G_gf, + const FiniteElementSpace &fespace, + const FiniteElementSpace &smooth_fespace, + const FluxProjector &projector, + const ceed::Operator &integ_op) +{ + // Compute the projection of the discontinuous flux onto the smooth finite element space + // (recovery) and populate the corresponding grid functions. + BlockTimer bt(Timer::ESTIMATION); + projector.Mult(F, G); + if constexpr (std::is_same::value) + { + fespace.GetProlongationMatrix()->Mult(F.Real(), F_gf.Real()); + fespace.GetProlongationMatrix()->Mult(F.Imag(), F_gf.Imag()); + smooth_fespace.GetProlongationMatrix()->Mult(G.Real(), G_gf.Real()); + smooth_fespace.GetProlongationMatrix()->Mult(G.Imag(), G_gf.Imag()); + } + else + { + fespace.GetProlongationMatrix()->Mult(F, F_gf); + smooth_fespace.GetProlongationMatrix()->Mult(G, G_gf); + } + + // Use libCEED operators to perform the error estimate integration over each element. + const auto &mesh = fespace.GetMesh(); + Vector estimates(mesh.GetNE()); + estimates.UseDevice(true); + estimates = 0.0; + PalacePragmaOmp(parallel if (ceed::internal::NumCeeds() > 1)) + { + Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; + + // We need to update the state of the underlying libCEED vectors to indicate that the + // data has changed. Each thread has it's own vector, referencing the same underlying + // data. + CeedVector F_gf_vec, G_gf_vec; + { + CeedInt nsub_ops; + CeedOperator *sub_ops; + PalaceCeedCall( + ceed, CeedOperatorCompositeGetNumSub(integ_op[utils::GetThreadNum()], &nsub_ops)); + PalaceCeedCall( + ceed, CeedOperatorCompositeGetSubList(integ_op[utils::GetThreadNum()], &sub_ops)); + MFEM_ASSERT(nsub_ops > 0, "Unexpected empty libCEED composite operator!"); + CeedOperatorField field; + PalaceCeedCall(ceed, CeedOperatorGetFieldByName(sub_ops[0], "u_1", &field)); + PalaceCeedCall(ceed, CeedOperatorFieldGetVector(field, &F_gf_vec)); + PalaceCeedCall(ceed, CeedOperatorGetFieldByName(sub_ops[0], "u_2", &field)); + PalaceCeedCall(ceed, CeedOperatorFieldGetVector(field, &G_gf_vec)); + if constexpr (std::is_same::value) + { + ceed::InitCeedVector(F_gf.Real(), ceed, &F_gf_vec, false); + ceed::InitCeedVector(G_gf.Real(), ceed, &G_gf_vec, false); + } + else + { + ceed::InitCeedVector(F_gf, ceed, &F_gf_vec, false); + ceed::InitCeedVector(G_gf, ceed, &G_gf_vec, false); + } + } + + // Each thread writes to non-overlapping entries of the estimates vector. + CeedVector estimates_vec; + ceed::InitCeedVector(estimates, ceed, &estimates_vec); + + // Do the integration (both input vectors are passive). For the complex case, add sum of + // squares of real and imaginary parts to the estimates before square root. + PalaceCeedCall(ceed, + CeedOperatorApplyAdd(integ_op[utils::GetThreadNum()], CEED_VECTOR_NONE, + estimates_vec, CEED_REQUEST_IMMEDIATE)); + if constexpr (std::is_same::value) + { + ceed::InitCeedVector(F_gf.Imag(), ceed, &F_gf_vec, false); + ceed::InitCeedVector(G_gf.Imag(), ceed, &G_gf_vec, false); + PalaceCeedCall(ceed, + CeedOperatorApplyAdd(integ_op[utils::GetThreadNum()], CEED_VECTOR_NONE, + estimates_vec, CEED_REQUEST_IMMEDIATE)); + } + + // Cleanup. + PalaceCeedCall(ceed, CeedVectorDestroy(&estimates_vec)); + } + + return estimates; +} + +} // namespace + +template +GradFluxErrorEstimator::GradFluxErrorEstimator( + const MaterialOperator &mat_op, FiniteElementSpace &nd_fespace, + FiniteElementSpaceHierarchy &rt_fespaces, double tol, int max_it, int print, + bool use_mg) + : nd_fespace(nd_fespace), rt_fespace(rt_fespaces.GetFinestFESpace()), + projector(MaterialPropertyCoefficient(mat_op.GetAttributeToMaterial(), + mat_op.GetPermittivityReal()), + rt_fespaces, nd_fespace, tol, max_it, print, use_mg), + integ_op(nd_fespace.GetMesh().GetNE(), nd_fespace.GetVSize()), + E_gf(nd_fespace.GetVSize()), D(rt_fespace.GetTrueVSize()), D_gf(rt_fespace.GetVSize()) +{ + E_gf.UseDevice(true); + D.UseDevice(true); + D_gf.UseDevice(true); + + // Construct the libCEED operator used for integrating the element-wise error. The + // discontinuous flux is ε E = ε ∇V. + const auto &mesh = nd_fespace.GetMesh(); + PalacePragmaOmp(parallel if (ceed::internal::NumCeeds() > 1)) + { + Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; + for (const auto &[geom, data] : mesh.GetCeedGeomFactorData(ceed)) + { + // Only integrate over domain elements (not on the boundary). + if (mfem::Geometry::Dimension[geom] < mesh.Dimension()) + { + continue; + } + + // Create libCEED vector wrappers for use with libCEED operators. + CeedVector E_gf_vec, D_gf_vec; + if constexpr (std::is_same::value) + { + ceed::InitCeedVector(E_gf.Real(), ceed, &E_gf_vec); + ceed::InitCeedVector(D_gf.Real(), ceed, &D_gf_vec); + } + else + { + ceed::InitCeedVector(E_gf, ceed, &E_gf_vec); + ceed::InitCeedVector(D_gf, ceed, &D_gf_vec); + } + + // Construct mesh element restriction for elements of this element geometry type. + CeedElemRestriction mesh_elem_restr; + PalaceCeedCall(ceed, CeedElemRestrictionCreate( + ceed, static_cast(data.indices.size()), 1, 1, + mesh.GetNE(), mesh.GetNE(), CEED_MEM_HOST, CEED_USE_POINTER, + data.indices.data(), &mesh_elem_restr)); + + // Element restriction and basis objects for inputs. + CeedElemRestriction nd_restr = + nd_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedElemRestriction rt_restr = + rt_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedBasis nd_basis = nd_fespace.GetCeedBasis(ceed, geom); + CeedBasis rt_basis = rt_fespace.GetCeedBasis(ceed, geom); + + // Construct coefficient for discontinuous flux, then smooth flux. + auto mat_sqrtepsilon = linalg::MatrixSqrt(mat_op.GetPermittivityReal()); + auto mat_invsqrtepsilon = linalg::MatrixPow(mat_op.GetPermittivityReal(), -0.5); + MaterialPropertyCoefficient sqrtepsilon_func(mat_op.GetAttributeToMaterial(), + mat_sqrtepsilon); + MaterialPropertyCoefficient invsqrtepsilon_func(mat_op.GetAttributeToMaterial(), + mat_invsqrtepsilon); + auto ctx = + ceed::PopulateCoefficientContext(mesh.SpaceDimension(), &sqrtepsilon_func, + mesh.SpaceDimension(), &invsqrtepsilon_func); + + // Assemble the libCEED operator. Inputs: E (for discontinuous flux), then smooth + // flux. + ceed::CeedQFunctionInfo info; + info.assemble_q_data = false; + switch (10 * mesh.SpaceDimension() + mesh.Dimension()) + { + case 22: + info.apply_qf = f_apply_hcurlhdiv_error_22; + info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_hcurlhdiv_error_22_loc); + break; + case 33: + info.apply_qf = f_apply_hcurlhdiv_error_33; + info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_hcurlhdiv_error_33_loc); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" + << mesh.Dimension() << ", " << mesh.SpaceDimension() + << ") for GradFluxErrorEstimator!"); + } + info.trial_ops = info.test_ops = ceed::EvalMode::Interp; + + CeedOperator sub_op; + ceed::AssembleCeedElementErrorIntegrator( + info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, E_gf_vec, + D_gf_vec, nd_restr, rt_restr, nd_basis, rt_basis, mesh_elem_restr, data.geom_data, + data.geom_data_restr, &sub_op); + integ_op.AddSubOperator(sub_op); // Sub-operator owned by ceed::Operator + + // Element restriction and passive input vectors are owned by the operator. + PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&mesh_elem_restr)); + PalaceCeedCall(ceed, CeedVectorDestroy(&E_gf_vec)); + PalaceCeedCall(ceed, CeedVectorDestroy(&D_gf_vec)); + } + } + + // Finalize the operator (call CeedOperatorCheckReady). + integ_op.Finalize(); +} + +template +void GradFluxErrorEstimator::AddErrorIndicator(const VecType &E, double Et, + ErrorIndicator &indicator) const +{ + auto estimates = + ComputeErrorEstimates(E, E_gf, D, D_gf, nd_fespace, rt_fespace, projector, integ_op); + linalg::Sqrt(estimates, (Et > 0.0) ? 0.5 / Et : 1.0); // Correct factor of 1/2 in energy + indicator.AddIndicator(estimates); +} + +template +CurlFluxErrorEstimator::CurlFluxErrorEstimator( + const MaterialOperator &mat_op, FiniteElementSpace &rt_fespace, + FiniteElementSpaceHierarchy &nd_fespaces, double tol, int max_it, int print, + bool use_mg) + : rt_fespace(rt_fespace), nd_fespace(nd_fespaces.GetFinestFESpace()), + projector(MaterialPropertyCoefficient(mat_op.GetAttributeToMaterial(), + mat_op.GetInvPermeability()), + nd_fespaces, rt_fespace, tol, max_it, print, use_mg), + integ_op(nd_fespace.GetMesh().GetNE(), rt_fespace.GetVSize()), + B_gf(rt_fespace.GetVSize()), H(nd_fespace.GetTrueVSize()), H_gf(nd_fespace.GetVSize()) +{ + B_gf.UseDevice(true); + H.UseDevice(true); + H_gf.UseDevice(true); + + // Construct the libCEED operator used for integrating the element-wise error. The + // discontinuous flux is μ⁻¹ B ≃ μ⁻¹ ∇ × E. + const auto &mesh = rt_fespace.GetMesh(); + PalacePragmaOmp(parallel if (ceed::internal::NumCeeds() > 1)) + { + Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; + for (const auto &[geom, data] : mesh.GetCeedGeomFactorData(ceed)) + { + // Only integrate over domain elements (not on the boundary). + if (mfem::Geometry::Dimension[geom] < mesh.Dimension()) + { + continue; + } + + // Create libCEED vector wrappers for use with libCEED operators. + CeedVector B_gf_vec, H_gf_vec; + if constexpr (std::is_same::value) + { + ceed::InitCeedVector(B_gf.Real(), ceed, &B_gf_vec); + ceed::InitCeedVector(H_gf.Real(), ceed, &H_gf_vec); + } + else + { + ceed::InitCeedVector(B_gf, ceed, &B_gf_vec); + ceed::InitCeedVector(H_gf, ceed, &H_gf_vec); + } + + // Construct mesh element restriction for elements of this element geometry type. + CeedElemRestriction mesh_elem_restr; + PalaceCeedCall(ceed, CeedElemRestrictionCreate( + ceed, static_cast(data.indices.size()), 1, 1, + mesh.GetNE(), mesh.GetNE(), CEED_MEM_HOST, CEED_USE_POINTER, + data.indices.data(), &mesh_elem_restr)); + + // Element restriction and basis objects for inputs. + CeedElemRestriction rt_restr = + rt_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedElemRestriction nd_restr = + nd_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedBasis rt_basis = rt_fespace.GetCeedBasis(ceed, geom); + CeedBasis nd_basis = nd_fespace.GetCeedBasis(ceed, geom); + + // Construct coefficient for discontinuous flux, then smooth flux. + auto mat_invsqrtmu = linalg::MatrixSqrt(mat_op.GetInvPermeability()); + auto mat_sqrtmu = linalg::MatrixPow(mat_op.GetInvPermeability(), -0.5); + MaterialPropertyCoefficient invsqrtmu_func(mat_op.GetAttributeToMaterial(), + mat_invsqrtmu); + MaterialPropertyCoefficient sqrtmu_func(mat_op.GetAttributeToMaterial(), mat_sqrtmu); + auto ctx = ceed::PopulateCoefficientContext(mesh.SpaceDimension(), &invsqrtmu_func, + mesh.SpaceDimension(), &sqrtmu_func); + + // Assemble the libCEED operator. Inputs: B (for discontinuous flux), then smooth + // flux. Currently only supports 3D, since curl in 2D requires special treatment. + ceed::CeedQFunctionInfo info; + info.assemble_q_data = false; + switch (10 * mesh.SpaceDimension() + mesh.Dimension()) + { + case 33: + info.apply_qf = f_apply_hdivhcurl_error_33; + info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_hdivhcurl_error_33_loc); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" + << mesh.Dimension() << ", " << mesh.SpaceDimension() + << ") for CurlFluxErrorEstimator!"); + } + info.trial_ops = info.test_ops = ceed::EvalMode::Interp; + + CeedOperator sub_op; + ceed::AssembleCeedElementErrorIntegrator( + info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, B_gf_vec, + H_gf_vec, rt_restr, nd_restr, rt_basis, nd_basis, mesh_elem_restr, data.geom_data, + data.geom_data_restr, &sub_op); + integ_op.AddSubOperator(sub_op); // Sub-operator owned by ceed::Operator + + // Element restriction and passive input vectors are owned by the operator. + PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&mesh_elem_restr)); + PalaceCeedCall(ceed, CeedVectorDestroy(&B_gf_vec)); + PalaceCeedCall(ceed, CeedVectorDestroy(&H_gf_vec)); + } + } + + // Finalize the operator (call CeedOperatorCheckReady). + integ_op.Finalize(); +} + +template +void CurlFluxErrorEstimator::AddErrorIndicator(const VecType &B, double Et, + ErrorIndicator &indicator) const +{ + auto estimates = + ComputeErrorEstimates(B, B_gf, H, H_gf, rt_fespace, nd_fespace, projector, integ_op); + linalg::Sqrt(estimates, (Et > 0.0) ? 0.5 / Et : 1.0); // Correct factor of 1/2 in energy + indicator.AddIndicator(estimates); +} + +template +TimeDependentFluxErrorEstimator::TimeDependentFluxErrorEstimator( + const MaterialOperator &mat_op, FiniteElementSpaceHierarchy &nd_fespaces, + FiniteElementSpaceHierarchy &rt_fespaces, double tol, int max_it, int print, + bool use_mg) + : grad_estimator(mat_op, nd_fespaces.GetFinestFESpace(), rt_fespaces, tol, max_it, print, + use_mg), + curl_estimator(mat_op, rt_fespaces.GetFinestFESpace(), nd_fespaces, tol, max_it, print, + use_mg) +{ +} + +template +void TimeDependentFluxErrorEstimator::AddErrorIndicator( + const VecType &E, const VecType &B, double Et, ErrorIndicator &indicator) const +{ + auto grad_estimates = + ComputeErrorEstimates(E, grad_estimator.E_gf, grad_estimator.D, grad_estimator.D_gf, + grad_estimator.nd_fespace, grad_estimator.rt_fespace, + grad_estimator.projector, grad_estimator.integ_op); + auto curl_estimates = + ComputeErrorEstimates(B, curl_estimator.B_gf, curl_estimator.H, curl_estimator.H_gf, + curl_estimator.rt_fespace, curl_estimator.nd_fespace, + curl_estimator.projector, curl_estimator.integ_op); + grad_estimates += curl_estimates; // Sum of squares + linalg::Sqrt(grad_estimates, + (Et > 0.0) ? 0.5 / Et : 1.0); // Correct factor of 1/2 in energy + indicator.AddIndicator(grad_estimates); +} + +template class FluxProjector; +template class FluxProjector; +template class GradFluxErrorEstimator; +template class GradFluxErrorEstimator; +template class CurlFluxErrorEstimator; +template class CurlFluxErrorEstimator; +template class TimeDependentFluxErrorEstimator; +template class TimeDependentFluxErrorEstimator; + +} // namespace palace diff --git a/palace/linalg/errorestimator.cpp.bak b/palace/linalg/errorestimator.cpp.bak new file mode 100644 index 0000000000..35bc5093d1 --- /dev/null +++ b/palace/linalg/errorestimator.cpp.bak @@ -0,0 +1,520 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "errorestimator.hpp" + +#include +#include "fem/bilinearform.hpp" +#include "fem/integrator.hpp" +#include "fem/libceed/ceed.hpp" +#include "fem/libceed/coefficient.hpp" +#include "fem/libceed/integrator.hpp" +#include "linalg/amg.hpp" +#include "linalg/densematrix.hpp" +#include "linalg/gmg.hpp" +#include "linalg/iterative.hpp" +#include "linalg/jacobi.hpp" +#include "linalg/rap.hpp" +#include "models/materialoperator.hpp" +#include "utils/communication.hpp" +#include "utils/diagnostic.hpp" +#include "utils/omp.hpp" +#include "utils/timer.hpp" + +PalacePragmaDiagnosticPush +PalacePragmaDiagnosticDisableUnused + +#include "fem/qfunctions/hcurlhdiv_error_qf.h" + +PalacePragmaDiagnosticPop + +namespace palace +{ + +namespace +{ + +template +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace); + +template <> +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace) +{ + return std::make_unique(std::move(a), trial_fespace, test_fespace, false); +} + +template <> +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace) +{ + return std::make_unique(std::move(a), nullptr, trial_fespace, + test_fespace, false); +} + +template +auto BuildLevelParOperator(std::unique_ptr &&a, const FiniteElementSpace &fespace) +{ + return BuildLevelParOperator(std::move(a), fespace, fespace); +} + +template +auto ConfigureLinearSolver(const FiniteElementSpaceHierarchy &fespaces, double tol, + int max_it, int print, bool use_mg) +{ + // The system matrix for the projection is real, SPD and diagonally dominant. + std::unique_ptr> pc; + if (!use_mg) + { + // Use eigenvalue estimate to compute optimal Jacobi damping parameter. + pc = std::make_unique>(fespaces.GetFinestFESpace().GetComm(), + 0.0); + } + else + { + auto amg = std::make_unique(1, 1, true, 0); + amg->SetStrengthThresh(0.8); // More coarsening to save memory + if (fespaces.GetNumLevels() > 1) + { + const int mg_smooth_order = 2; // Smooth order independent of FE space order + pc = std::make_unique>( + fespaces.GetFinestFESpace().GetComm(), + std::make_unique>(std::move(amg)), + fespaces.GetProlongationOperators(), nullptr, 1, 1, mg_smooth_order, 1.0, 0.0, + true); + } + else + { + pc = std::make_unique>(std::move(amg)); + } + } + auto pcg = + std::make_unique>(fespaces.GetFinestFESpace().GetComm(), print); + pcg->SetInitialGuess(false); + pcg->SetRelTol(tol); + pcg->SetAbsTol(std::numeric_limits::epsilon()); + pcg->SetMaxIter(max_it); + return std::make_unique>(std::move(pcg), std::move(pc)); +} + +} // namespace + +template +FluxProjector::FluxProjector(const MaterialPropertyCoefficient &coeff, + const FiniteElementSpaceHierarchy &smooth_fespaces, + const FiniteElementSpace &rhs_fespace, double tol, + int max_it, int print, bool use_mg) +{ + BlockTimer bt(Timer::CONSTRUCT_ESTIMATOR); + const auto &smooth_fespace = smooth_fespaces.GetFinestFESpace(); + { + constexpr bool skip_zeros = false; + BilinearForm m(smooth_fespace); + m.AddDomainIntegrator(); + if (!use_mg) + { + M = BuildLevelParOperator(m.Assemble(skip_zeros), smooth_fespace); + } + else + { + auto m_vec = m.Assemble(smooth_fespaces, skip_zeros); + auto M_mg = + std::make_unique>(smooth_fespaces.GetNumLevels()); + for (std::size_t l = 0; l < smooth_fespaces.GetNumLevels(); l++) + { + const auto &fespace_l = smooth_fespaces.GetFESpaceAtLevel(l); + M_mg->AddOperator(BuildLevelParOperator(std::move(m_vec[l]), fespace_l)); + } + M = std::move(M_mg); + } + } + { + // Flux operator is always partially assembled. + BilinearForm flux(rhs_fespace, smooth_fespace); + flux.AddDomainIntegrator(coeff); + Flux = BuildLevelParOperator(flux.PartialAssemble(), rhs_fespace, + smooth_fespace); + } + ksp = ConfigureLinearSolver(smooth_fespaces, tol, max_it, print, use_mg); + ksp->SetOperators(*M, *M); + rhs.SetSize(smooth_fespace.GetTrueVSize()); + rhs.UseDevice(true); +} + +template +void FluxProjector::Mult(const VecType &x, VecType &y) const +{ + BlockTimer bt(Timer::SOLVE_ESTIMATOR); + MFEM_ASSERT(x.Size() == Flux->Width() && y.Size() == rhs.Size(), + "Invalid vector dimensions for FluxProjector::Mult!"); + // Mpi::Print(" Computing smooth flux recovery (projection) for error estimation\n"); + Flux->Mult(x, rhs); + ksp->Mult(rhs, y); +} + +namespace +{ + +template +Vector ComputeErrorEstimates(const VecType &F, VecType &F_gf, VecType &G, VecType &G_gf, + const FiniteElementSpace &fespace, + const FiniteElementSpace &smooth_fespace, + const FluxProjector &projector, + const ceed::Operator &integ_op) +{ + // Compute the projection of the discontinuous flux onto the smooth finite element space + // (recovery) and populate the corresponding grid functions. + BlockTimer bt(Timer::ESTIMATION); + projector.Mult(F, G); + if constexpr (std::is_same::value) + { + fespace.GetProlongationMatrix()->Mult(F.Real(), F_gf.Real()); + fespace.GetProlongationMatrix()->Mult(F.Imag(), F_gf.Imag()); + smooth_fespace.GetProlongationMatrix()->Mult(G.Real(), G_gf.Real()); + smooth_fespace.GetProlongationMatrix()->Mult(G.Imag(), G_gf.Imag()); + } + else + { + fespace.GetProlongationMatrix()->Mult(F, F_gf); + smooth_fespace.GetProlongationMatrix()->Mult(G, G_gf); + } + + // Use libCEED operators to perform the error estimate integration over each element. + const auto &mesh = fespace.GetMesh(); + Vector estimates(mesh.GetNE()); + estimates.UseDevice(true); + estimates = 0.0; + PalacePragmaOmp(parallel if (ceed::internal::NumCeeds() > 1)) + { + Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; + + // We need to update the state of the underlying libCEED vectors to indicate that the + // data has changed. Each thread has it's own vector, referencing the same underlying + // data. + CeedVector F_gf_vec, G_gf_vec; + { + CeedInt nsub_ops; + CeedOperator *sub_ops; + PalaceCeedCall( + ceed, CeedOperatorCompositeGetNumSub(integ_op[utils::GetThreadNum()], &nsub_ops)); + PalaceCeedCall( + ceed, CeedOperatorCompositeGetSubList(integ_op[utils::GetThreadNum()], &sub_ops)); + MFEM_ASSERT(nsub_ops > 0, "Unexpected empty libCEED composite operator!"); + CeedOperatorField field; + PalaceCeedCall(ceed, CeedOperatorGetFieldByName(sub_ops[0], "u_1", &field)); + PalaceCeedCall(ceed, CeedOperatorFieldGetVector(field, &F_gf_vec)); + PalaceCeedCall(ceed, CeedOperatorGetFieldByName(sub_ops[0], "u_2", &field)); + PalaceCeedCall(ceed, CeedOperatorFieldGetVector(field, &G_gf_vec)); + if constexpr (std::is_same::value) + { + ceed::InitCeedVector(F_gf.Real(), ceed, &F_gf_vec, false); + ceed::InitCeedVector(G_gf.Real(), ceed, &G_gf_vec, false); + } + else + { + ceed::InitCeedVector(F_gf, ceed, &F_gf_vec, false); + ceed::InitCeedVector(G_gf, ceed, &G_gf_vec, false); + } + } + + // Each thread writes to non-overlapping entries of the estimates vector. + CeedVector estimates_vec; + ceed::InitCeedVector(estimates, ceed, &estimates_vec); + + // Do the integration (both input vectors are passive). For the complex case, add sum of + // squares of real and imaginary parts to the estimates before square root. + PalaceCeedCall(ceed, + CeedOperatorApplyAdd(integ_op[utils::GetThreadNum()], CEED_VECTOR_NONE, + estimates_vec, CEED_REQUEST_IMMEDIATE)); + if constexpr (std::is_same::value) + { + ceed::InitCeedVector(F_gf.Imag(), ceed, &F_gf_vec, false); + ceed::InitCeedVector(G_gf.Imag(), ceed, &G_gf_vec, false); + PalaceCeedCall(ceed, + CeedOperatorApplyAdd(integ_op[utils::GetThreadNum()], CEED_VECTOR_NONE, + estimates_vec, CEED_REQUEST_IMMEDIATE)); + } + + // Cleanup. + PalaceCeedCall(ceed, CeedVectorDestroy(&estimates_vec)); + } + + return estimates; +} + +} // namespace + +template +GradFluxErrorEstimator::GradFluxErrorEstimator( + const MaterialOperator &mat_op, FiniteElementSpace &nd_fespace, + FiniteElementSpaceHierarchy &rt_fespaces, double tol, int max_it, int print, + bool use_mg) + : nd_fespace(nd_fespace), rt_fespace(rt_fespaces.GetFinestFESpace()), + projector(MaterialPropertyCoefficient(mat_op.GetAttributeToMaterial(), + mat_op.GetPermittivityReal()), + rt_fespaces, nd_fespace, tol, max_it, print, use_mg), + integ_op(nd_fespace.GetMesh().GetNE(), nd_fespace.GetVSize()), + E_gf(nd_fespace.GetVSize()), D(rt_fespace.GetTrueVSize()), D_gf(rt_fespace.GetVSize()) +{ + E_gf.UseDevice(true); + D.UseDevice(true); + D_gf.UseDevice(true); + + // Construct the libCEED operator used for integrating the element-wise error. The + // discontinuous flux is ε E = ε ∇V. + const auto &mesh = nd_fespace.GetMesh(); + PalacePragmaOmp(parallel if (ceed::internal::NumCeeds() > 1)) + { + Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; + for (const auto &[geom, data] : mesh.GetCeedGeomFactorData(ceed)) + { + // Only integrate over domain elements (not on the boundary). + if (mfem::Geometry::Dimension[geom] < mesh.Dimension()) + { + continue; + } + + // Create libCEED vector wrappers for use with libCEED operators. + CeedVector E_gf_vec, D_gf_vec; + if constexpr (std::is_same::value) + { + ceed::InitCeedVector(E_gf.Real(), ceed, &E_gf_vec); + ceed::InitCeedVector(D_gf.Real(), ceed, &D_gf_vec); + } + else + { + ceed::InitCeedVector(E_gf, ceed, &E_gf_vec); + ceed::InitCeedVector(D_gf, ceed, &D_gf_vec); + } + + // Construct mesh element restriction for elements of this element geometry type. + CeedElemRestriction mesh_elem_restr; + PalaceCeedCall(ceed, CeedElemRestrictionCreate( + ceed, static_cast(data.indices.size()), 1, 1, + mesh.GetNE(), mesh.GetNE(), CEED_MEM_HOST, CEED_USE_POINTER, + data.indices.data(), &mesh_elem_restr)); + + // Element restriction and basis objects for inputs. + CeedElemRestriction nd_restr = + nd_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedElemRestriction rt_restr = + rt_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedBasis nd_basis = nd_fespace.GetCeedBasis(ceed, geom); + CeedBasis rt_basis = rt_fespace.GetCeedBasis(ceed, geom); + + // Construct coefficient for discontinuous flux, then smooth flux. + auto mat_sqrtepsilon = linalg::MatrixSqrt(mat_op.GetPermittivityReal()); + auto mat_invsqrtepsilon = linalg::MatrixPow(mat_op.GetPermittivityReal(), -0.5); + MaterialPropertyCoefficient sqrtepsilon_func(mat_op.GetAttributeToMaterial(), + mat_sqrtepsilon); + MaterialPropertyCoefficient invsqrtepsilon_func(mat_op.GetAttributeToMaterial(), + mat_invsqrtepsilon); + auto ctx = + ceed::PopulateCoefficientContext(mesh.SpaceDimension(), &sqrtepsilon_func, + mesh.SpaceDimension(), &invsqrtepsilon_func); + + // Assemble the libCEED operator. Inputs: E (for discontinuous flux), then smooth + // flux. + ceed::CeedQFunctionInfo info; + info.assemble_q_data = false; + switch (10 * mesh.SpaceDimension() + mesh.Dimension()) + { + case 22: + info.apply_qf = f_apply_hcurlhdiv_error_22; + info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_hcurlhdiv_error_22_loc); + break; + case 33: + info.apply_qf = f_apply_hcurlhdiv_error_33; + info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_hcurlhdiv_error_33_loc); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" + << mesh.Dimension() << ", " << mesh.SpaceDimension() + << ") for GradFluxErrorEstimator!"); + } + info.trial_ops = info.test_ops = ceed::EvalMode::Interp; + + CeedOperator sub_op; + ceed::AssembleCeedElementErrorIntegrator( + info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, E_gf_vec, + D_gf_vec, nd_restr, rt_restr, nd_basis, rt_basis, mesh_elem_restr, data.geom_data, + data.geom_data_restr, &sub_op); + integ_op.AddSubOperator(sub_op); // Sub-operator owned by ceed::Operator + + // Element restriction and passive input vectors are owned by the operator. + PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&mesh_elem_restr)); + PalaceCeedCall(ceed, CeedVectorDestroy(&E_gf_vec)); + PalaceCeedCall(ceed, CeedVectorDestroy(&D_gf_vec)); + } + } + + // Finalize the operator (call CeedOperatorCheckReady). + integ_op.Finalize(); +} + +template +void GradFluxErrorEstimator::AddErrorIndicator(const VecType &E, double Et, + ErrorIndicator &indicator) const +{ + auto estimates = + ComputeErrorEstimates(E, E_gf, D, D_gf, nd_fespace, rt_fespace, projector, integ_op); + linalg::Sqrt(estimates, (Et > 0.0) ? 0.5 / Et : 1.0); // Correct factor of 1/2 in energy + indicator.AddIndicator(estimates); +} + +template +CurlFluxErrorEstimator::CurlFluxErrorEstimator( + const MaterialOperator &mat_op, FiniteElementSpace &rt_fespace, + FiniteElementSpaceHierarchy &nd_fespaces, double tol, int max_it, int print, + bool use_mg) + : rt_fespace(rt_fespace), nd_fespace(nd_fespaces.GetFinestFESpace()), + projector(MaterialPropertyCoefficient(mat_op.GetAttributeToMaterial(), + mat_op.GetInvPermeability()), + nd_fespaces, rt_fespace, tol, max_it, print, use_mg), + integ_op(nd_fespace.GetMesh().GetNE(), rt_fespace.GetVSize()), + B_gf(rt_fespace.GetVSize()), H(nd_fespace.GetTrueVSize()), H_gf(nd_fespace.GetVSize()) +{ + B_gf.UseDevice(true); + H.UseDevice(true); + H_gf.UseDevice(true); + + // Construct the libCEED operator used for integrating the element-wise error. The + // discontinuous flux is μ⁻¹ B ≃ μ⁻¹ ∇ × E. + const auto &mesh = rt_fespace.GetMesh(); + PalacePragmaOmp(parallel if (ceed::internal::NumCeeds() > 1)) + { + Ceed ceed = ceed::internal::GetCeedObjects()[utils::GetThreadNum()]; + for (const auto &[geom, data] : mesh.GetCeedGeomFactorData(ceed)) + { + // Only integrate over domain elements (not on the boundary). + if (mfem::Geometry::Dimension[geom] < mesh.Dimension()) + { + continue; + } + + // Create libCEED vector wrappers for use with libCEED operators. + CeedVector B_gf_vec, H_gf_vec; + if constexpr (std::is_same::value) + { + ceed::InitCeedVector(B_gf.Real(), ceed, &B_gf_vec); + ceed::InitCeedVector(H_gf.Real(), ceed, &H_gf_vec); + } + else + { + ceed::InitCeedVector(B_gf, ceed, &B_gf_vec); + ceed::InitCeedVector(H_gf, ceed, &H_gf_vec); + } + + // Construct mesh element restriction for elements of this element geometry type. + CeedElemRestriction mesh_elem_restr; + PalaceCeedCall(ceed, CeedElemRestrictionCreate( + ceed, static_cast(data.indices.size()), 1, 1, + mesh.GetNE(), mesh.GetNE(), CEED_MEM_HOST, CEED_USE_POINTER, + data.indices.data(), &mesh_elem_restr)); + + // Element restriction and basis objects for inputs. + CeedElemRestriction rt_restr = + rt_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedElemRestriction nd_restr = + nd_fespace.GetCeedElemRestriction(ceed, geom, data.indices); + CeedBasis rt_basis = rt_fespace.GetCeedBasis(ceed, geom); + CeedBasis nd_basis = nd_fespace.GetCeedBasis(ceed, geom); + + // Construct coefficient for discontinuous flux, then smooth flux. + auto mat_invsqrtmu = linalg::MatrixSqrt(mat_op.GetInvPermeability()); + auto mat_sqrtmu = linalg::MatrixPow(mat_op.GetInvPermeability(), -0.5); + MaterialPropertyCoefficient invsqrtmu_func(mat_op.GetAttributeToMaterial(), + mat_invsqrtmu); + MaterialPropertyCoefficient sqrtmu_func(mat_op.GetAttributeToMaterial(), mat_sqrtmu); + auto ctx = ceed::PopulateCoefficientContext(mesh.SpaceDimension(), &invsqrtmu_func, + mesh.SpaceDimension(), &sqrtmu_func); + + // Assemble the libCEED operator. Inputs: B (for discontinuous flux), then smooth + // flux. Currently only supports 3D, since curl in 2D requires special treatment. + ceed::CeedQFunctionInfo info; + info.assemble_q_data = false; + switch (10 * mesh.SpaceDimension() + mesh.Dimension()) + { + case 33: + info.apply_qf = f_apply_hdivhcurl_error_33; + info.apply_qf_path = PalaceQFunctionRelativePath(f_apply_hdivhcurl_error_33_loc); + break; + default: + MFEM_ABORT("Invalid value of (dim, space_dim) = (" + << mesh.Dimension() << ", " << mesh.SpaceDimension() + << ") for CurlFluxErrorEstimator!"); + } + info.trial_ops = info.test_ops = ceed::EvalMode::Interp; + + CeedOperator sub_op; + ceed::AssembleCeedElementErrorIntegrator( + info, (void *)ctx.data(), ctx.size() * sizeof(CeedIntScalar), ceed, B_gf_vec, + H_gf_vec, rt_restr, nd_restr, rt_basis, nd_basis, mesh_elem_restr, data.geom_data, + data.geom_data_restr, &sub_op); + integ_op.AddSubOperator(sub_op); // Sub-operator owned by ceed::Operator + + // Element restriction and passive input vectors are owned by the operator. + PalaceCeedCall(ceed, CeedElemRestrictionDestroy(&mesh_elem_restr)); + PalaceCeedCall(ceed, CeedVectorDestroy(&B_gf_vec)); + PalaceCeedCall(ceed, CeedVectorDestroy(&H_gf_vec)); + } + } + + // Finalize the operator (call CeedOperatorCheckReady). + integ_op.Finalize(); +} + +template +void CurlFluxErrorEstimator::AddErrorIndicator(const VecType &B, double Et, + ErrorIndicator &indicator) const +{ + auto estimates = + ComputeErrorEstimates(B, B_gf, H, H_gf, rt_fespace, nd_fespace, projector, integ_op); + linalg::Sqrt(estimates, (Et > 0.0) ? 0.5 / Et : 1.0); // Correct factor of 1/2 in energy + indicator.AddIndicator(estimates); +} + +template +TimeDependentFluxErrorEstimator::TimeDependentFluxErrorEstimator( + const MaterialOperator &mat_op, FiniteElementSpaceHierarchy &nd_fespaces, + FiniteElementSpaceHierarchy &rt_fespaces, double tol, int max_it, int print, + bool use_mg) + : grad_estimator(mat_op, nd_fespaces.GetFinestFESpace(), rt_fespaces, tol, max_it, print, + use_mg), + curl_estimator(mat_op, rt_fespaces.GetFinestFESpace(), nd_fespaces, tol, max_it, print, + use_mg) +{ +} + +template +void TimeDependentFluxErrorEstimator::AddErrorIndicator( + const VecType &E, const VecType &B, double Et, ErrorIndicator &indicator) const +{ + auto grad_estimates = + ComputeErrorEstimates(E, grad_estimator.E_gf, grad_estimator.D, grad_estimator.D_gf, + grad_estimator.nd_fespace, grad_estimator.rt_fespace, + grad_estimator.projector, grad_estimator.integ_op); + auto curl_estimates = + ComputeErrorEstimates(B, curl_estimator.B_gf, curl_estimator.H, curl_estimator.H_gf, + curl_estimator.rt_fespace, curl_estimator.nd_fespace, + curl_estimator.projector, curl_estimator.integ_op); + grad_estimates += curl_estimates; // Sum of squares + linalg::Sqrt(grad_estimates, + (Et > 0.0) ? 0.5 / Et : 1.0); // Correct factor of 1/2 in energy + indicator.AddIndicator(grad_estimates); +} + +template class FluxProjector; +template class FluxProjector; +template class GradFluxErrorEstimator; +template class GradFluxErrorEstimator; +template class CurlFluxErrorEstimator; +template class CurlFluxErrorEstimator; +template class TimeDependentFluxErrorEstimator; +template class TimeDependentFluxErrorEstimator; + +} // namespace palace diff --git a/palace/linalg/errorestimator.hpp b/palace/linalg/errorestimator.hpp index 3ab5333dbb..6945b66462 100644 --- a/palace/linalg/errorestimator.hpp +++ b/palace/linalg/errorestimator.hpp @@ -1,129 +1,147 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_ERROR_ESTIMATOR_HPP -#define PALACE_LINALG_ERROR_ESTIMATOR_HPP - -#include -#include -#include "fem/errorindicator.hpp" -#include "fem/fespace.hpp" -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" - -namespace palace -{ - -class MaterialOperator; - -// -// Classes used in the estimation of element-wise solution errors via a global L2 projection -// of a discontinuous flux onto a smooth space. -// - -// This solver computes a smooth reconstruction of a discontinuous flux. The difference -// between this resulting smooth flux and the original non-smooth flux provides a -// localizable error estimate. -class FluxProjector -{ -private: - // Operator for the mass matrix inversion. - std::unique_ptr Flux, M; - - // Linear solver and preconditioner for the projected linear system. - std::unique_ptr ksp; - - // Workspace object for solver application. - mutable Vector rhs; - -public: - FluxProjector(const MaterialOperator &mat_op, - const FiniteElementSpaceHierarchy &nd_fespaces, double tol, int max_it, - int print, int pa_order_threshold); - FluxProjector(const MaterialOperator &mat_op, - const FiniteElementSpaceHierarchy &h1_fespaces, - const FiniteElementSpace &h1d_fespace, double tol, int max_it, int print, - int pa_order_threshold); - - template - void Mult(const VecType &x, VecType &y) const; -}; - -// Class used for computing curl flux error estimate, i.e. || μ⁻¹ ∇ × Uₕ - F ||_K where F -// denotes a smooth reconstruction of μ⁻¹ ∇ × Uₕ. -template -class CurlFluxErrorEstimator -{ - using GridFunctionType = - typename std::conditional::value, - mfem::ParComplexGridFunction, mfem::ParGridFunction>::type; - - // Reference to input data (not owned). - const MaterialOperator &mat_op; - - // Finite element space used to represent U and F. - const FiniteElementSpace &nd_fespace; - - // Global L2 projection solver. - FluxProjector projector; - - // Temporary vectors for error estimation. - mutable VecType F; - mutable GridFunctionType F_gf, U_gf; - -public: - CurlFluxErrorEstimator(const MaterialOperator &mat_op, - const FiniteElementSpaceHierarchy &nd_fespaces, double tol, - int max_it, int print, int pa_order_threshold); - - // Compute elemental error indicators given a vector of true DOF. - ErrorIndicator ComputeIndicators(const VecType &U) const; - - // Compute elemental error indicators given a vector of true DOF and fold into an existing - // indicator. - void AddErrorIndicator(const VecType &U, ErrorIndicator &indicator) const - { - indicator.AddIndicator(ComputeIndicators(U)); - } -}; - -// Class used for computing gradient flux error estimate, i.e. || ε ∇Uₕ - F ||_K, where F -// denotes a smooth reconstruction of ε ∇Uₕ. -class GradFluxErrorEstimator -{ - // Reference to input data (not owned). - const MaterialOperator &mat_op; - - // Finite element space used to represent U. - const FiniteElementSpace &h1_fespace; - - // Vector H1 space used to represent the components of F, ordered by component. - std::unique_ptr h1d_fespace; - - // Global L2 projection solver. - FluxProjector projector; - - // Temporary vectors for error estimation. - mutable Vector F; - mutable mfem::ParGridFunction F_gf, U_gf; - -public: - GradFluxErrorEstimator(const MaterialOperator &mat_op, - const FiniteElementSpaceHierarchy &h1_fespaces, double tol, - int max_it, int print, int pa_order_threshold); - - // Compute elemental error indicators given a vector of true DOF. - ErrorIndicator ComputeIndicators(const Vector &U) const; - - // Compute elemental error indicators given a vector of true DOF and fold into an existing - // indicator. - void AddErrorIndicator(const Vector &U, ErrorIndicator &indicator) const - { - indicator.AddIndicator(ComputeIndicators(U)); - } -}; - -} // namespace palace - -#endif // PALACE_LINALG_ERROR_ESTIMATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_ERROR_ESTIMATOR_HPP +#define PALACE_LINALG_ERROR_ESTIMATOR_HPP + +#include +#include +#include "fem/errorindicator.hpp" +#include "fem/fespace.hpp" +#include "fem/libceed/operator.hpp" +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +class MaterialPropertyCoefficient; +class MaterialOperator; + +// +// Classes used in the estimation of element-wise solution errors via a global L2 projection +// of a discontinuous flux onto a smooth space (flux recovery). +// + +template +class TimeDependentFluxErrorEstimator; + +// This solver computes a smooth recovery of a discontinuous flux. The difference between +// this resulting smooth flux and the original non-smooth flux provides a localizable error +// estimate. +template +class FluxProjector +{ + using OperType = typename std::conditional::value, + ComplexOperator, Operator>::type; + +private: + // Operator for the mass matrix inversion. + std::unique_ptr Flux, M; + + // Linear solver and preconditioner for the projected linear system. + std::unique_ptr> ksp; + + // Workspace object for solver application. + mutable VecType rhs; + +public: + FluxProjector(const MaterialPropertyCoefficient &coeff, + const FiniteElementSpaceHierarchy &smooth_fespaces, + const FiniteElementSpace &rhs_fespace, double tol, int max_it, int print, + bool use_mg); + + void Mult(const VecType &x, VecType &y) const; +}; + +// Class used for computing gradient flux error estimate, η_K = || ε Eₕ - D ||_K, where D +// denotes a smooth reconstruction of ε Eₕ = ε ∇Vₕ with continuous normal component. +template +class GradFluxErrorEstimator +{ + friend class TimeDependentFluxErrorEstimator; + +private: + // Finite element spaces used to represent E and the recovered D. + const FiniteElementSpace &nd_fespace, &rt_fespace; + + // Global L2 projection solver. + FluxProjector projector; + + // Operator which performs the integration of the flux error on each element. + ceed::Operator integ_op; + + // Temporary vectors for error estimation. + mutable VecType E_gf, D, D_gf; + +public: + GradFluxErrorEstimator(const MaterialOperator &mat_op, FiniteElementSpace &nd_fespace, + FiniteElementSpaceHierarchy &rt_fespaces, double tol, int max_it, + int print, bool use_mg); + + // Compute elemental error indicators given the electric field as a vector of true dofs, + // and fold into an existing indicator. The indicators are nondimensionalized using the + // total field energy. + void AddErrorIndicator(const VecType &E, double Et, ErrorIndicator &indicator) const; +}; + +// Class used for computing curl flux error estimate, η_K = || μ⁻¹ Bₕ - H ||_K where H +// denotes a smooth reconstruction of μ⁻¹ Bₕ = μ⁻¹ ∇ × Eₕ with continuous tangential +// component. +template +class CurlFluxErrorEstimator +{ + friend class TimeDependentFluxErrorEstimator; + +private: + // Finite element space used to represent B and the recovered H. + const FiniteElementSpace &rt_fespace, &nd_fespace; + + // Global L2 projection solver. + FluxProjector projector; + + // Operator which performs the integration of the flux error on each element. + ceed::Operator integ_op; + + // Temporary vectors for error estimation. + mutable VecType B_gf, H, H_gf; + +public: + CurlFluxErrorEstimator(const MaterialOperator &mat_op, FiniteElementSpace &rt_fespace, + FiniteElementSpaceHierarchy &nd_fespaces, double tol, int max_it, + int print, bool use_mg); + + // Compute elemental error indicators given the magnetic flux density as a vector of true + // dofs, and fold into an existing indicator. The indicators are nondimensionalized using + // the total field energy. + void AddErrorIndicator(const VecType &B, double Et, ErrorIndicator &indicator) const; +}; + +// Class used for computing sum of the gradient flux and curl flux error estimates, +// η²_K = || ε Eₕ - D ||²_K + || μ⁻¹ Bₕ - H ||²_K, where D and H denote a smooth +// reconstructions of ε Eₕ = ε ∇Vₕ with continuous normal component and μ⁻¹ Bₕ = μ⁻¹ ∇ × Eₕ +// with continuous tangential component. +template +class TimeDependentFluxErrorEstimator +{ +private: + GradFluxErrorEstimator grad_estimator; + CurlFluxErrorEstimator curl_estimator; + +public: + TimeDependentFluxErrorEstimator(const MaterialOperator &mat_op, + FiniteElementSpaceHierarchy &nd_fespaces, + FiniteElementSpaceHierarchy &rt_fespaces, double tol, + int max_it, int print, bool use_mg); + + // Compute elemental error indicators given the electric field and magnetic flux density + // as a vectors of true dofs, and fold into an existing indicator. The indicators are + // nondimensionalized using the total field energy. + void AddErrorIndicator(const VecType &E, const VecType &B, double Et, + ErrorIndicator &indicator) const; +}; + +} // namespace palace + +#endif // PALACE_LINALG_ERROR_ESTIMATOR_HPP diff --git a/palace/linalg/floquetcorrection.cpp b/palace/linalg/floquetcorrection.cpp new file mode 100644 index 0000000000..409a45ba8a --- /dev/null +++ b/palace/linalg/floquetcorrection.cpp @@ -0,0 +1,90 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "floquetcorrection.hpp" + +#include +#include +#include "fem/bilinearform.hpp" +#include "fem/fespace.hpp" +#include "fem/integrator.hpp" +#include "linalg/iterative.hpp" +#include "linalg/jacobi.hpp" +#include "linalg/rap.hpp" +#include "models/materialoperator.hpp" + +namespace palace +{ + +template +FloquetCorrSolver::FloquetCorrSolver(const MaterialOperator &mat_op, + FiniteElementSpace &nd_fespace, + FiniteElementSpace &rt_fespace, double tol, + int max_it, int print) +{ + // Create the mass and cross product operators for Floquet correction. + { + constexpr bool skip_zeros = false; + BilinearForm a(rt_fespace); + a.AddDomainIntegrator(); + std::unique_ptr m = a.Assemble(skip_zeros); + if constexpr (std::is_same::value) + { + M = std::make_unique(std::move(m), nullptr, rt_fespace); + } + else + { + M = std::make_unique(std::move(m), rt_fespace); + } + } + + { + MaterialPropertyCoefficient f(mat_op.MaxCeedAttribute()); + f.AddCoefficient(mat_op.GetAttributeToMaterial(), mat_op.GetFloquetCross(), 1.0); + constexpr bool skip_zeros = false; + BilinearForm a(nd_fespace, rt_fespace); + a.AddDomainIntegrator(f); + std::unique_ptr m = a.Assemble(skip_zeros); + if constexpr (std::is_same::value) + { + Cross = std::make_unique(std::move(m), nullptr, nd_fespace, + rt_fespace, false); + } + else + { + Cross = std::make_unique(std::move(m), nd_fespace, rt_fespace, false); + } + } + + // Setup the linear solver. + auto pcg = std::make_unique>(rt_fespace.GetComm(), print); + pcg->SetInitialGuess(0); + pcg->SetRelTol(tol); + pcg->SetAbsTol(std::numeric_limits::epsilon()); + pcg->SetMaxIter(max_it); + auto jac = std::make_unique>(rt_fespace.GetComm()); + ksp = std::make_unique>(std::move(pcg), std::move(jac)); + ksp->SetOperators(*M, *M); + + rhs.SetSize(rt_fespace.GetTrueVSize()); + rhs.UseDevice(true); +} + +template +void FloquetCorrSolver::Mult(const VecType &x, VecType &y) const +{ + Cross->Mult(x, rhs); + ksp->Mult(rhs, y); +} + +template +void FloquetCorrSolver::AddMult(const VecType &x, VecType &y, ScalarType a) const +{ + this->Mult(x, rhs); + rhs *= a; + y += rhs; +} + +template class FloquetCorrSolver; + +} // namespace palace diff --git a/palace/linalg/floquetcorrection.hpp b/palace/linalg/floquetcorrection.hpp new file mode 100644 index 0000000000..5129798e5d --- /dev/null +++ b/palace/linalg/floquetcorrection.hpp @@ -0,0 +1,64 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_FLOQUET_CORR_HPP +#define PALACE_LINALG_FLOQUET_CORR_HPP + +#include +#include +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace mfem +{ + +template +class Array; + +} // namespace mfem + +namespace palace +{ + +class FiniteElementSpace; +class MaterialOperator; + +// +// This solver calculates a correction for the magnetic flux density field +// when Floquet periodicity is imposed. The correction is the cross product +// of the Floquet wave vector with the electric field. +// +template +class FloquetCorrSolver +{ + using OperType = typename std::conditional::value, + ComplexOperator, Operator>::type; + using ScalarType = + typename std::conditional::value, + std::complex, double>::type; + +private: + // Operators for the floquet correction. + std::unique_ptr M, Cross; + + // Linear solver for the linear system M y = x. + std::unique_ptr> ksp; + + // Workspace objects for solver application. + mutable VecType rhs; + +public: + FloquetCorrSolver(const MaterialOperator &mat_op, FiniteElementSpace &nd_fespace, + FiniteElementSpace &rt_fespace, double tol, int max_it, int print); + + // Given a vector of Nedelec dofs for an arbitrary vector field, compute + // the Raviart-Thomas space field y = [kp x] x, where [kp x] is a matrix + // representing the action of the cross product with the Floquet wave vector. + void Mult(const VecType &x, VecType &y) const; + void AddMult(const VecType &x, VecType &y, ScalarType a = 1.0) const; +}; + +} // namespace palace + +#endif // PALACE_LINALG_FLOQUET_CORR_HPP diff --git a/palace/linalg/gmg.cpp b/palace/linalg/gmg.cpp index 9abcf6508f..461202aff8 100644 --- a/palace/linalg/gmg.cpp +++ b/palace/linalg/gmg.cpp @@ -1,206 +1,210 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "gmg.hpp" - -#include -#include "linalg/chebyshev.hpp" -#include "linalg/distrelaxation.hpp" -#include "linalg/rap.hpp" -#include "utils/timer.hpp" - -namespace palace -{ - -template -GeometricMultigridSolver::GeometricMultigridSolver( - std::unique_ptr> &&coarse_solver, - const std::vector &P, const std::vector *G, - int cycle_it, int smooth_it, int cheby_order, double cheby_sf_max, double cheby_sf_min, - bool cheby_4th_kind) - : Solver(), pc_it(cycle_it), P(P.begin(), P.end()), A(P.size() + 1), - dbc_tdof_lists(P.size()), B(P.size() + 1), X(P.size() + 1), Y(P.size() + 1), - R(P.size() + 1) -{ - // Configure levels of geometric coarsening. Multigrid vectors will be configured at first - // call to Mult. The multigrid operator size is set based on the finest space dimension. - const auto n_levels = P.size() + 1; - MFEM_VERIFY(n_levels > 0, - "Empty finite element space hierarchy during multigrid solver setup!"); - MFEM_VERIFY(!G || G->size() == n_levels, - "Invalid input for distributive relaxation smoother auxiliary space transfer " - "operators (mismatch in number of levels)!"); - - // Use the supplied level 0 (coarse) solver. - B[0] = std::move(coarse_solver); - - // Configure level smoothers. Use distributive relaxation smoothing if an auxiliary - // finite element space was provided. - for (std::size_t l = 1; l < n_levels; l++) - { - if (G) - { - const int cheby_smooth_it = 1; - B[l] = std::make_unique>( - *(*G)[l], smooth_it, cheby_smooth_it, cheby_order, cheby_sf_max, cheby_sf_min, - cheby_4th_kind); - } - else - { - const int cheby_smooth_it = smooth_it; - if (cheby_4th_kind) - { - B[l] = std::make_unique>(cheby_smooth_it, cheby_order, - cheby_sf_max); - } - else - { - B[l] = std::make_unique>( - cheby_smooth_it, cheby_order, cheby_sf_max, cheby_sf_min); - } - } - } -} - -template -void GeometricMultigridSolver::SetOperator(const OperType &op) -{ - using ParOperType = - typename std::conditional::value, - ComplexParOperator, ParOperator>::type; - - const auto *mg_op = dynamic_cast *>(&op); - MFEM_VERIFY(mg_op, "GeometricMultigridSolver requires a MultigridOperator or " - "ComplexMultigridOperator argument provided to SetOperator!"); - - const auto n_levels = A.size(); - MFEM_VERIFY( - mg_op->GetNumLevels() == n_levels && - (!mg_op->HasAuxiliaryOperators() || mg_op->GetNumAuxiliaryLevels() == n_levels), - "Invalid number of levels for operators in multigrid solver setup!"); - for (std::size_t l = 0; l < n_levels; l++) - { - A[l] = &mg_op->GetOperatorAtLevel(l); - MFEM_VERIFY( - A[l]->Width() == A[l]->Height() && - (n_levels == 1 || - (A[l]->Height() == ((l < n_levels - 1) ? P[l]->Width() : P[l - 1]->Height()))), - "Invalid operator sizes for GeometricMultigridSolver!"); - - const auto *PtAP_l = dynamic_cast(&mg_op->GetOperatorAtLevel(l)); - MFEM_VERIFY( - PtAP_l, - "GeometricMultigridSolver requires ParOperator or ComplexParOperator operators!"); - if (l < n_levels - 1) - { - dbc_tdof_lists[l] = PtAP_l->GetEssentialTrueDofs(); - } - - auto *dist_smoother = dynamic_cast *>(B[l].get()); - if (dist_smoother) - { - MFEM_VERIFY(mg_op->HasAuxiliaryOperators(), - "Distributive relaxation smoother relies on both primary space and " - "auxiliary space operators for multigrid smoothing!"); - dist_smoother->SetOperators(mg_op->GetOperatorAtLevel(l), - mg_op->GetAuxiliaryOperatorAtLevel(l)); - } - else - { - B[l]->SetOperator(mg_op->GetOperatorAtLevel(l)); - } - - X[l].SetSize(A[l]->Height()); - Y[l].SetSize(A[l]->Height()); - R[l].SetSize(A[l]->Height()); - } - this->height = op.Height(); - this->width = op.Width(); -} - -template -void GeometricMultigridSolver::Mult(const VecType &x, VecType &y) const -{ - // Initialize. - const auto n_levels = A.size(); - MFEM_ASSERT(!this->initial_guess, - "Geometric multigrid solver does not use initial guess!"); - MFEM_ASSERT(n_levels > 1 || pc_it == 1, - "Single-level geometric multigrid will not work with multiple iterations!"); - - // Apply V-cycle. The initial guess for y is zero'd at the first pre-smooth iteration. - X.back() = x; - for (int it = 0; it < pc_it; it++) - { - VCycle(n_levels - 1, (it > 0)); - } - y = Y.back(); -} - -namespace -{ - -inline void RealMult(const Operator &op, const Vector &x, Vector &y) -{ - op.Mult(x, y); -} - -inline void RealMult(const Operator &op, const ComplexVector &x, ComplexVector &y) -{ - op.Mult(x.Real(), y.Real()); - op.Mult(x.Imag(), y.Imag()); -} - -inline void RealMultTranspose(const Operator &op, const Vector &x, Vector &y) -{ - op.MultTranspose(x, y); -} - -inline void RealMultTranspose(const Operator &op, const ComplexVector &x, ComplexVector &y) -{ - op.MultTranspose(x.Real(), y.Real()); - op.MultTranspose(x.Imag(), y.Imag()); -} - -} // namespace - -template -void GeometricMultigridSolver::VCycle(int l, bool initial_guess) const -{ - // Pre-smooth, with zero initial guess (Y = 0 set inside). This is the coarse solve at - // level 0. Important to note that the smoothers must respect the initial guess flag - // correctly (given X, Y, compute Y <- Y + B (X - A Y)) . - B[l]->SetInitialGuess(initial_guess); - if (l == 0) - { - BlockTimer bt(Timer::COARSESOLVE); - B[l]->Mult(X[l], Y[l]); - return; - } - B[l]->Mult(X[l], Y[l]); - - // Compute residual. - A[l]->Mult(Y[l], R[l]); - linalg::AXPBY(1.0, X[l], -1.0, R[l]); - - // Coarse grid correction. - RealMultTranspose(*P[l - 1], R[l], X[l - 1]); - if (dbc_tdof_lists[l - 1]) - { - linalg::SetSubVector(X[l - 1], *dbc_tdof_lists[l - 1], 0.0); - } - VCycle(l - 1, false); - - // Prolongate and add. - RealMult(*P[l - 1], Y[l - 1], R[l]); - Y[l] += R[l]; - - // Post-smooth, with nonzero initial guess. - B[l]->SetInitialGuess(true); - B[l]->MultTranspose(X[l], Y[l]); -} - -template class GeometricMultigridSolver; -template class GeometricMultigridSolver; - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "gmg.hpp" + +#include +#include "linalg/chebyshev.hpp" +#include "linalg/distrelaxation.hpp" +#include "linalg/rap.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +template +GeometricMultigridSolver::GeometricMultigridSolver( + MPI_Comm comm, std::unique_ptr> &&coarse_solver, + const std::vector &P, const std::vector *G, + int cycle_it, int smooth_it, int cheby_order, double cheby_sf_max, double cheby_sf_min, + bool cheby_4th_kind) + : Solver(), pc_it(cycle_it), P(P.begin(), P.end()), A(P.size() + 1), + dbc_tdof_lists(P.size()), B(P.size() + 1), X(P.size() + 1), Y(P.size() + 1), + R(P.size() + 1), use_timer(false) +{ + // Configure levels of geometric coarsening. Multigrid vectors will be configured at first + // call to Mult. The multigrid operator size is set based on the finest space dimension. + const auto n_levels = P.size() + 1; + MFEM_VERIFY(n_levels > 0, + "Empty finite element space hierarchy during multigrid solver setup!"); + MFEM_VERIFY(!G || G->size() == n_levels, + "Invalid input for distributive relaxation smoother auxiliary space transfer " + "operators (mismatch in number of levels)!"); + + // Use the supplied level 0 (coarse) solver. + B[0] = std::move(coarse_solver); + + // Configure level smoothers. Use distributive relaxation smoothing if an auxiliary + // finite element space was provided. + for (std::size_t l = 1; l < n_levels; l++) + { + if (G) + { + const int cheby_smooth_it = 1; + B[l] = std::make_unique>( + comm, *(*G)[l], smooth_it, cheby_smooth_it, cheby_order, cheby_sf_max, + cheby_sf_min, cheby_4th_kind); + } + else + { + const int cheby_smooth_it = smooth_it; + if (cheby_4th_kind) + { + B[l] = std::make_unique>(comm, cheby_smooth_it, + cheby_order, cheby_sf_max); + } + else + { + B[l] = std::make_unique>( + comm, cheby_smooth_it, cheby_order, cheby_sf_max, cheby_sf_min); + } + } + } +} + +template +void GeometricMultigridSolver::SetOperator(const OperType &op) +{ + using ParOperType = + typename std::conditional::value, + ComplexParOperator, ParOperator>::type; + + const auto *mg_op = dynamic_cast *>(&op); + MFEM_VERIFY(mg_op, "GeometricMultigridSolver requires a MultigridOperator or " + "ComplexMultigridOperator argument provided to SetOperator!"); + + const auto n_levels = A.size(); + MFEM_VERIFY( + mg_op->GetNumLevels() == n_levels && + (!mg_op->HasAuxiliaryOperators() || mg_op->GetNumAuxiliaryLevels() == n_levels), + "Invalid number of levels for operators in multigrid solver setup!"); + for (std::size_t l = 0; l < n_levels; l++) + { + A[l] = &mg_op->GetOperatorAtLevel(l); + MFEM_VERIFY( + A[l]->Width() == A[l]->Height() && + (n_levels == 1 || + (A[l]->Height() == ((l < n_levels - 1) ? P[l]->Width() : P[l - 1]->Height()))), + "Invalid operator sizes for GeometricMultigridSolver!"); + + const auto *PtAP_l = dynamic_cast(&mg_op->GetOperatorAtLevel(l)); + MFEM_VERIFY( + PtAP_l, + "GeometricMultigridSolver requires ParOperator or ComplexParOperator operators!"); + if (l < n_levels - 1) + { + dbc_tdof_lists[l] = PtAP_l->GetEssentialTrueDofs(); + } + + auto *dist_smoother = dynamic_cast *>(B[l].get()); + if (dist_smoother) + { + MFEM_VERIFY(mg_op->HasAuxiliaryOperators(), + "Distributive relaxation smoother relies on both primary space and " + "auxiliary space operators for multigrid smoothing!"); + dist_smoother->SetOperators(mg_op->GetOperatorAtLevel(l), + mg_op->GetAuxiliaryOperatorAtLevel(l)); + } + else + { + B[l]->SetOperator(mg_op->GetOperatorAtLevel(l)); + } + + X[l].SetSize(A[l]->Height()); + Y[l].SetSize(A[l]->Height()); + R[l].SetSize(A[l]->Height()); + X[l].UseDevice(true); + Y[l].UseDevice(true); + R[l].UseDevice(true); + } + + this->height = op.Height(); + this->width = op.Width(); +} + +template +void GeometricMultigridSolver::Mult(const VecType &x, VecType &y) const +{ + // Initialize. + const auto n_levels = A.size(); + MFEM_ASSERT(!this->initial_guess, + "Geometric multigrid solver does not use initial guess!"); + MFEM_ASSERT(n_levels > 1 || pc_it == 1, + "Single-level geometric multigrid will not work with multiple iterations!"); + + // Apply V-cycle. The initial guess for y is zero'd at the first pre-smooth iteration. + X.back() = x; + for (int it = 0; it < pc_it; it++) + { + VCycle(n_levels - 1, (it > 0)); + } + y = Y.back(); +} + +namespace +{ + +inline void RealMult(const Operator &op, const Vector &x, Vector &y) +{ + op.Mult(x, y); +} + +inline void RealMult(const Operator &op, const ComplexVector &x, ComplexVector &y) +{ + op.Mult(x.Real(), y.Real()); + op.Mult(x.Imag(), y.Imag()); +} + +inline void RealMultTranspose(const Operator &op, const Vector &x, Vector &y) +{ + op.MultTranspose(x, y); +} + +inline void RealMultTranspose(const Operator &op, const ComplexVector &x, ComplexVector &y) +{ + op.MultTranspose(x.Real(), y.Real()); + op.MultTranspose(x.Imag(), y.Imag()); +} + +} // namespace + +template +void GeometricMultigridSolver::VCycle(int l, bool initial_guess) const +{ + // Pre-smooth, with zero initial guess (Y = 0 set inside). This is the coarse solve at + // level 0. Important to note that the smoothers must respect the initial guess flag + // correctly (given X, Y, compute Y <- Y + B (X - A Y)) . + B[l]->SetInitialGuess(initial_guess); + if (l == 0) + { + BlockTimer bt(Timer::KSP_COARSE_SOLVE, use_timer); + B[l]->Mult(X[l], Y[l]); + return; + } + B[l]->Mult2(X[l], Y[l], R[l]); + + // Compute residual. + A[l]->Mult(Y[l], R[l]); + linalg::AXPBY(1.0, X[l], -1.0, R[l]); + + // Coarse grid correction. + RealMultTranspose(*P[l - 1], R[l], X[l - 1]); + if (dbc_tdof_lists[l - 1]) + { + linalg::SetSubVector(X[l - 1], *dbc_tdof_lists[l - 1], 0.0); + } + VCycle(l - 1, false); + + // Prolongate and add. + RealMult(*P[l - 1], Y[l - 1], R[l]); + Y[l] += R[l]; + + // Post-smooth, with nonzero initial guess. + B[l]->SetInitialGuess(true); + B[l]->MultTranspose2(X[l], Y[l], R[l]); +} + +template class GeometricMultigridSolver; +template class GeometricMultigridSolver; + +} // namespace palace diff --git a/palace/linalg/gmg.hpp b/palace/linalg/gmg.hpp index 958cc23c60..a33f1b5ae0 100644 --- a/palace/linalg/gmg.hpp +++ b/palace/linalg/gmg.hpp @@ -1,81 +1,86 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_GEOMETRIC_MULTIGRID_HPP -#define PALACE_LINALG_GEOMETRIC_MULTIGRID_HPP - -#include -#include -#include "linalg/operator.hpp" -#include "linalg/solver.hpp" -#include "linalg/vector.hpp" -#include "utils/iodata.hpp" - -namespace mfem -{ - -template -class Array; - -} // namespace mfem - -namespace palace -{ - -// -// Geometric multigrid preconditioner using a given coarse solver for the provided -// hierarchy of finite element spaces. Optionally can be configured to use auxiliary space -// smoothing at each level. -// -template -class GeometricMultigridSolver : public Solver -{ - using VecType = typename Solver::VecType; - -private: - // Number of V-cycles per preconditioner application. - const int pc_it; - - // Prolongation operators (not owned). - std::vector P; - - // System matrices at each multigrid level (not owned). - std::vector A; - std::vector *> dbc_tdof_lists; - - // Smoothers for each level. Coarse level solver is B[0]. - mutable std::vector>> B; - - // Temporary vectors for preconditioner application. The type of these is dictated by the - // MFEM Operator interface for multiple RHS. - mutable std::vector X, Y, R; - - // Internal function to perform a single V-cycle iteration. - void VCycle(int l, bool initial_guess) const; - -public: - GeometricMultigridSolver(std::unique_ptr> &&coarse_solver, - const std::vector &P, - const std::vector *G, int cycle_it, - int smooth_it, int cheby_order, double cheby_sf_max, - double cheby_sf_min, bool cheby_4th_kind); - GeometricMultigridSolver(const IoData &iodata, - std::unique_ptr> &&coarse_solver, - const std::vector &P, - const std::vector *G = nullptr) - : GeometricMultigridSolver( - std::move(coarse_solver), P, G, iodata.solver.linear.mg_cycle_it, - iodata.solver.linear.mg_smooth_it, iodata.solver.linear.mg_smooth_order, - iodata.solver.linear.mg_smooth_sf_max, iodata.solver.linear.mg_smooth_sf_min, - iodata.solver.linear.mg_smooth_cheby_4th) - { - } - - void SetOperator(const OperType &op) override; - - void Mult(const VecType &x, VecType &y) const override; -}; - -} // namespace palace - -#endif // PALACE_LINALG_GEOMETRIC_MULTIGRID_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_GEOMETRIC_MULTIGRID_HPP +#define PALACE_LINALG_GEOMETRIC_MULTIGRID_HPP + +#include +#include +#include "linalg/operator.hpp" +#include "linalg/solver.hpp" +#include "linalg/vector.hpp" +#include "utils/iodata.hpp" + +namespace mfem +{ + +template +class Array; + +} // namespace mfem + +namespace palace +{ + +// +// Geometric multigrid preconditioner using a given coarse solver for the provided +// hierarchy of finite element spaces. Optionally can be configured to use auxiliary space +// smoothing at each level. +// +template +class GeometricMultigridSolver : public Solver +{ + using VecType = typename Solver::VecType; + +private: + // Number of V-cycles per preconditioner application. + const int pc_it; + + // Prolongation operators (not owned). + std::vector P; + + // System matrices at each multigrid level (not owned). + std::vector A; + std::vector *> dbc_tdof_lists; + + // Smoothers for each level. Coarse-level solver is B[0]. + mutable std::vector>> B; + + // Temporary vectors for preconditioner application. The type of these is dictated by the + // MFEM Operator interface for multiple RHS. + mutable std::vector X, Y, R; + + // Enable timer contribution for Timer::KSP_COARSE_SOLVE. + bool use_timer; + + // Internal function to perform a single V-cycle iteration. + void VCycle(int l, bool initial_guess) const; + +public: + GeometricMultigridSolver(MPI_Comm comm, std::unique_ptr> &&coarse_solver, + const std::vector &P, + const std::vector *G, int cycle_it, + int smooth_it, int cheby_order, double cheby_sf_max, + double cheby_sf_min, bool cheby_4th_kind); + GeometricMultigridSolver(const IoData &iodata, MPI_Comm comm, + std::unique_ptr> &&coarse_solver, + const std::vector &P, + const std::vector *G = nullptr) + : GeometricMultigridSolver( + comm, std::move(coarse_solver), P, G, iodata.solver.linear.mg_cycle_it, + iodata.solver.linear.mg_smooth_it, iodata.solver.linear.mg_smooth_order, + iodata.solver.linear.mg_smooth_sf_max, iodata.solver.linear.mg_smooth_sf_min, + iodata.solver.linear.mg_smooth_cheby_4th) + { + } + + void SetOperator(const OperType &op) override; + + void Mult(const VecType &x, VecType &y) const override; + + void EnableTimer() { use_timer = true; } +}; + +} // namespace palace + +#endif // PALACE_LINALG_GEOMETRIC_MULTIGRID_HPP diff --git a/palace/linalg/hcurl.cpp b/palace/linalg/hcurl.cpp index 3a8b1395ba..2199a713d1 100644 --- a/palace/linalg/hcurl.cpp +++ b/palace/linalg/hcurl.cpp @@ -1,100 +1,123 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "hcurl.hpp" - -#include -#include "fem/bilinearform.hpp" -#include "fem/coefficient.hpp" -#include "fem/fespace.hpp" -#include "fem/integrator.hpp" -#include "linalg/ams.hpp" -#include "linalg/gmg.hpp" -#include "linalg/iterative.hpp" -#include "linalg/rap.hpp" -#include "models/materialoperator.hpp" - -namespace palace -{ - -WeightedHCurlNormSolver::WeightedHCurlNormSolver( - const MaterialOperator &mat_op, const FiniteElementSpaceHierarchy &nd_fespaces, - const AuxiliaryFiniteElementSpaceHierarchy &h1_fespaces, - const std::vector> &nd_dbc_tdof_lists, - const std::vector> &h1_dbc_tdof_lists, double tol, int max_it, - int print, int pa_order_threshold) -{ - constexpr bool skip_zeros = false; - constexpr auto MatTypeMuInv = MaterialPropertyType::INV_PERMEABILITY; - constexpr auto MatTypeEps = MaterialPropertyType::PERMITTIVITY_REAL; - MaterialPropertyCoefficient muinv_func(mat_op); - MaterialPropertyCoefficient epsilon_func(mat_op); - { - MFEM_VERIFY(h1_fespaces.GetNumLevels() == nd_fespaces.GetNumLevels(), - "Multigrid hierarchy mismatch for auxiliary space preconditioning!"); - const auto n_levels = nd_fespaces.GetNumLevels(); - auto A_mg = std::make_unique(n_levels); - for (bool aux : {false, true}) - { - for (std::size_t l = 0; l < n_levels; l++) - { - // Force coarse level operator to be fully assembled always. - const auto &fespace_l = - aux ? h1_fespaces.GetFESpaceAtLevel(l) : nd_fespaces.GetFESpaceAtLevel(l); - const auto &dbc_tdof_lists_l = aux ? h1_dbc_tdof_lists[l] : nd_dbc_tdof_lists[l]; - BilinearForm a(fespace_l); - if (aux) - { - a.AddDomainIntegrator(epsilon_func); - } - else - { - a.AddDomainIntegrator(muinv_func, epsilon_func); - } - auto A_l = std::make_unique( - a.Assemble((l > 0) ? pa_order_threshold : 99, skip_zeros), fespace_l); - A_l->SetEssentialTrueDofs(dbc_tdof_lists_l, Operator::DiagonalPolicy::DIAG_ONE); - if (aux) - { - A_mg->AddAuxiliaryOperator(std::move(A_l)); - } - else - { - A_mg->AddOperator(std::move(A_l)); - } - } - } - A = std::move(A_mg); - } - - // The system matrix K + M is real and SPD. We use Hypre's AMS solver as the coarse-level - // multigrid solve. - auto ams = std::make_unique>(std::make_unique( - nd_fespaces.GetFESpaceAtLevel(0), h1_fespaces.GetFESpaceAtLevel(0), 1, 1, 1, false, - false, 0)); - std::unique_ptr> pc; - if (nd_fespaces.GetNumLevels() > 1) - { - const auto G = h1_fespaces.GetDiscreteInterpolators(); - const int mg_smooth_order = - std::max(nd_fespaces.GetFinestFESpace().GetMaxElementOrder(), 2); - pc = std::make_unique>( - std::move(ams), nd_fespaces.GetProlongationOperators(), &G, 1, 1, mg_smooth_order, - 1.0, 0.0, true); - } - else - { - pc = std::move(ams); - } - - auto pcg = - std::make_unique>(nd_fespaces.GetFinestFESpace().GetComm(), print); - pcg->SetInitialGuess(false); - pcg->SetRelTol(tol); - pcg->SetMaxIter(max_it); - - ksp = std::make_unique(std::move(pcg), std::move(pc)); - ksp->SetOperators(*A, *A); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "hcurl.hpp" + +#include +#include "fem/bilinearform.hpp" +#include "fem/fespace.hpp" +#include "fem/integrator.hpp" +#include "linalg/ams.hpp" +#include "linalg/gmg.hpp" +#include "linalg/iterative.hpp" +#include "linalg/rap.hpp" +#include "models/materialoperator.hpp" + +namespace palace +{ + +namespace +{ + +template +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &fespace); + +template <> +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &fespace) +{ + return std::make_unique(std::move(a), fespace); +} + +template <> +auto BuildLevelParOperator(std::unique_ptr &&a, + const FiniteElementSpace &fespace) +{ + return std::make_unique(std::move(a), nullptr, fespace); +} + +} // namespace + +template +WeightedHCurlNormSolver::WeightedHCurlNormSolver( + const MaterialOperator &mat_op, FiniteElementSpaceHierarchy &nd_fespaces, + FiniteElementSpaceHierarchy &h1_fespaces, + const std::vector> &nd_dbc_tdof_lists, + const std::vector> &h1_dbc_tdof_lists, double tol, int max_it, + int print) +{ + MFEM_VERIFY(h1_fespaces.GetNumLevels() == nd_fespaces.GetNumLevels(), + "Multigrid hierarchy mismatch for auxiliary space preconditioning!"); + const auto n_levels = nd_fespaces.GetNumLevels(); + { + constexpr bool skip_zeros = false; + MaterialPropertyCoefficient muinv_func(mat_op.GetAttributeToMaterial(), + mat_op.GetInvPermeability()); + MaterialPropertyCoefficient epsilon_func(mat_op.GetAttributeToMaterial(), + mat_op.GetPermittivityReal()); + BilinearForm a(nd_fespaces.GetFinestFESpace()), a_aux(h1_fespaces.GetFinestFESpace()); + a.AddDomainIntegrator(muinv_func, epsilon_func); + a_aux.AddDomainIntegrator(epsilon_func); + // a.AssembleQuadratureData(); + // a_aux.AssembleQuadratureData(); + auto a_vec = a.Assemble(nd_fespaces, skip_zeros); + auto a_aux_vec = a_aux.Assemble(h1_fespaces, skip_zeros); + auto A_mg = std::make_unique>(n_levels); + for (bool aux : {false, true}) + { + for (std::size_t l = 0; l < n_levels; l++) + { + const auto &fespace_l = + aux ? h1_fespaces.GetFESpaceAtLevel(l) : nd_fespaces.GetFESpaceAtLevel(l); + const auto &dbc_tdof_lists_l = aux ? h1_dbc_tdof_lists[l] : nd_dbc_tdof_lists[l]; + auto A_l = BuildLevelParOperator(std::move(aux ? a_aux_vec[l] : a_vec[l]), + fespace_l); + A_l->SetEssentialTrueDofs(dbc_tdof_lists_l, Operator::DiagonalPolicy::DIAG_ONE); + if (aux) + { + A_mg->AddAuxiliaryOperator(std::move(A_l)); + } + else + { + A_mg->AddOperator(std::move(A_l)); + } + } + } + A = std::move(A_mg); + } + + // The system matrix K + M is real and SPD. We use Hypre's AMS solver as the coarse-level + // multigrid solve. + auto ams = std::make_unique>(std::make_unique( + nd_fespaces.GetFESpaceAtLevel(0), h1_fespaces.GetFESpaceAtLevel(0), 1, 1, false, true, + false, 0)); + std::unique_ptr> pc; + if (n_levels > 1) + { + const auto G = nd_fespaces.GetDiscreteInterpolators(h1_fespaces); + const int mg_smooth_order = + std::max(nd_fespaces.GetFinestFESpace().GetMaxElementOrder(), 2); + pc = std::make_unique>( + nd_fespaces.GetFinestFESpace().GetComm(), std::move(ams), + nd_fespaces.GetProlongationOperators(), &G, 1, 1, mg_smooth_order, 1.0, 0.0, true); + } + else + { + pc = std::move(ams); + } + + auto pcg = + std::make_unique>(nd_fespaces.GetFinestFESpace().GetComm(), print); + pcg->SetInitialGuess(false); + pcg->SetRelTol(tol); + pcg->SetMaxIter(max_it); + + ksp = std::make_unique>(std::move(pcg), std::move(pc)); + ksp->SetOperators(*A, *A); +} + +template class WeightedHCurlNormSolver; +template class WeightedHCurlNormSolver; + +} // namespace palace diff --git a/palace/linalg/hcurl.hpp b/palace/linalg/hcurl.hpp index 579d2b1264..08d48b080b 100644 --- a/palace/linalg/hcurl.hpp +++ b/palace/linalg/hcurl.hpp @@ -1,61 +1,58 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_HCURL_HPP -#define PALACE_LINALG_HCURL_HPP - -#include -#include -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" - -namespace mfem -{ - -template -class Array; - -} // namespace mfem - -namespace palace -{ - -class AuxiliaryFiniteElementSpaceHierarchy; -class FiniteElementSpaceHierarchy; -class MaterialOperator; - -// -// This solver implements a solver for the operator K + M in a Nedelec space. -// -class WeightedHCurlNormSolver -{ -private: - // H(curl) norm operator A = K + M and its projection Gᵀ A G. - std::unique_ptr A; - - // Linear solver for the linear system A y = x; - std::unique_ptr ksp; - -public: - WeightedHCurlNormSolver(const MaterialOperator &mat_op, - const FiniteElementSpaceHierarchy &nd_fespaces, - const AuxiliaryFiniteElementSpaceHierarchy &h1_fespaces, - const std::vector> &nd_dbc_tdof_lists, - const std::vector> &h1_dbc_tdof_lists, - double tol, int max_it, int print, int pa_order_threshold); - - const Operator &GetOperator() { return *A; } - - void Mult(const Vector &x, Vector &y) const { ksp->Mult(x, y); } - - void Mult(const ComplexVector &x, ComplexVector &y) - { - Mult(x.Real(), y.Real()); - Mult(x.Imag(), y.Imag()); - } -}; - -} // namespace palace - -#endif // PALACE_LINALG_HCURL_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_HCURL_HPP +#define PALACE_LINALG_HCURL_HPP + +#include +#include +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace mfem +{ + +template +class Array; + +} // namespace mfem + +namespace palace +{ + +class FiniteElementSpaceHierarchy; +class MaterialOperator; + +// +// This solver implements a solver for the operator K + M in a Nedelec space. +// +template +class WeightedHCurlNormSolver +{ + using OperType = typename std::conditional::value, + ComplexOperator, Operator>::type; + +private: + // H(curl) norm operator A = K + M and its projection Gᵀ A G. + std::unique_ptr A; + + // Linear solver for the linear system A y = x; + std::unique_ptr> ksp; + +public: + WeightedHCurlNormSolver(const MaterialOperator &mat_op, + FiniteElementSpaceHierarchy &nd_fespaces, + FiniteElementSpaceHierarchy &h1_fespaces, + const std::vector> &nd_dbc_tdof_lists, + const std::vector> &h1_dbc_tdof_lists, + double tol, int max_it, int print); + + const OperType &GetOperator() { return *A; } + + void Mult(const VecType &x, VecType &y) const { ksp->Mult(x, y); } +}; + +} // namespace palace + +#endif // PALACE_LINALG_HCURL_HPP diff --git a/palace/linalg/hypre.cpp b/palace/linalg/hypre.cpp new file mode 100644 index 0000000000..d1a1e4c575 --- /dev/null +++ b/palace/linalg/hypre.cpp @@ -0,0 +1,126 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "hypre.hpp" + +namespace palace::hypre +{ + +HypreVector::HypreVector(hypre_Vector *vec) : vec(vec) {} + +HypreVector::HypreVector(const Vector &x) : vec(nullptr) +{ + Update(x); +} + +HypreVector::~HypreVector() +{ + hypre_SeqVectorDestroy(vec); +} + +void HypreVector::Update(const Vector &x) +{ + const HYPRE_Int N = x.Size(); + if (!vec) + { + vec = hypre_SeqVectorCreate(N); + hypre_SeqVectorSetDataOwner(vec, 0); + hypre_VectorData(vec) = const_cast(x.Read()); + hypre_SeqVectorInitialize(vec); + } + else + { + hypre_SeqVectorSetSize(vec, N); + hypre_VectorData(vec) = const_cast(x.Read()); + } +} + +HypreCSRMatrix::HypreCSRMatrix(int h, int w, int nnz) + : palace::Operator(h, w), hypre_own_I(true) +{ + mat = hypre_CSRMatrixCreate(h, w, nnz); + hypre_CSRMatrixInitialize(mat); +} + +HypreCSRMatrix::HypreCSRMatrix(hypre_CSRMatrix *mat) : mat(mat), hypre_own_I(true) +{ + height = hypre_CSRMatrixNumRows(mat); + width = hypre_CSRMatrixNumCols(mat); +} + +HypreCSRMatrix::HypreCSRMatrix(const mfem::SparseMatrix &m) + : palace::Operator(m.Height(), m.Width()), hypre_own_I(false) +{ + const int nnz = m.NumNonZeroElems(); + mat = hypre_CSRMatrixCreate(height, width, nnz); + hypre_CSRMatrixSetDataOwner(mat, 0); + hypre_CSRMatrixData(mat) = const_cast(m.ReadData()); +#if !defined(HYPRE_BIGINT) + hypre_CSRMatrixI(mat) = const_cast(m.ReadI()); + hypre_CSRMatrixJ(mat) = const_cast(m.ReadJ()); +#else + data_I.SetSize(height); + data_J.SetSize(nnz); + { + const auto *I = m.ReadI(); + const auto *J = m.ReadJ(); + auto *DI = data_I.Write(); + auto *DJ = data_J.Write(); + mfem::forall(height, [=] MFEM_HOST_DEVICE(int i) { DI[i] = I[i]; }); + mfem::forall(nnz, [=] MFEM_HOST_DEVICE(int i) { DJ[i] = J[i]; }); + } +#endif + hypre_CSRMatrixInitialize(mat); +} + +HypreCSRMatrix::~HypreCSRMatrix() +{ + if (!hypre_own_I) + { + hypre_CSRMatrixI(mat) = nullptr; + } + hypre_CSRMatrixDestroy(mat); +} + +void HypreCSRMatrix::AssembleDiagonal(Vector &diag) const +{ + diag.SetSize(height); + hypre_CSRMatrixExtractDiagonal(mat, diag.Write(), 0); +} + +namespace +{ + +static HypreVector X, Y; + +} // namespace + +void HypreCSRMatrix::Mult(const Vector &x, Vector &y) const +{ + X.Update(x); + Y.Update(y); + hypre_CSRMatrixMatvec(1.0, mat, X, 0.0, Y); +} + +void HypreCSRMatrix::AddMult(const Vector &x, Vector &y, const double a) const +{ + X.Update(x); + Y.Update(y); + hypre_CSRMatrixMatvec(a, mat, X, 1.0, Y); +} + +void HypreCSRMatrix::MultTranspose(const Vector &x, Vector &y) const +{ + X.Update(x); + Y.Update(y); + hypre_CSRMatrixMatvecT(1.0, mat, X, 0.0, Y); +} + +void HypreCSRMatrix::AddMultTranspose(const Vector &x, Vector &y, const double a) const +{ + X.Update(x); + Y.Update(y); + hypre_CSRMatrixMatvecT(a, mat, X, 1.0, Y); +} + +} // namespace palace::hypre diff --git a/palace/linalg/hypre.hpp b/palace/linalg/hypre.hpp new file mode 100644 index 0000000000..9c70d349d1 --- /dev/null +++ b/palace/linalg/hypre.hpp @@ -0,0 +1,85 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_HYPRE_HPP +#define PALACE_LINALG_HYPRE_HPP + +#include +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace::hypre +{ + +// Helper function to initialize HYPRE and control use of GPU at runtime. This will call +// HYPRE_SetMemoryLocation and HYPRE_SetExecutionPolicy to match the mfem::Device +// configuration. +inline void Initialize() +{ + mfem::Hypre::Init(); + // HYPRE_SetSpGemmUseCusparse(1); // MFEM sets to zero, so leave as is for now +} + +// +// Wrapper class for HYPRE's hypre_Vector, which can alias an mfem::Vector object for use +// with HYPRE. +// +class HypreVector +{ +private: + hypre_Vector *vec; + +public: + HypreVector(hypre_Vector *vec = nullptr); + HypreVector(const Vector &x); + ~HypreVector(); + + auto Size() const { return hypre_VectorSize(vec); } + + void Update(const Vector &x); + + operator hypre_Vector *() const { return vec; } +}; + +// +// Wrapper class for HYPRE's hypre_CSRMatrix, an alternative to mfem::SparseMatrix with +// increased functionality from HYPRE. +// +class HypreCSRMatrix : public palace::Operator +{ +private: + hypre_CSRMatrix *mat; + mfem::Array data_I, data_J; + bool hypre_own_I; + +public: + HypreCSRMatrix(int h, int w, int nnz); + HypreCSRMatrix(hypre_CSRMatrix *mat); + HypreCSRMatrix(const mfem::SparseMatrix &m); + ~HypreCSRMatrix(); + + auto NNZ() const { return hypre_CSRMatrixNumNonzeros(mat); } + + const auto *GetI() const { return hypre_CSRMatrixI(mat); } + auto *GetI() { return hypre_CSRMatrixI(mat); } + const auto *GetJ() const { return hypre_CSRMatrixJ(mat); } + auto *GetJ() { return hypre_CSRMatrixJ(mat); } + const auto *GetData() const { return hypre_CSRMatrixData(mat); } + auto *GetData() { return hypre_CSRMatrixData(mat); } + + void AssembleDiagonal(Vector &diag) const override; + + void Mult(const Vector &x, Vector &y) const override; + + void AddMult(const Vector &x, Vector &y, const double a = 1.0) const override; + + void MultTranspose(const Vector &x, Vector &y) const override; + + void AddMultTranspose(const Vector &x, Vector &y, const double a = 1.0) const override; + + operator hypre_CSRMatrix *() const { return mat; } +}; + +} // namespace palace::hypre + +#endif // PALACE_LINALG_HYPRE_HPP diff --git a/palace/linalg/iterative.cpp b/palace/linalg/iterative.cpp index 985954d945..9fa42fc9b1 100644 --- a/palace/linalg/iterative.cpp +++ b/palace/linalg/iterative.cpp @@ -1,848 +1,866 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "iterative.hpp" - -#include -#include -#include -#include -#include "linalg/orthog.hpp" -#include "utils/communication.hpp" -#include "utils/timer.hpp" - -namespace palace -{ - -namespace -{ - -template -inline void CheckDot(T dot, const char *msg) -{ - MFEM_ASSERT(std::isfinite(dot) && dot >= 0.0, msg << dot << "!"); -} - -template -inline void CheckDot(std::complex dot, const char *msg) -{ - MFEM_ASSERT(std::isfinite(dot.real()) && std::isfinite(dot.imag()) && dot.real() >= 0.0, - msg << dot << "!"); -} - -template -inline constexpr T SafeMin() -{ - // Originally part of LAPACK. - // LAPACK is free software: you can redistribute it and/or modify it under - // the terms of the BSD 3-Clause license. - // - // Copyright (c) 2021-2023, University of Colorado Denver. All rights reserved. - // Copyright (c) 2017-2021, University of Tennessee. All rights reserved. - // - // Original author: Weslley S Pereira, University of Colorado Denver, USA - constexpr int fradix = std::numeric_limits::radix; - constexpr int expm = std::numeric_limits::min_exponent; - constexpr int expM = std::numeric_limits::max_exponent; - // Note: pow is not constexpr in C++17 so this actually might not return a constexpr for - // all compilers. - return std::max(std::pow(fradix, T(expm - 1)), std::pow(fradix, T(1 - expM))); -} - -template -inline constexpr T SafeMax() -{ - // Originally part of LAPACK. - // LAPACK is free software: you can redistribute it and/or modify it under - // the terms of the BSD 3-Clause license. - // - // Copyright (c) 2021-2023, University of Colorado Denver. All rights reserved. - // Copyright (c) 2017-2021, University of Tennessee. All rights reserved. - // - // Original author: Weslley S Pereira, University of Colorado Denver, USA - constexpr int fradix = std::numeric_limits::radix; - constexpr int expm = std::numeric_limits::min_exponent; - constexpr int expM = std::numeric_limits::max_exponent; - // Note: pow is not constexpr in C++17 so this actually might not return a constexpr for - // all compilers. - return std::min(std::pow(fradix, T(1 - expm)), std::pow(fradix, T(expM - 1))); -} - -template -inline void GeneratePlaneRotation(const T dx, const T dy, T &cs, T &sn) -{ - // See LAPACK's s/dlartg. - const T safmin = SafeMin(); - const T safmax = SafeMax(); - const T root_min = std::sqrt(safmin); - const T root_max = std::sqrt(safmax / 2); - if (dy == 0.0) - { - cs = 1.0; - sn = 0.0; - return; - } - if (dx == 0.0) - { - cs = 0.0; - sn = std::copysign(1.0, dy); - return; - } - T dx1 = std::abs(dx); - T dy1 = std::abs(dy); - if (dx1 > root_min && dx1 < root_max && dy1 > root_min && dy1 < root_max) - { - T d = std::sqrt(dx * dx + dy * dy); - cs = dx1 / d; - sn = dy / std::copysign(d, dx); - } - else - { - T u = std::min(safmax, std::max(safmin, std::max(dx1, dy1))); - T dxs = dx / u; - T dys = dy / u; - T d = std::sqrt(dxs * dxs + dys * dys); - cs = std::abs(dxs) / d; - sn = dys / std::copysign(d, dx); - } -} - -template -inline void GeneratePlaneRotation(const std::complex dx, const std::complex dy, T &cs, - std::complex &sn) -{ - // Generates a plane rotation so that: - // [ cs sn ] [ dx ] = [ r ] - // [ -conj(sn) cs ] [ dy ] [ 0 ] - // where cs is real and cs² + |sn|² = 1. See LAPACK's c/zlartg. - const T safmin = SafeMin(); - const T safmax = SafeMax(); - if (dy == 0.0) - { - cs = 1.0; - sn = 0.0; - return; - } - if (dx == 0.0) - { - cs = 0.0; - if (dy.real() == 0.0) - { - sn = std::conj(dy) / std::abs(dy.imag()); - } - else if (dy.imag() == 0.0) - { - sn = std::conj(dy) / std::abs(dy.real()); - } - else - { - const T root_min = std::sqrt(safmin); - const T root_max = std::sqrt(safmax / 2); - T dy1 = std::max(std::abs(dy.real()), std::abs(dy.imag())); - if (dy1 > root_min && dy1 < root_max) - { - sn = std::conj(dy) / std::sqrt(dy.real() * dy.real() + dy.imag() * dy.imag()); - } - else - { - T u = std::min(safmax, std::max(safmin, dy1)); - std::complex dys = dy / u; - sn = std::conj(dys) / std::sqrt(dys.real() * dys.real() + dys.imag() * dys.imag()); - } - } - return; - } - const T root_min = std::sqrt(safmin); - const T root_max = std::sqrt(safmax / 4); - T dx1 = std::max(std::abs(dx.real()), std::abs(dx.imag())); - T dy1 = std::max(std::abs(dy.real()), std::abs(dy.imag())); - if (dx1 > root_min && dx1 < root_max && dy1 > root_min && dy1 < root_max) - { - T dx2 = dx.real() * dx.real() + dx.imag() * dx.imag(); - T dy2 = dy.real() * dy.real() + dy.imag() * dy.imag(); - T dz2 = dx2 + dy2; - if (dx2 >= dz2 * safmin) - { - cs = std::sqrt(dx2 / dz2); - if (dx2 > root_min && dz2 < root_max * 2) - { - sn = std::conj(dy) * (dx / std::sqrt(dx2 * dz2)); - } - else - { - sn = std::conj(dy) * ((dx / cs) / dz2); - } - } - else - { - T d = std::sqrt(dx2 * dz2); - cs = dx2 / d; - sn = std::conj(dy) * (dx / d); - } - } - else - { - T u = std::min(safmax, std::max(safmin, std::max(dx1, dy1))), w; - std::complex dys = dy / u, dxs; - T dy2 = dys.real() * dys.real() + dys.imag() * dys.imag(), dx2, dz2; - if (dx1 / u < root_min) - { - T v = std::min(safmax, std::max(safmin, dx1)); - w = v / u; - dxs = dx / v; - dx2 = dxs.real() * dxs.real() + dxs.imag() * dxs.imag(); - dz2 = dx2 * w * w + dy2; - } - else - { - w = 1.0; - dxs = dx / u; - dx2 = dxs.real() * dxs.real() + dxs.imag() * dxs.imag(); - dz2 = dx2 + dy2; - } - if (dx2 >= dz2 * safmin) - { - cs = std::sqrt(dx2 / dz2); - if (dx2 > root_min && dz2 < root_max * 2) - { - sn = std::conj(dys) * (dxs / std::sqrt(dx2 * dz2)); - } - else - { - sn = std::conj(dys) * ((dxs / cs) / dz2); - } - } - else - { - T d = std::sqrt(dx2 * dz2); - cs = dx2 / d; - sn = std::conj(dys) * (dxs / d); - } - cs *= w; - } -} - -template -inline void ApplyPlaneRotation(T &dx, T &dy, const T cs, const T sn) -{ - T t = cs * dx + sn * dy; - dy = -sn * dx + cs * dy; - dx = t; -} - -template -inline void ApplyPlaneRotation(std::complex &dx, std::complex &dy, const T cs, - const std::complex sn) -{ - std::complex t = cs * dx + sn * dy; - dy = -std::conj(sn) * dx + cs * dy; - dx = t; -} - -template -inline void ApplyB(const Solver *B, const VecType &x, VecType &y) -{ - BlockTimer bt(Timer::PRECONDITIONER); - MFEM_ASSERT(B, "Missing preconditioner in ApplyB!"); - B->Mult(x, y); -} - -template -inline void InitialResidual(PrecSide side, const OperType *A, const Solver *B, - const VecType &b, VecType &x, VecType &r, VecType &z, - bool initial_guess) -{ - if (B && side == GmresSolver::PrecSide::LEFT) - { - if (initial_guess) - { - A->Mult(x, z); - linalg::AXPBY(1.0, b, -1.0, z); - ApplyB(B, z, r); - } - else - { - ApplyB(B, b, r); - x = 0.0; - } - } - else // !B || side == PrecSide::RIGHT - { - if (initial_guess) - { - A->Mult(x, r); - linalg::AXPBY(1.0, b, -1.0, r); - } - else - { - r = b; - x = 0.0; - } - } -} - -template -inline void ApplyBA(PrecSide side, const OperType *A, const Solver *B, - const VecType &x, VecType &y, VecType &z) -{ - if (B && side == GmresSolver::PrecSide::LEFT) - { - A->Mult(x, z); - ApplyB(B, z, y); - } - else if (B && side == GmresSolver::PrecSide::RIGHT) - { - ApplyB(B, x, z); - A->Mult(z, y); - } - else - { - A->Mult(x, y); - } -} - -template -inline void OrthogonalizeIteration(OrthogType type, MPI_Comm comm, - const std::vector &V, VecType &w, - ScalarType *Hj, int j) -{ - using OperType = typename std::conditional::value, - ComplexOperator, Operator>::type; - - // Orthogonalize w against the leading j + 1 columns of V. - switch (type) - { - case GmresSolver::OrthogType::MGS: - linalg::OrthogonalizeColumnMGS(comm, V, w, Hj, j + 1); - break; - case GmresSolver::OrthogType::CGS: - linalg::OrthogonalizeColumnCGS(comm, V, w, Hj, j + 1); - break; - case GmresSolver::OrthogType::CGS2: - linalg::OrthogonalizeColumnCGS(comm, V, w, Hj, j + 1, true); - break; - } -} - -} // namespace - -template -IterativeSolver::IterativeSolver(MPI_Comm comm, int print) - : Solver(), comm(comm), A(nullptr), B(nullptr) -{ - print_opts.Warnings(); - if (print > 0) - { - print_opts.Summary(); - if (print > 1) - { - print_opts.Iterations(); - if (print > 2) - { - print_opts.All(); - } - } - } - int_width = 3; - tab_width = 0; - - rel_tol = abs_tol = 0.0; - max_it = 100; - - converged = false; - initial_res = 1.0; - final_res = 0.0; - final_it = 0; -} - -template -void CgSolver::Mult(const VecType &b, VecType &x) const -{ - // Set up workspace. - ScalarType beta, beta_prev = 0.0, alpha, denom; - RealType res, eps; - MFEM_VERIFY(A, "Operator must be set for CgSolver::Mult!"); - MFEM_ASSERT(A->Width() == x.Size() && A->Height() == b.Size(), - "Size mismatch for CgSolver::Mult!"); - r.SetSize(A->Height()); - z.SetSize(A->Height()); - p.SetSize(A->Height()); - - // Initialize. - if (this->initial_guess) - { - A->Mult(x, r); - linalg::AXPBY(1.0, b, -1.0, r); - } - else - { - r = b; - x = 0.0; - } - if (B) - { - ApplyB(B, r, z); - } - else - { - z = r; - } - beta = linalg::Dot(comm, z, r); - CheckDot(beta, "PCG preconditioner is not positive definite: (Br, r) = "); - res = std::sqrt(std::abs(beta)); - if (this->initial_guess && B) - { - ApplyB(B, b, p); - auto beta_rhs = linalg::Dot(comm, p, b); - CheckDot(beta_rhs, "PCG preconditioner is not positive definite: (Bb, b) = "); - initial_res = std::sqrt(std::abs(beta_rhs)); - } - else - { - initial_res = res; - } - eps = std::max(rel_tol * initial_res, abs_tol); - converged = (res < eps); - - // Begin iterations. - int it = 0; - if (print_opts.iterations) - { - Mpi::Print(comm, "{}Residual norms for PCG solve\n", - std::string(tab_width + int_width - 1, ' ')); - } - for (; it < max_it && !converged; it++) - { - if (print_opts.iterations) - { - Mpi::Print(comm, "{}{:{}d} KSP residual norm ||r||_B = {:.6e}\n", - std::string(tab_width, ' '), it, int_width, res); - } - if (!it) - { - p = z; - } - else - { - linalg::AXPBY(ScalarType(1.0), z, beta / beta_prev, p); - } - - A->Mult(p, z); - denom = linalg::Dot(comm, z, p); - CheckDot(denom, "PCG operator is not positive definite: (Ap, p) = "); - alpha = beta / denom; - - x.Add(alpha, p); - r.Add(-alpha, z); - - beta_prev = beta; - if (B) - { - ApplyB(B, r, z); - } - else - { - z = r; - } - beta = linalg::Dot(comm, z, r); - CheckDot(beta, "PCG preconditioner is not positive definite: (Br, r) = "); - res = std::sqrt(std::abs(beta)); - converged = (res < eps); - } - if (print_opts.iterations) - { - Mpi::Print(comm, "{}{:{}d} KSP residual norm ||r||_B = {:.6e}\n", - std::string(tab_width, ' '), it, int_width, res); - } - if (print_opts.summary || (print_opts.warnings && eps > 0.0 && !converged)) - { - Mpi::Print(comm, "{}PCG solver {} in {:d} iteration{}", std::string(tab_width, ' '), - converged ? "converged" : "did NOT converge", it, (it == 1) ? "" : "s"); - if (it > 0) - { - Mpi::Print(comm, " (avg. reduction factor: {:.3e})\n", - std::pow(res / initial_res, 1.0 / it)); - } - else - { - Mpi::Print(comm, "\n"); - } - } - final_res = res; - final_it = it; -} - -template -void GmresSolver::Initialize() const -{ - if (!V.empty()) - { - MFEM_ASSERT(V.size() == static_cast(max_dim + 1) && - V[0].Size() == A->Height(), - "Repeated solves with GmresSolver should not modify the operator size or " - "restart dimension!"); - return; - } - if (max_dim < 0) - { - max_dim = max_it; - } - constexpr int init_size = 5; - V.resize(max_dim + 1); - for (int j = 0; j < std::min(init_size, max_dim + 1); j++) - { - V[j].SetSize(A->Height()); - } - H.resize((max_dim + 1) * max_dim); - s.resize(max_dim + 1); - cs.resize(max_dim + 1); - sn.resize(max_dim + 1); -} - -template -void GmresSolver::Update(int j) const -{ - // Add storage for basis vectors in increments. - constexpr int add_size = 10; - for (int k = j + 1; k < std::min(j + 1 + add_size, max_dim + 1); k++) - { - V[k].SetSize(A->Height()); - } -} - -template -void GmresSolver::Mult(const VecType &b, VecType &x) const -{ - // Set up workspace. - RealType beta = 0.0, true_beta, eps = 0.0; - MFEM_VERIFY(A, "Operator must be set for GmresSolver::Mult!"); - MFEM_ASSERT(A->Width() == x.Size() && A->Height() == b.Size(), - "Size mismatch for GmresSolver::Mult!"); - r.SetSize(A->Height()); - Initialize(); - - // Begin iterations. - converged = false; - int it = 0, restart = 0; - if (print_opts.iterations) - { - Mpi::Print(comm, "{}Residual norms for GMRES solve\n", - std::string(tab_width + int_width - 1, ' ')); - } - for (; it < max_it; restart++) - { - // Initialize. - InitialResidual(pc_side, A, B, b, x, r, V[0], (this->initial_guess || restart > 0)); - true_beta = linalg::Norml2(comm, r); - CheckDot(true_beta, "GMRES residual norm is not valid: beta = "); - if (it == 0) - { - if (this->initial_guess) - { - RealType beta_rhs; - if (B && pc_side == PrecSide::LEFT) - { - ApplyB(B, b, V[0]); - beta_rhs = linalg::Norml2(comm, V[0]); - } - else // !B || pc_side == PrecSide::RIGHT - { - beta_rhs = linalg::Norml2(comm, b); - } - CheckDot(beta_rhs, "GMRES residual norm is not valid: beta_rhs = "); - initial_res = beta_rhs; - } - else - { - initial_res = true_beta; - } - eps = std::max(rel_tol * initial_res, abs_tol); - } - else if (beta > 0.0 && std::abs(beta - true_beta) > 0.1 * true_beta && - print_opts.warnings) - { - Mpi::Print( - comm, - "{}GMRES residual at restart ({:.6e}) is far from the residual norm estimate " - "from the recursion formula ({:.6e}) (initial residual = {:.6e})\n", - std::string(tab_width, ' '), true_beta, beta, initial_res); - } - beta = true_beta; - if (beta < eps) - { - converged = true; - break; - } - - V[0] = 0.0; - V[0].Add(1.0 / beta, r); - std::fill(s.begin(), s.end(), 0.0); - s[0] = beta; - - int j = 0; - for (;; j++, it++) - { - if (print_opts.iterations) - { - Mpi::Print(comm, "{}{:{}d} (restart {:d}) KSP residual norm {:.6e}\n", - std::string(tab_width, ' '), it, int_width, restart, beta); - } - VecType &w = V[j + 1]; - if (w.Size() == 0) - { - Update(j); - } - ApplyBA(pc_side, A, B, V[j], w, r); - - ScalarType *Hj = H.data() + j * (max_dim + 1); - OrthogonalizeIteration(orthog_type, comm, V, w, Hj, j); - Hj[j + 1] = linalg::Norml2(comm, w); - w *= 1.0 / Hj[j + 1]; - - for (int k = 0; k < j; k++) - { - ApplyPlaneRotation(Hj[k], Hj[k + 1], cs[k], sn[k]); - } - GeneratePlaneRotation(Hj[j], Hj[j + 1], cs[j], sn[j]); - ApplyPlaneRotation(Hj[j], Hj[j + 1], cs[j], sn[j]); - ApplyPlaneRotation(s[j], s[j + 1], cs[j], sn[j]); - - beta = std::abs(s[j + 1]); - CheckDot(beta, "GMRES residual norm is not valid: beta = "); - converged = (beta < eps); - if (converged || j + 1 == max_dim || it + 1 == max_it) - { - it++; - break; - } - } - - // Reconstruct the solution (for restart or due to convergence or maximum iterations). - for (int i = j; i >= 0; i--) - { - ScalarType *Hi = H.data() + i * (max_dim + 1); - s[i] /= Hi[i]; - for (int k = i - 1; k >= 0; k--) - { - s[k] -= Hi[k] * s[i]; - } - } - if (!B || pc_side == PrecSide::LEFT) - { - for (int k = 0; k <= j; k++) - { - x.Add(s[k], V[k]); - } - } - else // B && pc_side == PrecSide::RIGHT - { - r = 0.0; - for (int k = 0; k <= j; k++) - { - r.Add(s[k], V[k]); - } - ApplyB(B, r, V[0]); - x += V[0]; - } - if (converged) - { - break; - } - } - if (print_opts.iterations) - { - Mpi::Print(comm, "{}{:{}d} (restart {:d}) KSP residual norm {:.6e}\n", - std::string(tab_width, ' '), it, int_width, restart, beta); - } - if (print_opts.summary || (print_opts.warnings && eps > 0.0 && !converged)) - { - Mpi::Print(comm, "{}GMRES solver {} in {:d} iteration{}", std::string(tab_width, ' '), - converged ? "converged" : "did NOT converge", it, (it == 1) ? "" : "s"); - if (it > 0) - { - Mpi::Print(comm, " (avg. reduction factor: {:.3e})\n", - std::pow(beta / initial_res, 1.0 / it)); - } - else - { - Mpi::Print(comm, "\n"); - } - } - final_res = beta; - final_it = it; -} - -template -void FgmresSolver::Initialize() const -{ - GmresSolver::Initialize(); - constexpr int init_size = 5; - Z.resize(max_dim + 1); - for (int j = 0; j < std::min(init_size, max_dim + 1); j++) - { - Z[j].SetSize(A->Height()); - } -} - -template -void FgmresSolver::Update(int j) const -{ - // Add storage for basis vectors in increments. - GmresSolver::Update(j); - constexpr int add_size = 10; - for (int k = j + 1; k < std::min(j + 1 + add_size, max_dim + 1); k++) - { - Z[k].SetSize(A->Height()); - } -} - -template -void FgmresSolver::Mult(const VecType &b, VecType &x) const -{ - // Set up workspace. - RealType beta = 0.0, true_beta, eps = 0.0; - MFEM_VERIFY(A && B, "Operator and preconditioner must be set for FgmresSolver::Mult!"); - MFEM_ASSERT(A->Width() == x.Size() && A->Height() == b.Size(), - "Size mismatch for FgmresSolver::Mult!"); - Initialize(); - - // Begin iterations. - converged = false; - int it = 0, restart = 0; - if (print_opts.iterations) - { - Mpi::Print(comm, "{}Residual norms for FGMRES solve\n", - std::string(tab_width + int_width - 1, ' ')); - } - for (; it < max_it; restart++) - { - // Initialize. - InitialResidual(PrecSide::RIGHT, A, B, b, x, Z[0], V[0], - (this->initial_guess || restart > 0)); - true_beta = linalg::Norml2(comm, Z[0]); - CheckDot(true_beta, "FGMRES residual norm is not valid: beta = "); - if (it == 0) - { - if (this->initial_guess) - { - auto beta_rhs = linalg::Norml2(comm, b); - CheckDot(beta_rhs, "GMRES residual norm is not valid: beta_rhs = "); - initial_res = beta_rhs; - } - else - { - initial_res = true_beta; - } - eps = std::max(rel_tol * initial_res, abs_tol); - } - else if (beta > 0.0 && std::abs(beta - true_beta) > 0.1 * true_beta && - print_opts.warnings) - { - Mpi::Print( - comm, - "{}FGMRES residual at restart ({:.6e}) is far from the residual norm estimate " - "from the recursion formula ({:.6e}) (initial residual = {:.6e})\n", - std::string(tab_width, ' '), true_beta, beta, initial_res); - } - beta = true_beta; - if (beta < eps) - { - converged = true; - break; - } - - V[0] = 0.0; - V[0].Add(1.0 / beta, Z[0]); - std::fill(s.begin(), s.end(), 0.0); - s[0] = beta; - - int j = 0; - for (;; j++, it++) - { - if (print_opts.iterations) - { - Mpi::Print(comm, "{}{:{}d} (restart {:d}) KSP residual norm {:.6e}\n", - std::string(tab_width, ' '), it, int_width, restart, beta); - } - VecType &w = V[j + 1]; - if (w.Size() == 0) - { - Update(j); - } - ApplyBA(PrecSide::RIGHT, A, B, V[j], w, Z[j]); - - ScalarType *Hj = H.data() + j * (max_dim + 1); - OrthogonalizeIteration(orthog_type, comm, V, w, Hj, j); - Hj[j + 1] = linalg::Norml2(comm, w); - w *= 1.0 / Hj[j + 1]; - - for (int k = 0; k < j; k++) - { - ApplyPlaneRotation(Hj[k], Hj[k + 1], cs[k], sn[k]); - } - GeneratePlaneRotation(Hj[j], Hj[j + 1], cs[j], sn[j]); - ApplyPlaneRotation(Hj[j], Hj[j + 1], cs[j], sn[j]); - ApplyPlaneRotation(s[j], s[j + 1], cs[j], sn[j]); - - beta = std::abs(s[j + 1]); - CheckDot(beta, "FGMRES residual norm is not valid: beta = "); - converged = (beta < eps); - if (converged || j + 1 == max_dim || it + 1 == max_it) - { - it++; - break; - } - } - - // Reconstruct the solution (for restart or due to convergence or maximum iterations). - for (int i = j; i >= 0; i--) - { - ScalarType *Hi = H.data() + i * (max_dim + 1); - s[i] /= Hi[i]; - for (int k = i - 1; k >= 0; k--) - { - s[k] -= Hi[k] * s[i]; - } - } - for (int k = 0; k <= j; k++) - { - x.Add(s[k], Z[k]); - } - if (converged) - { - break; - } - } - if (print_opts.iterations) - { - Mpi::Print(comm, "{}{:{}d} (restart {:d}) KSP residual norm {:.6e}\n", - std::string(tab_width, ' '), it, int_width, restart, beta); - } - if (print_opts.summary || (print_opts.warnings && eps > 0.0 && !converged)) - { - Mpi::Print(comm, "{}FGMRES solver {} in {:d} iteration{}", std::string(tab_width, ' '), - converged ? "converged" : "did NOT converge", it, (it == 1) ? "" : "s"); - if (it > 0) - { - Mpi::Print(comm, " (avg. reduction factor: {:.3e})\n", - std::pow(beta / initial_res, 1.0 / it)); - } - else - { - Mpi::Print(comm, "\n"); - } - } - final_res = beta; - final_it = it; -} - -template class IterativeSolver; -template class IterativeSolver; -template class CgSolver; -template class CgSolver; -template class GmresSolver; -template class GmresSolver; -template class FgmresSolver; -template class FgmresSolver; - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "iterative.hpp" + +#include +#include +#include +#include +#include "linalg/orthog.hpp" +#include "utils/communication.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +namespace +{ + +template +inline void CheckDot(T dot, const char *msg) +{ + MFEM_ASSERT(std::isfinite(dot) && dot >= 0.0, msg << dot << "!"); +} + +template +inline void CheckDot(std::complex dot, const char *msg) +{ + MFEM_ASSERT(std::isfinite(dot.real()) && std::isfinite(dot.imag()) && dot.real() >= 0.0, + msg << dot << "!"); +} + +template +inline constexpr T SafeMin() +{ + // Originally part of LAPACK. + // LAPACK is free software: you can redistribute it and/or modify it under + // the terms of the BSD 3-Clause license. + // + // Copyright (c) 2021-2023, University of Colorado Denver. All rights reserved. + // Copyright (c) 2017-2021, University of Tennessee. All rights reserved. + // + // Original author: Weslley S Pereira, University of Colorado Denver, USA + constexpr int fradix = std::numeric_limits::radix; + constexpr int expm = std::numeric_limits::min_exponent; + constexpr int expM = std::numeric_limits::max_exponent; + // Note: pow is not constexpr in C++17 so this actually might not return a constexpr for + // all compilers. + return std::max(std::pow(fradix, T(expm - 1)), std::pow(fradix, T(1 - expM))); +} + +template +inline constexpr T SafeMax() +{ + // Originally part of LAPACK. + // LAPACK is free software: you can redistribute it and/or modify it under + // the terms of the BSD 3-Clause license. + // + // Copyright (c) 2021-2023, University of Colorado Denver. All rights reserved. + // Copyright (c) 2017-2021, University of Tennessee. All rights reserved. + // + // Original author: Weslley S Pereira, University of Colorado Denver, USA + constexpr int fradix = std::numeric_limits::radix; + constexpr int expm = std::numeric_limits::min_exponent; + constexpr int expM = std::numeric_limits::max_exponent; + // Note: pow is not constexpr in C++17 so this actually might not return a constexpr for + // all compilers. + return std::min(std::pow(fradix, T(1 - expm)), std::pow(fradix, T(expM - 1))); +} + +template +inline void GeneratePlaneRotation(const T dx, const T dy, T &cs, T &sn) +{ + // See LAPACK's s/dlartg. + const T safmin = SafeMin(); + const T safmax = SafeMax(); + const T root_min = std::sqrt(safmin); + const T root_max = std::sqrt(safmax / 2); + if (dy == 0.0) + { + cs = 1.0; + sn = 0.0; + return; + } + if (dx == 0.0) + { + cs = 0.0; + sn = std::copysign(1.0, dy); + return; + } + T dx1 = std::abs(dx); + T dy1 = std::abs(dy); + if (dx1 > root_min && dx1 < root_max && dy1 > root_min && dy1 < root_max) + { + T d = std::sqrt(dx * dx + dy * dy); + cs = dx1 / d; + sn = dy / std::copysign(d, dx); + } + else + { + T u = std::min(safmax, std::max(safmin, std::max(dx1, dy1))); + T dxs = dx / u; + T dys = dy / u; + T d = std::sqrt(dxs * dxs + dys * dys); + cs = std::abs(dxs) / d; + sn = dys / std::copysign(d, dx); + } +} + +template +inline void GeneratePlaneRotation(const std::complex dx, const std::complex dy, T &cs, + std::complex &sn) +{ + // Generates a plane rotation so that: + // [ cs sn ] [ dx ] = [ r ] + // [ -conj(sn) cs ] [ dy ] [ 0 ] + // where cs is real and cs² + |sn|² = 1. See LAPACK's c/zlartg. + const T safmin = SafeMin(); + const T safmax = SafeMax(); + if (dy == 0.0) + { + cs = 1.0; + sn = 0.0; + return; + } + if (dx == 0.0) + { + cs = 0.0; + if (dy.real() == 0.0) + { + sn = std::conj(dy) / std::abs(dy.imag()); + } + else if (dy.imag() == 0.0) + { + sn = std::conj(dy) / std::abs(dy.real()); + } + else + { + const T root_min = std::sqrt(safmin); + const T root_max = std::sqrt(safmax / 2); + T dy1 = std::max(std::abs(dy.real()), std::abs(dy.imag())); + if (dy1 > root_min && dy1 < root_max) + { + sn = std::conj(dy) / std::sqrt(dy.real() * dy.real() + dy.imag() * dy.imag()); + } + else + { + T u = std::min(safmax, std::max(safmin, dy1)); + std::complex dys = dy / u; + sn = std::conj(dys) / std::sqrt(dys.real() * dys.real() + dys.imag() * dys.imag()); + } + } + return; + } + const T root_min = std::sqrt(safmin); + const T root_max = std::sqrt(safmax / 4); + T dx1 = std::max(std::abs(dx.real()), std::abs(dx.imag())); + T dy1 = std::max(std::abs(dy.real()), std::abs(dy.imag())); + if (dx1 > root_min && dx1 < root_max && dy1 > root_min && dy1 < root_max) + { + T dx2 = dx.real() * dx.real() + dx.imag() * dx.imag(); + T dy2 = dy.real() * dy.real() + dy.imag() * dy.imag(); + T dz2 = dx2 + dy2; + if (dx2 >= dz2 * safmin) + { + cs = std::sqrt(dx2 / dz2); + if (dx2 > root_min && dz2 < root_max * 2) + { + sn = std::conj(dy) * (dx / std::sqrt(dx2 * dz2)); + } + else + { + sn = std::conj(dy) * ((dx / cs) / dz2); + } + } + else + { + T d = std::sqrt(dx2 * dz2); + cs = dx2 / d; + sn = std::conj(dy) * (dx / d); + } + } + else + { + T u = std::min(safmax, std::max(safmin, std::max(dx1, dy1))), w; + std::complex dys = dy / u, dxs; + T dy2 = dys.real() * dys.real() + dys.imag() * dys.imag(), dx2, dz2; + if (dx1 / u < root_min) + { + T v = std::min(safmax, std::max(safmin, dx1)); + w = v / u; + dxs = dx / v; + dx2 = dxs.real() * dxs.real() + dxs.imag() * dxs.imag(); + dz2 = dx2 * w * w + dy2; + } + else + { + w = 1.0; + dxs = dx / u; + dx2 = dxs.real() * dxs.real() + dxs.imag() * dxs.imag(); + dz2 = dx2 + dy2; + } + if (dx2 >= dz2 * safmin) + { + cs = std::sqrt(dx2 / dz2); + if (dx2 > root_min && dz2 < root_max * 2) + { + sn = std::conj(dys) * (dxs / std::sqrt(dx2 * dz2)); + } + else + { + sn = std::conj(dys) * ((dxs / cs) / dz2); + } + } + else + { + T d = std::sqrt(dx2 * dz2); + cs = dx2 / d; + sn = std::conj(dys) * (dxs / d); + } + cs *= w; + } +} + +template +inline void ApplyPlaneRotation(T &dx, T &dy, const T cs, const T sn) +{ + T t = cs * dx + sn * dy; + dy = -sn * dx + cs * dy; + dx = t; +} + +template +inline void ApplyPlaneRotation(std::complex &dx, std::complex &dy, const T cs, + const std::complex sn) +{ + std::complex t = cs * dx + sn * dy; + dy = -std::conj(sn) * dx + cs * dy; + dx = t; +} + +template +inline void ApplyB(const Solver *B, const VecType &x, VecType &y, + bool use_timer = true) +{ + BlockTimer bt(Timer::KSP_PRECONDITIONER, use_timer); + MFEM_ASSERT(B, "Missing preconditioner in ApplyB!"); + B->Mult(x, y); +} + +template +inline void InitialResidual(PreconditionerSide side, const OperType *A, + const Solver *B, const VecType &b, VecType &x, + VecType &r, VecType &z, bool initial_guess, + bool use_timer = true) +{ + if (B && side == PreconditionerSide::LEFT) + { + if (initial_guess) + { + A->Mult(x, z); + linalg::AXPBY(1.0, b, -1.0, z); + ApplyB(B, z, r, use_timer); + } + else + { + ApplyB(B, b, r, use_timer); + x = 0.0; + } + } + else // !B || side == PreconditionerSide::RIGHT + { + if (initial_guess) + { + A->Mult(x, r); + linalg::AXPBY(1.0, b, -1.0, r); + } + else + { + r = b; + x = 0.0; + } + } +} + +template +inline void ApplyBA(PreconditionerSide side, const OperType *A, const Solver *B, + const VecType &x, VecType &y, VecType &z, bool use_timer = true) +{ + if (B && side == PreconditionerSide::LEFT) + { + A->Mult(x, z); + ApplyB(B, z, y, use_timer); + } + else if (B && side == PreconditionerSide::RIGHT) + { + ApplyB(B, x, z, use_timer); + A->Mult(z, y); + } + else + { + A->Mult(x, y); + } +} + +template +inline void OrthogonalizeIteration(Orthogonalization type, MPI_Comm comm, + const std::vector &V, VecType &w, + ScalarType *Hj, int j) +{ + // Orthogonalize w against the leading j + 1 columns of V. + switch (type) + { + case Orthogonalization::MGS: + linalg::OrthogonalizeColumnMGS(comm, V, w, Hj, j + 1); + break; + case Orthogonalization::CGS: + linalg::OrthogonalizeColumnCGS(comm, V, w, Hj, j + 1); + break; + case Orthogonalization::CGS2: + linalg::OrthogonalizeColumnCGS(comm, V, w, Hj, j + 1, true); + break; + } +} + +} // namespace + +template +IterativeSolver::IterativeSolver(MPI_Comm comm, int print) + : Solver(), comm(comm), A(nullptr), B(nullptr) +{ + print_opts.Warnings(); + if (print > 0) + { + print_opts.Summary(); + if (print > 1) + { + print_opts.Iterations(); + if (print > 2) + { + print_opts.All(); + } + } + } + int_width = 3; + tab_width = 0; + + rel_tol = abs_tol = 0.0; + max_it = 100; + + converged = false; + initial_res = 1.0; + final_res = 0.0; + final_it = 0; + + use_timer = false; +} + +template +void CgSolver::Mult(const VecType &b, VecType &x) const +{ + // Set up workspace. + ScalarType beta, beta_prev = 0.0, alpha, denom; + RealType res, eps; + MFEM_VERIFY(A, "Operator must be set for CgSolver::Mult!"); + MFEM_ASSERT(A->Width() == x.Size() && A->Height() == b.Size(), + "Size mismatch for CgSolver::Mult!"); + r.SetSize(A->Height()); + z.SetSize(A->Height()); + p.SetSize(A->Height()); + r.UseDevice(true); + z.UseDevice(true); + p.UseDevice(true); + + // Initialize. + if (this->initial_guess) + { + A->Mult(x, r); + linalg::AXPBY(1.0, b, -1.0, r); + } + else + { + r = b; + x = 0.0; + } + if (B) + { + ApplyB(B, r, z, this->use_timer); + } + else + { + z = r; + } + beta = linalg::Dot(comm, z, r); + CheckDot(beta, "PCG preconditioner is not positive definite: (Br, r) = "); + res = std::sqrt(std::abs(beta)); + if (this->initial_guess) + { + ScalarType beta_rhs; + if (B) + { + ApplyB(B, b, p, this->use_timer); + beta_rhs = linalg::Dot(comm, p, b); + } + else + { + beta_rhs = linalg::Norml2(comm, b); + } + CheckDot(beta_rhs, "PCG preconditioner is not positive definite: (Bb, b) = "); + initial_res = std::sqrt(std::abs(beta_rhs)); + } + else + { + initial_res = res; + } + eps = std::max(rel_tol * initial_res, abs_tol); + converged = (res < eps); + + // Begin iterations. + int it = 0; + if (print_opts.iterations) + { + Mpi::Print(comm, "{}Residual norms for PCG solve\n", + std::string(tab_width + int_width - 1, ' ')); + } + for (; it < max_it && !converged; it++) + { + if (print_opts.iterations) + { + Mpi::Print(comm, "{}{:{}d} KSP residual norm ||r||_B = {:.6e}\n", + std::string(tab_width, ' '), it, int_width, res); + } + if (!it) + { + p = z; + } + else + { + linalg::AXPBY(ScalarType(1.0), z, beta / beta_prev, p); + } + + A->Mult(p, z); + denom = linalg::Dot(comm, z, p); + CheckDot(denom, "PCG operator is not positive definite: (Ap, p) = "); + alpha = beta / denom; + + x.Add(alpha, p); + r.Add(-alpha, z); + + beta_prev = beta; + if (B) + { + ApplyB(B, r, z, this->use_timer); + } + else + { + z = r; + } + beta = linalg::Dot(comm, z, r); + CheckDot(beta, "PCG preconditioner is not positive definite: (Br, r) = "); + res = std::sqrt(std::abs(beta)); + converged = (res < eps); + } + if (print_opts.iterations) + { + Mpi::Print(comm, "{}{:{}d} KSP residual norm ||r||_B = {:.6e}\n", + std::string(tab_width, ' '), it, int_width, res); + } + if (print_opts.summary || (print_opts.warnings && eps > 0.0 && !converged)) + { + Mpi::Print(comm, "{}PCG solver {} in {:d} iteration{}", std::string(tab_width, ' '), + converged ? "converged" : "did NOT converge", it, (it == 1) ? "" : "s"); + if (it > 0) + { + Mpi::Print(comm, " (avg. reduction factor: {:.3e})\n", + std::pow(res / initial_res, 1.0 / it)); + } + else + { + Mpi::Print(comm, "\n"); + } + } + final_res = res; + final_it = it; +} + +template +void GmresSolver::Initialize() const +{ + if (!V.empty()) + { + MFEM_ASSERT(V.size() == static_cast(max_dim + 1) && + V[0].Size() == A->Height(), + "Repeated solves with GmresSolver should not modify the operator size or " + "restart dimension!"); + return; + } + if (max_dim < 0) + { + max_dim = max_it; + } + constexpr int init_size = 5; + V.resize(max_dim + 1); + for (int j = 0; j < std::min(init_size, max_dim + 1); j++) + { + V[j].SetSize(A->Height()); + V[j].UseDevice(true); + } + H.resize((max_dim + 1) * max_dim); + s.resize(max_dim + 1); + cs.resize(max_dim + 1); + sn.resize(max_dim + 1); +} + +template +void GmresSolver::Update(int j) const +{ + // Add storage for basis vectors in increments. + constexpr int add_size = 10; + for (int k = j + 1; k < std::min(j + 1 + add_size, max_dim + 1); k++) + { + V[k].SetSize(A->Height()); + V[k].UseDevice(true); + } +} + +template +void GmresSolver::Mult(const VecType &b, VecType &x) const +{ + // Set up workspace. + RealType beta = 0.0, true_beta, eps = 0.0; + MFEM_VERIFY(A, "Operator must be set for GmresSolver::Mult!"); + MFEM_ASSERT(A->Width() == x.Size() && A->Height() == b.Size(), + "Size mismatch for GmresSolver::Mult!"); + r.SetSize(A->Height()); + r.UseDevice(true); + Initialize(); + + // Begin iterations. + converged = false; + int it = 0, restart = 0; + if (print_opts.iterations) + { + Mpi::Print(comm, "{}Residual norms for GMRES solve\n", + std::string(tab_width + int_width - 1, ' ')); + } + for (; it < max_it; restart++) + { + // Initialize. + InitialResidual(pc_side, A, B, b, x, r, V[0], (this->initial_guess || restart > 0), + this->use_timer); + true_beta = linalg::Norml2(comm, r); + CheckDot(true_beta, "GMRES residual norm is not valid: beta = "); + if (it == 0) + { + if (this->initial_guess) + { + RealType beta_rhs; + if (B && pc_side == PreconditionerSide::LEFT) + { + ApplyB(B, b, V[0], this->use_timer); + beta_rhs = linalg::Norml2(comm, V[0]); + } + else // !B || pc_side == PreconditionerSide::RIGHT + { + beta_rhs = linalg::Norml2(comm, b); + } + CheckDot(beta_rhs, "GMRES residual norm is not valid: beta_rhs = "); + initial_res = beta_rhs; + } + else + { + initial_res = true_beta; + } + eps = std::max(rel_tol * initial_res, abs_tol); + } + else if (beta > 0.0 && std::abs(beta - true_beta) > 0.1 * true_beta && + print_opts.warnings) + { + Mpi::Print( + comm, + "{}GMRES residual at restart ({:.6e}) is far from the residual norm estimate " + "from the recursion formula ({:.6e}) (initial residual = {:.6e})\n", + std::string(tab_width, ' '), true_beta, beta, initial_res); + } + beta = true_beta; + if (beta < eps) + { + converged = true; + break; + } + + V[0] = 0.0; + V[0].Add(1.0 / beta, r); + std::fill(s.begin(), s.end(), 0.0); + s[0] = beta; + + int j = 0; + for (;; j++, it++) + { + if (print_opts.iterations) + { + Mpi::Print(comm, "{}{:{}d} (restart {:d}) KSP residual norm {:.6e}\n", + std::string(tab_width, ' '), it, int_width, restart, beta); + } + VecType &w = V[j + 1]; + if (w.Size() == 0) + { + Update(j); + } + ApplyBA(pc_side, A, B, V[j], w, r, this->use_timer); + + ScalarType *Hj = H.data() + j * (max_dim + 1); + OrthogonalizeIteration(gs_orthog, comm, V, w, Hj, j); + Hj[j + 1] = linalg::Norml2(comm, w); + w *= 1.0 / Hj[j + 1]; + + for (int k = 0; k < j; k++) + { + ApplyPlaneRotation(Hj[k], Hj[k + 1], cs[k], sn[k]); + } + GeneratePlaneRotation(Hj[j], Hj[j + 1], cs[j], sn[j]); + ApplyPlaneRotation(Hj[j], Hj[j + 1], cs[j], sn[j]); + ApplyPlaneRotation(s[j], s[j + 1], cs[j], sn[j]); + + beta = std::abs(s[j + 1]); + CheckDot(beta, "GMRES residual norm is not valid: beta = "); + converged = (beta < eps); + if (converged || j + 1 == max_dim || it + 1 == max_it) + { + it++; + break; + } + } + + // Reconstruct the solution (for restart or due to convergence or maximum iterations). + for (int i = j; i >= 0; i--) + { + ScalarType *Hi = H.data() + i * (max_dim + 1); + s[i] /= Hi[i]; + for (int k = i - 1; k >= 0; k--) + { + s[k] -= Hi[k] * s[i]; + } + } + if (!B || pc_side == PreconditionerSide::LEFT) + { + for (int k = 0; k <= j; k++) + { + x.Add(s[k], V[k]); + } + } + else // B && pc_side == PreconditionerSide::RIGHT + { + r = 0.0; + for (int k = 0; k <= j; k++) + { + r.Add(s[k], V[k]); + } + ApplyB(B, r, V[0], this->use_timer); + x += V[0]; + } + if (converged) + { + break; + } + } + if (print_opts.iterations) + { + Mpi::Print(comm, "{}{:{}d} (restart {:d}) KSP residual norm {:.6e}\n", + std::string(tab_width, ' '), it, int_width, restart, beta); + } + if (print_opts.summary || (print_opts.warnings && eps > 0.0 && !converged)) + { + Mpi::Print(comm, "{}GMRES solver {} in {:d} iteration{}", std::string(tab_width, ' '), + converged ? "converged" : "did NOT converge", it, (it == 1) ? "" : "s"); + if (it > 0) + { + Mpi::Print(comm, " (avg. reduction factor: {:.3e})\n", + std::pow(beta / initial_res, 1.0 / it)); + } + else + { + Mpi::Print(comm, "\n"); + } + } + final_res = beta; + final_it = it; +} + +template +void FgmresSolver::Initialize() const +{ + GmresSolver::Initialize(); + constexpr int init_size = 5; + Z.resize(max_dim + 1); + for (int j = 0; j < std::min(init_size, max_dim + 1); j++) + { + Z[j].SetSize(A->Height()); + Z[j].UseDevice(true); + } +} + +template +void FgmresSolver::Update(int j) const +{ + // Add storage for basis vectors in increments. + GmresSolver::Update(j); + constexpr int add_size = 10; + for (int k = j + 1; k < std::min(j + 1 + add_size, max_dim + 1); k++) + { + Z[k].SetSize(A->Height()); + Z[k].UseDevice(true); + } +} + +template +void FgmresSolver::Mult(const VecType &b, VecType &x) const +{ + // Set up workspace. + RealType beta = 0.0, true_beta, eps = 0.0; + MFEM_VERIFY(A && B, "Operator and preconditioner must be set for FgmresSolver::Mult!"); + MFEM_ASSERT(A->Width() == x.Size() && A->Height() == b.Size(), + "Size mismatch for FgmresSolver::Mult!"); + Initialize(); + + // Begin iterations. + converged = false; + int it = 0, restart = 0; + if (print_opts.iterations) + { + Mpi::Print(comm, "{}Residual norms for FGMRES solve\n", + std::string(tab_width + int_width - 1, ' ')); + } + for (; it < max_it; restart++) + { + // Initialize. + InitialResidual(PreconditionerSide::RIGHT, A, B, b, x, Z[0], V[0], + (this->initial_guess || restart > 0), this->use_timer); + true_beta = linalg::Norml2(comm, Z[0]); + CheckDot(true_beta, "FGMRES residual norm is not valid: beta = "); + if (it == 0) + { + if (this->initial_guess) + { + auto beta_rhs = linalg::Norml2(comm, b); + CheckDot(beta_rhs, "GMRES residual norm is not valid: beta_rhs = "); + initial_res = beta_rhs; + } + else + { + initial_res = true_beta; + } + eps = std::max(rel_tol * initial_res, abs_tol); + } + else if (beta > 0.0 && std::abs(beta - true_beta) > 0.1 * true_beta && + print_opts.warnings) + { + Mpi::Print( + comm, + "{}FGMRES residual at restart ({:.6e}) is far from the residual norm estimate " + "from the recursion formula ({:.6e}) (initial residual = {:.6e})\n", + std::string(tab_width, ' '), true_beta, beta, initial_res); + } + beta = true_beta; + if (beta < eps) + { + converged = true; + break; + } + + V[0] = 0.0; + V[0].Add(1.0 / beta, Z[0]); + std::fill(s.begin(), s.end(), 0.0); + s[0] = beta; + + int j = 0; + for (;; j++, it++) + { + if (print_opts.iterations) + { + Mpi::Print(comm, "{}{:{}d} (restart {:d}) KSP residual norm {:.6e}\n", + std::string(tab_width, ' '), it, int_width, restart, beta); + } + VecType &w = V[j + 1]; + if (w.Size() == 0) + { + Update(j); + } + ApplyBA(PreconditionerSide::RIGHT, A, B, V[j], w, Z[j], this->use_timer); + + ScalarType *Hj = H.data() + j * (max_dim + 1); + OrthogonalizeIteration(gs_orthog, comm, V, w, Hj, j); + Hj[j + 1] = linalg::Norml2(comm, w); + w *= 1.0 / Hj[j + 1]; + + for (int k = 0; k < j; k++) + { + ApplyPlaneRotation(Hj[k], Hj[k + 1], cs[k], sn[k]); + } + GeneratePlaneRotation(Hj[j], Hj[j + 1], cs[j], sn[j]); + ApplyPlaneRotation(Hj[j], Hj[j + 1], cs[j], sn[j]); + ApplyPlaneRotation(s[j], s[j + 1], cs[j], sn[j]); + + beta = std::abs(s[j + 1]); + CheckDot(beta, "FGMRES residual norm is not valid: beta = "); + converged = (beta < eps); + if (converged || j + 1 == max_dim || it + 1 == max_it) + { + it++; + break; + } + } + + // Reconstruct the solution (for restart or due to convergence or maximum iterations). + for (int i = j; i >= 0; i--) + { + ScalarType *Hi = H.data() + i * (max_dim + 1); + s[i] /= Hi[i]; + for (int k = i - 1; k >= 0; k--) + { + s[k] -= Hi[k] * s[i]; + } + } + for (int k = 0; k <= j; k++) + { + x.Add(s[k], Z[k]); + } + if (converged) + { + break; + } + } + if (print_opts.iterations) + { + Mpi::Print(comm, "{}{:{}d} (restart {:d}) KSP residual norm {:.6e}\n", + std::string(tab_width, ' '), it, int_width, restart, beta); + } + if (print_opts.summary || (print_opts.warnings && eps > 0.0 && !converged)) + { + Mpi::Print(comm, "{}FGMRES solver {} in {:d} iteration{}", std::string(tab_width, ' '), + converged ? "converged" : "did NOT converge", it, (it == 1) ? "" : "s"); + if (it > 0) + { + Mpi::Print(comm, " (avg. reduction factor: {:.3e})\n", + std::pow(beta / initial_res, 1.0 / it)); + } + else + { + Mpi::Print(comm, "\n"); + } + } + final_res = beta; + final_it = it; +} + +template class IterativeSolver; +template class IterativeSolver; +template class CgSolver; +template class CgSolver; +template class GmresSolver; +template class GmresSolver; +template class FgmresSolver; +template class FgmresSolver; + +} // namespace palace diff --git a/palace/linalg/iterative.hpp b/palace/linalg/iterative.hpp index 536632ebb2..6e0de5f297 100644 --- a/palace/linalg/iterative.hpp +++ b/palace/linalg/iterative.hpp @@ -1,284 +1,273 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_ITERATIVE_HPP -#define PALACE_LINALG_ITERATIVE_HPP - -#include -#include -#include -#include "linalg/operator.hpp" -#include "linalg/solver.hpp" -#include "linalg/vector.hpp" - -namespace palace -{ - -// -// Iterative solvers based on Krylov subspace methods with optional preconditioning, for -// real- or complex-valued systems. -// - -// Base class for iterative solvers based on Krylov subspace methods with optional -// preconditioning. -template -class IterativeSolver : public Solver -{ -protected: - using RealType = double; - using ScalarType = - typename std::conditional::value, - std::complex, RealType>::type; - - // MPI communicator associated with the solver. - MPI_Comm comm; - - // Control level of printing during solves. - mfem::IterativeSolver::PrintLevel print_opts; - int int_width, tab_width; - - // Relative and absolute tolerances. - double rel_tol, abs_tol; - - // Limit for the number of solver iterations. - int max_it; - - // Operator and (optional) preconditioner associated with the iterative solver (not - // owned). - const OperType *A; - const Solver *B; - - // Variables set during solve to capture solve statistics. - mutable bool converged; - mutable double initial_res, final_res; - mutable int final_it; - -public: - IterativeSolver(MPI_Comm comm, int print); - - // Set an indentation for all log printing. - void SetTabWidth(int width) { tab_width = width; } - - // Set the relative convergence tolerance. - void SetTol(double tol) { SetRelTol(tol); } - void SetRelTol(double tol) { rel_tol = tol; } - - // Set the absolute convergence tolerance. - void SetAbsTol(double tol) { abs_tol = tol; } - - // Set the maximum number of iterations. - void SetMaxIter(int its) - { - max_it = its; - int_width = 1 + static_cast(std::log10(its)); - } - - // Set the operator for the solver. - void SetOperator(const OperType &op) override - { - A = &op; - this->height = op.Height(); - this->width = op.Width(); - } - - // Set the preconditioner for the solver. - void SetPreconditioner(const Solver &pc) { B = &pc; } - - // Returns if the previous solve converged or not. - bool GetConverged() const { return converged && (rel_tol > 0.0 || abs_tol > 0.0); } - - // Returns the initial (absolute) residual for the previous solve. - double GetInitialRes() const { return initial_res; } - - // Returns the final (absolute) residual for the previous solve, which may be an estimate - // to the true residual. - double GetFinalRes() const { return final_res; } - - // Returns the number of iterations for the previous solve. - int GetNumIterations() const { return final_it; } - - // Get the associated MPI communicator. - MPI_Comm GetComm() const { return comm; } -}; - -// Preconditioned Conjugate Gradient (CG) method for SPD linear systems. -template -class CgSolver : public IterativeSolver -{ -protected: - using VecType = typename Solver::VecType; - using RealType = typename IterativeSolver::RealType; - using ScalarType = typename IterativeSolver::ScalarType; - - using IterativeSolver::comm; - using IterativeSolver::print_opts; - using IterativeSolver::int_width; - using IterativeSolver::tab_width; - - using IterativeSolver::rel_tol; - using IterativeSolver::abs_tol; - using IterativeSolver::max_it; - - using IterativeSolver::A; - using IterativeSolver::B; - - using IterativeSolver::converged; - using IterativeSolver::initial_res; - using IterativeSolver::final_res; - using IterativeSolver::final_it; - - // Temporary workspace for solve. - mutable VecType r, z, p; - -public: - CgSolver(MPI_Comm comm, int print) : IterativeSolver(comm, print) {} - - void Mult(const VecType &b, VecType &x) const override; -}; - -// Preconditioned Generalized Minimum Residual Method (GMRES) for general nonsymmetric -// linear systems. -template -class GmresSolver : public IterativeSolver -{ -public: - enum class OrthogType - { - MGS, - CGS, - CGS2 - }; - - enum class PrecSide - { - LEFT, - RIGHT - }; - -protected: - using VecType = typename Solver::VecType; - using RealType = typename IterativeSolver::RealType; - using ScalarType = typename IterativeSolver::ScalarType; - - using IterativeSolver::comm; - using IterativeSolver::print_opts; - using IterativeSolver::int_width; - using IterativeSolver::tab_width; - - using IterativeSolver::rel_tol; - using IterativeSolver::abs_tol; - using IterativeSolver::max_it; - - using IterativeSolver::A; - using IterativeSolver::B; - - using IterativeSolver::converged; - using IterativeSolver::initial_res; - using IterativeSolver::final_res; - using IterativeSolver::final_it; - - // Maximum subspace dimension for restarted GMRES. - mutable int max_dim; - - // Orthogonalization method for orthonormalizing a newly computed vector against a basis - // at each iteration. - OrthogType orthog_type; - - // Use left or right preconditioning. - PrecSide pc_side; - - // Temporary workspace for solve. - mutable std::vector V; - mutable VecType r; - mutable std::vector H; - mutable std::vector s, sn; - mutable std::vector cs; - - // Allocate storage for solve. - virtual void Initialize() const; - virtual void Update(int j) const; - -public: - GmresSolver(MPI_Comm comm, int print) - : IterativeSolver(comm, print), max_dim(-1), orthog_type(OrthogType::MGS), - pc_side(PrecSide::LEFT) - { - } - - // Set the dimension for restart. - void SetRestartDim(int dim) { max_dim = dim; } - - // Set the orthogonalization method. - void SetOrthogonalization(OrthogType type) { orthog_type = type; } - - // Set the side for preconditioning. - virtual void SetPrecSide(PrecSide side) { pc_side = side; } - - void Mult(const VecType &b, VecType &x) const override; -}; - -// Preconditioned Flexible Generalized Minimum Residual Method (FGMRES) for general -// nonsymmetric linear systems with a non-constant preconditioner. -template -class FgmresSolver : public GmresSolver -{ -public: - using OrthogType = typename GmresSolver::OrthogType; - using PrecSide = typename GmresSolver::PrecSide; - -protected: - using VecType = typename GmresSolver::VecType; - using RealType = typename GmresSolver::RealType; - using ScalarType = typename GmresSolver::ScalarType; - - using GmresSolver::comm; - using GmresSolver::print_opts; - using GmresSolver::int_width; - using GmresSolver::tab_width; - - using GmresSolver::rel_tol; - using GmresSolver::abs_tol; - using GmresSolver::max_it; - - using GmresSolver::A; - using GmresSolver::B; - - using GmresSolver::converged; - using GmresSolver::initial_res; - using GmresSolver::final_res; - using GmresSolver::final_it; - - using GmresSolver::max_dim; - using GmresSolver::orthog_type; - using GmresSolver::pc_side; - using GmresSolver::V; - using GmresSolver::H; - using GmresSolver::s; - using GmresSolver::sn; - using GmresSolver::cs; - - // Temporary workspace for solve. - mutable std::vector Z; - - // Allocate storage for solve. - void Initialize() const override; - void Update(int j) const override; - -public: - FgmresSolver(MPI_Comm comm, int print) : GmresSolver(comm, print) - { - pc_side = PrecSide::RIGHT; - } - - void SetPrecSide(PrecSide side) override - { - MFEM_VERIFY(side == PrecSide::RIGHT, - "FGMRES solver only supports right preconditioning!"); - } - - void Mult(const VecType &b, VecType &x) const override; -}; - -} // namespace palace - -#endif // PALACE_LINALG_ITERATIVE_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_ITERATIVE_HPP +#define PALACE_LINALG_ITERATIVE_HPP + +#include +#include +#include +#include "linalg/operator.hpp" +#include "linalg/solver.hpp" +#include "linalg/vector.hpp" +#include "utils/labels.hpp" + +namespace palace +{ + +// +// Iterative solvers based on Krylov subspace methods with optional preconditioning, for +// real- or complex-valued systems. +// + +// Base class for iterative solvers based on Krylov subspace methods with optional +// preconditioning. +template +class IterativeSolver : public Solver +{ +protected: + using RealType = double; + using ScalarType = + typename std::conditional::value, + std::complex, RealType>::type; + + // MPI communicator associated with the solver. + MPI_Comm comm; + + // Control level of printing during solves. + mfem::IterativeSolver::PrintLevel print_opts; + int int_width, tab_width; + + // Relative and absolute tolerances. + double rel_tol, abs_tol; + + // Limit for the number of solver iterations. + int max_it; + + // Operator and (optional) preconditioner associated with the iterative solver (not + // owned). + const OperType *A; + const Solver *B; + + // Variables set during solve to capture solve statistics. + mutable bool converged; + mutable double initial_res, final_res; + mutable int final_it; + + // Enable timer contribution for Timer::PRECONDITIONER. + bool use_timer; + +public: + IterativeSolver(MPI_Comm comm, int print); + + // Set an indentation for all log printing. + void SetTabWidth(int width) { tab_width = width; } + + // Set the relative convergence tolerance. + void SetTol(double tol) { SetRelTol(tol); } + void SetRelTol(double tol) { rel_tol = tol; } + + // Set the absolute convergence tolerance. + void SetAbsTol(double tol) { abs_tol = tol; } + + // Set the maximum number of iterations. + void SetMaxIter(int its) + { + max_it = its; + int_width = 1 + static_cast(std::log10(its)); + } + + // Set the operator for the solver. + void SetOperator(const OperType &op) override + { + A = &op; + this->height = op.Height(); + this->width = op.Width(); + } + + // Set the preconditioner for the solver. + void SetPreconditioner(const Solver &pc) { B = &pc; } + + // Returns if the previous solve converged or not. + bool GetConverged() const { return converged && (rel_tol > 0.0 || abs_tol > 0.0); } + + // Returns the initial (absolute) residual for the previous solve. + double GetInitialRes() const { return initial_res; } + + // Returns the final (absolute) residual for the previous solve, which may be an estimate + // to the true residual. + double GetFinalRes() const { return final_res; } + + // Returns the number of iterations for the previous solve. + int GetNumIterations() const { return final_it; } + + // Get the associated MPI communicator. + MPI_Comm GetComm() const { return comm; } + + // Activate preconditioner timing during solves. + void EnableTimer() { use_timer = true; } +}; + +// Preconditioned Conjugate Gradient (CG) method for SPD linear systems. +template +class CgSolver : public IterativeSolver +{ +protected: + using VecType = typename Solver::VecType; + using RealType = typename IterativeSolver::RealType; + using ScalarType = typename IterativeSolver::ScalarType; + + using IterativeSolver::comm; + using IterativeSolver::print_opts; + using IterativeSolver::int_width; + using IterativeSolver::tab_width; + + using IterativeSolver::rel_tol; + using IterativeSolver::abs_tol; + using IterativeSolver::max_it; + + using IterativeSolver::A; + using IterativeSolver::B; + + using IterativeSolver::converged; + using IterativeSolver::initial_res; + using IterativeSolver::final_res; + using IterativeSolver::final_it; + + // Temporary workspace for solve. + mutable VecType r, z, p; + +public: + CgSolver(MPI_Comm comm, int print) : IterativeSolver(comm, print) {} + + void Mult(const VecType &b, VecType &x) const override; +}; + +// Preconditioned Generalized Minimum Residual Method (GMRES) for general nonsymmetric +// linear systems. +template +class GmresSolver : public IterativeSolver +{ +protected: + using VecType = typename Solver::VecType; + using RealType = typename IterativeSolver::RealType; + using ScalarType = typename IterativeSolver::ScalarType; + + using IterativeSolver::comm; + using IterativeSolver::print_opts; + using IterativeSolver::int_width; + using IterativeSolver::tab_width; + + using IterativeSolver::rel_tol; + using IterativeSolver::abs_tol; + using IterativeSolver::max_it; + + using IterativeSolver::A; + using IterativeSolver::B; + + using IterativeSolver::converged; + using IterativeSolver::initial_res; + using IterativeSolver::final_res; + using IterativeSolver::final_it; + + // Maximum subspace dimension for restarted GMRES. + mutable int max_dim; + + // Orthogonalization method for orthonormalizing a newly computed vector against a basis + // at each iteration. + Orthogonalization gs_orthog; + + // Use left or right preconditioning. + PreconditionerSide pc_side; + + // Temporary workspace for solve. + mutable std::vector V; + mutable VecType r; + mutable std::vector H; + mutable std::vector s, sn; + mutable std::vector cs; + + // Allocate storage for solve. + virtual void Initialize() const; + virtual void Update(int j) const; + +public: + GmresSolver(MPI_Comm comm, int print) + : IterativeSolver(comm, print), max_dim(-1), + gs_orthog(Orthogonalization::MGS), pc_side(PreconditionerSide::LEFT) + { + } + + // Set the dimension for restart. + void SetRestartDim(int dim) { max_dim = dim; } + + // Set the orthogonalization method. + void SetOrthogonalization(Orthogonalization orthog) { gs_orthog = orthog; } + + // Set the side for preconditioning. + virtual void SetPreconditionerSide(PreconditionerSide side) { pc_side = side; } + + void Mult(const VecType &b, VecType &x) const override; +}; + +// Preconditioned Flexible Generalized Minimum Residual Method (FGMRES) for general +// nonsymmetric linear systems with a non-constant preconditioner. +template +class FgmresSolver : public GmresSolver +{ +protected: + using VecType = typename GmresSolver::VecType; + using RealType = typename GmresSolver::RealType; + using ScalarType = typename GmresSolver::ScalarType; + + using GmresSolver::comm; + using GmresSolver::print_opts; + using GmresSolver::int_width; + using GmresSolver::tab_width; + + using GmresSolver::rel_tol; + using GmresSolver::abs_tol; + using GmresSolver::max_it; + + using GmresSolver::A; + using GmresSolver::B; + + using GmresSolver::converged; + using GmresSolver::initial_res; + using GmresSolver::final_res; + using GmresSolver::final_it; + + using GmresSolver::max_dim; + using GmresSolver::gs_orthog; + using GmresSolver::pc_side; + using GmresSolver::V; + using GmresSolver::H; + using GmresSolver::s; + using GmresSolver::sn; + using GmresSolver::cs; + + // Temporary workspace for solve. + mutable std::vector Z; + + // Allocate storage for solve. + void Initialize() const override; + void Update(int j) const override; + +public: + FgmresSolver(MPI_Comm comm, int print) : GmresSolver(comm, print) + { + pc_side = PreconditionerSide::RIGHT; + } + + void SetPreconditionerSide(const PreconditionerSide side) override + { + MFEM_VERIFY(side == PreconditionerSide::RIGHT, + "FGMRES solver only supports right preconditioning!"); + } + + void Mult(const VecType &b, VecType &x) const override; +}; + +} // namespace palace + +#endif // PALACE_LINALG_ITERATIVE_HPP diff --git a/palace/linalg/jacobi.cpp b/palace/linalg/jacobi.cpp index 3848841546..733fdef4d2 100644 --- a/palace/linalg/jacobi.cpp +++ b/palace/linalg/jacobi.cpp @@ -1,85 +1,109 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "jacobi.hpp" - -#include -#include "linalg/rap.hpp" - -namespace palace -{ - -namespace -{ - -template -inline void Apply(const Vector &dinv, const Vector &x, Vector &y) -{ - const int N = dinv.Size(); - const auto *DI = dinv.Read(); - const auto *X = x.Read(); - auto *Y = y.Write(); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { Y[i] = DI[i] * X[i]; }); -} - -template -inline void Apply(const ComplexVector &dinv, const ComplexVector &x, ComplexVector &y) -{ - const int N = dinv.Size(); - const auto *DIR = dinv.Real().Read(); - const auto *DII = dinv.Imag().Read(); - const auto *XR = x.Real().Read(); - const auto *XI = x.Imag().Read(); - auto *YR = y.Real().Write(); - auto *YI = y.Imag().Write(); - if constexpr (!Transpose) - { - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - YR[i] = DIR[i] * XR[i] - DII[i] * XI[i]; - YI[i] = DII[i] * XR[i] + DIR[i] * XI[i]; - }); - } - else - { - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - YR[i] = DIR[i] * XR[i] + DII[i] * XI[i]; - YI[i] = -DII[i] * XR[i] + DIR[i] * XI[i]; - }); - } -} - -} // namespace - -template -void JacobiSmoother::SetOperator(const OperType &op) -{ - using ParOperType = - typename std::conditional::value, - ComplexParOperator, ParOperator>::type; - - const auto *PtAP = dynamic_cast(&op); - MFEM_VERIFY(PtAP, - "JacobiSmoother requires a ParOperator or ComplexParOperator operator!"); - dinv.SetSize(op.Height()); - PtAP->AssembleDiagonal(dinv); - dinv.Reciprocal(); - - this->height = op.Height(); - this->width = op.Width(); -} - -template -void JacobiSmoother::Mult(const VecType &x, VecType &y) const -{ - MFEM_ASSERT(!this->initial_guess, "JacobiSmoother does not use initial guess!"); - Apply(dinv, x, y); -} - -template class JacobiSmoother; -template class JacobiSmoother; - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "jacobi.hpp" + +#include + +namespace palace +{ + +namespace +{ + +double GetLambdaMax(MPI_Comm comm, const Operator &A, const Vector &dinv) +{ + // Assumes A SPD (diag(A) > 0) to use Hermitian eigenvalue solver. + DiagonalOperator Dinv(dinv); + ProductOperator DinvA(Dinv, A); + return linalg::SpectralNorm(comm, DinvA, true); +} + +double GetLambdaMax(MPI_Comm comm, const ComplexOperator &A, const ComplexVector &dinv) +{ + // Assumes A SPD (diag(A) > 0) to use Hermitian eigenvalue solver. + ComplexDiagonalOperator Dinv(dinv); + ComplexProductOperator DinvA(Dinv, A); + return linalg::SpectralNorm(comm, DinvA, A.IsReal()); +} + +template +inline void Apply(const Vector &dinv, const Vector &x, Vector &y) +{ + const bool use_dev = dinv.UseDevice() || x.UseDevice() || y.UseDevice(); + const int N = dinv.Size(); + const auto *DI = dinv.Read(use_dev); + const auto *X = x.Read(use_dev); + auto *Y = y.Write(use_dev); + mfem::forall_switch(use_dev, N, [=] MFEM_HOST_DEVICE(int i) { Y[i] = DI[i] * X[i]; }); +} + +template +inline void Apply(const ComplexVector &dinv, const ComplexVector &x, ComplexVector &y) +{ + const bool use_dev = dinv.UseDevice() || x.UseDevice() || y.UseDevice(); + const int N = dinv.Size(); + const auto *DIR = dinv.Real().Read(use_dev); + const auto *DII = dinv.Imag().Read(use_dev); + const auto *XR = x.Real().Read(use_dev); + const auto *XI = x.Imag().Read(use_dev); + auto *YR = y.Real().Write(use_dev); + auto *YI = y.Imag().Write(use_dev); + if constexpr (!Transpose) + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + YR[i] = DIR[i] * XR[i] - DII[i] * XI[i]; + YI[i] = DII[i] * XR[i] + DIR[i] * XI[i]; + }); + } + else + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + YR[i] = DIR[i] * XR[i] + DII[i] * XI[i]; + YI[i] = -DII[i] * XR[i] + DIR[i] * XI[i]; + }); + } +} + +} // namespace + +template +void JacobiSmoother::SetOperator(const OperType &op) +{ + dinv.SetSize(op.Height()); + dinv.UseDevice(true); + op.AssembleDiagonal(dinv); + dinv.Reciprocal(); + + // Damping factor. If the given damping is zero, estimate the spectral radius-minimizing + // damping factor. + if (omega == 0.0) + { + auto lambda_max = GetLambdaMax(comm, op, dinv); + auto lambda_min = (sf_max - 1.0) * lambda_max; + omega = 2.0 / (lambda_min + lambda_max); + } + if (omega != 1.0) + { + dinv *= omega; + } + + this->height = op.Height(); + this->width = op.Width(); +} + +template +void JacobiSmoother::Mult(const VecType &x, VecType &y) const +{ + MFEM_ASSERT(!this->initial_guess, "JacobiSmoother does not use initial guess!"); + Apply(dinv, x, y); +} + +template class JacobiSmoother; +template class JacobiSmoother; + +} // namespace palace diff --git a/palace/linalg/jacobi.hpp b/palace/linalg/jacobi.hpp index 3296c98d0f..494729efb5 100644 --- a/palace/linalg/jacobi.hpp +++ b/palace/linalg/jacobi.hpp @@ -1,39 +1,48 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_JACOBI_SMOOTHER_HPP -#define PALACE_LINALG_JACOBI_SMOOTHER_HPP - -#include "linalg/operator.hpp" -#include "linalg/solver.hpp" -#include "linalg/vector.hpp" - -namespace palace -{ - -// -// Simple Jacobi smoother using the diagonal vector from OperType::AssembleDiagonal(), -// which allows for (approximate) diagonal construction for matrix-free operators. -// -template -class JacobiSmoother : public Solver -{ - using VecType = typename Solver::VecType; - -private: - // Inverse diagonal scaling of the operator (real-valued for now). - VecType dinv; - -public: - JacobiSmoother() : Solver() {} - - void SetOperator(const OperType &op) override; - - void Mult(const VecType &x, VecType &y) const override; - - void MultTranspose(const VecType &x, VecType &y) const override { Mult(x, y); } -}; - -} // namespace palace - -#endif // PALACE_LINALG_JACOBI_SMOOTHER_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_JACOBI_SMOOTHER_HPP +#define PALACE_LINALG_JACOBI_SMOOTHER_HPP + +#include "linalg/operator.hpp" +#include "linalg/solver.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +// +// Simple Jacobi smoother using the diagonal vector from OperType::AssembleDiagonal(), +// which allows for (approximate) diagonal construction for matrix-free operators. +// +template +class JacobiSmoother : public Solver +{ + using VecType = typename Solver::VecType; + +private: + // MPI communicator associated with the solver operator and vectors. + MPI_Comm comm; + + // Inverse diagonal scaling of the operator (real-valued for now). + VecType dinv; + + // Damping factor and scaling factor for maximum eigenvalue. + double omega, sf_max; + +public: + JacobiSmoother(MPI_Comm comm, double omega = 1.0, double sf_max = 1.0) + : Solver(), comm(comm), omega(omega), sf_max(sf_max) + { + } + + void SetOperator(const OperType &op) override; + + void Mult(const VecType &x, VecType &y) const override; + + void MultTranspose(const VecType &x, VecType &y) const override { Mult(x, y); } +}; + +} // namespace palace + +#endif // PALACE_LINALG_JACOBI_SMOOTHER_HPP diff --git a/palace/linalg/ksp.cpp b/palace/linalg/ksp.cpp index 0c59949be9..9780f89fbf 100644 --- a/palace/linalg/ksp.cpp +++ b/palace/linalg/ksp.cpp @@ -1,251 +1,287 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "ksp.hpp" - -#include -#include "fem/fespace.hpp" -#include "linalg/amg.hpp" -#include "linalg/ams.hpp" -#include "linalg/gmg.hpp" -#include "linalg/mumps.hpp" -#include "linalg/strumpack.hpp" -#include "linalg/superlu.hpp" -#include "utils/communication.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -namespace -{ - -template -std::unique_ptr> ConfigureKrylovSolver(MPI_Comm comm, - const IoData &iodata) -{ - // Create the solver. - std::unique_ptr> ksp; - const auto type = iodata.solver.linear.ksp_type; - const int print = iodata.problem.verbose; - switch (type) - { - case config::LinearSolverData::KspType::CG: - ksp = std::make_unique>(comm, print); - break; - case config::LinearSolverData::KspType::GMRES: - { - auto gmres = std::make_unique>(comm, print); - gmres->SetRestartDim(iodata.solver.linear.max_size); - ksp = std::move(gmres); - } - break; - case config::LinearSolverData::KspType::FGMRES: - { - auto fgmres = std::make_unique>(comm, print); - fgmres->SetRestartDim(iodata.solver.linear.max_size); - ksp = std::move(fgmres); - } - break; - case config::LinearSolverData::KspType::MINRES: - case config::LinearSolverData::KspType::BICGSTAB: - case config::LinearSolverData::KspType::DEFAULT: - MFEM_ABORT("Unexpected solver type for Krylov solver configuration!"); - break; - } - ksp->SetInitialGuess(iodata.solver.linear.initial_guess); - ksp->SetRelTol(iodata.solver.linear.tol); - ksp->SetMaxIter(iodata.solver.linear.max_it); - - // Configure preconditioning side (only for GMRES). - if (iodata.solver.linear.pc_side_type != config::LinearSolverData::SideType::DEFAULT && - type != config::LinearSolverData::KspType::GMRES) - { - Mpi::Warning(comm, - "Preconditioner side will be ignored for non-GMRES iterative solvers!\n"); - } - else - { - auto *gmres = static_cast *>(ksp.get()); - switch (iodata.solver.linear.pc_side_type) - { - case config::LinearSolverData::SideType::LEFT: - gmres->SetPrecSide(GmresSolver::PrecSide::LEFT); - break; - case config::LinearSolverData::SideType::RIGHT: - gmres->SetPrecSide(GmresSolver::PrecSide::RIGHT); - break; - case config::LinearSolverData::SideType::DEFAULT: - // Do nothing - break; - } - } - - // Configure orthogonalization method for GMRES/FMGRES. - if (type == config::LinearSolverData::KspType::GMRES || - type == config::LinearSolverData::KspType::FGMRES) - { - // Because FGMRES inherits from GMRES, this is OK. - auto *gmres = static_cast *>(ksp.get()); - switch (iodata.solver.linear.gs_orthog_type) - { - case config::LinearSolverData::OrthogType::MGS: - gmres->SetOrthogonalization(GmresSolver::OrthogType::MGS); - break; - case config::LinearSolverData::OrthogType::CGS: - gmres->SetOrthogonalization(GmresSolver::OrthogType::CGS); - break; - case config::LinearSolverData::OrthogType::CGS2: - gmres->SetOrthogonalization(GmresSolver::OrthogType::CGS2); - break; - } - } - - return ksp; -} - -template -std::unique_ptr> -ConfigurePreconditionerSolver(MPI_Comm comm, const IoData &iodata, - const FiniteElementSpaceHierarchy &fespaces, - const AuxiliaryFiniteElementSpaceHierarchy *aux_fespaces) -{ - // Create the real-valued solver first. - std::unique_ptr pc0; - const auto type = iodata.solver.linear.type; - const int print = iodata.problem.verbose - 1; - switch (type) - { - case config::LinearSolverData::Type::AMS: - // Can either be the coarse solve for geometric multigrid or the solver at the finest - // space (in which case fespaces.GetNumLevels() == 1). - MFEM_VERIFY(aux_fespaces, "AMS solver relies on both primary space " - "and auxiliary spaces for construction!"); - pc0 = std::make_unique(iodata, fespaces.GetNumLevels() > 1, - fespaces.GetFESpaceAtLevel(0), - aux_fespaces->GetFESpaceAtLevel(0), print); - break; - case config::LinearSolverData::Type::BOOMER_AMG: - pc0 = std::make_unique(iodata, fespaces.GetNumLevels() > 1, print); - break; - case config::LinearSolverData::Type::SUPERLU: -#if defined(MFEM_USE_SUPERLU) - pc0 = std::make_unique(comm, iodata, print); -#else - MFEM_ABORT("Solver was not built with SuperLU_DIST support, please choose a " - "different solver!"); -#endif - break; - case config::LinearSolverData::Type::STRUMPACK: -#if defined(MFEM_USE_STRUMPACK) - pc0 = std::make_unique(comm, iodata, print); -#else - MFEM_ABORT("Solver was not built with STRUMPACK support, please choose a " - "different solver!"); -#endif - break; - case config::LinearSolverData::Type::STRUMPACK_MP: -#if defined(MFEM_USE_STRUMPACK) - pc0 = std::make_unique(comm, iodata, print); -#else - MFEM_ABORT("Solver was not built with STRUMPACK support, please choose a " - "different solver!"); -#endif - break; - case config::LinearSolverData::Type::MUMPS: -#if defined(MFEM_USE_MUMPS) - pc0 = std::make_unique(comm, iodata, print); -#else - MFEM_ABORT( - "Solver was not built with MUMPS support, please choose a different solver!"); -#endif - break; - case config::LinearSolverData::Type::DEFAULT: - MFEM_ABORT("Unexpected solver type for preconditioner configuration!"); - break; - } - - // Construct the actual solver, which has the right value type. - auto pc = std::make_unique>(std::move(pc0)); - if (fespaces.GetNumLevels() > 1) - { - // This will construct the multigrid hierarchy using pc as the coarse solver - // (ownership of pc is transferred to the GeometricMultigridSolver). When a special - // auxiliary space smoother for pre-/post-smoothing is not desired, the auxiliary - // space is a nullptr here. - if (iodata.solver.linear.mg_smooth_aux) - { - MFEM_VERIFY(aux_fespaces, "Multigrid with auxiliary space smoothers requires both " - "primary space and auxiliary spaces for construction!"); - const auto G = aux_fespaces->GetDiscreteInterpolators(); - return std::make_unique>( - iodata, std::move(pc), fespaces.GetProlongationOperators(), &G); - } - else - { - return std::make_unique>( - iodata, std::move(pc), fespaces.GetProlongationOperators()); - } - } - else - { - return pc; - } -} - -} // namespace - -template -BaseKspSolver::BaseKspSolver( - const IoData &iodata, const FiniteElementSpaceHierarchy &fespaces, - const AuxiliaryFiniteElementSpaceHierarchy *aux_fespaces) - : BaseKspSolver( - ConfigureKrylovSolver(fespaces.GetFinestFESpace().GetComm(), iodata), - ConfigurePreconditionerSolver(fespaces.GetFinestFESpace().GetComm(), - iodata, fespaces, aux_fespaces)) -{ -} - -template -BaseKspSolver::BaseKspSolver(std::unique_ptr> &&ksp, - std::unique_ptr> &&pc) - : ksp(std::move(ksp)), pc(std::move(pc)), ksp_mult(0), ksp_mult_it(0) -{ - this->ksp->SetPreconditioner(*this->pc); -} - -template -void BaseKspSolver::SetOperators(const OperType &op, const OperType &pc_op) -{ - ksp->SetOperator(op); - const auto *mg_op = dynamic_cast *>(&pc_op); - const auto *mg_pc = dynamic_cast *>(pc.get()); - if (mg_op && !mg_pc) - { - pc->SetOperator(mg_op->GetFinestOperator()); - } - else - { - pc->SetOperator(pc_op); - } -} - -template -void BaseKspSolver::Mult(const VecType &x, VecType &y) const -{ - ksp->Mult(x, y); - if (!ksp->GetConverged()) - { - Mpi::Warning( - ksp->GetComm(), - "Linear solver did not converge, norm(Ax-b)/norm(b) = {:.3e} (norm(b) = {:.3e})!\n", - ksp->GetFinalRes() / ksp->GetInitialRes(), ksp->GetInitialRes()); - } - ksp_mult++; - ksp_mult_it += ksp->GetNumIterations(); -} - -template class BaseKspSolver; -template class BaseKspSolver; - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "ksp.hpp" + +#include +#include "fem/fespace.hpp" +#include "linalg/amg.hpp" +#include "linalg/ams.hpp" +#include "linalg/gmg.hpp" +#include "linalg/jacobi.hpp" +#include "linalg/mumps.hpp" +#include "linalg/strumpack.hpp" +#include "linalg/superlu.hpp" +#include "utils/communication.hpp" +#include "utils/iodata.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +namespace +{ + +template +std::unique_ptr> ConfigureKrylovSolver(const IoData &iodata, + MPI_Comm comm) +{ + // Create the solver. + std::unique_ptr> ksp; + const auto type = iodata.solver.linear.krylov_solver; + const int print = iodata.problem.verbose; + switch (type) + { + case KrylovSolver::CG: + ksp = std::make_unique>(comm, print); + break; + case KrylovSolver::GMRES: + { + auto gmres = std::make_unique>(comm, print); + gmres->SetRestartDim(iodata.solver.linear.max_size); + ksp = std::move(gmres); + } + break; + case KrylovSolver::FGMRES: + { + auto fgmres = std::make_unique>(comm, print); + fgmres->SetRestartDim(iodata.solver.linear.max_size); + ksp = std::move(fgmres); + } + break; + case KrylovSolver::MINRES: + case KrylovSolver::BICGSTAB: + case KrylovSolver::DEFAULT: + MFEM_ABORT("Unexpected solver type for Krylov solver configuration!"); + break; + } + ksp->SetInitialGuess(iodata.solver.linear.initial_guess); + ksp->SetRelTol(iodata.solver.linear.tol); + ksp->SetMaxIter(iodata.solver.linear.max_it); + + // Configure preconditioning side (only for GMRES). + if (iodata.solver.linear.pc_side != PreconditionerSide::DEFAULT && + type != KrylovSolver::GMRES) + { + Mpi::Warning(comm, + "Preconditioner side will be ignored for non-GMRES iterative solvers!\n"); + } + else + { + if (type == KrylovSolver::GMRES || type == KrylovSolver::FGMRES) + { + auto *gmres = static_cast *>(ksp.get()); + switch (iodata.solver.linear.pc_side) + { + case PreconditionerSide::LEFT: + gmres->SetPreconditionerSide(PreconditionerSide::LEFT); + break; + case PreconditionerSide::RIGHT: + gmres->SetPreconditionerSide(PreconditionerSide::RIGHT); + break; + case PreconditionerSide::DEFAULT: + // Do nothing. Set in ctors. + break; + } + } + } + + // Configure orthogonalization method for GMRES/FMGRES. + if (type == KrylovSolver::GMRES || type == KrylovSolver::FGMRES) + { + // Because FGMRES inherits from GMRES, this is OK. + auto *gmres = static_cast *>(ksp.get()); + gmres->SetOrthogonalization(iodata.solver.linear.gs_orthog); + } + + // Configure timing for the primary linear solver. + ksp->EnableTimer(); + + return ksp; +} + +template +auto MakeWrapperSolver(const IoData &iodata, U &&...args) +{ + // Sparse direct solver types copy the input matrix, so there is no need to save the + // parallel assembled operator. + constexpr bool save_assembled = !(false || +#if defined(MFEM_USE_SUPERLU) + std::is_same::value || +#endif +#if defined(MFEM_USE_STRUMPACK) + std::is_same::value || + std::is_same::value || +#endif +#if defined(MFEM_USE_MUMPS) + std::is_same::value || +#endif + false); + return std::make_unique>( + std::make_unique(iodata, std::forward(args)...), save_assembled, + iodata.solver.linear.complex_coarse_solve, iodata.solver.linear.drop_small_entries, + iodata.solver.linear.reorder_reuse); +} + +template +std::unique_ptr> +ConfigurePreconditionerSolver(const IoData &iodata, MPI_Comm comm, + FiniteElementSpaceHierarchy &fespaces, + FiniteElementSpaceHierarchy *aux_fespaces) +{ + // Create the real-valued solver first. + std::unique_ptr> pc; + const auto type = iodata.solver.linear.type; + const int print = iodata.problem.verbose - 1; + switch (type) + { + case LinearSolver::AMS: + // Can either be the coarse solve for geometric multigrid or the solver at the finest + // space (in which case fespaces.GetNumLevels() == 1). + MFEM_VERIFY(aux_fespaces, "AMS solver relies on both primary space " + "and auxiliary spaces for construction!"); + pc = MakeWrapperSolver( + iodata, fespaces.GetNumLevels() > 1, fespaces.GetFESpaceAtLevel(0), + aux_fespaces->GetFESpaceAtLevel(0), print); + break; + case LinearSolver::BOOMER_AMG: + pc = MakeWrapperSolver(iodata, fespaces.GetNumLevels() > 1, + print); + break; + case LinearSolver::SUPERLU: +#if defined(MFEM_USE_SUPERLU) + pc = MakeWrapperSolver(iodata, comm, print); +#else + MFEM_ABORT("Solver was not built with SuperLU_DIST support, please choose a " + "different solver!"); +#endif + break; + case LinearSolver::STRUMPACK: +#if defined(MFEM_USE_STRUMPACK) + pc = MakeWrapperSolver(iodata, comm, print); +#else + MFEM_ABORT("Solver was not built with STRUMPACK support, please choose a " + "different solver!"); +#endif + break; + case LinearSolver::STRUMPACK_MP: +#if defined(MFEM_USE_STRUMPACK) + pc = MakeWrapperSolver(iodata, comm, print); +#else + MFEM_ABORT("Solver was not built with STRUMPACK support, please choose a " + "different solver!"); +#endif + break; + case LinearSolver::MUMPS: +#if defined(MFEM_USE_MUMPS) + pc = MakeWrapperSolver(iodata, comm, print); +#else + MFEM_ABORT( + "Solver was not built with MUMPS support, please choose a different solver!"); +#endif + break; + case LinearSolver::JACOBI: + pc = std::make_unique>(comm); + break; + case LinearSolver::DEFAULT: + MFEM_ABORT("Unexpected solver type for preconditioner configuration!"); + break; + } + + // Construct the actual solver, which has the right value type. + if (fespaces.GetNumLevels() > 1) + { + // This will construct the multigrid hierarchy using pc as the coarse solver + // (ownership of pc is transferred to the GeometricMultigridSolver). When a special + // auxiliary space smoother for pre-/post-smoothing is not desired, the auxiliary + // space is a nullptr here. + auto gmg = [&]() + { + if (iodata.solver.linear.mg_smooth_aux) + { + MFEM_VERIFY(aux_fespaces, "Multigrid with auxiliary space smoothers requires both " + "primary space and auxiliary spaces for construction!"); + const auto G = fespaces.GetDiscreteInterpolators(*aux_fespaces); + return std::make_unique>( + iodata, comm, std::move(pc), fespaces.GetProlongationOperators(), &G); + } + else + { + return std::make_unique>( + iodata, comm, std::move(pc), fespaces.GetProlongationOperators()); + } + }(); + gmg->EnableTimer(); // Enable timing for primary geometric multigrid solver + return gmg; + } + else + { + return pc; + } +} + +} // namespace + +template +BaseKspSolver::BaseKspSolver(const IoData &iodata, + FiniteElementSpaceHierarchy &fespaces, + FiniteElementSpaceHierarchy *aux_fespaces) + : BaseKspSolver( + ConfigureKrylovSolver(iodata, fespaces.GetFinestFESpace().GetComm()), + ConfigurePreconditionerSolver( + iodata, fespaces.GetFinestFESpace().GetComm(), fespaces, aux_fespaces)) +{ + use_timer = true; +} + +template +BaseKspSolver::BaseKspSolver(std::unique_ptr> &&ksp, + std::unique_ptr> &&pc) + : ksp(std::move(ksp)), pc(std::move(pc)), ksp_mult(0), ksp_mult_it(0), use_timer(false) +{ + if (this->pc) + { + this->ksp->SetPreconditioner(*this->pc); + } +} + +template +void BaseKspSolver::SetOperators(const OperType &op, const OperType &pc_op) +{ + BlockTimer bt(Timer::KSP_SETUP, use_timer); + ksp->SetOperator(op); + if (pc) + { + const auto *mg_op = dynamic_cast *>(&pc_op); + const auto *mg_pc = dynamic_cast *>(pc.get()); + if (mg_op && !mg_pc) + { + pc->SetOperator(mg_op->GetFinestOperator()); + } + else + { + pc->SetOperator(pc_op); + } + } +} + +template +void BaseKspSolver::Mult(const VecType &x, VecType &y) const +{ + BlockTimer bt(Timer::KSP, use_timer); + ksp->Mult(x, y); + if (!ksp->GetConverged()) + { + Mpi::Warning( + ksp->GetComm(), + "Linear solver did not converge, norm(Ax-b)/norm(b) = {:.3e} (norm(b) = {:.3e})!\n", + ksp->GetFinalRes() / ksp->GetInitialRes(), ksp->GetInitialRes()); + } + ksp_mult++; + ksp_mult_it += ksp->GetNumIterations(); +} + +template class BaseKspSolver; +template class BaseKspSolver; + +} // namespace palace diff --git a/palace/linalg/ksp.hpp b/palace/linalg/ksp.hpp index 4af3f4da66..b826f9698f 100644 --- a/palace/linalg/ksp.hpp +++ b/palace/linalg/ksp.hpp @@ -1,61 +1,63 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_KSP_HPP -#define PALACE_LINALG_KSP_HPP - -#include -#include -#include "linalg/iterative.hpp" -#include "linalg/operator.hpp" -#include "linalg/solver.hpp" - -namespace palace -{ - -class AuxiliaryFiniteElementSpaceHierarchy; -class FiniteElementSpaceHierarchy; -class IoData; - -// -// Linear solver class composing an iterative solver and preconditioner object. -// -template -class BaseKspSolver -{ - static_assert(std::is_same::value || - std::is_same::value, - "Solver can only be defined for OperType = Operator or ComplexOperator!"); - - using VecType = typename std::conditional::value, - ComplexVector, Vector>::type; - -protected: - // The actual solver and preconditioner objects. - std::unique_ptr> ksp; - std::unique_ptr> pc; - - // Counters for number of calls to Mult method for linear solves, and cumulative number - // of iterations. - mutable int ksp_mult, ksp_mult_it; - -public: - BaseKspSolver(const IoData &iodata, const FiniteElementSpaceHierarchy &fespaces, - const AuxiliaryFiniteElementSpaceHierarchy *aux_fespaces = nullptr); - BaseKspSolver(std::unique_ptr> &&ksp, - std::unique_ptr> &&pc); - - int NumTotalMult() const { return ksp_mult; } - int NumTotalMultIterations() const { return ksp_mult_it; } - - void SetOperators(const OperType &op, const OperType &pc_op); - - void Mult(const VecType &x, VecType &y) const; -}; - -using KspSolver = BaseKspSolver; -using ComplexKspSolver = BaseKspSolver; - -} // namespace palace - -#endif // PALACE_LINALG_KSP_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_KSP_HPP +#define PALACE_LINALG_KSP_HPP + +#include +#include +#include "linalg/iterative.hpp" +#include "linalg/operator.hpp" +#include "linalg/solver.hpp" + +namespace palace +{ + +class FiniteElementSpaceHierarchy; +class IoData; + +// +// Linear solver class composing an iterative solver and preconditioner object. +// +template +class BaseKspSolver +{ + static_assert(std::is_same::value || + std::is_same::value, + "Solver can only be defined for OperType = Operator or ComplexOperator!"); + + using VecType = typename std::conditional::value, + ComplexVector, Vector>::type; + +protected: + // The actual solver and preconditioner objects. + std::unique_ptr> ksp; + std::unique_ptr> pc; + + // Counters for number of calls to Mult method for linear solves, and cumulative number + // of iterations. + mutable int ksp_mult, ksp_mult_it; + + // Enable timer contribution for Timer::KSP_PRECONDITIONER. + bool use_timer; + +public: + BaseKspSolver(const IoData &iodata, FiniteElementSpaceHierarchy &fespaces, + FiniteElementSpaceHierarchy *aux_fespaces = nullptr); + BaseKspSolver(std::unique_ptr> &&ksp, + std::unique_ptr> &&pc); + + int NumTotalMult() const { return ksp_mult; } + int NumTotalMultIterations() const { return ksp_mult_it; } + + void SetOperators(const OperType &op, const OperType &pc_op); + + void Mult(const VecType &x, VecType &y) const; +}; + +using KspSolver = BaseKspSolver; +using ComplexKspSolver = BaseKspSolver; + +} // namespace palace + +#endif // PALACE_LINALG_KSP_HPP diff --git a/palace/linalg/mumps.cpp b/palace/linalg/mumps.cpp index 8026eb8030..1789c0e4c2 100644 --- a/palace/linalg/mumps.cpp +++ b/palace/linalg/mumps.cpp @@ -1,64 +1,58 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "mumps.hpp" - -#if defined(MFEM_USE_MUMPS) - -#include "linalg/rap.hpp" - -namespace palace -{ - -MumpsSolver::MumpsSolver(MPI_Comm comm, mfem::MUMPSSolver::MatType sym, - config::LinearSolverData::SymFactType reorder, double blr_tol, - int print) - : mfem::MUMPSSolver(comm) -{ - // Configure the solver (must be called before SetOperator). - SetPrintLevel(print); - SetMatrixSymType(sym); - if (reorder == config::LinearSolverData::SymFactType::METIS) - { - SetReorderingStrategy(mfem::MUMPSSolver::METIS); - } - else if (reorder == config::LinearSolverData::SymFactType::PARMETIS) - { - SetReorderingStrategy(mfem::MUMPSSolver::PARMETIS); - } - else if (reorder == config::LinearSolverData::SymFactType::SCOTCH) - { - SetReorderingStrategy(mfem::MUMPSSolver::SCOTCH); - } - else if (reorder == config::LinearSolverData::SymFactType::PTSCOTCH) - { - SetReorderingStrategy(mfem::MUMPSSolver::PTSCOTCH); - } - else - { - // SetReorderingStrategy(mfem::MUMPSSolver::AUTOMATIC); // Should have good default - SetReorderingStrategy(mfem::MUMPSSolver::PORD); - } - SetReorderingReuse(true); // Repeated calls use same sparsity pattern - if (blr_tol > 0.0) - { - SetBLRTol(blr_tol); - } -} - -void MumpsSolver::SetOperator(const Operator &op) -{ - const auto *PtAP = dynamic_cast(&op); - if (PtAP) - { - mfem::MUMPSSolver::SetOperator(PtAP->ParallelAssemble()); - } - else - { - mfem::MUMPSSolver::SetOperator(op); - } -} - -} // namespace palace - -#endif +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "mumps.hpp" + +#if defined(MFEM_USE_MUMPS) + +namespace palace +{ + +MumpsSolver::MumpsSolver(MPI_Comm comm, mfem::MUMPSSolver::MatType sym, + SymbolicFactorization reorder, double blr_tol, bool reorder_reuse, + int print) + : mfem::MUMPSSolver(comm) +{ + // Configure the solver (must be called before SetOperator). + SetPrintLevel(print); + SetMatrixSymType(sym); + switch (reorder) + { + case SymbolicFactorization::METIS: + SetReorderingStrategy(mfem::MUMPSSolver::METIS); + break; + case SymbolicFactorization::PARMETIS: + SetReorderingStrategy(mfem::MUMPSSolver::PARMETIS); + break; + case SymbolicFactorization::SCOTCH: + SetReorderingStrategy(mfem::MUMPSSolver::SCOTCH); + break; + case SymbolicFactorization::PTSCOTCH: + SetReorderingStrategy(mfem::MUMPSSolver::PTSCOTCH); + break; + case SymbolicFactorization::PORD: + SetReorderingStrategy(mfem::MUMPSSolver::PORD); + break; + case SymbolicFactorization::AMD: + case SymbolicFactorization::RCM: + SetReorderingStrategy(mfem::MUMPSSolver::AMD); + break; + case SymbolicFactorization::DEFAULT: + SetReorderingStrategy(mfem::MUMPSSolver::AUTOMATIC); // Should have good default + break; + } + SetReorderingReuse(reorder_reuse); // If true repeated calls use same sparsity pattern + if (blr_tol > 0.0) + { + SetBLRTol(blr_tol); + } +} + +void MumpsSolver::SetReorderReuse(bool reorder_reuse) +{ + SetReorderingReuse(reorder_reuse); // If true repeated calls use same sparsity pattern +} + +} // namespace palace + +#endif diff --git a/palace/linalg/mumps.hpp b/palace/linalg/mumps.hpp index f98bd02663..f84308939f 100644 --- a/palace/linalg/mumps.hpp +++ b/palace/linalg/mumps.hpp @@ -8,7 +8,6 @@ #if defined(MFEM_USE_MUMPS) -#include "linalg/operator.hpp" #include "utils/iodata.hpp" namespace palace @@ -20,26 +19,28 @@ namespace palace class MumpsSolver : public mfem::MUMPSSolver { public: - MumpsSolver(MPI_Comm comm, mfem::MUMPSSolver::MatType sym, - config::LinearSolverData::SymFactType reorder, double blr_tol, int print); - MumpsSolver(MPI_Comm comm, const IoData &iodata, int print) - : MumpsSolver(comm, - (iodata.solver.linear.pc_mat_shifted || - iodata.problem.type == config::ProblemData::Type::TRANSIENT || - iodata.problem.type == config::ProblemData::Type::ELECTROSTATIC || - iodata.problem.type == config::ProblemData::Type::MAGNETOSTATIC) - ? mfem::MUMPSSolver::SYMMETRIC_POSITIVE_DEFINITE - : mfem::MUMPSSolver::SYMMETRIC_INDEFINITE, - iodata.solver.linear.sym_fact_type, - (iodata.solver.linear.strumpack_compression_type == - config::LinearSolverData::CompressionType::BLR) - ? iodata.solver.linear.strumpack_lr_tol - : 0.0, - print) + MumpsSolver(MPI_Comm comm, mfem::MUMPSSolver::MatType sym, SymbolicFactorization reorder, + double blr_tol, bool reorder_reuse, int print); + MumpsSolver(const IoData &iodata, MPI_Comm comm, int print) + : MumpsSolver( + comm, + (iodata.solver.linear.pc_mat_shifted || + iodata.problem.type == ProblemType::TRANSIENT || + iodata.problem.type == ProblemType::ELECTROSTATIC || + iodata.problem.type == ProblemType::MAGNETOSTATIC) + ? mfem::MUMPSSolver::SYMMETRIC_POSITIVE_DEFINITE + : iodata.solver.linear.complex_coarse_solve + ? mfem::MUMPSSolver::UNSYMMETRIC + : mfem::MUMPSSolver::SYMMETRIC_INDEFINITE, + iodata.solver.linear.sym_factorization, + (iodata.solver.linear.strumpack_compression_type == SparseCompression::BLR) + ? iodata.solver.linear.strumpack_lr_tol + : 0.0, + iodata.solver.linear.reorder_reuse, print) { } - void SetOperator(const Operator &op) override; + void SetReorderReuse(bool reorder_reuse); }; } // namespace palace diff --git a/palace/linalg/mumps.hpp.bak b/palace/linalg/mumps.hpp.bak new file mode 100644 index 0000000000..e3dc5c22af --- /dev/null +++ b/palace/linalg/mumps.hpp.bak @@ -0,0 +1,50 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_MUMPS_HPP +#define PALACE_LINALG_MUMPS_HPP + +#include + +#if defined(MFEM_USE_MUMPS) + +#include "utils/iodata.hpp" + +namespace palace +{ + +// +// A wrapper for the MUMPS direct solver package. +// +class MumpsSolver : public mfem::MUMPSSolver +{ +public: + MumpsSolver(MPI_Comm comm, mfem::MUMPSSolver::MatType sym, SymbolicFactorization reorder, + double blr_tol, bool reorder_reuse, int print); + MumpsSolver(const IoData &iodata, MPI_Comm comm, int print) + : MumpsSolver( + comm, + (iodata.solver.linear.pc_mat_shifted || + iodata.problem.type == ProblemType::TRANSIENT || + iodata.problem.type == ProblemType::ELECTROSTATIC || + iodata.problem.type == ProblemType::MAGNETOSTATIC) + ? mfem::MUMPSSolver::SYMMETRIC_POSITIVE_DEFINITE + : iodata.boundaries.periodic.wave_vector == std::array{0.0, 0.0, 0.0} + ? mfem::MUMPSSolver::SYMMETRIC_INDEFINITE + : mfem::MUMPSSolver::UNSYMMETRIC, + iodata.solver.linear.sym_factorization, + (iodata.solver.linear.strumpack_compression_type == SparseCompression::BLR) + ? iodata.solver.linear.strumpack_lr_tol + : 0.0, + iodata.solver.linear.reorder_reuse, print) + { + } + + void SetReorderReuse(bool reorder_reuse); +}; + +} // namespace palace + +#endif + +#endif // PALACE_LINALG_MUMPS_HPP diff --git a/palace/linalg/nleps.cpp b/palace/linalg/nleps.cpp new file mode 100644 index 0000000000..eb7b3babae --- /dev/null +++ b/palace/linalg/nleps.cpp @@ -0,0 +1,888 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "nleps.hpp" + +#include +#include +#include +#include +#include "linalg/divfree.hpp" +#include "linalg/rap.hpp" +#include "utils/communication.hpp" + +namespace palace +{ + +using namespace std::complex_literals; +// Base class methods. + +NonLinearEigenvalueSolver::NonLinearEigenvalueSolver(MPI_Comm comm, int print) + : comm(comm), print(print) +{ + // Initialization. + nleps_it = 0; + + gamma = delta = 1.0; + sigma = 0.0; + + opInv = nullptr; + opProj = nullptr; + opB = nullptr; +} + +void NonLinearEigenvalueSolver::SetLinearSolver(ComplexKspSolver &ksp) +{ + opInv = &ksp; +} + +void NonLinearEigenvalueSolver::SetDivFreeProjector( + const DivFreeSolver &divfree) +{ + opProj = &divfree; +} + +void NonLinearEigenvalueSolver::SetBMat(const Operator &B) +{ + opB = &B; +} + +void NonLinearEigenvalueSolver::SetNumModes(int num_eig, int num_vec) +{ + if (nev > 0 && num_eig != nev) + { + res.reset(); + xscale.reset(); + perm.reset(); + } + nev = num_eig; +} + +void NonLinearEigenvalueSolver::SetTol(double tol) +{ + rtol = tol; +} + +void NonLinearEigenvalueSolver::SetMaxIter(int max_it) +{ + nleps_it = max_it; +} + +void NonLinearEigenvalueSolver::SetWhichEigenpairs(EigenvalueSolver::WhichType type) +{ + which_type = type; +} + +void NonLinearEigenvalueSolver::SetShiftInvert(std::complex s, bool precond) +{ + MFEM_VERIFY(!precond, "Nonlinear eigenvalue solver does not support preconditioned " + "spectral transformation option!"); + sigma = s; + sinvert = true; +} + +void NonLinearEigenvalueSolver::SetInitialSpace(const ComplexVector &v) +{ + MFEM_ABORT("SetInitialSpace not defined for base class NonLinearEigenvalueSolver!"); +} + +std::complex NonLinearEigenvalueSolver::GetEigenvalue(int i) const +{ + MFEM_VERIFY(i >= 0 && i < nev, + "Out of range eigenpair requested (i = " << i << ", nev = " << nev << ")!"); + const int &j = perm.get()[i]; + return eigenvalues[j]; +} + +void NonLinearEigenvalueSolver::GetEigenvector(int i, ComplexVector &x) const +{ + MFEM_VERIFY(i >= 0 && i < nev, + "Out of range eigenpair requested (i = " << i << ", nev = " << nev << ")!"); + MFEM_VERIFY(x.Size() == n, "Invalid size mismatch for provided eigenvector!"); + const int &j = perm.get()[i]; + x = eigenvectors[j]; + if (xscale.get()[j] > 0.0) + { + x *= xscale.get()[j]; + } +} + +double NonLinearEigenvalueSolver::GetEigenvectorNorm(const ComplexVector &x, + ComplexVector &Bx) const +{ + if (opB) + { + return linalg::Norml2(comm, x, *opB, Bx); + } + else + { + return linalg::Norml2(comm, x); + } +} + +double NonLinearEigenvalueSolver::GetError(int i, EigenvalueSolver::ErrorType type) const +{ + MFEM_VERIFY(i >= 0 && i < nev, + "Out of range eigenpair requested (i = " << i << ", nev = " << nev << ")!"); + const int &j = perm.get()[i]; + switch (type) + { + case ErrorType::ABSOLUTE: + return res.get()[j]; + case ErrorType::RELATIVE: + return res.get()[j] / std::abs(eigenvalues[j]); + case ErrorType::BACKWARD: + return res.get()[j] / GetBackwardScaling(eigenvalues[j]); + } + return 0.0; +} + +void NonLinearEigenvalueSolver::RescaleEigenvectors(int num_eig) +{ + res = std::make_unique(num_eig); + xscale = std::make_unique(num_eig); + for (int i = 0; i < num_eig; i++) + { + x1 = eigenvectors[i]; + xscale.get()[i] = 1.0 / GetEigenvectorNorm(x1, y1); + res.get()[i] = GetResidualNorm(eigenvalues[i], x1, y1) / linalg::Norml2(comm, x1); + } +} + +// Quasi-Newton specific methods. +QuasiNewtonSolver::QuasiNewtonSolver(MPI_Comm comm, + std::unique_ptr linear_eigensolver, + int num_conv, int print, bool refine) + : NonLinearEigenvalueSolver(comm, print), + linear_eigensolver_(std::move(linear_eigensolver)), nev_linear(num_conv), + refine_nonlinear(refine) +{ + opK = opC = opM = nullptr; + normK = normC = normM = 0.0; +} + +// Set the update frequency of the preconditioner. +void QuasiNewtonSolver::SetPreconditionerLag(int preconditioner_update_freq, + double preconditioner_update_tol) +{ + preconditioner_lag = preconditioner_update_freq; + preconditioner_tol = preconditioner_update_tol; +} + +// Set the maximum number of restarts with the same initial guess. +void QuasiNewtonSolver::SetMaxRestart(int max_num_restart) +{ + max_restart = max_num_restart; +} + +void QuasiNewtonSolver::SetExtraSystemMatrix( + std::function(double)> A2) +{ + funcA2 = A2; +} + +void QuasiNewtonSolver::SetPreconditionerUpdate( + std::function( + std::complex, std::complex, std::complex, double)> + P) +{ + funcP = P; +} + +void QuasiNewtonSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &M, + EigenvalueSolver::ScaleType type) +{ + MFEM_VERIFY(!opK || K.Height() == n, "Invalid modification of eigenvalue problem size!"); + bool first = (opK == nullptr); + opK = &K; + opM = &M; + + if (first && type != ScaleType::NONE) + { + normK = linalg::SpectralNorm(comm, *opK, opK->IsReal()); + normM = linalg::SpectralNorm(comm, *opM, opM->IsReal()); + MFEM_VERIFY(normK >= 0.0 && normM >= 0.0, + "Invalid matrix norms for Quasi-Newton scaling!"); + if (normK > 0 && normM > 0.0) + { + gamma = std::sqrt(normK / normM); + delta = 2.0 / (normK); + } + } + + n = opK->Height(); + + // Set up workspace. + x1.SetSize(opK->Height()); + y1.SetSize(opK->Height()); + x1.UseDevice(true); + y1.UseDevice(true); +} + +void QuasiNewtonSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, + EigenvalueSolver::ScaleType type) +{ + MFEM_VERIFY(!opK || K.Height() == n, "Invalid modification of eigenvalue problem size!"); + bool first = (opK == nullptr); + opK = &K; + opC = &C; + opM = &M; + + if (first && type != ScaleType::NONE) + { + normK = linalg::SpectralNorm(comm, *opK, opK->IsReal()); + normC = linalg::SpectralNorm(comm, *opC, opC->IsReal()); + normM = linalg::SpectralNorm(comm, *opM, opM->IsReal()); + MFEM_VERIFY(normK >= 0.0 && normC >= 0.0 && normM >= 0.0, + "Invalid matrix norms for Quasi-Newton scaling!"); + if (normK > 0 && normC >= 0.0 && normM > 0.0) + { + gamma = std::sqrt(normK / normM); + delta = 2.0 / (normK + gamma * normC); + } + } + + n = opK->Height(); + + // Set up workspace. + x1.SetSize(opK->Height()); + y1.SetSize(opK->Height()); + x1.UseDevice(true); + y1.UseDevice(true); +} + +void QuasiNewtonSolver::SetInitialGuess() +{ + + MFEM_VERIFY(n > 0, "Must call SetOperators before using SetInitialguess for nonlinear " + "eigenvalue solver!"); + MFEM_VERIFY(nev > 0, "Must call SetNumModes before using SetInitialguess for nonlinear " + "eigenvalue solver!"); + + // Get eigenmodes initial guesses from linear eigensolver + eigenvalues.resize(nev_linear); + eigenvectors.resize(nev_linear); + for (int i = 0; i < nev_linear; i++) + { + eigenvalues[i] = linear_eigensolver_->GetEigenvalue(i); + linear_eigensolver_->GetEigenvector(i, x1); + eigenvectors[i] = x1; + } + + // Compute errors. + RescaleEigenvectors(nev_linear); + + // Initialize eigenpairs ordering. + perm = std::make_unique(nev_linear); + std::iota(perm.get(), perm.get() + nev_linear, 0); + + // Early return if nonlinear Newton won't be used. + if (!refine_nonlinear) + return; + + // If the number of initial guesses is greater than the number of requested modes + // de-prioritize the initial guesses that have larger errors. + std::vector indices(nev_linear); + std::iota(indices.begin(), indices.end(), 0); + if (nev_linear > nev) + { + double min_error = res.get()[0]; + for (int i = 0; i < nev_linear; i++) + { + min_error = std::min(min_error, res.get()[i]); + } + const double threshold = 100.0 * min_error; + std::sort(indices.begin(), indices.end(), + [&](const auto i, const auto j) + { + if (res.get()[i] < threshold && res.get()[j] > threshold) + { + return true; + } + else if (res.get()[i] > threshold && res.get()[j] < threshold) + { + return false; + } + else + { + return eigenvalues[i].imag() < eigenvalues[j].imag(); + } + }); + } + for (int i = 0; i < nev_linear; i++) + { + eigenvalues[i] = linear_eigensolver_->GetEigenvalue(indices[i]); + linear_eigensolver_->GetEigenvector(indices[i], x1); + linalg::NormalizePhase(comm, x1); + eigenvectors[i] = x1; + } + + // Get ordering of the eigenpairs. + std::sort(perm.get(), perm.get() + nev_linear, [&eig = this->eigenvalues](auto l, auto r) + { return eig[l].imag() < eig[r].imag(); }); +} + +namespace +{ +// Multiply an (n x k) matrix (vector of size k of ComplexVectors of size n) by a vector of +// size k, returning a ComplexVector of size n. +ComplexVector MatVecMult(const std::vector &X, const Eigen::VectorXcd &y) +{ + // Cast to avoid compiler warnings about types. + MFEM_ASSERT(static_cast(X.size()) == y.size(), + "Mismatch in dimension of input vectors!"); + const int k = X.size(); + const int n = X[0].Size(); + const bool use_dev = X[0].UseDevice(); + ComplexVector z; + z.SetSize(n); + z.UseDevice(use_dev); + z = 0.0; + for (int j = 0; j < k; j++) + { + linalg::AXPBYPCZ(y(j).real(), X[j].Real(), -y(j).imag(), X[j].Imag(), 1.0, z.Real()); + linalg::AXPBYPCZ(y(j).imag(), X[j].Real(), y(j).real(), X[j].Imag(), 1.0, z.Imag()); + } + return z; +} + +} // namespace + +int QuasiNewtonSolver::Solve() +{ + // Quasi-Newton method for nonlinear eigenvalue problems. + // Reference: Jarlebring, Koskela, Mele, Disguised and new quasi-Newton methods for + // nonlinear eigenvalue problems, Numerical Algorithms (2018). + // Using the deflation scheme used by SLEPc's NEP solver with minimality index set to 1. + // Reference: Effenberger, Robust successive computation of eigenpairs for nonlinear + // eigenvalue problems, SIAM J. Matrix Anal. Appl. (2013). + // The deflation scheme solves an extended problem of size n + k, where n is the original + // problem size and k is the number of converged eigenpairs. The extended operators are + // never explicitly constructed and two separate vectors of length n and k are used to + // store the extended solution: [v, v2] where v is a ComplexVector distributed across all + // processes and v2 is an Eigen::VectorXcd stored redundantly on all processes. + + // Set initial guess from linear eigensolver. + SetInitialGuess(); + + // Return early if not refining the eigenmodes with Newton. + if (!refine_nonlinear) + { + nev = eigenvalues.size(); + return nev; + } + + // Palace ComplexVectors of size n. + ComplexVector v, u, w, c, w0, z; + v.SetSize(n); + u.SetSize(n); + w.SetSize(n); + c.SetSize(n); + w0.SetSize(n); + z.SetSize(n); + v.UseDevice(true); + u.UseDevice(true); + w.UseDevice(true); + c.UseDevice(true); + w0.UseDevice(true); + z.UseDevice(true); + + // Eigen Matrix/Vectors for extended operator of size k. + Eigen::MatrixXcd H; + Eigen::VectorXcd u2, z2, c2, w2, v2; + + // Storage for eigenpairs. + std::vector X; + std::vector> eigs; + X.reserve(nev); + + // Set defaults. + if (nleps_it <= 0) + { + nleps_it = 100; + } + + // Delta used in to compute divided difference Jacobian. + const auto delta = std::sqrt(std::numeric_limits::epsilon()); + + // Set a seed and distribution for random Eigen vectors to ensure the same values on all + // ranks. + unsigned int seed = 111; + std::mt19937 gen(seed); + std::uniform_real_distribution<> dist_real(-1.0, 1.0); + + const int num_init_guess = eigenvalues.size(); + std::uniform_int_distribution dist_int(0, num_init_guess - 1); + int k = 0, restart = 0, guess_idx = 0; + while (k < nev) + { + // If > max_restart with the same initial guess, skip to next initial guess. + // If we tried all initial guesses and the random guess, end search even if k < nev. + if (restart > max_restart) + { + if (guess_idx < num_init_guess) + { + guess_idx++; + restart = 0; + } + else + { + break; + } + } + + // Set the eigenpair estimate to the initial guess. + std::complex eig, eig_opInv; + if (guess_idx < num_init_guess) + { + eig = eigenvalues[guess_idx]; + v = eigenvectors[guess_idx]; + } + else + { + eig = sigma; + if (num_init_guess < 3) + { + // Set purely random vector. + linalg::SetRandom(GetComm(), v); + linalg::SetSubVector( + v, *dynamic_cast(opK)->GetEssentialTrueDofs(), 0.0); + } + else + { + // Set random vector as the average of two distinct randomly-chosen initial guesses. + int i1 = dist_int(gen); + int i2; + do + { + i2 = dist_int(gen); + } while (i2 == i1); + v.AXPBYPCZ(0.5, eigenvectors[i1], 0.5, eigenvectors[i2], 0.0); + } + } + eig_opInv = eig; // eigenvalue estimate used in the (lagged) preconditioner + + // Set the "random" c vector and the deflation component of the eigenpair initial guess. + linalg::SetRandom(GetComm(), c, seed); // set seed for deterministic behavior + c2.conservativeResize(k); + v2.conservativeResize(k); + for (int i = 0; i < k; i++) + { + c2(i) = std::complex(dist_real(gen), dist_real(gen)); + v2(i) = std::complex(dist_real(gen), dist_real(gen)); + } + + // Normalize random c vector. + double norm_c = std::sqrt(std::abs(linalg::Dot(GetComm(), c, c)) + c2.squaredNorm()); + c *= 1.0 / norm_c; + c2 *= 1.0 / norm_c; + + // Normalize eigenvector estimate. + double norm_v = std::sqrt(std::abs(linalg::Dot(GetComm(), v, v)) + v2.squaredNorm()); + v *= 1.0 / norm_v; + v2 *= 1.0 / norm_v; + + // Set the linear solver operators. + opA2 = (*funcA2)(std::abs(eig.imag())); + opA = BuildParSumOperator({1.0 + 0.0i, eig, eig * eig, 1.0 + 0.0i}, + {opK, opC, opM, opA2.get()}, true); + opP = (*funcP)(1.0 + 0.0i, eig, eig * eig, eig.imag()); + opInv->SetOperators(*opA, *opP); + + // Linear solve with the extended operator of the deflated problem. + auto deflated_solve = [&](const ComplexVector &b1, const Eigen::VectorXcd &b2, + ComplexVector &x1, Eigen::VectorXcd &x2) + { + // Solve the block linear system + // |T(σ) U(σ)| |x1| = |b1| + // |A(σ) B(σ)| |x2| |b2| + // x1 = T^-1 b1 + // x2 = SS^-1 (b2 - A x1) where SS = (B - A T^-1 U) = - X^* X S^-1 + // x1 = x1 - X S x2 + opInv->Mult(b1, x1); + if (k == 0) // no deflation + { + return; + } + x2.conservativeResize(k); + for (int j = 0; j < k; j++) + { + x2(j) = b2(j) - linalg::Dot(GetComm(), x1, X[j]); + } + Eigen::MatrixXcd SS(k, k); + for (int i = 0; i < k; i++) + { + for (int j = 0; j < k; j++) + { + SS(i, j) = linalg::Dot(GetComm(), X[i], X[j]); + } + } + const Eigen::MatrixXcd S = eig_opInv * Eigen::MatrixXcd::Identity(k, k) - H; + SS = -S.fullPivLu().solve(SS); + x2 = SS.fullPivLu().solve(x2); + const ComplexVector XSx2 = MatVecMult(X, S.fullPivLu().solve(x2)); + linalg::AXPY(-1.0, XSx2, x1); + }; + + // Compute w0 = T^-1 c and normalize it. + deflated_solve(c, c2, w0, w2); + double norm_w0 = std::sqrt(std::abs(linalg::Dot(GetComm(), w0, w0)) + w2.squaredNorm()); + w0 *= 1.0 / norm_w0; + w2 *= 1.0 / norm_w0; + + // Newton iterations. + double res = mfem::infinity(); + int it = 0, diverged_it = 0; + while (it < nleps_it) + { + // Compute u = A * v. + auto A2n = (*funcA2)(std::abs(eig.imag())); + auto A = BuildParSumOperator({1.0 + 0.0i, eig, eig * eig, 1.0 + 0.0i}, + {opK, opC, opM, A2n.get()}, true); + A->Mult(v, u); + if (k > 0) // Deflation + { + // u1 = T(l) v1 + U(l) v2 = T(l) v1 + T(l)X(lI - H)^-1 v2. + const Eigen::MatrixXcd S = eig * Eigen::MatrixXcd::Identity(k, k) - H; + const ComplexVector XSv2 = MatVecMult(X, S.fullPivLu().solve(v2)); + A->AddMult(XSv2, u, 1.0); + // u2 = X^* v1. + u2.conservativeResize(k); + for (int j = 0; j < k; j++) + { + u2(j) = linalg::Dot(GetComm(), v, X[j]); + } + } + + // Compute residual. + res = std::sqrt(std::abs(linalg::Dot(GetComm(), u, u)) + u2.squaredNorm()); + if (print > 0) + { + Mpi::Print(GetComm(), + "{:d} NLEPS (nconv={:d}, restart={:d}) residual norm {:.6e}\n", it, k, + restart, res); + } + + // End if residual below tolerance and eigenvalue above the target. + if (res < rtol) + { + if (print > 0) + { + Mpi::Print(GetComm(), + "Eigenvalue {:d}, Quasi-Newton converged in {:d} iterations " + "({:.3e}{:+.3e}i).\n", + k, it, eig.real(), eig.imag()); + } + // Update the invariant pair with normalization. + const auto scale = linalg::Norml2(GetComm(), v); + v *= 1.0 / scale; + eigs.resize(k + 1); + eigs[k] = eig; + X.resize(k + 1); + X[k] = v; + H.conservativeResizeLike(Eigen::MatrixXd::Zero(k + 1, k + 1)); + H.col(k).head(k) = v2 / scale; + H(k, k) = eig; + k++; + // If the eigenvalue is inside the desired range, increment initial guess index + // Otherwise, use the same initial guess again and increment number of desired + // eigenvalues. + if (eig.imag() > sigma.imag()) + { + guess_idx++; + } + else + { + nev++; + } + restart = 0; // reset restart counter + break; + } + // Stop if large residual for 10 consecutive iterations. + diverged_it = (res > 0.9) ? diverged_it + 1 : 0; + if (diverged_it > 10) + { + if (print > 0) + { + Mpi::Print(GetComm(), + "Eigenvalue {:d}, Quasi-Newton not converging after {:d} iterations, " + "restarting.\n", + k, it); + } + restart++; + break; + } + + // Compute w = J * v. + auto opA2p = (*funcA2)(std::abs(eig.imag()) * (1.0 + delta)); + const std::complex denom = + std::complex(0.0, delta * std::abs(eig.imag())); + std::unique_ptr opAJ = + BuildParSumOperator({1.0 / denom, -1.0 / denom}, {opA2p.get(), A2n.get()}, true); + auto opJ = BuildParSumOperator({0.0 + 0.0i, 1.0 + 0.0i, 2.0 * eig, 1.0 + 0.0i}, + {opK, opC, opM, opAJ.get()}, true); + opJ->Mult(v, w); + if (k > 0) // Deflation + { + // w1 = T'(l) v1 + U'(l) v2 = T'(l) v1 + T'(l)XS v2 - T(l)XS^2 v2. + const Eigen::MatrixXcd S = eig * Eigen::MatrixXcd::Identity(k, k) - H; + const Eigen::VectorXcd Sv2 = S.fullPivLu().solve(v2); + const ComplexVector XSv2 = MatVecMult(X, Sv2); + const ComplexVector XSSv2 = MatVecMult(X, S.fullPivLu().solve(Sv2)); + opJ->AddMult(XSv2, w, 1.0); + A->AddMult(XSSv2, w, -1.0); + } + + // Compute delta = - dot(w0, u) / dot(w0, w). + const std::complex u2_w0 = std::complex(w2.adjoint() * u2); + const std::complex delta = + -(linalg::Dot(GetComm(), u, w0) + u2_w0) / linalg::Dot(GetComm(), w, w0); + + // Update eigenvalue. + eig += delta; + + // Compute z = -(delta * w + u). + z.AXPBYPCZ(-delta, w, -1.0, u, 0.0); + z2 = -u2; + + // Update preconditioner if needed. Updating the preconditioner as infrequently as + // possible gives the best performance and robustness. Updating the preconditioner + // close to an eigenvalue can lead to numerical instability. + if (it > 0 && it % preconditioner_lag == 0 && res > preconditioner_tol) + { + eig_opInv = eig; + opA2 = (*funcA2)(std::abs(eig.imag())); + opA = BuildParSumOperator({1.0 + 0.0i, eig, eig * eig, 1.0 + 0.0i}, + {opK, opC, opM, opA2.get()}, true); + opP = (*funcP)(1.0 + 0.0i, eig, eig * eig, eig.imag()); + opInv->SetOperators(*opA, *opP); + // Recompute w0 and normalize. + deflated_solve(c, c2, w0, w2); + double norm_w0 = + std::sqrt(std::abs(linalg::Dot(GetComm(), w0, w0)) + w2.squaredNorm()); + w0 *= 1.0 / norm_w0; + w2 *= 1.0 / norm_w0; + } + + // Solve M (v_k+1 - v_k) = z. + deflated_solve(z, z2, u, u2); + + // Update and normalize eigenvector estimate. + v += u; + v2 += u2; + norm_v = std::sqrt(std::abs(linalg::Dot(GetComm(), v, v)) + v2.squaredNorm()); + v *= 1.0 / norm_v; + v2 *= 1.0 / norm_v; + + it++; + if (it == nleps_it) + { + if (print > 0) + { + Mpi::Print(GetComm(), + "Eigenvalue {:d}, Quasi-Newton did not converge in {:d} iterations, " + "restarting.\n", + k, nleps_it); + } + restart++; + } + } + } + nev = k; // in case some guesses did not converge + + // Eigenpair extraction from the invariant pair (X, H). + Eigen::ComplexEigenSolver eps; + eps.compute(H); + // H eigenvectors are ordered arbitrarily, need to match them to order of X. + std::vector order(nev), order_eigen(nev), order2(nev); + std::iota(order.begin(), order.end(), 0); + std::iota(order_eigen.begin(), order_eigen.end(), 0); + std::iota(order2.begin(), order2.end(), 0); + std::sort(order.begin(), order.end(), + [&](auto l, auto r) { return eigs[l].imag() < eigs[r].imag(); }); + std::sort(order_eigen.begin(), order_eigen.end(), + [&epseig = eps.eigenvalues()](auto l, auto r) + { return epseig(l).imag() < epseig(r).imag(); }); + std::sort(order2.begin(), order2.end(), + [&](auto l, auto r) { return order[l] < order[r]; }); + + // Sort Eigen eigenvectors. + std::vector Xeig; + for (int i = 0; i < nev; i++) + { + Xeig.push_back(eps.eigenvectors().col(order_eigen[i])); + } + + // Recover the eigenvectors in the target range. + eigenvalues.clear(); + eigenvectors.clear(); + for (int i = 0; i < nev; i++) + { + if (eigs[i].imag() > sigma.imag()) + { + ComplexVector eigv = MatVecMult(X, Xeig[order2[i]]); + eigenvalues.push_back(eigs[i]); + eigenvectors.push_back(eigv); + } + } + nev = eigenvalues.size(); + + // Get ordering of the reported eigenpairs. + perm = std::make_unique(nev); + std::iota(perm.get(), perm.get() + nev, 0); + std::sort(perm.get(), perm.get() + nev, [&eig = this->eigenvalues](auto l, auto r) + { return eig[l].imag() < eig[r].imag(); }); + + // Compute the eigenpair residuals for eigenvalue λ. + RescaleEigenvectors(nev); + + return nev; +} + +double QuasiNewtonSolver::GetResidualNorm(std::complex l, const ComplexVector &x, + ComplexVector &r) const +{ + // Compute the i-th eigenpair residual: || P(λ) x ||₂ = || (K + λ C + λ² M + A2(λ)) x ||₂ + // for eigenvalue λ. + opK->Mult(x, r); + if (opC) + { + opC->AddMult(x, r, l); + } + opM->AddMult(x, r, l * l); + auto A2 = (*funcA2)(std::abs(l.imag())); + A2->AddMult(x, r, 1.0); + return linalg::Norml2(comm, r); +} + +double QuasiNewtonSolver::GetBackwardScaling(std::complex l) const +{ + // Make sure not to use norms from scaling as this can be confusing if they are different. + // Note that SLEPc uses ||.||∞, not the 2-norm. + if (normK <= 0.0) + { + normK = linalg::SpectralNorm(comm, *opK, opK->IsReal()); + } + if (normC <= 0.0 && opC) + { + normC = linalg::SpectralNorm(comm, *opC, opC->IsReal()); + } + if (normM <= 0.0) + { + normM = linalg::SpectralNorm(comm, *opM, opM->IsReal()); + } + double t = std::abs(l); + return normK + t * normC + t * t * normM; +} + +NewtonInterpolationOperator::NewtonInterpolationOperator( + std::function(double)> funcA2, int size) + : funcA2(funcA2) +{ + rhs.SetSize(size); + rhs.UseDevice(true); +} + +// Compute the elementary symmetric polynomial. Used to convert from Newton to monomial +// basis. +template +ScalarType elementarySymmetric(const std::vector &points, int k, int n) +{ + if (k == 0) + { + return 1.0; + } + if (k > n || k < 0 || n == 0) + { + return 0.0; + } + return elementarySymmetric(points, k, n - 1) + + points[n - 1] * elementarySymmetric(points, k - 1, n - 1); +} + +void NewtonInterpolationOperator::Interpolate(const std::complex sigma_min, + const std::complex sigma_max) +{ + // Reset operators and sample points each time Interpolate is called. + ops.clear(); + ops.resize(num_points); + points.clear(); + points.resize(num_points); + + // Linearly spaced sample points. + for (int j = 0; j < num_points; j++) + { + points[j] = sigma_min + (double)j * (sigma_max - sigma_min) / (double)(num_points - 1); + } + + // Build divided difference matrices. + for (int k = 0; k < num_points; k++) + { + for (int j = 0; j < num_points - k; j++) + { + if (k == 0) + { + auto A2j = (funcA2)(points[j].imag()); + ops[k].push_back(std::move(A2j)); + } + else + { + std::complex denom = points[j + k] - points[j]; + auto A2dd = + BuildParSumOperator({1.0 / denom, -1.0 / denom}, + {ops[k - 1][j + 1].get(), ops[k - 1][j].get()}, true); + ops[k].push_back(std::move(A2dd)); + } + } + } + + // Compute monomial coefficients as a function of the Newton polynomial coefficients. + coeffs.clear(); + coeffs.assign(num_points, std::vector>(num_points, 0.0)); + for (int k = 0; k < num_points; k++) + { + for (int j = k; j < num_points; j++) + { + double sign = ((j - k) % 2 == 0) ? 1 : -1; + coeffs[k][j] = sign * elementarySymmetric(points, j - k, j); + } + } +} + +std::unique_ptr +NewtonInterpolationOperator::GetInterpolationOperator(int order) const +{ + MFEM_VERIFY(order >= 0 && order < num_points, + "Order must be greater than or equal to 0 and smaller than the number of " + "interpolation points!"); + return BuildParSumOperator({coeffs[order][0], coeffs[order][1], coeffs[order][2]}, + {ops[0][0].get(), ops[1][0].get(), ops[2][0].get()}, true); +} + +void NewtonInterpolationOperator::Mult(int order, const ComplexVector &x, + ComplexVector &y) const +{ + MFEM_VERIFY(order >= 0 && order < num_points, + "Order must be greater than or equal to 0 and smaller than the number of " + "interpolation points!"); + + y = 0.0; + for (int j = 0; j < num_points; j++) + { + if (coeffs[order][j] != 0.0) + { + ops[j][0]->AddMult(x, y, coeffs[order][j]); + } + } +} + +void NewtonInterpolationOperator::AddMult(int order, const ComplexVector &x, + ComplexVector &y, std::complex a) const +{ + this->Mult(order, x, rhs); + rhs *= a; + y += rhs; +} + +} // namespace palace diff --git a/palace/linalg/nleps.hpp b/palace/linalg/nleps.hpp new file mode 100644 index 0000000000..6043118eb4 --- /dev/null +++ b/palace/linalg/nleps.hpp @@ -0,0 +1,286 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_NLEPS_HPP +#define PALACE_LINALG_NLEPS_HPP + +#include +#include +#include +#include +#include "linalg/eps.hpp" +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" +#include "models/spaceoperator.hpp" + +namespace palace +{ + +// +// Abstract base class for nonlinear eigensolvers. +// Currently only implemented for complex scalar interface. +// +class NonLinearEigenvalueSolver : public EigenvalueSolver +{ +protected: + // MPI communicator. + MPI_Comm comm; + + // Control print level for debugging. + int print; + + // Number of eigenvalues to compute and problem size. + int nev, n; + + // Relative eigenvalue error convergence tolerance for the solver. + double rtol; + + // Maximum number of iterations. + int nleps_it; + + // Specifies which part of the spectrum to search for. + EigenvalueSolver::WhichType which_type; + + // Variables for scaling, from Higham et al., IJNME 2008. + double gamma, delta; + + // Parameters defining the spectral transformation. + std::complex sigma; + bool sinvert; + + // Storage for computed eigenvalues and eigenvectors. + std::vector> eigenvalues; + std::vector eigenvectors; + std::unique_ptr perm; + + // Storage for computed residual norms and eigenvector scalings. + std::unique_ptr res, xscale; + + // Reference to linear solver used for operator action of P(σ)⁻¹ (not owned). + ComplexKspSolver *opInv; + + // Reference to solver for projecting an intermediate vector onto a divergence-free space + // (not owned). + const DivFreeSolver *opProj; + + // Reference to matrix used for weighted inner products (not owned). May be nullptr, in + // which case identity is used. + const Operator *opB; + + // Workspace vector for operator applications. + mutable ComplexVector x1, y1; + + // Helper routine for computing the eigenvector normalization. + double GetEigenvectorNorm(const ComplexVector &x, ComplexVector &Bx) const; + + // Helper routine for computing the eigenpair residual. + virtual double GetResidualNorm(std::complex l, const ComplexVector &x, + ComplexVector &r) const = 0; + + // Helper routine for computing the backward error. + virtual double GetBackwardScaling(std::complex l) const = 0; + + // Get the associated MPI communicator. + virtual MPI_Comm GetComm() const = 0; + + // Return problem type name. + virtual const char *GetName() const = 0; + +public: + NonLinearEigenvalueSolver(MPI_Comm comm, int print); + + // The linear solver will be configured to compute the action of T(σ)⁻¹ + // where σ is the current eigenvalue estimate. + void SetLinearSolver(ComplexKspSolver &ksp) override; + + // Set the projection operator for enforcing the divergence-free constraint. + void SetDivFreeProjector(const DivFreeSolver &divfree) override; + + // Set optional B matrix used for weighted inner products. This must be set explicitly + // even for generalized problems, otherwise the identity will be used. + void SetBMat(const Operator &B) override; + + // Get scaling factors used by the solver. + double GetScalingGamma() const override { return gamma; } + double GetScalingDelta() const override { return delta; } + + // Set the number of required eigenmodes. + void SetNumModes(int num_eig, int num_vec = 0) override; + + // Set solver tolerance. + void SetTol(double tol) override; + + // Set maximum number of Arnoldi update iterations. + void SetMaxIter(int max_it) override; + + // Set target spectrum for the eigensolver. When a spectral transformation is used, this + // applies to the spectrum of the shifted operator. + void SetWhichEigenpairs(WhichType type) override; + + // Set shift-and-invert spectral transformation. + void SetShiftInvert(std::complex s, bool precond = false) override; + + // Set an initial vector for the solution subspace. + void SetInitialSpace(const ComplexVector &v) override; + + // Solve the eigenvalue problem. Returns the number of converged eigenvalues. + int Solve() override = 0; + + // Get the corresponding eigenvalue. + std::complex GetEigenvalue(int i) const override; + + // Get the corresponding eigenvector. Eigenvectors are normalized such that ||x||₂ = 1, + // unless the B-matrix is set for weighted inner products. + void GetEigenvector(int i, ComplexVector &x) const override; + + // Get the corresponding eigenpair error. + double GetError(int i, ErrorType type) const override; + + // Re-normalize the given number of eigenvectors, for example if the matrix B for weighted + // inner products has changed. This does not perform re-orthogonalization with respect to + // the new matrix, only normalization. + void RescaleEigenvectors(int num_eig) override; +}; + +// Quasi-Newton nonlinear eigenvalue solver for (K + λ C + λ² M + A2(λ)) x = 0. +class QuasiNewtonSolver : public NonLinearEigenvalueSolver +{ +private: + // References to matrices defining the nonlinear eigenvalue problem + // (not owned). + const ComplexOperator *opK, *opC, *opM; + + // Operators used in the iterative linear solver. + std::unique_ptr opA2, opA, opP; + + // Function to compute the A2 operator. + std::optional(double)>> funcA2; + + // Function to compute the preconditioner matrix. + std::optional( + std::complex, std::complex, std::complex, double)>> + funcP; + + // Linear eigenvalue solver used to set initial guess. + std::unique_ptr linear_eigensolver_; + + // Number of eigenmode initial guesses. + int nev_linear; + + // Operator norms for scaling. + mutable double normK, normC, normM; + + // Update frequency of the preconditioner during Newton iterations. + int preconditioner_lag; + + // Update tolerance of the preconditioner (no update below tol). + double preconditioner_tol; + + // Maximum number of Newton attempts with the same initial guess. + int max_restart; + + // Refine linear eigenvalues with nonlinear Newton eigenvalue solver. + bool refine_nonlinear; + + // Set the initial guesses from the linear eigenvalue solver results. + void SetInitialGuess(); + +protected: + double GetResidualNorm(std::complex l, const ComplexVector &x, + ComplexVector &r) const override; + + double GetBackwardScaling(std::complex l) const override; + + const char *GetName() const override { return "QuasiNewton"; } + +public: + QuasiNewtonSolver(MPI_Comm comm, std::unique_ptr linear_eigensolver, + int num_conv, int print, bool refine); + + using NonLinearEigenvalueSolver::SetOperators; + void SetOperators(const ComplexOperator &K, const ComplexOperator &M, + ScaleType type) override; + void SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, ScaleType type) override; + + // Set the frequency-dependent A2 matrix function. + void SetExtraSystemMatrix( + std::function(double)>) override; + + // Set the preconditioner update function. + void SetPreconditionerUpdate(std::function( + std::complex, std::complex, + std::complex, double)>) override; + + // Set the update frequency of the preconditioner. + void SetPreconditionerLag(int preconditioner_update_freq, + double preconditioner_update_tol); + + // Set the maximum number of restarts with the same initial guess. + void SetMaxRestart(int max_num_restart); + + // Solve the nonlinear eigenvalue problem. + int Solve() override; + + MPI_Comm GetComm() const override { return comm; } +}; + +// +// Interpolation operators to approximate the nonlinear A2 operator. +// +class Interpolation +{ +public: + Interpolation() = default; + virtual ~Interpolation() = default; + virtual void Interpolate(const std::complex sigma_min, + const std::complex sigma_max) = 0; + virtual std::unique_ptr GetInterpolationOperator(int order) const = 0; + virtual void Mult(int order, const ComplexVector &x, ComplexVector &y) const = 0; + virtual void AddMult(int order, const ComplexVector &x, ComplexVector &y, + std::complex a = 1.0) const = 0; +}; + +// Newton polynomial interpolation to approximate the nonlinear A2 operator. +class NewtonInterpolationOperator : public Interpolation +{ +private: + // Function to compute the A2 operator. + std::function(double)> funcA2; + + // Number of points used in the interpolation (currently always second order). + int num_points = 3; + + // Interpolation points. + std::vector> points; + + // Monomial basis coefficients. + std::vector>> coeffs; + + // Divided difference operators. + std::vector>> ops; + + // Workspace objects for solver application. + mutable ComplexVector rhs; + +public: + NewtonInterpolationOperator( + std::function(double)> funcA2, const int size); + + // Interpolate the A2 matrix between sigma_min and sigma_max with a Newton polynomial. + void Interpolate(const std::complex sigma_min, + const std::complex sigma_max); + + // Get the interpolation operator of specified order. + std::unique_ptr GetInterpolationOperator(int order) const; + + // Perform multiplication with interpolation operator of specified order. + void Mult(int order, const ComplexVector &x, ComplexVector &y) const; + void AddMult(int order, const ComplexVector &x, ComplexVector &y, + std::complex a = 1.0) const; +}; + +} // namespace palace + +#endif // PALACE_LINALG_NLEPS_HPP diff --git a/palace/linalg/operator.cpp b/palace/linalg/operator.cpp index a463d36871..001a371059 100644 --- a/palace/linalg/operator.cpp +++ b/palace/linalg/operator.cpp @@ -1,632 +1,679 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "operator.hpp" - -#include -#include "linalg/slepc.hpp" -#include "utils/communication.hpp" - -namespace palace -{ - -bool ComplexOperator::IsReal() const -{ - MFEM_ABORT("IsReal() is not implemented for base class ComplexOperator!"); - return false; -} - -bool ComplexOperator::IsImag() const -{ - MFEM_ABORT("IsImag() is not implemented for base class ComplexOperator!"); - return false; -} - -bool ComplexOperator::HasReal() const -{ - MFEM_ABORT("HasReal() is not implemented for base class ComplexOperator!"); - return false; -} - -bool ComplexOperator::HasImag() const -{ - MFEM_ABORT("HasImag() is not implemented for base class ComplexOperator!"); - return false; -} - -const Operator *ComplexOperator::Real() const -{ - MFEM_ABORT("Real() is not implemented for base class ComplexOperator!"); - return nullptr; -} - -Operator *ComplexOperator::Real() -{ - MFEM_ABORT("Real() is not implemented for base class ComplexOperator!"); - return nullptr; -} - -const Operator *ComplexOperator::Imag() const -{ - MFEM_ABORT("Imag() is not implemented for base class ComplexOperator!"); - return nullptr; -} - -Operator *ComplexOperator::Imag() -{ - MFEM_ABORT("Imag() is not implemented for base class ComplexOperator!"); - return nullptr; -} - -void ComplexOperator::MultTranspose(const ComplexVector &x, ComplexVector &y) const -{ - MFEM_ABORT("Base class ComplexOperator does not implement MultTranspose!"); -} - -void ComplexOperator::MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const -{ - MFEM_ABORT("Base class ComplexOperator does not implement MultHermitianTranspose!"); -} - -void ComplexOperator::AddMult(const ComplexVector &x, ComplexVector &y, - const std::complex a) const -{ - MFEM_ABORT("Base class ComplexOperator does not implement AddMult!"); -} - -void ComplexOperator::AddMultTranspose(const ComplexVector &x, ComplexVector &y, - const std::complex a) const -{ - MFEM_ABORT("Base class ComplexOperator does not implement AddMultTranspose!"); -} - -void ComplexOperator::AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, - const std::complex a) const -{ - MFEM_ABORT("Base class ComplexOperator does not implement AddMultHermitianTranspose!"); -} - -ComplexWrapperOperator::ComplexWrapperOperator(std::unique_ptr &&dAr, - std::unique_ptr &&dAi, - Operator *pAr, Operator *pAi) - : ComplexOperator(), data_Ar(std::move(dAr)), data_Ai(std::move(dAi)), - Ar((data_Ar != nullptr) ? data_Ar.get() : pAr), - Ai((data_Ai != nullptr) ? data_Ai.get() : pAi) -{ - MFEM_VERIFY(Ar || Ai, "Cannot construct ComplexWrapperOperator from an empty matrix!"); - MFEM_VERIFY((!Ar || !Ai) || (Ar->Height() == Ai->Height() && Ar->Width() == Ai->Width()), - "Mismatch in dimension of real and imaginary matrix parts!"); - height = Ar ? Ar->Height() : Ai->Height(); - width = Ar ? Ar->Width() : Ai->Width(); -} - -ComplexWrapperOperator::ComplexWrapperOperator(std::unique_ptr &&Ar, - std::unique_ptr &&Ai) - : ComplexWrapperOperator(std::move(Ar), std::move(Ai), nullptr, nullptr) -{ -} - -ComplexWrapperOperator::ComplexWrapperOperator(Operator *Ar, Operator *Ai) - : ComplexWrapperOperator(nullptr, nullptr, Ar, Ai) -{ -} - -void ComplexWrapperOperator::Mult(const ComplexVector &x, ComplexVector &y) const -{ - constexpr bool zero_real = false; - constexpr bool zero_imag = false; - const Vector &xr = x.Real(); - const Vector &xi = x.Imag(); - Vector &yr = y.Real(); - Vector &yi = y.Imag(); - if (Ar) - { - if (!zero_real) - { - Ar->Mult(xr, yr); - } - if (!zero_imag) - { - Ar->Mult(xi, yi); - } - } - else - { - yr = 0.0; - yi = 0.0; - } - if (Ai) - { - if (!zero_imag) - { - Ai->AddMult(xi, yr, -1.0); - } - if (!zero_real) - { - Ai->AddMult(xr, yi, 1.0); - } - } -} - -void ComplexWrapperOperator::MultTranspose(const ComplexVector &x, ComplexVector &y) const -{ - constexpr bool zero_real = false; - constexpr bool zero_imag = false; - const Vector &xr = x.Real(); - const Vector &xi = x.Imag(); - Vector &yr = y.Real(); - Vector &yi = y.Imag(); - if (Ar) - { - if (!zero_real) - { - Ar->MultTranspose(xr, yr); - } - if (!zero_imag) - { - Ar->MultTranspose(xi, yi); - } - } - else - { - yr = 0.0; - yi = 0.0; - } - if (Ai) - { - if (!zero_imag) - { - Ai->AddMultTranspose(xi, yr, -1.0); - } - if (!zero_real) - { - Ai->AddMultTranspose(xr, yi, 1.0); - } - } -} - -void ComplexWrapperOperator::MultHermitianTranspose(const ComplexVector &x, - ComplexVector &y) const -{ - constexpr bool zero_real = false; - constexpr bool zero_imag = false; - const Vector &xr = x.Real(); - const Vector &xi = x.Imag(); - Vector &yr = y.Real(); - Vector &yi = y.Imag(); - if (Ar) - { - if (!zero_real) - { - Ar->MultTranspose(xr, yr); - } - if (!zero_imag) - { - Ar->MultTranspose(xi, yi); - } - } - else - { - yr = 0.0; - yi = 0.0; - } - if (Ai) - { - if (!zero_imag) - { - Ai->AddMultTranspose(xi, yr, 1.0); - } - if (!zero_real) - { - Ai->AddMultTranspose(xr, yi, -1.0); - } - } -} - -void ComplexWrapperOperator::AddMult(const ComplexVector &x, ComplexVector &y, - const std::complex a) const -{ - constexpr bool zero_real = false; - constexpr bool zero_imag = false; - const Vector &xr = x.Real(); - const Vector &xi = x.Imag(); - Vector &yr = y.Real(); - Vector &yi = y.Imag(); - if (a.real() != 0.0 && a.imag() != 0.0) - { - ty.SetSize(height); - Mult(x, ty); - const int N = height; - const double ar = a.real(); - const double ai = a.imag(); - const auto *TYR = ty.Real().Read(); - const auto *TYI = ty.Imag().Read(); - auto *YR = yr.ReadWrite(); - auto *YI = yi.ReadWrite(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - YR[i] += ar * TYR[i] - ai * TYI[i]; - YI[i] += ai * TYR[i] + ar * TYI[i]; - }); - } - else if (a.real() != 0.0) - { - if (Ar) - { - if (!zero_real) - { - Ar->AddMult(xr, yr, a.real()); - } - if (!zero_imag) - { - Ar->AddMult(xi, yi, a.real()); - } - } - if (Ai) - { - if (!zero_imag) - { - Ai->AddMult(xi, yr, -a.real()); - } - if (!zero_real) - { - Ai->AddMult(xr, yi, a.real()); - } - } - } - else if (a.imag() != 0.0) - { - if (Ar) - { - if (!zero_real) - { - Ar->AddMult(xr, yi, a.imag()); - } - if (!zero_imag) - { - Ar->AddMult(xi, yr, -a.imag()); - } - } - if (Ai) - { - if (!zero_imag) - { - Ai->AddMult(xi, yi, -a.imag()); - } - if (!zero_real) - { - Ai->AddMult(xr, yr, -a.imag()); - } - } - } -} - -void ComplexWrapperOperator::AddMultTranspose(const ComplexVector &x, ComplexVector &y, - const std::complex a) const -{ - constexpr bool zero_real = false; - constexpr bool zero_imag = false; - const Vector &xr = x.Real(); - const Vector &xi = x.Imag(); - Vector &yr = y.Real(); - Vector &yi = y.Imag(); - if (a.real() != 0.0 && a.imag() != 0.0) - { - tx.SetSize(width); - MultTranspose(x, tx); - const int N = width; - const double ar = a.real(); - const double ai = a.imag(); - const auto *TXR = tx.Real().Read(); - const auto *TXI = tx.Imag().Read(); - auto *YR = yr.ReadWrite(); - auto *YI = yi.ReadWrite(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - YR[i] += ar * TXR[i] - ai * TXI[i]; - YI[i] += ai * TXR[i] + ar * TXI[i]; - }); - } - else if (a.real() != 0.0) - { - if (Ar) - { - if (!zero_real) - { - Ar->AddMultTranspose(xr, yr, a.real()); - } - if (!zero_imag) - { - Ar->AddMultTranspose(xi, yi, a.real()); - } - } - if (Ai) - { - if (!zero_imag) - { - Ai->AddMultTranspose(xi, yr, -a.real()); - } - if (!zero_real) - { - Ai->AddMultTranspose(xr, yi, a.real()); - } - } - } - else if (a.imag() != 0.0) - { - if (Ar) - { - if (!zero_real) - { - Ar->AddMultTranspose(xr, yi, a.imag()); - } - if (!zero_imag) - { - Ar->AddMultTranspose(xi, yr, -a.imag()); - } - } - if (Ai) - { - if (!zero_imag) - { - Ai->AddMultTranspose(xi, yi, -a.imag()); - } - if (!zero_real) - { - Ai->AddMultTranspose(xr, yr, -a.imag()); - } - } - } -} - -void ComplexWrapperOperator::AddMultHermitianTranspose(const ComplexVector &x, - ComplexVector &y, - const std::complex a) const -{ - constexpr bool zero_real = false; - constexpr bool zero_imag = false; - const Vector &xr = x.Real(); - const Vector &xi = x.Imag(); - Vector &yr = y.Real(); - Vector &yi = y.Imag(); - if (a.real() != 0.0 && a.imag() != 0.0) - { - tx.SetSize(width); - MultHermitianTranspose(x, tx); - const int N = width; - const double ar = a.real(); - const double ai = a.imag(); - const auto *TXR = tx.Real().Read(); - const auto *TXI = tx.Imag().Read(); - auto *YR = yr.ReadWrite(); - auto *YI = yi.ReadWrite(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - YR[i] += ar * TXR[i] - ai * TXI[i]; - YI[i] += ai * TXR[i] + ar * TXI[i]; - }); - } - else if (a.real() != 0.0) - { - if (Ar) - { - if (!zero_real) - { - Ar->AddMultTranspose(xr, yr, a.real()); - } - if (!zero_imag) - { - Ar->AddMultTranspose(xi, yi, a.real()); - } - } - if (Ai) - { - if (!zero_imag) - { - Ai->AddMultTranspose(xi, yr, a.real()); - } - if (!zero_real) - { - Ai->AddMultTranspose(xr, yi, -a.real()); - } - } - } - else if (a.imag() != 0.0) - { - if (Ar) - { - if (!zero_real) - { - Ar->AddMultTranspose(xr, yi, a.imag()); - } - if (!zero_imag) - { - Ar->AddMultTranspose(xi, yr, -a.imag()); - } - } - if (Ai) - { - if (!zero_imag) - { - Ai->AddMultTranspose(xi, yi, a.imag()); - } - if (!zero_real) - { - Ai->AddMultTranspose(xr, yr, a.imag()); - } - } - } -} - -SumOperator::SumOperator(const Operator &op, double c) : Operator(op.Height(), op.Width()) -{ - AddOperator(op, c); -} - -void SumOperator::AddOperator(const Operator &op, double c) -{ - MFEM_VERIFY(op.Height() == height && op.Width() == width, - "Invalid Operator dimensions for SumOperator!"); - ops.emplace_back(&op, c); -} - -void SumOperator::Mult(const Vector &x, Vector &y) const -{ - if (ops.size() == 1) - { - ops.front().first->Mult(x, y); - if (ops.front().second != 1.0) - { - y *= ops.front().second; - } - return; - } - y = 0.0; - AddMult(x, y); -} - -void SumOperator::MultTranspose(const Vector &x, Vector &y) const -{ - if (ops.size() == 1) - { - ops.front().first->MultTranspose(x, y); - if (ops.front().second != 1.0) - { - y *= ops.front().second; - } - return; - } - y = 0.0; - AddMultTranspose(x, y); -} - -void SumOperator::AddMult(const Vector &x, Vector &y, const double a) const -{ - for (const auto &[op, c] : ops) - { - op->AddMult(x, y, a * c); - } -} - -void SumOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) const -{ - for (const auto &[op, c] : ops) - { - op->AddMultTranspose(x, y, a * c); - } -} - -template <> -void BaseDiagonalOperator::Mult(const Vector &x, Vector &y) const -{ - const int N = this->height; - const auto *D = d.Read(); - const auto *X = x.Read(); - auto *Y = y.Write(); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { Y[i] = D[i] * X[i]; }); -} - -template <> -void BaseDiagonalOperator::Mult(const ComplexVector &x, - ComplexVector &y) const -{ - const int N = this->height; - const auto *DR = d.Real().Read(); - const auto *DI = d.Imag().Read(); - const auto *XR = x.Real().Read(); - const auto *XI = x.Imag().Read(); - auto *YR = y.Real().Write(); - auto *YI = y.Imag().Write(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - YR[i] = DR[i] * XR[i] - DI[i] * XI[i]; - YI[i] = DI[i] * XR[i] + DR[i] * XI[i]; - }); -} - -template <> -void DiagonalOperatorHelper, - ComplexOperator>::MultHermitianTranspose(const ComplexVector &x, - ComplexVector &y) const -{ - const ComplexVector &d = - static_cast *>(this)->d; - const int N = this->height; - const auto *DR = d.Real().Read(); - const auto *DI = d.Imag().Read(); - const auto *XR = x.Real().Read(); - const auto *XI = x.Imag().Read(); - auto *YR = y.Real().Write(); - auto *YI = y.Imag().Write(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - YR[i] = DR[i] * XR[i] + DI[i] * XI[i]; - YI[i] = -DI[i] * XR[i] + DR[i] * XI[i]; - }); -} - -namespace linalg -{ - -double SpectralNorm(MPI_Comm comm, const Operator &A, bool sym, double tol, int max_it) -{ - ComplexWrapperOperator Ar(const_cast(&A), nullptr); // Non-owning constructor - return SpectralNorm(comm, Ar, sym, tol, max_it); -} - -double SpectralNorm(MPI_Comm comm, const ComplexOperator &A, bool herm, double tol, - int max_it) -{ - // XX TODO: Use ARPACK or SLEPc for this when configured. -#if defined(PALACE_WITH_SLEPC) - return slepc::GetMaxSingularValue(comm, A, herm, tol, max_it); -#else - // Power iteration loop: ||A||₂² = λₙ(Aᴴ A). - int it = 0; - double res = 0.0; - double l, l0 = 0.0; - ComplexVector u(A.Height()), v(A.Height()); - SetRandom(comm, u); - Normalize(comm, u); - while (it < max_it) - { - A.Mult(u, v); - if (herm) - { - u = v; - } - else - { - A.MultHermitianTranspose(v, u); - } - l = Normalize(comm, u); - if (it > 0) - { - res = std::abs(l - l0) / l0; - if (res < tol) - { - break; - } - } - l0 = l; - it++; - } - if (it >= max_it) - { - Mpi::Warning(comm, - "Power iteration did not converge in {:d} iterations, res = {:.3e}, " - "lambda = {:.3e}!\n", - it, res, l); - } - return herm ? l : std::sqrt(l); -#endif -} - -} // namespace linalg - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "operator.hpp" + +#include +#include "linalg/slepc.hpp" +#include "utils/communication.hpp" + +namespace palace +{ + +const Operator *ComplexOperator::Real() const +{ + MFEM_ABORT("Real() is not implemented for base class ComplexOperator!"); + return nullptr; +} + +const Operator *ComplexOperator::Imag() const +{ + MFEM_ABORT("Imag() is not implemented for base class ComplexOperator!"); + return nullptr; +} + +void ComplexOperator::AssembleDiagonal(ComplexVector &diag) const +{ + MFEM_ABORT("Base class ComplexOperator does not implement AssembleDiagonal!"); +} + +void ComplexOperator::MultTranspose(const ComplexVector &x, ComplexVector &y) const +{ + MFEM_ABORT("Base class ComplexOperator does not implement MultTranspose!"); +} + +void ComplexOperator::MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const +{ + MFEM_ABORT("Base class ComplexOperator does not implement MultHermitianTranspose!"); +} + +void ComplexOperator::AddMult(const ComplexVector &x, ComplexVector &y, + const std::complex a) const +{ + MFEM_ABORT("Base class ComplexOperator does not implement AddMult!"); +} + +void ComplexOperator::AddMultTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a) const +{ + MFEM_ABORT("Base class ComplexOperator does not implement AddMultTranspose!"); +} + +void ComplexOperator::AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a) const +{ + MFEM_ABORT("Base class ComplexOperator does not implement AddMultHermitianTranspose!"); +} + +ComplexWrapperOperator::ComplexWrapperOperator(std::unique_ptr &&dAr, + std::unique_ptr &&dAi, + const Operator *pAr, const Operator *pAi) + : ComplexOperator(), data_Ar(std::move(dAr)), data_Ai(std::move(dAi)), + Ar((data_Ar != nullptr) ? data_Ar.get() : pAr), + Ai((data_Ai != nullptr) ? data_Ai.get() : pAi) +{ + MFEM_VERIFY(Ar || Ai, "Cannot construct ComplexWrapperOperator from an empty matrix!"); + MFEM_VERIFY((!Ar || !Ai) || (Ar->Height() == Ai->Height() && Ar->Width() == Ai->Width()), + "Mismatch in dimension of real and imaginary matrix parts!"); + tx.UseDevice(true); + ty.UseDevice(true); + height = Ar ? Ar->Height() : Ai->Height(); + width = Ar ? Ar->Width() : Ai->Width(); +} + +ComplexWrapperOperator::ComplexWrapperOperator(std::unique_ptr &&Ar, + std::unique_ptr &&Ai) + : ComplexWrapperOperator(std::move(Ar), std::move(Ai), nullptr, nullptr) +{ +} + +ComplexWrapperOperator::ComplexWrapperOperator(const Operator *Ar, const Operator *Ai) + : ComplexWrapperOperator(nullptr, nullptr, Ar, Ai) +{ +} + +void ComplexWrapperOperator::AssembleDiagonal(ComplexVector &diag) const +{ + diag = 0.0; + if (Ar) + { + Ar->AssembleDiagonal(diag.Real()); + } + if (Ai) + { + Ai->AssembleDiagonal(diag.Imag()); + } +} + +void ComplexWrapperOperator::Mult(const ComplexVector &x, ComplexVector &y) const +{ + constexpr bool zero_real = false; + constexpr bool zero_imag = false; + const Vector &xr = x.Real(); + const Vector &xi = x.Imag(); + Vector &yr = y.Real(); + Vector &yi = y.Imag(); + if (Ai) + { + if (!zero_imag) + { + Ai->Mult(xi, yr); + yr *= -1.0; + } + if (!zero_real) + { + Ai->Mult(xr, yi); + } + } + else + { + yr = 0.0; + yi = 0.0; + } + if (Ar) + { + if (!zero_real) + { + Ar->AddMult(xr, yr); + } + if (!zero_imag) + { + Ar->AddMult(xi, yi); + } + } +} + +void ComplexWrapperOperator::MultTranspose(const ComplexVector &x, ComplexVector &y) const +{ + constexpr bool zero_real = false; + constexpr bool zero_imag = false; + const Vector &xr = x.Real(); + const Vector &xi = x.Imag(); + Vector &yr = y.Real(); + Vector &yi = y.Imag(); + if (Ai) + { + if (!zero_imag) + { + Ai->MultTranspose(xi, yr); + yr *= -1.0; + } + if (!zero_real) + { + Ai->MultTranspose(xr, yi); + } + } + else + { + yr = 0.0; + yi = 0.0; + } + if (Ar) + { + if (!zero_real) + { + Ar->AddMultTranspose(xr, yr); + } + if (!zero_imag) + { + Ar->AddMultTranspose(xi, yi); + } + } +} + +void ComplexWrapperOperator::MultHermitianTranspose(const ComplexVector &x, + ComplexVector &y) const +{ + constexpr bool zero_real = false; + constexpr bool zero_imag = false; + const Vector &xr = x.Real(); + const Vector &xi = x.Imag(); + Vector &yr = y.Real(); + Vector &yi = y.Imag(); + if (Ai) + { + if (!zero_imag) + { + Ai->MultTranspose(xi, yr); + } + if (!zero_real) + { + Ai->MultTranspose(xr, yi); + yi *= -1.0; + } + } + else + { + yr = 0.0; + yi = 0.0; + } + if (Ar) + { + if (!zero_real) + { + Ar->AddMultTranspose(xr, yr); + } + if (!zero_imag) + { + Ar->AddMultTranspose(xi, yi); + } + } +} + +void ComplexWrapperOperator::AddMult(const ComplexVector &x, ComplexVector &y, + const std::complex a) const +{ + constexpr bool zero_real = false; + constexpr bool zero_imag = false; + const Vector &xr = x.Real(); + const Vector &xi = x.Imag(); + Vector &yr = y.Real(); + Vector &yi = y.Imag(); + if (a.real() != 0.0 && a.imag() != 0.0) + { + ty.SetSize(height); + Mult(x, ty); + y.AXPY(a, ty); + } + else if (a.real() != 0.0) + { + if (Ar) + { + if (!zero_real) + { + Ar->AddMult(xr, yr, a.real()); + } + if (!zero_imag) + { + Ar->AddMult(xi, yi, a.real()); + } + } + if (Ai) + { + if (!zero_imag) + { + Ai->AddMult(xi, yr, -a.real()); + } + if (!zero_real) + { + Ai->AddMult(xr, yi, a.real()); + } + } + } + else if (a.imag() != 0.0) + { + if (Ar) + { + if (!zero_real) + { + Ar->AddMult(xr, yi, a.imag()); + } + if (!zero_imag) + { + Ar->AddMult(xi, yr, -a.imag()); + } + } + if (Ai) + { + if (!zero_imag) + { + Ai->AddMult(xi, yi, -a.imag()); + } + if (!zero_real) + { + Ai->AddMult(xr, yr, -a.imag()); + } + } + } +} + +void ComplexWrapperOperator::AddMultTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a) const +{ + constexpr bool zero_real = false; + constexpr bool zero_imag = false; + const Vector &xr = x.Real(); + const Vector &xi = x.Imag(); + Vector &yr = y.Real(); + Vector &yi = y.Imag(); + if (a.real() != 0.0 && a.imag() != 0.0) + { + tx.SetSize(width); + MultTranspose(x, tx); + y.AXPY(a, tx); + } + else if (a.real() != 0.0) + { + if (Ar) + { + if (!zero_real) + { + Ar->AddMultTranspose(xr, yr, a.real()); + } + if (!zero_imag) + { + Ar->AddMultTranspose(xi, yi, a.real()); + } + } + if (Ai) + { + if (!zero_imag) + { + Ai->AddMultTranspose(xi, yr, -a.real()); + } + if (!zero_real) + { + Ai->AddMultTranspose(xr, yi, a.real()); + } + } + } + else if (a.imag() != 0.0) + { + if (Ar) + { + if (!zero_real) + { + Ar->AddMultTranspose(xr, yi, a.imag()); + } + if (!zero_imag) + { + Ar->AddMultTranspose(xi, yr, -a.imag()); + } + } + if (Ai) + { + if (!zero_imag) + { + Ai->AddMultTranspose(xi, yi, -a.imag()); + } + if (!zero_real) + { + Ai->AddMultTranspose(xr, yr, -a.imag()); + } + } + } +} + +void ComplexWrapperOperator::AddMultHermitianTranspose(const ComplexVector &x, + ComplexVector &y, + const std::complex a) const +{ + constexpr bool zero_real = false; + constexpr bool zero_imag = false; + const Vector &xr = x.Real(); + const Vector &xi = x.Imag(); + Vector &yr = y.Real(); + Vector &yi = y.Imag(); + if (a.real() != 0.0 && a.imag() != 0.0) + { + tx.SetSize(width); + MultHermitianTranspose(x, tx); + y.AXPY(a, tx); + } + else if (a.real() != 0.0) + { + if (Ar) + { + if (!zero_real) + { + Ar->AddMultTranspose(xr, yr, a.real()); + } + if (!zero_imag) + { + Ar->AddMultTranspose(xi, yi, a.real()); + } + } + if (Ai) + { + if (!zero_imag) + { + Ai->AddMultTranspose(xi, yr, a.real()); + } + if (!zero_real) + { + Ai->AddMultTranspose(xr, yi, -a.real()); + } + } + } + else if (a.imag() != 0.0) + { + if (Ar) + { + if (!zero_real) + { + Ar->AddMultTranspose(xr, yi, a.imag()); + } + if (!zero_imag) + { + Ar->AddMultTranspose(xi, yr, -a.imag()); + } + } + if (Ai) + { + if (!zero_imag) + { + Ai->AddMultTranspose(xi, yi, a.imag()); + } + if (!zero_real) + { + Ai->AddMultTranspose(xr, yr, a.imag()); + } + } + } +} + +SumOperator::SumOperator(const Operator &op, double a) : Operator(op.Height(), op.Width()) +{ + AddOperator(op, a); + z.UseDevice(true); +} + +void SumOperator::AddOperator(const Operator &op, double a) +{ + MFEM_VERIFY(op.Height() == height && op.Width() == width, + "Invalid Operator dimensions for SumOperator!"); + ops.emplace_back(&op, a); +} + +void SumOperator::Mult(const Vector &x, Vector &y) const +{ + if (ops.size() == 1) + { + ops.front().first->Mult(x, y); + if (ops.front().second != 1.0) + { + y *= ops.front().second; + } + return; + } + y = 0.0; + AddMult(x, y); +} + +void SumOperator::MultTranspose(const Vector &x, Vector &y) const +{ + if (ops.size() == 1) + { + ops.front().first->MultTranspose(x, y); + if (ops.front().second != 1.0) + { + y *= ops.front().second; + } + return; + } + y = 0.0; + AddMultTranspose(x, y); +} + +void SumOperator::AddMult(const Vector &x, Vector &y, const double a) const +{ + z.SetSize(y.Size()); + for (const auto &[op, c] : ops) + { + op->Mult(x, z); + y.Add(a * c, z); + } +} + +void SumOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) const +{ + z.SetSize(y.Size()); + for (const auto &[op, c] : ops) + { + op->MultTranspose(x, z); + y.Add(a * c, z); + } +} + +template <> +void BaseDiagonalOperator::Mult(const Vector &x, Vector &y) const +{ + const bool use_dev = x.UseDevice() || y.UseDevice(); + const int N = this->height; + const auto *D = d.Read(use_dev); + const auto *X = x.Read(use_dev); + auto *Y = y.Write(use_dev); + mfem::forall_switch(use_dev, N, [=] MFEM_HOST_DEVICE(int i) { Y[i] = D[i] * X[i]; }); +} + +template <> +void BaseDiagonalOperator::Mult(const ComplexVector &x, + ComplexVector &y) const +{ + const bool use_dev = x.UseDevice() || y.UseDevice(); + const int N = this->height; + const auto *DR = d.Real().Read(use_dev); + const auto *DI = d.Imag().Read(use_dev); + const auto *XR = x.Real().Read(use_dev); + const auto *XI = x.Imag().Read(use_dev); + auto *YR = y.Real().Write(use_dev); + auto *YI = y.Imag().Write(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + YR[i] = DR[i] * XR[i] - DI[i] * XI[i]; + YI[i] = DI[i] * XR[i] + DR[i] * XI[i]; + }); +} + +template <> +void BaseDiagonalOperator::AddMult(const Vector &x, Vector &y, + const double a) const +{ + const bool use_dev = x.UseDevice() || y.UseDevice(); + const int N = this->height; + const auto *D = d.Read(use_dev); + const auto *X = x.Read(use_dev); + auto *Y = y.Write(use_dev); + mfem::forall_switch(use_dev, N, [=] MFEM_HOST_DEVICE(int i) { Y[i] += a * D[i] * X[i]; }); +} + +template <> +void BaseDiagonalOperator::AddMult(const ComplexVector &x, + ComplexVector &y, + const std::complex a) const +{ + const bool use_dev = x.UseDevice() || y.UseDevice(); + const int N = this->height; + const double ar = a.real(); + const double ai = a.imag(); + const auto *DR = d.Real().Read(use_dev); + const auto *DI = d.Imag().Read(use_dev); + const auto *XR = x.Real().Read(use_dev); + const auto *XI = x.Imag().Read(use_dev); + auto *YR = y.Real().Write(use_dev); + auto *YI = y.Imag().Write(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto tr = DR[i] * XR[i] - DI[i] * XI[i]; + const auto ti = DI[i] * XR[i] + DR[i] * XI[i]; + YR[i] += ar * tr - ai * ti; + YI[i] += ai * ti + ar * ti; + }); +} + +template <> +void DiagonalOperatorHelper, + ComplexOperator>::MultHermitianTranspose(const ComplexVector &x, + ComplexVector &y) const +{ + const ComplexVector &d = + static_cast *>(this)->d; + const bool use_dev = x.UseDevice() || y.UseDevice(); + const int N = this->height; + const auto *DR = d.Real().Read(use_dev); + const auto *DI = d.Imag().Read(use_dev); + const auto *XR = x.Real().Read(use_dev); + const auto *XI = x.Imag().Read(use_dev); + auto *YR = y.Real().Write(use_dev); + auto *YI = y.Imag().Write(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + YR[i] = DR[i] * XR[i] + DI[i] * XI[i]; + YI[i] = -DI[i] * XR[i] + DR[i] * XI[i]; + }); +} + +template <> +void DiagonalOperatorHelper, ComplexOperator>:: + AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a) const +{ + const ComplexVector &d = + static_cast *>(this)->d; + const bool use_dev = x.UseDevice() || y.UseDevice(); + const int N = this->height; + const double ar = a.real(); + const double ai = a.imag(); + const auto *DR = d.Real().Read(use_dev); + const auto *DI = d.Imag().Read(use_dev); + const auto *XR = x.Real().Read(use_dev); + const auto *XI = x.Imag().Read(use_dev); + auto *YR = y.Real().Write(use_dev); + auto *YI = y.Imag().Write(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto tr = DR[i] * XR[i] + DI[i] * XI[i]; + const auto ti = -DI[i] * XR[i] + DR[i] * XI[i]; + YR[i] += ar * tr - ai * ti; + YI[i] += ai * ti + ar * ti; + }); +} + +namespace linalg +{ + +template <> +double Norml2(MPI_Comm comm, const Vector &x, const Operator &B, Vector &Bx) +{ + B.Mult(x, Bx); + double dot = Dot(comm, Bx, x); + MFEM_ASSERT(dot > 0.0, + "Non-positive vector norm in normalization (dot = " << dot << ")!"); + return std::sqrt(dot); +} + +template <> +double Norml2(MPI_Comm comm, const ComplexVector &x, const Operator &B, ComplexVector &Bx) +{ + // For SPD B, xᴴ B x is real. + B.Mult(x.Real(), Bx.Real()); + B.Mult(x.Imag(), Bx.Imag()); + std::complex dot = Dot(comm, Bx, x); + MFEM_ASSERT(dot.real() > 0.0 && std::abs(dot.imag()) < 1.0e-9 * dot.real(), + "Non-positive vector norm in normalization (dot = " << dot << ")!"); + return std::sqrt(dot.real()); +} + +double SpectralNorm(MPI_Comm comm, const Operator &A, bool sym, double tol, int max_it) +{ + ComplexWrapperOperator Ar(const_cast(&A), nullptr); // Non-owning constructor + return SpectralNorm(comm, Ar, sym, tol, max_it); +} + +double SpectralNorm(MPI_Comm comm, const ComplexOperator &A, bool herm, double tol, + int max_it) +{ + // XX TODO: Use ARPACK or SLEPc for this when configured. +#if defined(PALACE_WITH_SLEPC) + return slepc::GetMaxSingularValue(comm, A, herm, tol, max_it); +#else + // Power iteration loop: ||A||₂² = λₙ(Aᴴ A). + int it = 0; + double res = 0.0; + double l = 0.0, l0 = 0.0; + ComplexVector u(A.Height()), v(A.Height()); + u.UseDevice(true); + v.UseDevice(true); + SetRandom(comm, u); + Normalize(comm, u); + while (it < max_it) + { + A.Mult(u, v); + if (herm) + { + u = v; + } + else + { + A.MultHermitianTranspose(v, u); + } + l = Normalize(comm, u); + if (it > 0) + { + res = std::abs(l - l0) / l0; + if (res < tol) + { + break; + } + } + l0 = l; + it++; + } + if (it >= max_it) + { + Mpi::Warning(comm, + "Power iteration did not converge in {:d} iterations, res = {:.3e}, " + "lambda = {:.3e}!\n", + it, res, l); + } + return herm ? l : std::sqrt(l); +#endif +} + +} // namespace linalg + +} // namespace palace diff --git a/palace/linalg/operator.hpp b/palace/linalg/operator.hpp index 9ba7cec897..9742252a19 100644 --- a/palace/linalg/operator.hpp +++ b/palace/linalg/operator.hpp @@ -1,330 +1,397 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_OPERATOR_HPP -#define PALACE_LINALG_OPERATOR_HPP - -#include -#include -#include -#include -#include -#include "linalg/vector.hpp" - -namespace palace -{ - -// -// Functionality extending mfem::Operator from MFEM. -// - -// Abstract base class for complex-valued operators. -class ComplexOperator -{ -protected: - // The size of the complex-valued operator. - int height, width; - -public: - ComplexOperator(int s = 0) : height(s), width(s) {} - ComplexOperator(int h, int w) : height(h), width(w) {} - virtual ~ComplexOperator() = default; - - // Get the height (size of output) of the operator. - int Height() const { return height; } - - // Get the width (size of input) of the operator. - int Width() const { return width; } - - // Test whether or not the operator is purely real or imaginary. - virtual bool IsReal() const; - virtual bool IsImag() const; - - // Test whether or not we can access the real and imaginary operator parts. - virtual bool HasReal() const; - virtual bool HasImag() const; - - // Get access to the real and imaginary operator parts. - virtual const Operator *Real() const; - virtual Operator *Real(); - virtual const Operator *Imag() const; - virtual Operator *Imag(); - - // Operator application. - virtual void Mult(const ComplexVector &x, ComplexVector &y) const = 0; - - virtual void MultTranspose(const ComplexVector &x, ComplexVector &y) const; - - virtual void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const; - - virtual void AddMult(const ComplexVector &x, ComplexVector &y, - const std::complex a = 1.0) const; - - virtual void AddMultTranspose(const ComplexVector &x, ComplexVector &y, - const std::complex a = 1.0) const; - - virtual void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, - const std::complex a = 1.0) const; -}; - -// A complex-valued operator represented using a block 2 x 2 equivalent-real formulation: -// [ yr ] = [ Ar -Ai ] [ xr ] -// [ yi ] [ Ai Ar ] [ xi ] . -class ComplexWrapperOperator : public ComplexOperator -{ -private: - // Storage and access for real and imaginary parts of the operator. - std::unique_ptr data_Ar, data_Ai; - Operator *Ar, *Ai; - - // Temporary storage for operator application. - mutable ComplexVector tx, ty; - - ComplexWrapperOperator(std::unique_ptr &&dAr, std::unique_ptr &&dAi, - Operator *pAr, Operator *pAi); - -public: - // Construct a complex operator which inherits ownership of the input real and imaginary - // parts. - ComplexWrapperOperator(std::unique_ptr &&Ar, std::unique_ptr &&Ai); - - // Non-owning constructor. - ComplexWrapperOperator(Operator *Ar, Operator *Ai); - - bool IsReal() const override { return Ai == nullptr; } - bool IsImag() const override { return Ar == nullptr; } - bool HasReal() const override { return Ar != nullptr; } - bool HasImag() const override { return Ai != nullptr; } - const Operator *Real() const override { return Ar; } - Operator *Real() override { return Ar; } - const Operator *Imag() const override { return Ai; } - Operator *Imag() override { return Ai; } - - void Mult(const ComplexVector &x, ComplexVector &y) const override; - - void MultTranspose(const ComplexVector &x, ComplexVector &y) const override; - - void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const override; - - void AddMult(const ComplexVector &x, ComplexVector &y, - const std::complex a = 1.0) const override; - - void AddMultTranspose(const ComplexVector &x, ComplexVector &y, - const std::complex a = 1.0) const override; - - void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, - const std::complex a = 1.0) const override; -}; - -// Wrap a sequence of operators of the same dimensions and optional coefficients. -class SumOperator : public Operator -{ -private: - std::vector> ops; - -public: - SumOperator(int s) : Operator(s) {} - SumOperator(int h, int w) : Operator(h, w) {} - SumOperator(const Operator &op, double c = 1.0); - - void AddOperator(const Operator &op, double c = 1.0); - - void Mult(const Vector &x, Vector &y) const override; - - void MultTranspose(const Vector &x, Vector &y) const override; - - void AddMult(const Vector &x, Vector &y, const double a = 1.0) const override; - - void AddMultTranspose(const Vector &x, Vector &y, const double a = 1.0) const override; -}; - -// Wraps two operators such that: (AB)ᵀ = BᵀAᵀ and, for complex symmetric operators, the -// Hermitian transpose operation is (AB)ᴴ = BᴴAᴴ. -template -class ProductOperatorHelper : public OperType -{ -}; - -template -class ProductOperatorHelper : public Operator -{ -public: - ProductOperatorHelper(int h, int w) : Operator(h, w) {} -}; - -template -class ProductOperatorHelper : public ComplexOperator -{ -public: - ProductOperatorHelper(int h, int w) : ComplexOperator(h, w) {} - void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const override - { - const ComplexOperator &A = static_cast(this)->A; - const ComplexOperator &B = static_cast(this)->B; - ComplexVector &z = static_cast(this)->z; - A.MultHermitianTranspose(x, z); - B.MultHermitianTranspose(z, y); - } -}; - -template -class BaseProductOperator - : public ProductOperatorHelper, OperType> -{ - friend class ProductOperatorHelper, OperType>; - - using VecType = typename std::conditional::value, - ComplexVector, Vector>::type; - -private: - const OperType &A, &B; - mutable VecType z; - -public: - BaseProductOperator(const OperType &A, const OperType &B) - : ProductOperatorHelper, OperType>(A.Height(), B.Width()), - A(A), B(B), z(B.Height()) - { - } - - void Mult(const VecType &x, VecType &y) const override - { - B.Mult(x, z); - A.Mult(z, y); - } - - void MultTranspose(const VecType &x, VecType &y) const override - { - A.MultTranspose(x, z); - B.MultTranspose(z, y); - } -}; - -using ProductOperator = BaseProductOperator; -using ComplexProductOperator = BaseProductOperator; - -// Applies the simple, symmetric but not necessarily Hermitian, operator: diag(d). -template -class DiagonalOperatorHelper : public OperType -{ -}; - -template -class DiagonalOperatorHelper : public Operator -{ -public: - DiagonalOperatorHelper(int s) : Operator(s) {} -}; - -template -class DiagonalOperatorHelper : public ComplexOperator -{ -public: - DiagonalOperatorHelper(int s) : ComplexOperator(s) {} - void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const override; -}; - -template -class BaseDiagonalOperator - : public DiagonalOperatorHelper, OperType> -{ - friend class DiagonalOperatorHelper, OperType>; - - using VecType = typename std::conditional::value, - ComplexVector, Vector>::type; - -private: - const VecType &d; - -public: - BaseDiagonalOperator(const VecType &d) - : DiagonalOperatorHelper, OperType>(d.Size()), d(d) - { - } - - void Mult(const VecType &x, VecType &y) const override; - - void MultTranspose(const VecType &x, VecType &y) const override { Mult(x, y); } -}; - -using DiagonalOperator = BaseDiagonalOperator; -using ComplexDiagonalOperator = BaseDiagonalOperator; - -// A container for a sequence of operators corresponding to a multigrid hierarchy. -// Optionally includes operators for the auxiliary space at each level as well. The -// Operators are stored from coarsest to finest level. The height and width of this operator -// are never set. -template -class BaseMultigridOperator : public OperType -{ - using VecType = typename std::conditional::value, - ComplexVector, Vector>::type; - -private: - std::vector> ops, aux_ops; - -public: - BaseMultigridOperator(std::size_t l) : OperType(0) - { - ops.reserve(l); - aux_ops.reserve(l); - } - - void AddOperator(std::unique_ptr &&op) - { - ops.push_back(std::move(op)); - this->height = ops.back()->Height(); - this->width = ops.back()->Width(); - } - - void AddAuxiliaryOperator(std::unique_ptr &&aux_op) - { - aux_ops.push_back(std::move(aux_op)); - } - - bool HasAuxiliaryOperators() const { return !aux_ops.empty(); } - - auto GetNumLevels() const { return ops.size(); } - auto GetNumAuxiliaryLevels() const { return aux_ops.size(); } - - const OperType &GetFinestOperator() const { return *ops.back(); } - const OperType &GetFinestAuxiliaryOperator() const { return *aux_ops.back(); } - - const OperType &GetOperatorAtLevel(std::size_t l) const - { - MFEM_ASSERT(l >= 0 && l < GetNumLevels(), - "Out of bounds multigrid level operator requested!"); - return *ops[l]; - } - const OperType &GetAuxiliaryOperatorAtLevel(std::size_t l) const - { - MFEM_ASSERT(l < GetNumAuxiliaryLevels(), - "Out of bounds multigrid level auxiliary operator requested!"); - return *aux_ops[l]; - } - - void Mult(const VecType &x, VecType &y) const override { GetFinestOperator().Mult(x, y); } - void MultTranspose(const VecType &x, VecType &y) const override - { - GetFinestOperator().MultTranspose(x, y); - } -}; - -using MultigridOperator = BaseMultigridOperator; -using ComplexMultigridOperator = BaseMultigridOperator; - -namespace linalg -{ - -// Estimate operator 2-norm (spectral norm) using power iteration. Assumes the operator is -// not symmetric or Hermitian unless specified. -double SpectralNorm(MPI_Comm comm, const Operator &A, bool sym = false, double tol = 1.0e-4, - int max_it = 1000); -double SpectralNorm(MPI_Comm comm, const ComplexOperator &A, bool herm = false, - double tol = 1.0e-4, int max_it = 1000); - -} // namespace linalg - -} // namespace palace - -#endif // PALACE_LINALG_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_OPERATOR_HPP +#define PALACE_LINALG_OPERATOR_HPP + +#include +#include +#include +#include +#include +#include "linalg/vector.hpp" + +namespace palace +{ + +// +// Functionality extending mfem::Operator from MFEM. +// + +using Operator = mfem::Operator; + +// Abstract base class for complex-valued operators. +class ComplexOperator +{ +protected: + // The size of the complex-valued operator. + int height, width; + +public: + ComplexOperator(int s = 0) : height(s), width(s) {} + ComplexOperator(int h, int w) : height(h), width(w) {} + virtual ~ComplexOperator() = default; + + // Get the height (size of output) of the operator. + int Height() const { return height; } + + // Get the width (size of input) of the operator. + int Width() const { return width; } + + // Test whether or not the operator is purely real or imaginary. + virtual bool IsReal() const { return !Imag(); } + virtual bool IsImag() const { return !Real(); } + + // Get access to the real and imaginary operator parts separately (may be empty if + // operator is purely real or imaginary). + virtual const Operator *Real() const; + virtual const Operator *Imag() const; + + // Diagonal assembly. + virtual void AssembleDiagonal(ComplexVector &diag) const; + + // Operator application. + virtual void Mult(const ComplexVector &x, ComplexVector &y) const = 0; + + virtual void MultTranspose(const ComplexVector &x, ComplexVector &y) const; + + virtual void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const; + + virtual void AddMult(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const; + + virtual void AddMultTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const; + + virtual void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const; +}; + +// A complex-valued operator represented using a block 2 x 2 equivalent-real formulation: +// [ yr ] = [ Ar -Ai ] [ xr ] +// [ yi ] [ Ai Ar ] [ xi ] . +class ComplexWrapperOperator : public ComplexOperator +{ +private: + // Storage and access for real and imaginary parts of the operator. + std::unique_ptr data_Ar, data_Ai; + const Operator *Ar, *Ai; + + // Temporary storage for operator application. + mutable ComplexVector tx, ty; + + ComplexWrapperOperator(std::unique_ptr &&dAr, std::unique_ptr &&dAi, + const Operator *pAr, const Operator *pAi); + +public: + // Construct a complex operator which inherits ownership of the input real and imaginary + // parts. + ComplexWrapperOperator(std::unique_ptr &&Ar, std::unique_ptr &&Ai); + + // Non-owning constructor. + ComplexWrapperOperator(const Operator *Ar, const Operator *Ai); + + const Operator *Real() const override { return Ar; } + const Operator *Imag() const override { return Ai; } + + void AssembleDiagonal(ComplexVector &diag) const override; + + void Mult(const ComplexVector &x, ComplexVector &y) const override; + + void MultTranspose(const ComplexVector &x, ComplexVector &y) const override; + + void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const override; + + void AddMult(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const override; + + void AddMultTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const override; + + void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const override; +}; + +// Wrap a sequence of operators of the same dimensions and optional coefficients. +class SumOperator : public Operator +{ +private: + std::vector> ops; + mutable Vector z; + +public: + SumOperator(int s) : Operator(s) { z.UseDevice(true); } + SumOperator(int h, int w) : Operator(h, w) { z.UseDevice(true); } + SumOperator(const Operator &op, double a = 1.0); + + void AddOperator(const Operator &op, double a = 1.0); + + void Mult(const Vector &x, Vector &y) const override; + + void MultTranspose(const Vector &x, Vector &y) const override; + + void AddMult(const Vector &x, Vector &y, const double a = 1.0) const override; + + void AddMultTranspose(const Vector &x, Vector &y, const double a = 1.0) const override; +}; + +// Wraps two operators such that: (AB)ᵀ = BᵀAᵀ and, for complex symmetric operators, the +// Hermitian transpose operation is (AB)ᴴ = BᴴAᴴ. +template +class ProductOperatorHelper : public OperType +{ +}; + +template +class ProductOperatorHelper : public Operator +{ +public: + ProductOperatorHelper(int h, int w) : Operator(h, w) {} +}; + +template +class ProductOperatorHelper : public ComplexOperator +{ +public: + ProductOperatorHelper(int h, int w) : ComplexOperator(h, w) {} + + void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const override + { + const ComplexOperator &A = static_cast(this)->A; + const ComplexOperator &B = static_cast(this)->B; + ComplexVector &z = static_cast(this)->z; + A.MultHermitianTranspose(x, z); + B.MultHermitianTranspose(z, y); + } + + void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const override + { + const ComplexOperator &A = static_cast(this)->A; + const ComplexOperator &B = static_cast(this)->B; + ComplexVector &z = static_cast(this)->z; + A.MultHermitianTranspose(x, z); + B.AddMultHermitianTranspose(z, y, a); + } +}; + +template +class BaseProductOperator + : public ProductOperatorHelper, OperType> +{ + friend class ProductOperatorHelper, OperType>; + + using VecType = typename std::conditional::value, + ComplexVector, Vector>::type; + using ScalarType = + typename std::conditional::value, + std::complex, double>::type; + +private: + const OperType &A, &B; + mutable VecType z; + +public: + BaseProductOperator(const OperType &A, const OperType &B) + : ProductOperatorHelper, OperType>(A.Height(), B.Width()), + A(A), B(B), z(B.Height()) + { + z.UseDevice(true); + } + + void Mult(const VecType &x, VecType &y) const override + { + B.Mult(x, z); + A.Mult(z, y); + } + + void MultTranspose(const VecType &x, VecType &y) const override + { + A.MultTranspose(x, z); + B.MultTranspose(z, y); + } + + void AddMult(const VecType &x, VecType &y, const ScalarType a = 1.0) const override + { + B.Mult(x, z); + A.AddMult(z, y, a); + } + + void AddMultTranspose(const VecType &x, VecType &y, + const ScalarType a = 1.0) const override + { + A.MultTranspose(x, z); + B.AddMultTranspose(z, y, a); + } +}; + +using ProductOperator = BaseProductOperator; +using ComplexProductOperator = BaseProductOperator; + +// Applies the simple, symmetric but not necessarily Hermitian, operator: diag(d). +template +class DiagonalOperatorHelper : public OperType +{ +}; + +template +class DiagonalOperatorHelper : public Operator +{ +public: + DiagonalOperatorHelper(int s) : Operator(s) {} +}; + +template +class DiagonalOperatorHelper : public ComplexOperator +{ +public: + DiagonalOperatorHelper(int s) : ComplexOperator(s) {} + + void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const override; + + void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const override; +}; + +template +class BaseDiagonalOperator + : public DiagonalOperatorHelper, OperType> +{ + friend class DiagonalOperatorHelper, OperType>; + + using VecType = typename std::conditional::value, + ComplexVector, Vector>::type; + using ScalarType = + typename std::conditional::value, + std::complex, double>::type; + +private: + const VecType &d; + +public: + BaseDiagonalOperator(const VecType &d) + : DiagonalOperatorHelper, OperType>(d.Size()), d(d) + { + } + + void Mult(const VecType &x, VecType &y) const override; + + void MultTranspose(const VecType &x, VecType &y) const override { Mult(x, y); } + + void AddMult(const VecType &x, VecType &y, const ScalarType a = 1.0) const override; + + void AddMultTranspose(const VecType &x, VecType &y, + const ScalarType a = 1.0) const override + { + AddMult(x, y, a); + } +}; + +using DiagonalOperator = BaseDiagonalOperator; +using ComplexDiagonalOperator = BaseDiagonalOperator; + +// A container for a sequence of operators corresponding to a multigrid hierarchy. +// Optionally includes operators for the auxiliary space at each level as well. The +// Operators are stored from coarsest to finest level. The height and width of this operator +// are never set. +template +class BaseMultigridOperator : public OperType +{ + using VecType = typename std::conditional::value, + ComplexVector, Vector>::type; + using ScalarType = + typename std::conditional::value, + std::complex, double>::type; + +private: + std::vector> ops, aux_ops; + +public: + BaseMultigridOperator(std::size_t l) : OperType(0) + { + ops.reserve(l); + aux_ops.reserve(l); + } + + void AddOperator(std::unique_ptr &&op) + { + ops.push_back(std::move(op)); + this->height = ops.back()->Height(); + this->width = ops.back()->Width(); + } + + void AddAuxiliaryOperator(std::unique_ptr &&aux_op) + { + aux_ops.push_back(std::move(aux_op)); + } + + bool HasAuxiliaryOperators() const { return !aux_ops.empty(); } + auto GetNumLevels() const { return ops.size(); } + auto GetNumAuxiliaryLevels() const { return aux_ops.size(); } + + const OperType &GetFinestOperator() const { return *ops.back(); } + const OperType &GetFinestAuxiliaryOperator() const { return *aux_ops.back(); } + + const OperType &GetOperatorAtLevel(std::size_t l) const + { + MFEM_ASSERT(l < GetNumLevels(), "Out of bounds multigrid level operator requested!"); + return *ops[l]; + } + const OperType &GetAuxiliaryOperatorAtLevel(std::size_t l) const + { + MFEM_ASSERT(l < GetNumAuxiliaryLevels(), + "Out of bounds multigrid level auxiliary operator requested!"); + return *aux_ops[l]; + } + + void Mult(const VecType &x, VecType &y) const override { GetFinestOperator().Mult(x, y); } + + void MultTranspose(const VecType &x, VecType &y) const override + { + GetFinestOperator().MultTranspose(x, y); + } + + void AddMult(const VecType &x, VecType &y, const ScalarType a = 1.0) const override + { + GetFinestOperator().AddMult(x, y, a); + } + + void AddMultTranspose(const VecType &x, VecType &y, + const ScalarType a = 1.0) const override + { + GetFinestOperator().AddMultTranspose(x, y, a); + } +}; + +using MultigridOperator = BaseMultigridOperator; +using ComplexMultigridOperator = BaseMultigridOperator; + +namespace linalg +{ + +// Calculate the vector norm with respect to an SPD matrix B. +template +double Norml2(MPI_Comm comm, const VecType &x, const Operator &B, VecType &Bx); + +// Normalize the vector with respect to an SPD matrix B. +template +inline double Normalize(MPI_Comm comm, VecType &x, const Operator &B, VecType &Bx) +{ + double norm = Norml2(comm, x, B, Bx); + MFEM_ASSERT(norm > 0.0, "Zero vector norm in normalization!"); + x *= 1.0 / norm; + return norm; +} + +// Estimate operator 2-norm (spectral norm) using power iteration. Assumes the operator is +// not symmetric or Hermitian unless specified. +double SpectralNorm(MPI_Comm comm, const Operator &A, bool sym = false, double tol = 1.0e-4, + int max_it = 1000); +double SpectralNorm(MPI_Comm comm, const ComplexOperator &A, bool herm = false, + double tol = 1.0e-4, int max_it = 1000); + +} // namespace linalg + +} // namespace palace + +#endif // PALACE_LINALG_OPERATOR_HPP diff --git a/palace/linalg/orthog.hpp b/palace/linalg/orthog.hpp index aded6ebd68..7a442dc3f8 100644 --- a/palace/linalg/orthog.hpp +++ b/palace/linalg/orthog.hpp @@ -1,69 +1,69 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_ORTHOG_HPP -#define PALACE_LINALG_ORTHOG_HPP - -#include -#include "linalg/vector.hpp" -#include "utils/communication.hpp" - -namespace palace::linalg -{ - -// -// Orthogonalization functions for orthogonalizing a vector against a number of basis -// vectors using modified or classical Gram-Schmidt. -// - -template -inline void OrthogonalizeColumnMGS(MPI_Comm comm, const std::vector &V, VecType &w, - ScalarType *H, int m) -{ - MFEM_ASSERT(static_cast(m) <= V.size(), - "Out of bounds number of columns for MGS orthogonalization!"); - for (int j = 0; j < m; j++) - { - H[j] = linalg::Dot(comm, w, V[j]); // Global inner product - w.Add(-H[j], V[j]); - } -} - -template -inline void OrthogonalizeColumnCGS(MPI_Comm comm, const std::vector &V, VecType &w, - ScalarType *H, int m, bool refine = false) -{ - MFEM_ASSERT(static_cast(m) <= V.size(), - "Out of bounds number of columns for CGS orthogonalization!"); - if (m == 0) - { - return; - } - for (int j = 0; j < m; j++) - { - H[j] = w * V[j]; // Local inner product - } - Mpi::GlobalSum(m, H, comm); - for (int j = 0; j < m; j++) - { - w.Add(-H[j], V[j]); - } - if (refine) - { - std::vector dH(m); - for (int j = 0; j < m; j++) - { - dH[j] = w * V[j]; // Local inner product - } - Mpi::GlobalSum(m, dH.data(), comm); - for (int j = 0; j < m; j++) - { - H[j] += dH[j]; - w.Add(-dH[j], V[j]); - } - } -} - -} // namespace palace::linalg - -#endif // PALACE_LINALG_ORTHOG_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_ORTHOG_HPP +#define PALACE_LINALG_ORTHOG_HPP + +#include +#include "linalg/vector.hpp" +#include "utils/communication.hpp" + +namespace palace::linalg +{ + +// +// Orthogonalization functions for orthogonalizing a vector against a number of basis +// vectors using modified or classical Gram-Schmidt. +// + +template +inline void OrthogonalizeColumnMGS(MPI_Comm comm, const std::vector &V, VecType &w, + ScalarType *H, int m) +{ + MFEM_ASSERT(static_cast(m) <= V.size(), + "Out of bounds number of columns for MGS orthogonalization!"); + for (int j = 0; j < m; j++) + { + H[j] = linalg::Dot(comm, w, V[j]); // Global inner product + w.Add(-H[j], V[j]); + } +} + +template +inline void OrthogonalizeColumnCGS(MPI_Comm comm, const std::vector &V, VecType &w, + ScalarType *H, int m, bool refine = false) +{ + MFEM_ASSERT(static_cast(m) <= V.size(), + "Out of bounds number of columns for CGS orthogonalization!"); + if (m == 0) + { + return; + } + for (int j = 0; j < m; j++) + { + H[j] = w * V[j]; // Local inner product + } + Mpi::GlobalSum(m, H, comm); + for (int j = 0; j < m; j++) + { + w.Add(-H[j], V[j]); + } + if (refine) + { + std::vector dH(m); + for (int j = 0; j < m; j++) + { + dH[j] = w * V[j]; // Local inner product + } + Mpi::GlobalSum(m, dH.data(), comm); + for (int j = 0; j < m; j++) + { + H[j] += dH[j]; + w.Add(-dH[j], V[j]); + } + } +} + +} // namespace palace::linalg + +#endif // PALACE_LINALG_ORTHOG_HPP diff --git a/palace/linalg/petsc.hpp b/palace/linalg/petsc.hpp index b62f5e3fb6..b8c6a65cdc 100644 --- a/palace/linalg/petsc.hpp +++ b/palace/linalg/petsc.hpp @@ -1,36 +1,44 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_PETSC_HPP -#define PALACE_LINALG_PETSC_HPP - -#if defined(PALACE_WITH_SLEPC) - -#include -#include - -#if !defined(PETSC_USE_REAL_DOUBLE) -#error "PETSc should be compiled with double precision!" -#endif -#if defined(PETSC_HAVE_HYPRE) -#error \ - "PETSc should be built without Hypre to avoid conflicts with MFEM's Hypre dependency!" -#endif -#if defined(PETSC_USE_64BIT_INDICES) && !(defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT)) -#warning "Mismatch between big HYPRE (32bit) and PETSc (64bit) integer types!" -#endif -#if !defined(PETSC_USE_64BIT_INDICES) && (defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT)) -#warning "Mismatch between big HYPRE (64bit) and PETSc (32bit) integer types!" -#endif - -// Forward declarations of PETSc objects. -typedef struct _p_Vec *Vec; -typedef struct _p_Mat *Mat; - -// Error handling similar to Petsc's PetscCallAbort but always aborts on the global -// PETSC_COMM_WORLD communicator. -#define PalacePetscCall(...) PetscCallAbort(PETSC_COMM_WORLD, __VA_ARGS__) - -#endif - -#endif // PALACE_LINALG_PETSC_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_PETSC_HPP +#define PALACE_LINALG_PETSC_HPP + +#if defined(PALACE_WITH_SLEPC) + +#include +#include + +#if !defined(PETSC_USE_REAL_DOUBLE) +#error "PETSc should be compiled with double precision!" +#endif +#if defined(PETSC_HAVE_HYPRE) +#error \ + "PETSc should be built without Hypre to avoid conflicts with MFEM's Hypre dependency!" +#endif +#if defined(PETSC_USE_64BIT_INDICES) && !(defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT)) +#warning "Mismatch between big HYPRE (32bit) and PETSc (64bit) integer types!" +#endif +#if !defined(PETSC_USE_64BIT_INDICES) && (defined(HYPRE_BIGINT) || defined(HYPRE_MIXEDINT)) +#warning "Mismatch between big HYPRE (64bit) and PETSc (32bit) integer types!" +#endif +#if (defined(PETSC_HAVE_CUDA) && !defined(MFEM_USE_CUDA)) || \ + (!defined(PETSC_HAVE_CUDA) && defined(MFEM_USE_CUDA)) +#error "Mismatch between MFEM and PETSc CUDA support!" +#endif +#if (defined(PETSC_HAVE_HIP) && !defined(MFEM_USE_HIP)) || \ + (!defined(PETSC_HAVE_HIP) && defined(MFEM_USE_HIP)) +#error "Mismatch between MFEM and PETSc HIP support!" +#endif + +// Forward declarations of PETSc objects. +typedef struct _p_Vec *Vec; +typedef struct _p_Mat *Mat; + +// Error handling similar to Petsc's PetscCallAbort but always aborts on the global +// PETSC_COMM_WORLD communicator. +#define PalacePetscCall(...) PetscCallAbort(PETSC_COMM_WORLD, __VA_ARGS__) + +#endif + +#endif // PALACE_LINALG_PETSC_HPP diff --git a/palace/linalg/rap.cpp b/palace/linalg/rap.cpp index 30ee392463..a56d8480ea 100644 --- a/palace/linalg/rap.cpp +++ b/palace/linalg/rap.cpp @@ -1,768 +1,974 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "rap.hpp" - -#include "fem/bilinearform.hpp" - -namespace palace -{ - -ParOperator::ParOperator(std::unique_ptr &&dA, Operator *pA, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - bool test_restrict) - : Operator(test_fespace.GetTrueVSize(), trial_fespace.GetTrueVSize()), - data_A(std::move(dA)), A((data_A != nullptr) ? data_A.get() : pA), - trial_fespace(trial_fespace), test_fespace(test_fespace), use_R(test_restrict), - dbc_tdof_list(nullptr), diag_policy(DiagonalPolicy::DIAG_ONE), RAP(nullptr) -{ - MFEM_VERIFY(A, "Cannot construct ParOperator from an empty matrix!"); - lx.SetSize(A->Width()); - ly.SetSize(A->Height()); - ty.SetSize(width); -} - -ParOperator::ParOperator(std::unique_ptr &&A, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - bool test_restrict) - : ParOperator(std::move(A), nullptr, trial_fespace, test_fespace, test_restrict) -{ -} - -ParOperator::ParOperator(Operator &A, const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - bool test_restrict) - : ParOperator(nullptr, &A, trial_fespace, test_fespace, test_restrict) -{ -} - -const Operator &ParOperator::LocalOperator() const -{ - MFEM_VERIFY(A, "No local matrix available for ParOperator::LocalOperator!"); - return *A; -} - -Operator &ParOperator::LocalOperator() -{ - MFEM_VERIFY(A, "No local matrix available for ParOperator::LocalOperator!"); - return *A; -} - -void ParOperator::SetEssentialTrueDofs(const mfem::Array &tdof_list, - DiagonalPolicy policy) -{ - MFEM_VERIFY(policy == DiagonalPolicy::DIAG_ONE || policy == DiagonalPolicy::DIAG_ZERO, - "Essential boundary condition true dof elimination for ParOperator supports " - "only DiagonalPolicy::DIAG_ONE or DiagonalPolicy::DIAG_ZERO!"); - MFEM_VERIFY(height == width, "Set essential true dofs for both test and trial spaces " - "for rectangular ParOperator!"); - dbc_tdof_list = &tdof_list; - diag_policy = policy; -} - -void ParOperator::AssembleDiagonal(Vector &diag) const -{ - if (RAP) - { - RAP->AssembleDiagonal(diag); - return; - } - - // For an AMR mesh, a convergent diagonal is assembled with |P|ᵀ dₗ, where |P| has - // entry-wise absolute values of the conforming prolongation operator. - MFEM_VERIFY(&trial_fespace == &test_fespace, - "Diagonal assembly is only available for square ParOperator!"); - if (const auto *sA = dynamic_cast(A)) - { - sA->GetDiag(ly); - } - else - { - A->AssembleDiagonal(ly); - } - - // Parallel assemble and eliminate essential true dofs. - const Operator *P = test_fespace.GetProlongationMatrix(); - if (const auto *hP = dynamic_cast(P)) - { - hP->AbsMultTranspose(1.0, ly, 0.0, diag); - } - else - { - P->MultTranspose(ly, diag); - } - if (dbc_tdof_list) - { - if (diag_policy == DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(diag, *dbc_tdof_list, 1.0); - } - else if (diag_policy == DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(diag, *dbc_tdof_list, 0.0); - } - } -} - -mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const -{ - if (RAP) - { - return *RAP; - } - - // Build the square or rectangular assembled HypreParMatrix. - auto *sA = dynamic_cast(A); - std::unique_ptr data_sA; - if (!sA) - { - auto *cA = dynamic_cast(A); - MFEM_VERIFY(cA, "ParOperator::ParallelAssemble requires A as an mfem::SparseMatrix or " - "ceed::Operator!"); - data_sA = use_R ? DiscreteLinearOperator::FullAssemble(*cA, skip_zeros) - : BilinearForm::FullAssemble(*cA, skip_zeros); - sA = data_sA.get(); - } - if (&trial_fespace == &test_fespace) - { - mfem::HypreParMatrix *hA = - new mfem::HypreParMatrix(trial_fespace.GetComm(), trial_fespace.GlobalVSize(), - trial_fespace.GetDofOffsets(), sA); - const mfem::HypreParMatrix *P = trial_fespace.Dof_TrueDof_Matrix(); - RAP = std::make_unique(hypre_ParCSRMatrixRAP(*P, *hA, *P), true); - delete hA; - } - else - { - mfem::HypreParMatrix *hA = new mfem::HypreParMatrix( - trial_fespace.GetComm(), test_fespace.GlobalVSize(), trial_fespace.GlobalVSize(), - test_fespace.GetDofOffsets(), trial_fespace.GetDofOffsets(), sA); - const mfem::HypreParMatrix *P = trial_fespace.Dof_TrueDof_Matrix(); - if (!use_R) - { - const mfem::HypreParMatrix *Rt = test_fespace.Dof_TrueDof_Matrix(); - RAP = - std::make_unique(hypre_ParCSRMatrixRAP(*Rt, *hA, *P), true); - } - else - { - mfem::SparseMatrix *sRt = mfem::Transpose(*test_fespace.GetRestrictionMatrix()); - mfem::HypreParMatrix *hRt = new mfem::HypreParMatrix( - test_fespace.GetComm(), test_fespace.GlobalVSize(), - test_fespace.GlobalTrueVSize(), test_fespace.GetDofOffsets(), - test_fespace.GetTrueDofOffsets(), sRt); - RAP = std::make_unique(hypre_ParCSRMatrixRAP(*hRt, *hA, *P), - true); - delete sRt; - delete hRt; - } - delete hA; - } - if (data_A) - { - // The local matrix is no longer needed now that we have the parallel-assembled one. - data_A.reset(); - A = nullptr; - } - hypre_ParCSRMatrixSetNumNonzeros(*RAP); - - // Eliminate boundary conditions on the assembled (square) matrix. - if (dbc_tdof_list) - { - RAP->EliminateBC(*dbc_tdof_list, diag_policy); - } - return *RAP; -} - -void ParOperator::EliminateRHS(const Vector &x, Vector &b) const -{ - if (!dbc_tdof_list) - { - return; - } - - MFEM_VERIFY(A, "No local matrix available for ParOperator::EliminateRHS!"); - ty = 0.0; - linalg::SetSubVector(ty, *dbc_tdof_list, x); - trial_fespace.GetProlongationMatrix()->Mult(ty, lx); - - // Apply the unconstrained operator. - A->Mult(lx, ly); - - RestrictionMatrixAddMult(ly, b, -1.0); - if (diag_policy == DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(b, *dbc_tdof_list, x); - } - else if (diag_policy == DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(b, *dbc_tdof_list, 0.0); - } -} - -void ParOperator::Mult(const Vector &x, Vector &y) const -{ - MFEM_ASSERT(x.Size() == width && y.Size() == height, - "Incompatible dimensions for ParOperator::Mult!"); - if (RAP) - { - RAP->Mult(x, y); - return; - } - - if (dbc_tdof_list) - { - ty = x; - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - trial_fespace.GetProlongationMatrix()->Mult(ty, lx); - } - else - { - trial_fespace.GetProlongationMatrix()->Mult(x, lx); - } - - // Apply the operator on the L-vector. - A->Mult(lx, ly); - - RestrictionMatrixMult(ly, y); - if (dbc_tdof_list) - { - if (diag_policy == DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(y, *dbc_tdof_list, x); - } - else if (diag_policy == DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(y, *dbc_tdof_list, 0.0); - } - } -} - -void ParOperator::AddMult(const Vector &x, Vector &y, const double a) const -{ - MFEM_ASSERT(x.Size() == width && y.Size() == height, - "Incompatible dimensions for ParOperator::AddMult!"); - if (RAP) - { - RAP->AddMult(x, y, a); - return; - } - - if (dbc_tdof_list) - { - ty = x; - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - trial_fespace.GetProlongationMatrix()->Mult(ty, lx); - } - else - { - trial_fespace.GetProlongationMatrix()->Mult(x, lx); - } - - // Apply the operator on the L-vector. - A->Mult(lx, ly); - - if (dbc_tdof_list) - { - RestrictionMatrixMult(ly, ty); - if (diag_policy == DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(ty, *dbc_tdof_list, x); - } - else if (diag_policy == DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - } - y.Add(a, ty); - } - else - { - RestrictionMatrixAddMult(ly, y, a); - } -} - -void ParOperator::MultTranspose(const Vector &x, Vector &y) const -{ - MFEM_ASSERT(x.Size() == height && y.Size() == width, - "Incompatible dimensions for ParOperator::MultTranspose!"); - if (RAP) - { - RAP->MultTranspose(x, y); - return; - } - - if (dbc_tdof_list) - { - ty = x; - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - RestrictionMatrixMultTranspose(ty, ly); - } - else - { - RestrictionMatrixMultTranspose(x, ly); - } - - // Apply the operator on the L-vector. - A->MultTranspose(ly, lx); - - trial_fespace.GetProlongationMatrix()->MultTranspose(lx, y); - if (dbc_tdof_list) - { - if (diag_policy == DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(y, *dbc_tdof_list, x); - } - else if (diag_policy == DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(y, *dbc_tdof_list, 0.0); - } - } -} - -void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) const -{ - MFEM_ASSERT(x.Size() == height && y.Size() == width, - "Incompatible dimensions for ParOperator::AddMultTranspose!"); - if (RAP) - { - RAP->AddMultTranspose(x, y, a); - return; - } - - if (dbc_tdof_list) - { - ty = x; - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - RestrictionMatrixMultTranspose(ty, ly); - } - else - { - RestrictionMatrixMultTranspose(x, ly); - } - - // Apply the operator on the L-vector. - A->MultTranspose(ly, lx); - - if (dbc_tdof_list) - { - trial_fespace.GetProlongationMatrix()->MultTranspose(lx, ty); - if (diag_policy == DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(ty, *dbc_tdof_list, x); - } - else if (diag_policy == DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - } - y.Add(a, ty); - } - else - { - trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx, y, a); - } -} - -void ParOperator::RestrictionMatrixMult(const Vector &ly, Vector &ty) const -{ - if (!use_R) - { - test_fespace.GetProlongationMatrix()->MultTranspose(ly, ty); - } - else - { - test_fespace.GetRestrictionMatrix()->Mult(ly, ty); - } -} - -void ParOperator::RestrictionMatrixAddMult(const Vector &ly, Vector &ty, - const double a) const -{ - if (!use_R) - { - test_fespace.GetProlongationMatrix()->AddMultTranspose(ly, ty, a); - } - else - { - test_fespace.GetRestrictionMatrix()->AddMult(ly, ty, a); - } -} - -void ParOperator::RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) const -{ - if (!use_R) - { - test_fespace.GetProlongationMatrix()->Mult(ty, ly); - } - else - { - test_fespace.GetRestrictionMatrix()->MultTranspose(ty, ly); - } -} - -ComplexParOperator::ComplexParOperator(std::unique_ptr &&dAr, - std::unique_ptr &&dAi, Operator *pAr, - Operator *pAi, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - bool test_restrict) - : ComplexOperator(test_fespace.GetTrueVSize(), trial_fespace.GetTrueVSize()), - data_A((dAr != nullptr || dAi != nullptr) - ? std::make_unique(std::move(dAr), std::move(dAi)) - : std::make_unique(pAr, pAi)), - A(data_A.get()), trial_fespace(trial_fespace), test_fespace(test_fespace), - use_R(test_restrict), dbc_tdof_list(nullptr), - diag_policy(Operator::DiagonalPolicy::DIAG_ONE), - RAPr(A->HasReal() - ? std::make_unique(*A->Real(), trial_fespace, test_fespace, use_R) - : nullptr), - RAPi(A->HasImag() - ? std::make_unique(*A->Imag(), trial_fespace, test_fespace, use_R) - : nullptr) -{ - // We use the non-owning constructors for real and imaginary part ParOperators. We know A - // is a ComplexWrapperOperator which has separate access to the real and imaginary - // components. - lx.SetSize(A->Width()); - ly.SetSize(A->Height()); - ty.SetSize(width); -} - -ComplexParOperator::ComplexParOperator(std::unique_ptr &&Ar, - std::unique_ptr &&Ai, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - bool test_restrict) - : ComplexParOperator(std::move(Ar), std::move(Ai), nullptr, nullptr, trial_fespace, - test_fespace, test_restrict) -{ -} - -ComplexParOperator::ComplexParOperator(Operator *Ar, Operator *Ai, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, - bool test_restrict) - : ComplexParOperator(nullptr, nullptr, Ar, Ai, trial_fespace, test_fespace, test_restrict) -{ -} - -const ComplexOperator &ComplexParOperator::LocalOperator() const -{ - MFEM_ASSERT(A, "No local matrix available for ComplexParOperator::LocalOperator!"); - return *A; -} - -ComplexOperator &ComplexParOperator::LocalOperator() -{ - MFEM_ASSERT(A, "No local matrix available for ComplexParOperator::LocalOperator!"); - return *A; -} - -void ComplexParOperator::SetEssentialTrueDofs(const mfem::Array &tdof_list, - Operator::DiagonalPolicy policy) -{ - MFEM_VERIFY(policy == Operator::DiagonalPolicy::DIAG_ONE || - policy == Operator::DiagonalPolicy::DIAG_ZERO, - "Essential boundary condition true dof elimination for ComplexParOperator " - "supports only DiagonalPolicy::DIAG_ONE or DiagonalPolicy::DIAG_ZERO!"); - MFEM_VERIFY( - policy != Operator::DiagonalPolicy::DIAG_ONE || RAPr, - "DiagonalPolicy::DIAG_ONE specified for ComplexParOperator with no real part!"); - MFEM_VERIFY(height == width, "Set essential true dofs for both test and trial spaces " - "for rectangular ComplexParOperator!"); - dbc_tdof_list = &tdof_list; - diag_policy = policy; - if (RAPr) - { - RAPr->SetEssentialTrueDofs(tdof_list, policy); - } - if (RAPi) - { - RAPi->SetEssentialTrueDofs(tdof_list, Operator::DiagonalPolicy::DIAG_ZERO); - } -} - -void ComplexParOperator::AssembleDiagonal(ComplexVector &diag) const -{ - diag = 0.0; - if (RAPr) - { - RAPr->AssembleDiagonal(diag.Real()); - } - if (RAPi) - { - RAPi->AssembleDiagonal(diag.Imag()); - } -} - -void ComplexParOperator::Mult(const ComplexVector &x, ComplexVector &y) const -{ - MFEM_ASSERT(x.Size() == width && y.Size() == height, - "Incompatible dimensions for ComplexParOperator::Mult!"); - if (dbc_tdof_list) - { - ty = x; - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - trial_fespace.GetProlongationMatrix()->Mult(ty.Real(), lx.Real()); - trial_fespace.GetProlongationMatrix()->Mult(ty.Imag(), lx.Imag()); - } - else - { - trial_fespace.GetProlongationMatrix()->Mult(x.Real(), lx.Real()); - trial_fespace.GetProlongationMatrix()->Mult(x.Imag(), lx.Imag()); - } - - // Apply the operator on the L-vector. - A->Mult(lx, ly); - - RestrictionMatrixMult(ly, y); - if (dbc_tdof_list) - { - if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(y, *dbc_tdof_list, x); - } - else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(y, *dbc_tdof_list, 0.0); - } - } -} - -void ComplexParOperator::AddMult(const ComplexVector &x, ComplexVector &y, - const std::complex a) const -{ - MFEM_ASSERT(x.Size() == width && y.Size() == height, - "Incompatible dimensions for ComplexParOperator::AddMult!"); - if (dbc_tdof_list) - { - ty = x; - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - trial_fespace.GetProlongationMatrix()->Mult(ty.Real(), lx.Real()); - trial_fespace.GetProlongationMatrix()->Mult(ty.Imag(), lx.Imag()); - } - else - { - trial_fespace.GetProlongationMatrix()->Mult(x.Real(), lx.Real()); - trial_fespace.GetProlongationMatrix()->Mult(x.Imag(), lx.Imag()); - } - - // Apply the operator on the L-vector. - ly = 0.0; - A->AddMult(lx, ly, a); - - if (dbc_tdof_list) - { - RestrictionMatrixMult(ly, ty); - if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(ty, *dbc_tdof_list, x); - } - else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - } - y += ty; - } - else - { - RestrictionMatrixAddMult(ly, y, 1.0); - } -} - -void ComplexParOperator::MultTranspose(const ComplexVector &x, ComplexVector &y) const -{ - MFEM_ASSERT(x.Size() == height && y.Size() == width, - "Incompatible dimensions for ComplexParOperator::MultTranspose!"); - if (dbc_tdof_list) - { - ty = x; - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - RestrictionMatrixMultTranspose(ty, ly); - } - else - { - RestrictionMatrixMultTranspose(x, ly); - } - - // Apply the operator on the L-vector. - A->MultTranspose(ly, lx); - - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), y.Real()); - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), y.Imag()); - if (dbc_tdof_list) - { - if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(y, *dbc_tdof_list, x); - } - else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(y, *dbc_tdof_list, 0.0); - } - } -} - -void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector &y, - const std::complex a) const -{ - MFEM_ASSERT(x.Size() == height && y.Size() == width, - "Incompatible dimensions for ComplexParOperator::AddMultTranspose!"); - if (dbc_tdof_list) - { - ty = x; - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - RestrictionMatrixMultTranspose(ty, ly); - } - else - { - RestrictionMatrixMultTranspose(x, ly); - } - - // Apply the operator on the L-vector. - lx = 0.0; - A->AddMultTranspose(ly, lx, a); - - if (dbc_tdof_list) - { - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), ty.Real()); - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), ty.Imag()); - if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(ty, *dbc_tdof_list, x); - } - else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - } - y += ty; - } - else - { - trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Real(), y.Real()); - trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Imag(), y.Imag()); - } -} - -void ComplexParOperator::MultHermitianTranspose(const ComplexVector &x, - ComplexVector &y) const -{ - MFEM_ASSERT(x.Size() == height && y.Size() == width, - "Incompatible dimensions for ComplexParOperator::MultHermitianTranspose!"); - if (dbc_tdof_list) - { - ty = x; - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - RestrictionMatrixMultTranspose(ty, ly); - } - else - { - RestrictionMatrixMultTranspose(x, ly); - } - - // Apply the operator on the L-vector. - A->MultHermitianTranspose(ly, lx); - - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), y.Real()); - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), y.Imag()); - if (dbc_tdof_list) - { - if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(y, *dbc_tdof_list, x); - } - else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(y, *dbc_tdof_list, 0.0); - } - } -} - -void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, - const std::complex a) const -{ - MFEM_ASSERT(x.Size() == height && y.Size() == width, - "Incompatible dimensions for ComplexParOperator::AddMultHermitianTranspose!"); - if (dbc_tdof_list) - { - ty = x; - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - RestrictionMatrixMultTranspose(ty, ly); - } - else - { - RestrictionMatrixMultTranspose(x, ly); - } - - // Apply the operator on the L-vector. - lx = 0.0; - A->AddMultHermitianTranspose(ly, lx, a); - - if (dbc_tdof_list) - { - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), ty.Real()); - trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), ty.Imag()); - if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) - { - linalg::SetSubVector(ty, *dbc_tdof_list, x); - } - else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) - { - linalg::SetSubVector(ty, *dbc_tdof_list, 0.0); - } - y += ty; - } - else - { - trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Real(), y.Real()); - trial_fespace.GetProlongationMatrix()->AddMultTranspose(lx.Imag(), y.Imag()); - } -} - -void ComplexParOperator::RestrictionMatrixMult(const ComplexVector &ly, - ComplexVector &ty) const -{ - if (!use_R) - { - test_fespace.GetProlongationMatrix()->MultTranspose(ly.Real(), ty.Real()); - test_fespace.GetProlongationMatrix()->MultTranspose(ly.Imag(), ty.Imag()); - } - else - { - test_fespace.GetRestrictionMatrix()->Mult(ly.Real(), ty.Real()); - test_fespace.GetRestrictionMatrix()->Mult(ly.Imag(), ty.Imag()); - } -} - -void ComplexParOperator::RestrictionMatrixAddMult(const ComplexVector &ly, - ComplexVector &ty, const double a) const -{ - if (!use_R) - { - test_fespace.GetProlongationMatrix()->AddMultTranspose(ly.Real(), ty.Real(), a); - test_fespace.GetProlongationMatrix()->AddMultTranspose(ly.Imag(), ty.Imag(), a); - } - else - { - test_fespace.GetRestrictionMatrix()->AddMult(ly.Real(), ty.Real(), a); - test_fespace.GetRestrictionMatrix()->AddMult(ly.Imag(), ty.Imag(), a); - } -} - -void ComplexParOperator::RestrictionMatrixMultTranspose(const ComplexVector &ty, - ComplexVector &ly) const -{ - if (!use_R) - { - test_fespace.GetProlongationMatrix()->Mult(ty.Real(), ly.Real()); - test_fespace.GetProlongationMatrix()->Mult(ty.Imag(), ly.Imag()); - } - else - { - test_fespace.GetRestrictionMatrix()->MultTranspose(ty.Real(), ly.Real()); - test_fespace.GetRestrictionMatrix()->MultTranspose(ty.Imag(), ly.Imag()); - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "rap.hpp" + +#include "fem/bilinearform.hpp" +#include "linalg/hypre.hpp" + +namespace palace +{ + +ParOperator::ParOperator(std::unique_ptr &&dA, const Operator *pA, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, bool test_restrict) + : Operator(test_fespace.GetTrueVSize(), trial_fespace.GetTrueVSize()), + data_A(std::move(dA)), A((data_A != nullptr) ? data_A.get() : pA), + trial_fespace(trial_fespace), test_fespace(test_fespace), use_R(test_restrict), + diag_policy(DiagonalPolicy::DIAG_ONE), RAP(nullptr) +{ + MFEM_VERIFY(A, "Cannot construct ParOperator from an empty matrix!"); +} + +ParOperator::ParOperator(std::unique_ptr &&A, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, bool test_restrict) + : ParOperator(std::move(A), nullptr, trial_fespace, test_fespace, test_restrict) +{ +} + +ParOperator::ParOperator(const Operator &A, const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, bool test_restrict) + : ParOperator(nullptr, &A, trial_fespace, test_fespace, test_restrict) +{ +} + +void ParOperator::SetEssentialTrueDofs(const mfem::Array &tdof_list, + DiagonalPolicy policy) +{ + MFEM_VERIFY(policy == DiagonalPolicy::DIAG_ONE || policy == DiagonalPolicy::DIAG_ZERO, + "Essential boundary condition true dof elimination for ParOperator supports " + "only DiagonalPolicy::DIAG_ONE or DiagonalPolicy::DIAG_ZERO!"); + MFEM_VERIFY(height == width, "Set essential true dofs for both test and trial spaces " + "for rectangular ParOperator!"); + tdof_list.Read(); + dbc_tdof_list.MakeRef(tdof_list); + diag_policy = policy; +} + +Operator::DiagonalPolicy ParOperator::GetDiagonalPolicy() const +{ + MFEM_VERIFY(dbc_tdof_list.Size() > 0, + "There is no DiagonalPolicy if no essential dofs have been set!"); + return diag_policy; +} + +void ParOperator::EliminateRHS(const Vector &x, Vector &b) const +{ + MFEM_VERIFY(A, "No local matrix available for ParOperator::EliminateRHS!"); + auto &lx = trial_fespace.GetLVector(); + auto &ly = GetTestLVector(); + { + auto &tx = trial_fespace.GetTVector(); + tx = 0.0; + linalg::SetSubVector(tx, dbc_tdof_list, x); + trial_fespace.GetProlongationMatrix()->Mult(tx, lx); + } + + // Apply the unconstrained operator. + A->Mult(lx, ly); + + auto &ty = test_fespace.GetTVector(); + RestrictionMatrixMult(ly, ty); + b.Add(-1.0, ty); + if (diag_policy == DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(b, dbc_tdof_list, x); + } + else if (diag_policy == DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(b, dbc_tdof_list, 0.0); + } +} + +mfem::HypreParMatrix &ParOperator::ParallelAssemble(bool skip_zeros) const +{ + if (RAP) + { + return *RAP; + } + + // Build the square or rectangular assembled HypreParMatrix. + const auto *sA = dynamic_cast(A); + std::unique_ptr data_sA; + if (!sA) + { + const auto *cA = dynamic_cast(A); + MFEM_VERIFY(cA, + "ParOperator::ParallelAssemble requires A as an hypre::HypreCSRMatrix or " + "ceed::Operator!"); + data_sA = BilinearForm::FullAssemble(*cA, skip_zeros, use_R); + sA = data_sA.get(); + } + + hypre_ParCSRMatrix *hA = hypre_ParCSRMatrixCreate( + trial_fespace.GetComm(), test_fespace.GlobalVSize(), trial_fespace.GlobalVSize(), + test_fespace.Get().GetDofOffsets(), trial_fespace.Get().GetDofOffsets(), 0, sA->NNZ(), + 0); + hypre_CSRMatrix *hA_diag = hypre_ParCSRMatrixDiag(hA); + hypre_ParCSRMatrixDiag(hA) = *const_cast(sA); + hypre_ParCSRMatrixInitialize(hA); + + const mfem::HypreParMatrix *P = trial_fespace.Get().Dof_TrueDof_Matrix(); + if (!use_R) + { + const mfem::HypreParMatrix *Rt = test_fespace.Get().Dof_TrueDof_Matrix(); + RAP = std::make_unique(hypre_ParCSRMatrixRAPKT(*Rt, hA, *P, 1), + true); + } + else + { + mfem::HypreParMatrix *hR = new mfem::HypreParMatrix( + test_fespace.GetComm(), test_fespace.GlobalTrueVSize(), test_fespace.GlobalVSize(), + test_fespace.Get().GetTrueDofOffsets(), test_fespace.Get().GetDofOffsets(), + const_cast(test_fespace.GetRestrictionMatrix())); + hypre_ParCSRMatrix *AP = hypre_ParCSRMatMat(hA, *P); + RAP = std::make_unique(hypre_ParCSRMatMat(*hR, AP), true); + hypre_ParCSRMatrixDestroy(AP); + delete hR; + } + + hypre_ParCSRMatrixDiag(hA) = hA_diag; + hypre_ParCSRMatrixDestroy(hA); + hypre_ParCSRMatrixSetNumNonzeros(*RAP); + if (&trial_fespace == &test_fespace) + { + // Make sure that the first entry in each row is the diagonal one, for a square matrix. + hypre_CSRMatrixReorder(hypre_ParCSRMatrixDiag((hypre_ParCSRMatrix *)*RAP)); + } + + // Eliminate boundary conditions on the assembled (square) matrix. + if (&trial_fespace == &test_fespace) + { + RAP->EliminateBC(dbc_tdof_list, diag_policy); + } + else + { + MFEM_VERIFY(dbc_tdof_list.Size() == 0, + "Essential BC elimination is only available for square ParOperator!"); + } + + return *RAP; +} + +void ParOperator::AssembleDiagonal(Vector &diag) const +{ + diag.UseDevice(true); + if (RAP) + { + RAP->AssembleDiagonal(diag); + return; + } + + // For an AMR mesh, a convergent diagonal is assembled with |P|ᵀ dₗ, where |P| has + // entry-wise absolute values of the conforming prolongation operator. + MFEM_VERIFY(&trial_fespace == &test_fespace, + "Diagonal assembly is only available for square ParOperator!"); + auto &lx = trial_fespace.GetLVector(); + A->AssembleDiagonal(lx); + + // Parallel assemble and eliminate essential true dofs. + const Operator *P = test_fespace.GetProlongationMatrix(); + if (const auto *hP = dynamic_cast(P)) + { + hP->AbsMultTranspose(1.0, lx, 0.0, diag); + } + else + { + P->MultTranspose(lx, diag); + } + + // Eliminate essential true dofs. + if (dbc_tdof_list.Size()) + { + if (diag_policy == DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(diag, dbc_tdof_list, 1.0); + } + else if (diag_policy == DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(diag, dbc_tdof_list, 0.0); + } + } +} + +void ParOperator::Mult(const Vector &x, Vector &y) const +{ + MFEM_ASSERT(x.Size() == width && y.Size() == height, + "Incompatible dimensions for ParOperator::Mult!"); + if (RAP) + { + RAP->Mult(x, y); + return; + } + + auto &lx = trial_fespace.GetLVector(); + auto &ly = GetTestLVector(); + if (dbc_tdof_list.Size()) + { + auto &tx = trial_fespace.GetTVector(); + tx = x; + linalg::SetSubVector(tx, dbc_tdof_list, 0.0); + trial_fespace.GetProlongationMatrix()->Mult(tx, lx); + } + else + { + trial_fespace.GetProlongationMatrix()->Mult(x, lx); + } + + // Apply the operator on the L-vector. + A->Mult(lx, ly); + + RestrictionMatrixMult(ly, y); + if (dbc_tdof_list.Size()) + { + if (diag_policy == DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(y, dbc_tdof_list, x); + } + else if (diag_policy == DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(y, dbc_tdof_list, 0.0); + } + } +} + +void ParOperator::MultTranspose(const Vector &x, Vector &y) const +{ + MFEM_ASSERT(x.Size() == height && y.Size() == width, + "Incompatible dimensions for ParOperator::MultTranspose!"); + if (RAP) + { + RAP->MultTranspose(x, y); + return; + } + + auto &lx = trial_fespace.GetLVector(); + auto &ly = GetTestLVector(); + if (dbc_tdof_list.Size()) + { + auto &ty = test_fespace.GetTVector(); + ty = x; + linalg::SetSubVector(ty, dbc_tdof_list, 0.0); + RestrictionMatrixMultTranspose(ty, ly); + } + else + { + RestrictionMatrixMultTranspose(x, ly); + } + + // Apply the operator on the L-vector. + A->MultTranspose(ly, lx); + + trial_fespace.GetProlongationMatrix()->MultTranspose(lx, y); + if (dbc_tdof_list.Size()) + { + if (diag_policy == DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(y, dbc_tdof_list, x); + } + else if (diag_policy == DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(y, dbc_tdof_list, 0.0); + } + } +} + +void ParOperator::AddMult(const Vector &x, Vector &y, const double a) const +{ + MFEM_ASSERT(x.Size() == width && y.Size() == height, + "Incompatible dimensions for ParOperator::AddMult!"); + if (RAP) + { + RAP->AddMult(x, y, a); + return; + } + + auto &lx = trial_fespace.GetLVector(); + auto &ly = GetTestLVector(); + if (dbc_tdof_list.Size()) + { + auto &tx = trial_fespace.GetTVector(); + tx = x; + linalg::SetSubVector(tx, dbc_tdof_list, 0.0); + trial_fespace.GetProlongationMatrix()->Mult(tx, lx); + } + else + { + trial_fespace.GetProlongationMatrix()->Mult(x, lx); + } + + // Apply the operator on the L-vector. + A->Mult(lx, ly); + + auto &ty = test_fespace.GetTVector(); + RestrictionMatrixMult(ly, ty); + if (dbc_tdof_list.Size()) + { + if (diag_policy == DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(ty, dbc_tdof_list, x); + } + else if (diag_policy == DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(ty, dbc_tdof_list, 0.0); + } + } + y.Add(a, ty); +} + +void ParOperator::AddMultTranspose(const Vector &x, Vector &y, const double a) const +{ + MFEM_ASSERT(x.Size() == height && y.Size() == width, + "Incompatible dimensions for ParOperator::AddMultTranspose!"); + if (RAP) + { + RAP->AddMultTranspose(x, y, a); + return; + } + + auto &lx = trial_fespace.GetLVector(); + auto &ly = GetTestLVector(); + if (dbc_tdof_list.Size()) + { + auto &ty = test_fespace.GetTVector(); + ty = x; + linalg::SetSubVector(ty, dbc_tdof_list, 0.0); + RestrictionMatrixMultTranspose(ty, ly); + } + else + { + RestrictionMatrixMultTranspose(x, ly); + } + + // Apply the operator on the L-vector. + A->MultTranspose(ly, lx); + + auto &tx = trial_fespace.GetTVector(); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx, tx); + if (dbc_tdof_list.Size()) + { + if (diag_policy == DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(tx, dbc_tdof_list, x); + } + else if (diag_policy == DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(tx, dbc_tdof_list, 0.0); + } + } + y.Add(a, tx); +} + +void ParOperator::RestrictionMatrixMult(const Vector &ly, Vector &ty) const +{ + if (!use_R) + { + test_fespace.GetProlongationMatrix()->MultTranspose(ly, ty); + } + else + { + test_fespace.GetRestrictionMatrix()->Mult(ly, ty); + } +} + +void ParOperator::RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) const +{ + if (!use_R) + { + test_fespace.GetProlongationMatrix()->Mult(ty, ly); + } + else + { + test_fespace.GetRestrictionMatrix()->MultTranspose(ty, ly); + } +} + +Vector &ParOperator::GetTestLVector() const +{ + return (&trial_fespace == &test_fespace) ? trial_fespace.GetLVector2() + : test_fespace.GetLVector(); +} + +ComplexParOperator::ComplexParOperator(std::unique_ptr &&dAr, + std::unique_ptr &&dAi, const Operator *pAr, + const Operator *pAi, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, + bool test_restrict) + : ComplexOperator(test_fespace.GetTrueVSize(), trial_fespace.GetTrueVSize()), + data_A((dAr != nullptr || dAi != nullptr) + ? std::make_unique(std::move(dAr), std::move(dAi)) + : std::make_unique(pAr, pAi)), + A(data_A.get()), trial_fespace(trial_fespace), test_fespace(test_fespace), + use_R(test_restrict), diag_policy(Operator::DiagonalPolicy::DIAG_ONE), + RAPr(A->Real() + ? std::make_unique(*A->Real(), trial_fespace, test_fespace, use_R) + : nullptr), + RAPi(A->Imag() + ? std::make_unique(*A->Imag(), trial_fespace, test_fespace, use_R) + : nullptr) +{ + // We use the non-owning constructors for real and imaginary part ParOperators, since we + // construct A as a ComplexWrapperOperator which has separate access to the real and + // imaginary components. +} + +ComplexParOperator::ComplexParOperator(std::unique_ptr &&Ar, + std::unique_ptr &&Ai, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, + bool test_restrict) + : ComplexParOperator(std::move(Ar), std::move(Ai), nullptr, nullptr, trial_fespace, + test_fespace, test_restrict) +{ +} + +ComplexParOperator::ComplexParOperator(const Operator *Ar, const Operator *Ai, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, + bool test_restrict) + : ComplexParOperator(nullptr, nullptr, Ar, Ai, trial_fespace, test_fespace, test_restrict) +{ +} + +void ComplexParOperator::SetEssentialTrueDofs(const mfem::Array &tdof_list, + Operator::DiagonalPolicy policy) +{ + MFEM_VERIFY(policy == Operator::DiagonalPolicy::DIAG_ONE || + policy == Operator::DiagonalPolicy::DIAG_ZERO, + "Essential boundary condition true dof elimination for ComplexParOperator " + "supports only DiagonalPolicy::DIAG_ONE or DiagonalPolicy::DIAG_ZERO!"); + MFEM_VERIFY( + policy != Operator::DiagonalPolicy::DIAG_ONE || RAPr, + "DiagonalPolicy::DIAG_ONE specified for ComplexParOperator with no real part!"); + MFEM_VERIFY(height == width, "Set essential true dofs for both test and trial spaces " + "for rectangular ComplexParOperator!"); + tdof_list.Read(); + dbc_tdof_list.MakeRef(tdof_list); + diag_policy = policy; + if (RAPr) + { + RAPr->SetEssentialTrueDofs(tdof_list, policy); + } + if (RAPi) + { + RAPi->SetEssentialTrueDofs(tdof_list, Operator::DiagonalPolicy::DIAG_ZERO); + } +} + +Operator::DiagonalPolicy ComplexParOperator::GetDiagonalPolicy() const +{ + MFEM_VERIFY(dbc_tdof_list.Size() > 0, + "There is no DiagonalPolicy if no essential dofs have been set!"); + return diag_policy; +} + +void ComplexParOperator::AssembleDiagonal(ComplexVector &diag) const +{ + diag.UseDevice(true); + diag = 0.0; + if (RAPr) + { + RAPr->AssembleDiagonal(diag.Real()); + } + if (RAPi) + { + RAPi->AssembleDiagonal(diag.Imag()); + } +} + +void ComplexParOperator::Mult(const ComplexVector &x, ComplexVector &y) const +{ + MFEM_ASSERT(x.Size() == width && y.Size() == height, + "Incompatible dimensions for ComplexParOperator::Mult!"); + + auto &lx = trial_fespace.GetLVector(); + auto &ly = GetTestLVector(); + if (dbc_tdof_list.Size()) + { + auto &tx = trial_fespace.GetTVector(); + tx = x; + linalg::SetSubVector(tx, dbc_tdof_list, 0.0); + trial_fespace.GetProlongationMatrix()->Mult(tx.Real(), lx.Real()); + trial_fespace.GetProlongationMatrix()->Mult(tx.Imag(), lx.Imag()); + } + else + { + trial_fespace.GetProlongationMatrix()->Mult(x.Real(), lx.Real()); + trial_fespace.GetProlongationMatrix()->Mult(x.Imag(), lx.Imag()); + } + + // Apply the operator on the L-vector. + A->Mult(lx, ly); + + RestrictionMatrixMult(ly, y); + if (dbc_tdof_list.Size()) + { + if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(y, dbc_tdof_list, x); + } + else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(y, dbc_tdof_list, 0.0); + } + } +} + +void ComplexParOperator::MultTranspose(const ComplexVector &x, ComplexVector &y) const +{ + MFEM_ASSERT(x.Size() == height && y.Size() == width, + "Incompatible dimensions for ComplexParOperator::MultTranspose!"); + + auto &lx = trial_fespace.GetLVector(); + auto &ly = GetTestLVector(); + if (dbc_tdof_list.Size()) + { + auto &ty = test_fespace.GetTVector(); + ty = x; + linalg::SetSubVector(ty, dbc_tdof_list, 0.0); + RestrictionMatrixMultTranspose(ty, ly); + } + else + { + RestrictionMatrixMultTranspose(x, ly); + } + + // Apply the operator on the L-vector. + A->MultTranspose(ly, lx); + + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), y.Real()); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), y.Imag()); + if (dbc_tdof_list.Size()) + { + if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(y, dbc_tdof_list, x); + } + else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(y, dbc_tdof_list, 0.0); + } + } +} + +void ComplexParOperator::MultHermitianTranspose(const ComplexVector &x, + ComplexVector &y) const +{ + MFEM_ASSERT(x.Size() == height && y.Size() == width, + "Incompatible dimensions for ComplexParOperator::MultHermitianTranspose!"); + + auto &lx = trial_fespace.GetLVector(); + auto &ly = GetTestLVector(); + if (dbc_tdof_list.Size()) + { + auto &ty = test_fespace.GetTVector(); + ty = x; + linalg::SetSubVector(ty, dbc_tdof_list, 0.0); + RestrictionMatrixMultTranspose(ty, ly); + } + else + { + RestrictionMatrixMultTranspose(x, ly); + } + + // Apply the operator on the L-vector. + A->MultHermitianTranspose(ly, lx); + + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), y.Real()); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), y.Imag()); + if (dbc_tdof_list.Size()) + { + if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(y, dbc_tdof_list, x); + } + else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(y, dbc_tdof_list, 0.0); + } + } +} + +void ComplexParOperator::AddMult(const ComplexVector &x, ComplexVector &y, + const std::complex a) const +{ + MFEM_ASSERT(x.Size() == width && y.Size() == height, + "Incompatible dimensions for ComplexParOperator::AddMult!"); + + auto &lx = trial_fespace.GetLVector(); + auto &ly = GetTestLVector(); + if (dbc_tdof_list.Size()) + { + auto &tx = trial_fespace.GetTVector(); + tx = x; + linalg::SetSubVector(tx, dbc_tdof_list, 0.0); + trial_fespace.GetProlongationMatrix()->Mult(tx.Real(), lx.Real()); + trial_fespace.GetProlongationMatrix()->Mult(tx.Imag(), lx.Imag()); + } + else + { + trial_fespace.GetProlongationMatrix()->Mult(x.Real(), lx.Real()); + trial_fespace.GetProlongationMatrix()->Mult(x.Imag(), lx.Imag()); + } + + // Apply the operator on the L-vector. + A->Mult(lx, ly); + + auto &ty = test_fespace.GetTVector(); + RestrictionMatrixMult(ly, ty); + if (dbc_tdof_list.Size()) + { + if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(ty, dbc_tdof_list, x); + } + else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(ty, dbc_tdof_list, 0.0); + } + } + y.AXPY(a, ty); +} + +void ComplexParOperator::AddMultTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a) const +{ + MFEM_ASSERT(x.Size() == height && y.Size() == width, + "Incompatible dimensions for ComplexParOperator::AddMultTranspose!"); + + auto &lx = trial_fespace.GetLVector(); + auto &ly = GetTestLVector(); + if (dbc_tdof_list.Size()) + { + auto &ty = test_fespace.GetTVector(); + ty = x; + linalg::SetSubVector(ty, dbc_tdof_list, 0.0); + RestrictionMatrixMultTranspose(ty, ly); + } + else + { + RestrictionMatrixMultTranspose(x, ly); + } + + // Apply the operator on the L-vector. + A->MultTranspose(ly, lx); + + auto &tx = trial_fespace.GetTVector(); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real()); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag()); + if (dbc_tdof_list.Size()) + { + if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(tx, dbc_tdof_list, x); + } + else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(tx, dbc_tdof_list, 0.0); + } + } + y.AXPY(a, tx); +} + +void ComplexParOperator::AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a) const +{ + MFEM_ASSERT(x.Size() == height && y.Size() == width, + "Incompatible dimensions for ComplexParOperator::AddMultHermitianTranspose!"); + + auto &lx = trial_fespace.GetLVector(); + auto &ly = GetTestLVector(); + if (dbc_tdof_list.Size()) + { + auto &ty = test_fespace.GetTVector(); + ty = x; + linalg::SetSubVector(ty, dbc_tdof_list, 0.0); + RestrictionMatrixMultTranspose(ty, ly); + } + else + { + RestrictionMatrixMultTranspose(x, ly); + } + + // Apply the operator on the L-vector. + A->MultHermitianTranspose(ly, lx); + + auto &tx = trial_fespace.GetTVector(); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Real(), tx.Real()); + trial_fespace.GetProlongationMatrix()->MultTranspose(lx.Imag(), tx.Imag()); + if (dbc_tdof_list.Size()) + { + if (diag_policy == Operator::DiagonalPolicy::DIAG_ONE) + { + linalg::SetSubVector(tx, dbc_tdof_list, x); + } + else if (diag_policy == Operator::DiagonalPolicy::DIAG_ZERO) + { + linalg::SetSubVector(tx, dbc_tdof_list, 0.0); + } + } + y.AXPY(a, tx); +} + +void ComplexParOperator::RestrictionMatrixMult(const ComplexVector &ly, + ComplexVector &ty) const +{ + if (!use_R) + { + test_fespace.GetProlongationMatrix()->MultTranspose(ly.Real(), ty.Real()); + test_fespace.GetProlongationMatrix()->MultTranspose(ly.Imag(), ty.Imag()); + } + else + { + test_fespace.GetRestrictionMatrix()->Mult(ly.Real(), ty.Real()); + test_fespace.GetRestrictionMatrix()->Mult(ly.Imag(), ty.Imag()); + } +} + +void ComplexParOperator::RestrictionMatrixMultTranspose(const ComplexVector &ty, + ComplexVector &ly) const +{ + if (!use_R) + { + test_fespace.GetProlongationMatrix()->Mult(ty.Real(), ly.Real()); + test_fespace.GetProlongationMatrix()->Mult(ty.Imag(), ly.Imag()); + } + else + { + test_fespace.GetRestrictionMatrix()->MultTranspose(ty.Real(), ly.Real()); + test_fespace.GetRestrictionMatrix()->MultTranspose(ty.Imag(), ly.Imag()); + } +} + +ComplexVector &ComplexParOperator::GetTestLVector() const +{ + return (&trial_fespace == &test_fespace) ? trial_fespace.GetLVector2() + : test_fespace.GetLVector(); +} + +// Helper that checks if two containers (Vector or Array) are actually references to the +// same underlying data. +template +bool ReferencesSameMemory(const C &c1, const C &c2) +{ + const auto &m1 = c1.GetMemory(); + const auto &m2 = c2.GetMemory(); + return (m1.HostIsValid() && m2.HostIsValid() && c1.HostRead() == c2.HostRead()) || + (m1.DeviceIsValid() && m2.DeviceIsValid() && c1.Read() == c2.Read()); +} + +// Combine a collection of ParOperator into a weighted summation. If set_essential is true, +// extract the essential dofs from the operator array, and apply to the summed operator. +template +std::unique_ptr +BuildParSumOperator(const std::array &coeff, + const std::array &ops, bool set_essential) +{ + auto it = std::find_if(ops.begin(), ops.end(), [](auto p) { return p != nullptr; }); + MFEM_VERIFY(it != ops.end(), + "BuildParSumOperator requires at least one valid ParOperator!"); + const auto first_op = *it; + const auto &fespace = first_op->TrialFiniteElementSpace(); + MFEM_VERIFY( + std::all_of(ops.begin(), ops.end(), [&fespace](auto p) + { return p == nullptr || &p->TrialFiniteElementSpace() == &fespace; }), + "All ComplexParOperators must have the same FiniteElementSpace!"); + + auto sum = std::make_unique(first_op->LocalOperator().Height(), + first_op->LocalOperator().Width()); + for (std::size_t i = 0; i < coeff.size(); i++) + { + if (ops[i] && coeff[i] != 0) + { + sum->AddOperator(ops[i]->LocalOperator(), coeff[i]); + } + } + + auto O = std::make_unique(std::move(sum), fespace); + if (set_essential) + { + // Extract essential dof pointer from first operator with one. + auto it_ess = std::find_if(ops.begin(), ops.end(), [](auto p) + { return p != nullptr && p->GetEssentialTrueDofs(); }); + if (it_ess == ops.end()) + { + return O; + } + const auto *ess_dofs = (*it_ess)->GetEssentialTrueDofs(); + + // Check other existent essential dof arrays are references. + MFEM_VERIFY(std::all_of(ops.begin(), ops.end(), + [&](auto p) + { + if (p == nullptr) + { + return true; + } + auto p_ess_dofs = p->GetEssentialTrueDofs(); + return p_ess_dofs == nullptr || + ReferencesSameMemory(*ess_dofs, *p_ess_dofs); + }), + "If essential dofs are set, all suboperators must agree on them!"); + + // Use implied ordering of enumeration. + Operator::DiagonalPolicy policy = Operator::DiagonalPolicy::DIAG_ZERO; + for (auto p : ops) + { + policy = (p && p->GetEssentialTrueDofs()) ? std::max(policy, p->GetDiagonalPolicy()) + : policy; + } + O->SetEssentialTrueDofs(*ess_dofs, policy); + } + + return O; +} + +// Combine a collection of ComplexParOperator into a weighted summation. If set_essential is +// true, extract the essential dofs from the operator array, and apply to the summed +// operator. +template +std::unique_ptr +BuildParSumOperator(const std::array, N> &coeff, + const std::array &ops, + bool set_essential) +{ + auto it = std::find_if(ops.begin(), ops.end(), [](auto p) { return p != nullptr; }); + MFEM_VERIFY(it != ops.end(), + "BuildParSumOperator requires at least one valid ComplexParOperator!"); + const auto first_op = *it; + const auto &fespace = first_op->TrialFiniteElementSpace(); + MFEM_VERIFY( + std::all_of(ops.begin(), ops.end(), [&fespace](auto p) + { return p == nullptr || &p->TrialFiniteElementSpace() == &fespace; }), + "All ComplexParOperators must have the same FiniteElementSpace!"); + + auto sumr = std::make_unique(first_op->LocalOperator().Height(), + first_op->LocalOperator().Width()); + auto sumi = std::make_unique(first_op->LocalOperator().Height(), + first_op->LocalOperator().Width()); + for (std::size_t i = 0; i < coeff.size(); i++) + { + if (ops[i] && coeff[i].real() != 0) + { + if (ops[i]->LocalOperator().Real()) + { + sumr->AddOperator(*ops[i]->LocalOperator().Real(), coeff[i].real()); + } + if (ops[i]->LocalOperator().Imag()) + { + sumi->AddOperator(*ops[i]->LocalOperator().Imag(), coeff[i].real()); + } + } + if (ops[i] && coeff[i].imag() != 0) + { + if (ops[i]->LocalOperator().Imag()) + { + sumr->AddOperator(*ops[i]->LocalOperator().Imag(), -coeff[i].imag()); + } + if (ops[i]->LocalOperator().Real()) + { + sumi->AddOperator(*ops[i]->LocalOperator().Real(), coeff[i].imag()); + } + } + } + auto O = std::make_unique(std::move(sumr), std::move(sumi), fespace); + if (set_essential) + { + // Extract essential dof pointer from first operator with one. + auto it_ess = std::find_if(ops.begin(), ops.end(), [](auto p) + { return p != nullptr && p->GetEssentialTrueDofs(); }); + if (it_ess == ops.end()) + { + return O; + } + const auto *ess_dofs = (*it_ess)->GetEssentialTrueDofs(); + + // Check other existent essential dof arrays are references. + MFEM_VERIFY(std::all_of(ops.begin(), ops.end(), + [&](auto p) + { + if (p == nullptr) + { + return true; + } + auto p_ess_dofs = p->GetEssentialTrueDofs(); + return p_ess_dofs == nullptr || + ReferencesSameMemory(*ess_dofs, *p_ess_dofs); + }), + "If essential dofs are set, all suboperators must agree on them!"); + + // Use implied ordering of enumeration. + Operator::DiagonalPolicy policy = Operator::DiagonalPolicy::DIAG_ZERO; + for (auto p : ops) + { + policy = (p && p->GetEssentialTrueDofs()) ? std::max(policy, p->GetDiagonalPolicy()) + : policy; + } + O->SetEssentialTrueDofs(*ess_dofs, policy); + } + return O; +} + +// TODO: replace with std::to_array in c++20. +namespace detail +{ +// Helper for conversion to std::array. +template +constexpr std::array, N> to_array_impl(T (&&a)[N], + std::index_sequence) +{ + return {{std::move(a[I])...}}; +} +} // namespace detail + +template +constexpr std::array, N> to_array(T (&&a)[N]) +{ + return detail::to_array_impl(std::move(a), std::make_index_sequence{}); +} + +template +std::unique_ptr +BuildParSumOperator(std::complex (&&coeff_in)[N], + const ComplexParOperator *(&&ops_in)[N], bool set_essential) +{ + return BuildParSumOperator(to_array>(std::move(coeff_in)), + to_array(std::move(ops_in)), + set_essential); +} + +template +std::unique_ptr, + ComplexParOperator, ParOperator>> +BuildParSumOperator(ScalarType (&&coeff_in)[N], const OperType *(&&ops_in)[N], + bool set_essential) +{ + using ParOperType = + typename std::conditional_t, + ComplexParOperator, ParOperator>; + + std::array par_ops; + std::transform(ops_in, ops_in + N, par_ops.begin(), + [](const OperType *op) { return dynamic_cast(op); }); + + return BuildParSumOperator(to_array(std::move(coeff_in)), std::move(par_ops), + set_essential); +} + +// Explicit instantiation. +template std::unique_ptr BuildParSumOperator(double (&&)[2], + const Operator *(&&)[2], bool); +template std::unique_ptr BuildParSumOperator(double (&&)[3], + const Operator *(&&)[3], bool); +template std::unique_ptr BuildParSumOperator(double (&&)[4], + const Operator *(&&)[4], bool); +template std::unique_ptr + BuildParSumOperator(std::complex (&&)[2], const ComplexOperator *(&&)[2], bool); +template std::unique_ptr + BuildParSumOperator(std::complex (&&)[3], const ComplexOperator *(&&)[3], bool); +template std::unique_ptr + BuildParSumOperator(std::complex (&&)[4], const ComplexOperator *(&&)[4], bool); + +} // namespace palace diff --git a/palace/linalg/rap.hpp b/palace/linalg/rap.hpp index cbf3e73e82..6ca1a4fdff 100644 --- a/palace/linalg/rap.hpp +++ b/palace/linalg/rap.hpp @@ -1,215 +1,252 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_RAP_HPP -#define PALACE_LINALG_RAP_HPP - -#include -#include -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" - -namespace palace -{ - -// -// A parallel operator represented by RAP constructed through the actions of R, A, and P, -// usually with R = Pᵀ, and with possible eliminated essential BC. Here R and P are the -// parallel restriction and prolongation matrices. -// - -// Real-valued RAP operator. -class ParOperator : public Operator -{ -private: - // Storage and access for the local operator. - mutable std::unique_ptr data_A; - mutable Operator *A; - - // Finite element spaces for parallel prolongation and restriction. - const mfem::ParFiniteElementSpace &trial_fespace, &test_fespace; - const bool use_R; - - // Lists of constrained essential boundary true dofs for elimination. - const mfem::Array *dbc_tdof_list; - - // Diagonal policy for constrained true dofs. - DiagonalPolicy diag_policy; - - // Assembled operator as a parallel Hypre matrix. If assembled, the local operator is not - // deleted. - mutable std::unique_ptr RAP; - - // Temporary storage for operator application. - mutable Vector lx, ly, ty; - - // Helper methods for operator application. - void RestrictionMatrixMult(const Vector &ly, Vector &ty) const; - void RestrictionMatrixAddMult(const Vector &ly, Vector &ty, const double a) const; - void RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) const; - - ParOperator(std::unique_ptr &&dA, Operator *pA, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict); - -public: - // Construct the parallel operator, inheriting ownership of the local operator. - ParOperator(std::unique_ptr &&A, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict); - ParOperator(std::unique_ptr &&A, const mfem::ParFiniteElementSpace &fespace) - : ParOperator(std::move(A), fespace, fespace, false) - { - } - - // Non-owning constructors. - ParOperator(Operator &A, const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict); - ParOperator(Operator &A, const mfem::ParFiniteElementSpace &fespace) - : ParOperator(A, fespace, fespace, false) - { - } - - // Get access to the underlying local (L-vector) operator. - const Operator &LocalOperator() const; - Operator &LocalOperator(); - - // Get the associated MPI communicator. - MPI_Comm GetComm() const { return trial_fespace.GetComm(); } - - // Set essential boundary condition true dofs for square operators. - void SetEssentialTrueDofs(const mfem::Array &tdof_list, DiagonalPolicy policy); - - // Get the essential boundary condition true dofs associated with the operator. May be - // nullptr. - const mfem::Array *GetEssentialTrueDofs() const { return dbc_tdof_list; } - - // Assemble the diagonal for the parallel operator. - void AssembleDiagonal(Vector &diag) const override; - - // Assemble the operator as a parallel sparse matrix. The memory associated with the - // local operator is free'd. - mfem::HypreParMatrix &ParallelAssemble(bool skip_zeros = false) const; - - // Steal the assembled parallel sparse matrix. - std::unique_ptr StealParallelAssemble(bool skip_zeros = false) const - { - ParallelAssemble(skip_zeros); - return std::move(RAP); - } - - // Eliminate essential true dofs from the RHS vector b, using the essential boundary - // condition values in x. - void EliminateRHS(const Vector &x, Vector &b) const; - - void Mult(const Vector &x, Vector &y) const override; - - void MultTranspose(const Vector &x, Vector &y) const override; - - void AddMult(const Vector &x, Vector &y, const double a = 1.0) const override; - - void AddMultTranspose(const Vector &x, Vector &y, const double a = 1.0) const override; -}; - -// Complex-valued RAP operator. -class ComplexParOperator : public ComplexOperator -{ -private: - // Storage and access for the local operator. - std::unique_ptr data_A; - ComplexWrapperOperator *A; - - // Finite element spaces for parallel prolongation and restriction. - const mfem::ParFiniteElementSpace &trial_fespace, &test_fespace; - const bool use_R; - - // Lists of constrained essential boundary true dofs for elimination. - mutable const mfem::Array *dbc_tdof_list; - - // Diagonal policy for constrained true dofs. - Operator::DiagonalPolicy diag_policy; - - // Real and imaginary parts of the operator as non-owning ParOperator objects. - std::unique_ptr RAPr, RAPi; - - // Temporary storage for operator application. - mutable ComplexVector lx, ly, ty; - - // Helper methods for operator application. - void RestrictionMatrixMult(const ComplexVector &ly, ComplexVector &ty) const; - void RestrictionMatrixAddMult(const ComplexVector &ly, ComplexVector &ty, - const double a) const; - void RestrictionMatrixMultTranspose(const ComplexVector &ty, ComplexVector &ly) const; - - ComplexParOperator(std::unique_ptr &&dAr, std::unique_ptr &&dAi, - Operator *pAr, Operator *pAi, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict); - -public: - // Construct the complex-valued parallel operator from the separate real and imaginary - // parts, inheriting ownership of the local operator. - ComplexParOperator(std::unique_ptr &&Ar, std::unique_ptr &&Ai, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict); - ComplexParOperator(std::unique_ptr &&Ar, std::unique_ptr &&Ai, - const mfem::ParFiniteElementSpace &fespace) - : ComplexParOperator(std::move(Ar), std::move(Ai), fespace, fespace, false) - { - } - - // Non-owning constructors. - ComplexParOperator(Operator *Ar, Operator *Ai, - const mfem::ParFiniteElementSpace &trial_fespace, - const mfem::ParFiniteElementSpace &test_fespace, bool test_restrict); - ComplexParOperator(Operator *Ar, Operator *Ai, const mfem::ParFiniteElementSpace &fespace) - : ComplexParOperator(Ar, Ai, fespace, fespace, false) - { - } - - // Get access to the underlying local (L-vector) operator. - const ComplexOperator &LocalOperator() const; - ComplexOperator &LocalOperator(); - - // Get the associated MPI communicator. - MPI_Comm GetComm() const { return trial_fespace.GetComm(); } - - // Set essential boundary condition true dofs for square operators. - void SetEssentialTrueDofs(const mfem::Array &tdof_list, - Operator::DiagonalPolicy policy); - - // Get the essential boundary condition true dofs associated with the operator. May be - // nullptr. - const mfem::Array *GetEssentialTrueDofs() const { return dbc_tdof_list; } - - // Assemble the diagonal for the parallel operator. - void AssembleDiagonal(ComplexVector &diag) const; - - bool IsReal() const override { return A->IsReal(); } - bool IsImag() const override { return A->IsImag(); } - bool HasReal() const override { return RAPr != nullptr; } - bool HasImag() const override { return RAPi != nullptr; } - const Operator *Real() const override { return RAPr.get(); } - Operator *Real() override { return RAPr.get(); } - const Operator *Imag() const override { return RAPi.get(); } - Operator *Imag() override { return RAPi.get(); } - - void Mult(const ComplexVector &x, ComplexVector &y) const override; - - void MultTranspose(const ComplexVector &x, ComplexVector &y) const override; - - void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const override; - - void AddMult(const ComplexVector &x, ComplexVector &y, - const std::complex a = 1.0) const override; - - void AddMultTranspose(const ComplexVector &x, ComplexVector &y, - const std::complex a = 1.0) const override; - - void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, - const std::complex a = 1.0) const override; -}; - -} // namespace palace - -#endif // PALACE_LINALG_RAP_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_RAP_HPP +#define PALACE_LINALG_RAP_HPP + +#include +#include +#include +#include "fem/fespace.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +// +// A parallel operator represented by RAP constructed through the actions of R, A, and P, +// usually with R = Pᵀ, and with possible eliminated essential BC. Here R and P are the +// parallel restriction and prolongation matrices. +// + +// Real-valued RAP operator. +class ParOperator : public Operator +{ +private: + // Storage and access for the local operator. + std::unique_ptr data_A; + const Operator *A; + + // Finite element spaces for parallel prolongation and restriction. + const FiniteElementSpace &trial_fespace, &test_fespace; + const bool use_R; + + // Lists of constrained essential boundary true dofs for elimination. + mfem::Array dbc_tdof_list; + + // Diagonal policy for constrained true dofs. + DiagonalPolicy diag_policy = DiagonalPolicy::DIAG_ZERO; + + // Assembled operator as a parallel Hypre matrix. If assembled, the local operator is not + // deleted. + mutable std::unique_ptr RAP; + + // Helper methods for operator application. + void RestrictionMatrixMult(const Vector &ly, Vector &ty) const; + void RestrictionMatrixMultTranspose(const Vector &ty, Vector &ly) const; + Vector &GetTestLVector() const; + + ParOperator(std::unique_ptr &&dA, const Operator *pA, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, bool test_restrict); + +public: + // Construct the parallel operator, inheriting ownership of the local operator. + ParOperator(std::unique_ptr &&A, const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, bool test_restrict); + ParOperator(std::unique_ptr &&A, const FiniteElementSpace &fespace) + : ParOperator(std::move(A), fespace, fespace, false) + { + } + + // Non-owning constructors. + ParOperator(const Operator &A, const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, bool test_restrict); + ParOperator(const Operator &A, const FiniteElementSpace &fespace) + : ParOperator(A, fespace, fespace, false) + { + } + + // Get access to the underlying local (L-vector) operator. + const Operator &LocalOperator() const { return *A; } + + // Get the associated MPI communicator. + MPI_Comm GetComm() const { return trial_fespace.GetComm(); } + + // Accessor for trial finite element space. + const FiniteElementSpace &TrialFiniteElementSpace() const { return trial_fespace; } + + // Accessor for test finite element space. + const FiniteElementSpace &TestFiniteElementSpace() const { return test_fespace; } + + // Set essential boundary condition true dofs for square operators. + void SetEssentialTrueDofs(const mfem::Array &tdof_list, DiagonalPolicy policy); + + // Get the essential boundary condition true dofs associated with the operator. May be + // nullptr. + const mfem::Array *GetEssentialTrueDofs() const + { + return dbc_tdof_list.Size() ? &dbc_tdof_list : nullptr; + } + + // Get the diagonal policy that was most recently used. If there are no essential dofs, + // and thus no valid policy, will error. + DiagonalPolicy GetDiagonalPolicy() const; + + // Eliminate essential true dofs from the RHS vector b, using the essential boundary + // condition values in x. + void EliminateRHS(const Vector &x, Vector &b) const; + + // Assemble the operator as a parallel sparse matrix. The memory associated with the + // local operator is free'd. + mfem::HypreParMatrix &ParallelAssemble(bool skip_zeros = false) const; + + // Steal the assembled parallel sparse matrix. + std::unique_ptr StealParallelAssemble(bool skip_zeros = false) const + { + ParallelAssemble(skip_zeros); + return std::move(RAP); + } + + void AssembleDiagonal(Vector &diag) const override; + + void Mult(const Vector &x, Vector &y) const override; + + void MultTranspose(const Vector &x, Vector &y) const override; + + void AddMult(const Vector &x, Vector &y, const double a = 1.0) const override; + + void AddMultTranspose(const Vector &x, Vector &y, const double a = 1.0) const override; +}; + +// Complex-valued RAP operator. +class ComplexParOperator : public ComplexOperator +{ +private: + // Storage and access for the local operator. + std::unique_ptr data_A; + const ComplexWrapperOperator *A; + + // Finite element spaces for parallel prolongation and restriction. + const FiniteElementSpace &trial_fespace, &test_fespace; + const bool use_R; + + // Lists of constrained essential boundary true dofs for elimination. + mfem::Array dbc_tdof_list; + + // Diagonal policy for constrained true dofs. + Operator::DiagonalPolicy diag_policy = Operator::DiagonalPolicy::DIAG_ZERO; + + // Real and imaginary parts of the operator as non-owning ParOperator objects. + std::unique_ptr RAPr, RAPi; + + // Helper methods for operator application. + void RestrictionMatrixMult(const ComplexVector &ly, ComplexVector &ty) const; + void RestrictionMatrixMultTranspose(const ComplexVector &ty, ComplexVector &ly) const; + ComplexVector &GetTestLVector() const; + + ComplexParOperator(std::unique_ptr &&dAr, std::unique_ptr &&dAi, + const Operator *pAr, const Operator *pAi, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, bool test_restrict); + +public: + // Construct the complex-valued parallel operator from the separate real and imaginary + // parts, inheriting ownership of the local operator. + ComplexParOperator(std::unique_ptr &&Ar, std::unique_ptr &&Ai, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, bool test_restrict); + ComplexParOperator(std::unique_ptr &&Ar, std::unique_ptr &&Ai, + const FiniteElementSpace &fespace) + : ComplexParOperator(std::move(Ar), std::move(Ai), fespace, fespace, false) + { + } + + // Non-owning constructors. + ComplexParOperator(const Operator *Ar, const Operator *Ai, + const FiniteElementSpace &trial_fespace, + const FiniteElementSpace &test_fespace, bool test_restrict); + ComplexParOperator(const Operator *Ar, const Operator *Ai, + const FiniteElementSpace &fespace) + : ComplexParOperator(Ar, Ai, fespace, fespace, false) + { + } + + const Operator *Real() const override { return RAPr.get(); } + const Operator *Imag() const override { return RAPi.get(); } + + // Get access to the underlying local (L-vector) operator. + const ComplexOperator &LocalOperator() const { return *A; } + + // Get the associated MPI communicator. + MPI_Comm GetComm() const { return trial_fespace.GetComm(); } + + // Accessor for trial finite element space. + const FiniteElementSpace &TrialFiniteElementSpace() const { return trial_fespace; } + + // Accessor for test finite element space. + const FiniteElementSpace &TestFiniteElementSpace() const { return test_fespace; } + + // Set essential boundary condition true dofs for square operators. + void SetEssentialTrueDofs(const mfem::Array &tdof_list, + Operator::DiagonalPolicy policy); + + // Get the essential boundary condition true dofs associated with the operator. May be + // nullptr. + const mfem::Array *GetEssentialTrueDofs() const + { + return dbc_tdof_list.Size() ? &dbc_tdof_list : nullptr; + } + + // Get the diagonal policy that was most recently used. If there are no essential dofs, + // and thus no valid policy, will error. + Operator::DiagonalPolicy GetDiagonalPolicy() const; + + void AssembleDiagonal(ComplexVector &diag) const override; + + void Mult(const ComplexVector &x, ComplexVector &y) const override; + + void MultTranspose(const ComplexVector &x, ComplexVector &y) const override; + + void MultHermitianTranspose(const ComplexVector &x, ComplexVector &y) const override; + + void AddMult(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const override; + + void AddMultTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const override; + + void AddMultHermitianTranspose(const ComplexVector &x, ComplexVector &y, + const std::complex a = 1.0) const override; +}; + +// Combine a collection of ParOperator into a weighted summation. If set_essential is true, +// extract the essential dofs from the operator array, and apply to the summed operator. +// Requires explicit instantiation. +template +std::unique_ptr +BuildParSumOperator(const std::array &coeff, + const std::array &ops, + bool set_essential = true); + +// Combine a collection of ComplexParOperator into a weighted summation. If set_essential is +// true, extract the essential dofs from the operator array, and apply to the summed +// operator. Requires explicit instantiation. +template +std::unique_ptr +BuildParSumOperator(const std::array, N> &coeff, + const std::array &ops, + bool set_essential = true); + +// Dispatcher to convert initializer list or C arrays into std::array whilst deducing sizes +// and types. +template +std::unique_ptr, + ComplexParOperator, ParOperator>> +BuildParSumOperator(ScalarType (&&coeff_in)[N], const OperType *(&&ops_in)[N], + bool set_essential = true); + +} // namespace palace + +#endif // PALACE_LINALG_RAP_HPP diff --git a/palace/linalg/slepc.cpp b/palace/linalg/slepc.cpp index f2b63ba3d9..529022f456 100644 --- a/palace/linalg/slepc.cpp +++ b/palace/linalg/slepc.cpp @@ -1,1708 +1,2204 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "slepc.hpp" - -#if defined(PALACE_WITH_SLEPC) - -#include -#include -#include -#include -#include "linalg/divfree.hpp" -#include "utils/communication.hpp" - -static PetscErrorCode __mat_apply_EPS_A0(Mat, Vec, Vec); -static PetscErrorCode __mat_apply_EPS_A1(Mat, Vec, Vec); -static PetscErrorCode __mat_apply_EPS_B(Mat, Vec, Vec); -static PetscErrorCode __pc_apply_EPS(PC, Vec, Vec); -static PetscErrorCode __mat_apply_PEPLinear_L0(Mat, Vec, Vec); -static PetscErrorCode __mat_apply_PEPLinear_L1(Mat, Vec, Vec); -static PetscErrorCode __mat_apply_PEPLinear_B(Mat, Vec, Vec); -static PetscErrorCode __pc_apply_PEPLinear(PC, Vec, Vec); -static PetscErrorCode __mat_apply_PEP_A0(Mat, Vec, Vec); -static PetscErrorCode __mat_apply_PEP_A1(Mat, Vec, Vec); -static PetscErrorCode __mat_apply_PEP_A2(Mat, Vec, Vec); -static PetscErrorCode __mat_apply_PEP_B(Mat, Vec, Vec); -static PetscErrorCode __pc_apply_PEP(PC, Vec, Vec); - -namespace palace::slepc -{ - -namespace -{ - -struct MatShellContext -{ - const ComplexOperator &A; - ComplexVector &x, &y; -}; - -PetscErrorCode __mat_apply_shell(Mat A, Vec x, Vec y) -{ - PetscFunctionBeginUser; - MatShellContext *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->A.Mult(ctx->x, ctx->y); - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __mat_apply_transpose_shell(Mat A, Vec x, Vec y) -{ - PetscFunctionBeginUser; - MatShellContext *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->A.MultTranspose(ctx->x, ctx->y); - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __mat_apply_hermitian_transpose_shell(Mat A, Vec x, Vec y) -{ - PetscFunctionBeginUser; - MatShellContext *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->A.MultHermitianTranspose(ctx->x, ctx->y); - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -}; - -void ConfigurePCShell(ST st, void *ctx, PetscErrorCode (*__pc_apply)(PC, Vec, Vec)) -{ - KSP ksp; - PC pc; - PalacePetscCall(STGetKSP(st, &ksp)); - PalacePetscCall(KSPGetPC(ksp, &pc)); - PalacePetscCall(PCSetType(pc, PCSHELL)); - PalacePetscCall(PCShellSetContext(pc, ctx)); - PalacePetscCall(PCShellSetApply(pc, __pc_apply)); -} - -void ConfigureRG(RG rg, PetscReal lr, PetscReal ur, PetscReal li, PetscReal ui, - bool complement = false) -{ - PalacePetscCall(RGSetType(rg, RGINTERVAL)); - PalacePetscCall(RGIntervalSetEndpoints(rg, lr, ur, li, ui)); - if (complement) - { - PalacePetscCall(RGSetComplement(rg, PETSC_TRUE)); - } -} - -} // namespace - -void Initialize(int &argc, char **&argv, const char rc_file[], const char help[]) -{ - // Remove default PETSc signal handling, since it can be confusing when the errors occur - // not from within SLEPc/PETSc. - PalacePetscCall(SlepcInitialize(&argc, &argv, rc_file, help)); - PalacePetscCall(PetscPopSignalHandler()); -} - -void Initialize() -{ - // Remove default PETSc signal handling, since it can be confusing when the errors occur - // not from within SLEPc/PETSc. - PalacePetscCall(SlepcInitializeNoArguments()); - PalacePetscCall(PetscPopSignalHandler()); -} - -void Finalize() -{ - PalacePetscCall(SlepcFinalize()); -} - -PetscReal GetMaxSingularValue(MPI_Comm comm, const ComplexOperator &A, bool herm, - PetscReal tol, PetscInt max_it) -{ - // This method assumes the provided operator has the required operations for SLEPc's EPS - // or SVD solvers, namely MATOP_MULT and MATOP_MULT_HERMITIAN_TRANSPOSE (if the matrix - // is not Hermitian). - Mat A0; - PetscInt n = A.Height(); - ComplexVector x(n), y(n); - MatShellContext ctx = {A, x, y}; - PalacePetscCall( - MatCreateShell(comm, n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)&ctx, &A0)); - PalacePetscCall(MatShellSetOperation(A0, MATOP_MULT, (void (*)(void))__mat_apply_shell)); - if (herm) - { - EPS eps; - PetscInt num_conv; - PetscScalar eig; - PalacePetscCall(EPSCreate(comm, &eps)); - PalacePetscCall(EPSSetOperators(eps, A0, nullptr)); - PalacePetscCall(EPSSetProblemType(eps, EPS_HEP)); - PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_LARGEST_MAGNITUDE)); - PalacePetscCall(EPSSetDimensions(eps, 1, PETSC_DEFAULT, PETSC_DEFAULT)); - PalacePetscCall(EPSSetTolerances(eps, tol, max_it)); - PalacePetscCall(EPSSolve(eps)); - PalacePetscCall(EPSGetConverged(eps, &num_conv)); - if (num_conv < 1) - { - Mpi::Warning(comm, "SLEPc EPS solve did not converge for maximum singular value!\n"); - eig = 0.0; - } - else - { - PalacePetscCall(EPSGetEigenvalue(eps, 0, &eig, nullptr)); - MFEM_VERIFY(PetscImaginaryPart(eig) == 0.0, - "Unexpected complex eigenvalue for Hermitian matrix (λ = " << eig - << ")!"); - } - PalacePetscCall(EPSDestroy(&eps)); - PalacePetscCall(MatDestroy(&A0)); - return PetscAbsScalar(eig); - } - else - { - PalacePetscCall(MatShellSetOperation(A0, MATOP_MULT_TRANSPOSE, - (void (*)(void))__mat_apply_transpose_shell)); - PalacePetscCall( - MatShellSetOperation(A0, MATOP_MULT_HERMITIAN_TRANSPOSE, - (void (*)(void))__mat_apply_hermitian_transpose_shell)); - - SVD svd; - PetscInt num_conv; - PetscReal sigma; - PalacePetscCall(SVDCreate(comm, &svd)); - PalacePetscCall(SVDSetOperators(svd, A0, nullptr)); - PalacePetscCall(SVDSetProblemType(svd, SVD_STANDARD)); - PalacePetscCall(SVDSetWhichSingularTriplets(svd, SVD_LARGEST)); - PalacePetscCall(SVDSetDimensions(svd, 1, PETSC_DEFAULT, PETSC_DEFAULT)); - PalacePetscCall(SVDSetTolerances(svd, tol, max_it)); - PalacePetscCall(SVDSolve(svd)); - PalacePetscCall(SVDGetConverged(svd, &num_conv)); - if (num_conv < 1) - { - Mpi::Warning(comm, "SLEPc SVD solve did not converge for maximum singular value!\n"); - sigma = 0.0; - } - else - { - PalacePetscCall(SVDGetSingularTriplet(svd, 0, &sigma, nullptr, nullptr)); - } - PalacePetscCall(SVDDestroy(&svd)); - PalacePetscCall(MatDestroy(&A0)); - return sigma; - } -} - -// Eigensolver base class methods - -SlepcEigenvalueSolver::SlepcEigenvalueSolver(int print) : print(print) -{ - sinvert = false; - region = true; - sigma = 0.0; - gamma = delta = 1.0; - - opInv = nullptr; - opProj = nullptr; - opB = nullptr; - - B0 = nullptr; - v0 = nullptr; - - cl_custom = false; -} - -SlepcEigenvalueSolver::~SlepcEigenvalueSolver() -{ - PalacePetscCall(MatDestroy(&B0)); - PalacePetscCall(VecDestroy(&v0)); -} - -void SlepcEigenvalueSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &M, - EigenvalueSolver::ScaleType type) -{ - MFEM_ABORT("SetOperators not defined for base class SlepcEigenvalueSolver!"); -} - -void SlepcEigenvalueSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &C, - const ComplexOperator &M, - EigenvalueSolver::ScaleType type) -{ - MFEM_ABORT("SetOperators not defined for base class SlepcEigenvalueSolver!"); -} - -void SlepcEigenvalueSolver::SetLinearSolver(const ComplexKspSolver &ksp) -{ - opInv = &ksp; -} - -void SlepcEigenvalueSolver::SetDivFreeProjector(const DivFreeSolver &divfree) -{ - opProj = &divfree; -} - -void SlepcEigenvalueSolver::SetBMat(const Operator &B) -{ - opB = &B; -} - -void SlepcEigenvalueSolver::SetShiftInvert(PetscScalar s, bool precond) -{ - ST st = GetST(); - if (precond) - { - PalacePetscCall(STSetType(st, STPRECOND)); - } - else - { - PalacePetscCall(STSetType(st, STSINVERT)); - } - PalacePetscCall(STSetTransform(st, PETSC_TRUE)); - PalacePetscCall(STSetMatMode(st, ST_MATMODE_SHELL)); - sigma = s; // Wait until solve time to call EPS/PEPSetTarget - sinvert = true; -} - -void SlepcEigenvalueSolver::SetOrthogonalization(bool mgs, bool cgs2) -{ - // The SLEPc default is CGS with refinement if needed. - if (mgs || cgs2) - { - BV bv = GetBV(); - BVOrthogType type; - BVOrthogRefineType refine; - if (mgs) - { - type = BV_ORTHOG_MGS; - refine = BV_ORTHOG_REFINE_NEVER; - } - else // cgs2 - { - type = BV_ORTHOG_CGS; - refine = BV_ORTHOG_REFINE_ALWAYS; - } - PalacePetscCall(BVSetOrthogonalization(bv, type, refine, 1.0, BV_ORTHOG_BLOCK_GS)); - } -} - -void SlepcEigenvalueSolver::Customize() -{ - // Configure the KSP object for non-preconditioned spectral transformations. - PetscBool precond; - ST st = GetST(); - PalacePetscCall( - PetscObjectTypeCompare(reinterpret_cast(st), STPRECOND, &precond)); - if (!precond) - { - KSP ksp; - PalacePetscCall(STGetKSP(st, &ksp)); - PalacePetscCall(KSPSetType(ksp, KSPPREONLY)); - } - - // Configure the region based on the given target if necessary. - if (sinvert && region) - { - if (PetscImaginaryPart(sigma) == 0.0) - { - PetscReal sr = PetscRealPart(sigma); - if (sr > 0.0) - { - ConfigureRG(GetRG(), sr / gamma, mfem::infinity(), -mfem::infinity(), - mfem::infinity()); - } - else if (sr < 0.0) - { - ConfigureRG(GetRG(), -mfem::infinity(), sr / gamma, -mfem::infinity(), - mfem::infinity()); - } - } - else if (PetscRealPart(sigma) == 0.0) - { - PetscReal si = PetscImaginaryPart(sigma); - if (si > 0.0) - { - ConfigureRG(GetRG(), -mfem::infinity(), mfem::infinity(), si / gamma, - mfem::infinity()); - } - else if (si < 0.0) - { - ConfigureRG(GetRG(), -mfem::infinity(), mfem::infinity(), -mfem::infinity(), - si / gamma); - } - } - else - { - MFEM_ABORT("Shift-and-invert with general complex eigenvalue target is unsupported!"); - } - } -} - -PetscReal SlepcEigenvalueSolver::GetError(int i, EigenvalueSolver::ErrorType type) const -{ - switch (type) - { - case ErrorType::ABSOLUTE: - return res.get()[i]; - case ErrorType::RELATIVE: - return res.get()[i] / PetscAbsScalar(GetEigenvalue(i)); - case ErrorType::BACKWARD: - return res.get()[i] / GetBackwardScaling(GetEigenvalue(i)); - } - return 0.0; -} - -// EPS specific methods - -SlepcEPSSolverBase::SlepcEPSSolverBase(MPI_Comm comm, int print, const std::string &prefix) - : SlepcEigenvalueSolver(print) -{ - PalacePetscCall(EPSCreate(comm, &eps)); - PalacePetscCall(EPSSetOptionsPrefix(eps, prefix.c_str())); - if (print > 0) - { - std::string opts = "-eps_monitor"; - if (print > 2) - { - opts.append(" -eps_view"); - } - if (prefix.length() > 0) - { - PetscOptionsPrefixPush(nullptr, prefix.c_str()); - } - PetscOptionsInsertString(nullptr, opts.c_str()); - if (prefix.length() > 0) - { - PetscOptionsPrefixPop(nullptr); - } - } - A0 = A1 = nullptr; -} - -SlepcEPSSolverBase::~SlepcEPSSolverBase() -{ - PalacePetscCall(EPSDestroy(&eps)); - PalacePetscCall(MatDestroy(&A0)); - PalacePetscCall(MatDestroy(&A1)); -} - -void SlepcEPSSolverBase::SetNumModes(int num_eig, int num_vec) -{ - PalacePetscCall(EPSSetDimensions(eps, num_eig, (num_vec > 0) ? num_vec : PETSC_DEFAULT, - PETSC_DEFAULT)); -} - -void SlepcEPSSolverBase::SetTol(PetscReal tol) -{ - PalacePetscCall(EPSSetTolerances(eps, tol, PETSC_DEFAULT)); - PalacePetscCall(EPSSetConvergenceTest(eps, EPS_CONV_REL)); - // PalacePetscCall(EPSSetTrackAll(eps, PETSC_TRUE)); - // PalacePetscCall(EPSSetTrueResidual(eps, PETSC_TRUE)); -} - -void SlepcEPSSolverBase::SetMaxIter(int max_it) -{ - PalacePetscCall( - EPSSetTolerances(eps, PETSC_DEFAULT, (max_it > 0) ? max_it : PETSC_DEFAULT)); -} - -void SlepcEPSSolverBase::SetWhichEigenpairs(EigenvalueSolver::WhichType type) -{ - switch (type) - { - case WhichType::LARGEST_MAGNITUDE: - PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_LARGEST_MAGNITUDE)); - region = false; - break; - case WhichType::SMALLEST_MAGNITUDE: - PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_SMALLEST_MAGNITUDE)); - region = false; - break; - case WhichType::LARGEST_REAL: - PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_LARGEST_REAL)); - break; - case WhichType::SMALLEST_REAL: - PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_SMALLEST_REAL)); - break; - case WhichType::LARGEST_IMAGINARY: - PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_LARGEST_IMAGINARY)); - break; - case WhichType::SMALLEST_IMAGINARY: - PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_SMALLEST_IMAGINARY)); - break; - case WhichType::TARGET_MAGNITUDE: - PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_TARGET_MAGNITUDE)); - region = false; - break; - case WhichType::TARGET_REAL: - PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_TARGET_REAL)); - break; - case WhichType::TARGET_IMAGINARY: - PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_TARGET_IMAGINARY)); - break; - } -} - -void SlepcEPSSolverBase::SetProblemType(SlepcEigenvalueSolver::ProblemType type) -{ - switch (type) - { - case ProblemType::HERMITIAN: - PalacePetscCall(EPSSetProblemType(eps, EPS_HEP)); - break; - case ProblemType::NON_HERMITIAN: - PalacePetscCall(EPSSetProblemType(eps, EPS_NHEP)); - break; - case ProblemType::GEN_HERMITIAN: - PalacePetscCall(EPSSetProblemType(eps, EPS_GHEP)); - break; - case ProblemType::GEN_INDEFINITE: - PalacePetscCall(EPSSetProblemType(eps, EPS_GHIEP)); - break; - case ProblemType::GEN_NON_HERMITIAN: - PalacePetscCall(EPSSetProblemType(eps, EPS_GNHEP)); - // PalacePetscCall(EPSSetProblemType(eps, EPS_PGNHEP)); // If B is SPD - break; - case ProblemType::HYPERBOLIC: - case ProblemType::GYROSCOPIC: - MFEM_ABORT("Problem type not implemented!"); - break; - } -} - -void SlepcEPSSolverBase::SetType(SlepcEigenvalueSolver::Type type) -{ - switch (type) - { - case Type::KRYLOVSCHUR: - PalacePetscCall(EPSSetType(eps, EPSKRYLOVSCHUR)); - break; - case Type::POWER: - PalacePetscCall(EPSSetType(eps, EPSPOWER)); - break; - case Type::SUBSPACE: - PalacePetscCall(EPSSetType(eps, EPSSUBSPACE)); - break; - case Type::JD: - PalacePetscCall(EPSSetType(eps, EPSJD)); - region = false; - break; - case Type::TOAR: - case Type::STOAR: - case Type::QARNOLDI: - MFEM_ABORT("Eigenvalue solver type not implemented!"); - break; - } -} - -void SlepcEPSSolverBase::SetInitialSpace(const ComplexVector &v) -{ - MFEM_VERIFY( - A0 && A1, - "Must call SetOperators before using SetInitialSpace for SLEPc eigenvalue solver!"); - if (!v0) - { - PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); - } - - PetscInt n; - PalacePetscCall(VecGetLocalSize(v0, &n)); - MFEM_VERIFY(v.Size() == n, "Invalid size mismatch for provided initial space vector!"); - - PetscScalar *pv0; - PalacePetscCall(VecGetArrayWrite(v0, &pv0)); - v.Get(pv0, n); - PalacePetscCall(VecRestoreArrayWrite(v0, &pv0)); - - Vec is[1] = {v0}; - PalacePetscCall(EPSSetInitialSpace(eps, 1, is)); -} - -void SlepcEPSSolverBase::Customize() -{ - SlepcEigenvalueSolver::Customize(); - PalacePetscCall(EPSSetTarget(eps, sigma / gamma)); - if (!cl_custom) - { - PalacePetscCall(EPSSetFromOptions(eps)); - if (print > 0) - { - PetscOptionsView(nullptr, PETSC_VIEWER_STDOUT_(GetComm())); - Mpi::Print(GetComm(), "\n"); - } - cl_custom = true; - } -} - -int SlepcEPSSolverBase::Solve() -{ - MFEM_VERIFY(A0 && A1 && opInv, "Operators are not set for SlepcEPSSolverBase!"); - - // Solve the eigenvalue problem. - PetscInt num_conv; - Customize(); - PalacePetscCall(EPSSolve(eps)); - PalacePetscCall(EPSGetConverged(eps, &num_conv)); - if (print > 0) - { - Mpi::Print(GetComm(), "\n"); - PalacePetscCall(EPSConvergedReasonView(eps, PETSC_VIEWER_STDOUT_(GetComm()))); - Mpi::Print(GetComm(), - " Total number of linear systems solved: {:d}\n" - " Total number of linear solver iterations: {:d}\n", - opInv->NumTotalMult(), opInv->NumTotalMultIterations()); - } - - // Compute and store the eigenpair residuals. - res = std::make_unique(num_conv); - for (int i = 0; i < num_conv; i++) - { - res.get()[i] = GetResidualNorm(i); - } - return (int)num_conv; -} - -PetscScalar SlepcEPSSolverBase::GetEigenvalue(int i) const -{ - PetscScalar l; - PalacePetscCall(EPSGetEigenvalue(eps, i, &l, nullptr)); - return l * gamma; -} - -void SlepcEPSSolverBase::GetEigenvector(int i, ComplexVector &x) const -{ - MFEM_VERIFY( - v0, - "Must call SetOperators before using GetEigenvector for SLEPc eigenvalue solver!"); - PalacePetscCall(EPSGetEigenvector(eps, i, v0, nullptr)); - - PetscInt n; - PalacePetscCall(VecGetLocalSize(v0, &n)); - MFEM_VERIFY(x.Size() == n, "Invalid size mismatch for provided eigenvector!"); - - const PetscScalar *pv0; - PalacePetscCall(VecGetArrayRead(v0, &pv0)); - x.Set(pv0, n); - PalacePetscCall(VecRestoreArrayRead(v0, &pv0)); -} - -BV SlepcEPSSolverBase::GetBV() const -{ - BV bv; - PalacePetscCall(EPSGetBV(eps, &bv)); - return bv; -} - -ST SlepcEPSSolverBase::GetST() const -{ - ST st; - PalacePetscCall(EPSGetST(eps, &st)); - return st; -} - -RG SlepcEPSSolverBase::GetRG() const -{ - RG rg; - PalacePetscCall(EPSGetRG(eps, &rg)); - return rg; -} - -SlepcEPSSolver::SlepcEPSSolver(MPI_Comm comm, int print, const std::string &prefix) - : SlepcEPSSolverBase(comm, print, prefix) -{ - opK = opM = nullptr; - normK = normM = 0.0; -} - -void SlepcEPSSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &M, - EigenvalueSolver::ScaleType type) -{ - // Construct shell matrices for the scaled operators which define the generalized - // eigenvalue problem. - bool first = (opK == nullptr); - opK = &K; - opM = &M; - - if (first) - { - PetscInt n = opK->Height(); - PalacePetscCall( - MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A0)); - PalacePetscCall( - MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A1)); - PalacePetscCall( - MatShellSetOperation(A0, MATOP_MULT, (void (*)(void))__mat_apply_EPS_A0)); - PalacePetscCall( - MatShellSetOperation(A1, MATOP_MULT, (void (*)(void))__mat_apply_EPS_A1)); - PalacePetscCall(EPSSetOperators(eps, A0, A1)); - } - - if (first && type != ScaleType::NONE) - { - normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); - normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); - MFEM_VERIFY(normK >= 0.0 && normM >= 0.0, "Invalid matrix norms for EPS scaling!"); - if (normK > 0 && normM > 0.0) - { - gamma = normK / normM; // Store γ² for linear problem - delta = 2.0 / normK; - } - } - - // Set up workspace. - if (!v0) - { - PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); - } - x.SetSize(opK->Height()); - y.SetSize(opK->Height()); - - // Configure linear solver for generalized problem or spectral transformation. This also - // allows use of the divergence-free projector as a linear solve side-effect. - if (first) - { - ConfigurePCShell(GetST(), (void *)this, __pc_apply_EPS); - } -} - -void SlepcEPSSolver::SetBMat(const Operator &B) -{ - SlepcEigenvalueSolver::SetBMat(B); - - PetscInt n = B.Height(); - PalacePetscCall( - MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &B0)); - PalacePetscCall(MatShellSetOperation(B0, MATOP_MULT, (void (*)(void))__mat_apply_EPS_B)); - - BV bv = GetBV(); - PalacePetscCall(BVSetMatrix(bv, B0, PETSC_FALSE)); -} - -PetscReal SlepcEPSSolver::GetResidualNorm(int i) const -{ - // Compute the i-th eigenpair residual: || (K - λ M) x ||₂ for eigenvalue λ. - PetscScalar l = GetEigenvalue(i); - GetEigenvector(i, x); - opK->Mult(x, y); - opM->AddMult(x, y, -l); - return linalg::Norml2(GetComm(), y); -} - -PetscReal SlepcEPSSolver::GetBackwardScaling(PetscScalar l) const -{ - // Make sure not to use norms from scaling as this can be confusing if they are different. - // Note that SLEPc typically uses ||.||∞, not the 2-norm. - if (normK <= 0.0) - { - normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); - } - if (normM <= 0.0) - { - normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); - } - return normK + PetscAbsScalar(l) * normM; -} - -SlepcPEPLinearSolver::SlepcPEPLinearSolver(MPI_Comm comm, int print, - const std::string &prefix) - : SlepcEPSSolverBase(comm, print, prefix) -{ - opK = opC = opM = nullptr; - normK = normC = normM = 0.0; -} - -void SlepcPEPLinearSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &C, - const ComplexOperator &M, - EigenvalueSolver::ScaleType type) -{ - // Construct shell matrices for the scaled linearized operators which define the block 2x2 - // eigenvalue problem. - bool first = (opK == nullptr); - opK = &K; - opC = &C; - opM = &M; - - if (first) - { - PetscInt n = opK->Height(); - PalacePetscCall(MatCreateShell(GetComm(), 2 * n, 2 * n, PETSC_DECIDE, PETSC_DECIDE, - (void *)this, &A0)); - PalacePetscCall(MatCreateShell(GetComm(), 2 * n, 2 * n, PETSC_DECIDE, PETSC_DECIDE, - (void *)this, &A1)); - PalacePetscCall( - MatShellSetOperation(A0, MATOP_MULT, (void (*)(void))__mat_apply_PEPLinear_L0)); - PalacePetscCall( - MatShellSetOperation(A1, MATOP_MULT, (void (*)(void))__mat_apply_PEPLinear_L1)); - PalacePetscCall(EPSSetOperators(eps, A0, A1)); - } - - if (first && type != ScaleType::NONE) - { - normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); - normC = linalg::SpectralNorm(GetComm(), *opC, opC->IsReal()); - normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); - MFEM_VERIFY(normK >= 0.0 && normC >= 0.0 && normM >= 0.0, - "Invalid matrix norms for PEP scaling!"); - if (normK > 0 && normC > 0.0 && normM > 0.0) - { - gamma = std::sqrt(normK / normM); - delta = 2.0 / (normK + gamma * normC); - } - } - - // Set up workspace. - if (!v0) - { - PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); - } - x1.SetSize(opK->Height()); - x2.SetSize(opK->Height()); - y1.SetSize(opK->Height()); - y2.SetSize(opK->Height()); - - // Configure linear solver. - if (first) - { - ConfigurePCShell(GetST(), (void *)this, __pc_apply_PEPLinear); - } -} - -void SlepcPEPLinearSolver::SetBMat(const Operator &B) -{ - SlepcEigenvalueSolver::SetBMat(B); - - PetscInt n = B.Height(); - PalacePetscCall(MatCreateShell(GetComm(), 2 * n, 2 * n, PETSC_DECIDE, PETSC_DECIDE, - (void *)this, &B0)); - PalacePetscCall( - MatShellSetOperation(B0, MATOP_MULT, (void (*)(void))__mat_apply_PEPLinear_B)); - - BV bv = GetBV(); - PalacePetscCall(BVSetMatrix(bv, B0, PETSC_FALSE)); -} - -void SlepcPEPLinearSolver::SetInitialSpace(const ComplexVector &v) -{ - MFEM_VERIFY( - A0 && A1, - "Must call SetOperators before using SetInitialSpace for SLEPc eigenvalue solver!"); - if (!v0) - { - PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); - } - - PetscInt n; - PalacePetscCall(VecGetLocalSize(v0, &n)); - MFEM_VERIFY(2 * v.Size() == n, - "Invalid size mismatch for provided initial space vector!"); - - PetscScalar *pv0; - PalacePetscCall(VecGetArrayWrite(v0, &pv0)); - v.Get(pv0, n / 2); - std::fill(pv0 + n / 2, pv0 + n, 0.0); - PalacePetscCall(VecRestoreArrayWrite(v0, &pv0)); - - Vec is[1] = {v0}; - PalacePetscCall(EPSSetInitialSpace(eps, 1, is)); -} - -void SlepcPEPLinearSolver::GetEigenvector(int i, ComplexVector &x) const -{ - // Select the most accurate x for y = [x₁; x₂] from the linearized eigenvalue problem. Or, - // just take x = x₁. - MFEM_VERIFY( - v0, - "Must call SetOperators before using GetEigenvector for SLEPc eigenvalue solver!"); - PalacePetscCall(EPSGetEigenvector(eps, i, v0, nullptr)); - PetscInt n; - PalacePetscCall(VecGetLocalSize(v0, &n)); - MFEM_VERIFY(2 * x.Size() == n, "Invalid size mismatch for provided eigenvector!"); - - const PetscScalar *pv0; - PalacePetscCall(VecGetArrayRead(v0, &pv0)); - x.Set(pv0, n / 2); - PalacePetscCall(VecRestoreArrayRead(v0, &pv0)); - - if (opB) - { - linalg::Normalize(GetComm(), x, *opB, y1); - } - else - { - linalg::Normalize(GetComm(), x); - } -} - -PetscReal SlepcPEPLinearSolver::GetResidualNorm(int i) const -{ - // Compute the i-th eigenpair residual: || P(λ) x ||₂ = || (K + λ C + λ² M) x ||₂ for - // eigenvalue λ. - PetscScalar l = GetEigenvalue(i); - GetEigenvector(i, x1); - opK->Mult(x1, y1); - opC->AddMult(x1, y1, l); - opM->AddMult(x1, y1, l * l); - return linalg::Norml2(GetComm(), y1); -} - -PetscReal SlepcPEPLinearSolver::GetBackwardScaling(PetscScalar l) const -{ - // Make sure not to use norms from scaling as this can be confusing if they are different. - // Note that SLEPc typically uses ||.||∞, not the 2-norm. - if (normK <= 0.0) - { - normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); - } - if (normC <= 0.0) - { - normC = linalg::SpectralNorm(GetComm(), *opC, opC->IsReal()); - } - if (normM <= 0.0) - { - normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); - } - PetscReal t = PetscAbsScalar(l); - return normK + t * normC + t * t * normM; -} - -// PEP specific methods - -SlepcPEPSolverBase::SlepcPEPSolverBase(MPI_Comm comm, int print, const std::string &prefix) - : SlepcEigenvalueSolver(print) -{ - PalacePetscCall(PEPCreate(comm, &pep)); - PalacePetscCall(PEPSetOptionsPrefix(pep, prefix.c_str())); - if (print > 0) - { - std::string opts = "-pep_monitor"; - if (print > 2) - { - opts.append(" -pep_view"); - } - if (prefix.length() > 0) - { - PetscOptionsPrefixPush(nullptr, prefix.c_str()); - } - PetscOptionsInsertString(nullptr, opts.c_str()); - if (prefix.length() > 0) - { - PetscOptionsPrefixPop(nullptr); - } - } - A0 = A1 = A2 = nullptr; -} - -SlepcPEPSolverBase::~SlepcPEPSolverBase() -{ - PalacePetscCall(PEPDestroy(&pep)); - PalacePetscCall(MatDestroy(&A0)); - PalacePetscCall(MatDestroy(&A1)); - PalacePetscCall(MatDestroy(&A2)); -} - -void SlepcPEPSolverBase::SetNumModes(int num_eig, int num_vec) -{ - PalacePetscCall(PEPSetDimensions(pep, num_eig, (num_vec > 0) ? num_vec : PETSC_DEFAULT, - PETSC_DEFAULT)); -} - -void SlepcPEPSolverBase::SetTol(PetscReal tol) -{ - PalacePetscCall(PEPSetTolerances(pep, tol, PETSC_DEFAULT)); - PalacePetscCall(PEPSetConvergenceTest(pep, PEP_CONV_REL)); - // PalacePetscCall(PEPSetTrackAll(pep, PETSC_TRUE)); -} - -void SlepcPEPSolverBase::SetMaxIter(int max_it) -{ - PalacePetscCall( - PEPSetTolerances(pep, PETSC_DEFAULT, (max_it > 0) ? max_it : PETSC_DEFAULT)); -} - -void SlepcPEPSolverBase::SetWhichEigenpairs(EigenvalueSolver::WhichType type) -{ - switch (type) - { - case WhichType::LARGEST_MAGNITUDE: - PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_LARGEST_MAGNITUDE)); - region = false; - break; - case WhichType::SMALLEST_MAGNITUDE: - PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_SMALLEST_MAGNITUDE)); - region = false; - break; - case WhichType::LARGEST_REAL: - PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_LARGEST_REAL)); - break; - case WhichType::SMALLEST_REAL: - PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_SMALLEST_REAL)); - break; - case WhichType::LARGEST_IMAGINARY: - PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_LARGEST_IMAGINARY)); - break; - case WhichType::SMALLEST_IMAGINARY: - PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_SMALLEST_IMAGINARY)); - break; - case WhichType::TARGET_MAGNITUDE: - PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_TARGET_MAGNITUDE)); - region = false; - break; - case WhichType::TARGET_REAL: - PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_TARGET_REAL)); - break; - case WhichType::TARGET_IMAGINARY: - PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_TARGET_IMAGINARY)); - break; - } -} - -void SlepcPEPSolverBase::SetProblemType(SlepcEigenvalueSolver::ProblemType type) -{ - switch (type) - { - case ProblemType::HERMITIAN: - case ProblemType::GEN_HERMITIAN: - PalacePetscCall(PEPSetProblemType(pep, PEP_HERMITIAN)); - break; - case ProblemType::NON_HERMITIAN: - case ProblemType::GEN_INDEFINITE: - case ProblemType::GEN_NON_HERMITIAN: - PalacePetscCall(PEPSetProblemType(pep, PEP_GENERAL)); - break; - case ProblemType::HYPERBOLIC: - PalacePetscCall(PEPSetProblemType(pep, PEP_HYPERBOLIC)); - break; - case ProblemType::GYROSCOPIC: - PalacePetscCall(PEPSetProblemType(pep, PEP_GYROSCOPIC)); - break; - } -} - -void SlepcPEPSolverBase::SetType(SlepcEigenvalueSolver::Type type) -{ - switch (type) - { - case Type::TOAR: - PalacePetscCall(PEPSetType(pep, PEPTOAR)); - break; - case Type::STOAR: - PalacePetscCall(PEPSetType(pep, PEPSTOAR)); - break; - case Type::QARNOLDI: - PalacePetscCall(PEPSetType(pep, PEPQARNOLDI)); - break; - case Type::JD: - PalacePetscCall(PEPSetType(pep, PEPJD)); - region = false; - break; - case Type::KRYLOVSCHUR: - case Type::POWER: - case Type::SUBSPACE: - MFEM_ABORT("Eigenvalue solver type not implemented!"); - break; - } -} - -void SlepcPEPSolverBase::SetInitialSpace(const ComplexVector &v) -{ - MFEM_VERIFY( - A0 && A1 && A2, - "Must call SetOperators before using SetInitialSpace for SLEPc eigenvalue solver!"); - if (!v0) - { - PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); - } - - PetscInt n; - PalacePetscCall(VecGetLocalSize(v0, &n)); - MFEM_VERIFY(v.Size() == n, "Invalid size mismatch for provided initial space vector!"); - - PetscScalar *pv0; - PalacePetscCall(VecGetArrayWrite(v0, &pv0)); - v.Get(pv0, n); - PalacePetscCall(VecRestoreArrayWrite(v0, &pv0)); - - Vec is[1] = {v0}; - PalacePetscCall(PEPSetInitialSpace(pep, 1, is)); -} - -void SlepcPEPSolverBase::Customize() -{ - SlepcEigenvalueSolver::Customize(); - PalacePetscCall(PEPSetTarget(pep, sigma / gamma)); - if (!cl_custom) - { - PalacePetscCall(PEPSetFromOptions(pep)); - if (print > 0) - { - PetscOptionsView(nullptr, PETSC_VIEWER_STDOUT_(GetComm())); - Mpi::Print(GetComm(), "\n"); - } - cl_custom = true; - } -} - -int SlepcPEPSolverBase::Solve() -{ - MFEM_VERIFY(A0 && A1 && A2 && opInv, "Operators are not set for SlepcPEPSolverBase!"); - - // Solve the eigenvalue problem. - PetscInt num_conv; - Customize(); - PalacePetscCall(PEPSolve(pep)); - PalacePetscCall(PEPGetConverged(pep, &num_conv)); - if (print > 0) - { - Mpi::Print(GetComm(), "\n"); - PalacePetscCall(PEPConvergedReasonView(pep, PETSC_VIEWER_STDOUT_(GetComm()))); - Mpi::Print(GetComm(), - " Total number of linear systems solved: {:d}\n" - " Total number of linear solver iterations: {:d}\n", - opInv->NumTotalMult(), opInv->NumTotalMultIterations()); - } - - // Compute and store the eigenpair residuals. - res = std::make_unique(num_conv); - for (int i = 0; i < num_conv; i++) - { - res.get()[i] = GetResidualNorm(i); - } - return (int)num_conv; -} - -PetscScalar SlepcPEPSolverBase::GetEigenvalue(int i) const -{ - PetscScalar l; - PalacePetscCall(PEPGetEigenpair(pep, i, &l, nullptr, nullptr, nullptr)); - return l * gamma; -} - -void SlepcPEPSolverBase::GetEigenvector(int i, ComplexVector &x) const -{ - MFEM_VERIFY( - v0, - "Must call SetOperators before using GetEigenvector for SLEPc eigenvalue solver!"); - PalacePetscCall(PEPGetEigenpair(pep, i, nullptr, nullptr, v0, nullptr)); - - PetscInt n; - PalacePetscCall(VecGetLocalSize(v0, &n)); - MFEM_VERIFY(x.Size() == n, "Invalid size mismatch for provided eigenvector!"); - - const PetscScalar *pv0; - PalacePetscCall(VecGetArrayRead(v0, &pv0)); - x.Set(pv0, n); - PalacePetscCall(VecRestoreArrayRead(v0, &pv0)); -} - -BV SlepcPEPSolverBase::GetBV() const -{ - BV bv; - PalacePetscCall(PEPGetBV(pep, &bv)); - return bv; -} - -ST SlepcPEPSolverBase::GetST() const -{ - ST st; - PalacePetscCall(PEPGetST(pep, &st)); - return st; -} - -RG SlepcPEPSolverBase::GetRG() const -{ - RG rg; - PalacePetscCall(PEPGetRG(pep, &rg)); - return rg; -} - -SlepcPEPSolver::SlepcPEPSolver(MPI_Comm comm, int print, const std::string &prefix) - : SlepcPEPSolverBase(comm, print, prefix) -{ - opK = opC = opM = nullptr; - normK = normC = normM = 0.0; -} - -void SlepcPEPSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &C, - const ComplexOperator &M, - EigenvalueSolver::ScaleType type) -{ - // Construct shell matrices for the scaled operators which define the quadratic polynomial - // eigenvalue problem. - bool first = (opK == nullptr); - opK = &K; - opC = &C; - opM = &M; - - if (first) - { - PetscInt n = opK->Height(); - PalacePetscCall( - MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A0)); - PalacePetscCall( - MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A1)); - PalacePetscCall( - MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A2)); - PalacePetscCall( - MatShellSetOperation(A0, MATOP_MULT, (void (*)(void))__mat_apply_PEP_A0)); - PalacePetscCall( - MatShellSetOperation(A1, MATOP_MULT, (void (*)(void))__mat_apply_PEP_A1)); - PalacePetscCall( - MatShellSetOperation(A2, MATOP_MULT, (void (*)(void))__mat_apply_PEP_A2)); - Mat A[3] = {A0, A1, A2}; - PalacePetscCall(PEPSetOperators(pep, 3, A)); - } - - if (first && type != ScaleType::NONE) - { - normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); - normC = linalg::SpectralNorm(GetComm(), *opC, opC->IsReal()); - normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); - MFEM_VERIFY(normK >= 0.0 && normC >= 0.0 && normM >= 0.0, - "Invalid matrix norms for PEP scaling!"); - if (normK > 0 && normC > 0.0 && normM > 0.0) - { - gamma = std::sqrt(normK / normM); - delta = 2.0 / (normK + gamma * normC); - } - } - - // Set up workspace. - if (!v0) - { - PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); - } - x.SetSize(opK->Height()); - y.SetSize(opK->Height()); - - // Configure linear solver. - if (first) - { - ConfigurePCShell(GetST(), (void *)this, __pc_apply_PEP); - } -} - -void SlepcPEPSolver::SetBMat(const Operator &B) -{ - SlepcEigenvalueSolver::SetBMat(B); - - PetscInt n = B.Height(); - PalacePetscCall( - MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &B0)); - PalacePetscCall(MatShellSetOperation(B0, MATOP_MULT, (void (*)(void))__mat_apply_PEP_B)); - - BV bv = GetBV(); - PalacePetscCall(BVSetMatrix(bv, B0, PETSC_FALSE)); -} - -PetscReal SlepcPEPSolver::GetResidualNorm(int i) const -{ - // Compute the i-th eigenpair residual: || P(λ) x ||₂ = || (K + λ C + λ² M) x ||₂ for - // eigenvalue λ. - PetscScalar l = GetEigenvalue(i); - GetEigenvector(i, x); - opK->Mult(x, y); - opC->AddMult(x, y, l); - opM->AddMult(x, y, l * l); - return linalg::Norml2(GetComm(), y); -} - -PetscReal SlepcPEPSolver::GetBackwardScaling(PetscScalar l) const -{ - // Make sure not to use norms from scaling as this can be confusing if they are different. - // Note that SLEPc typically uses ||.||∞, not Frobenius. - if (normK <= 0.0) - { - normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); - } - if (normC <= 0.0) - { - normC = linalg::SpectralNorm(GetComm(), *opC, opC->IsReal()); - } - if (normM <= 0.0) - { - normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); - } - PetscReal t = PetscAbsScalar(l); - return normK + t * normC + t * t * normM; -} - -} // namespace palace::slepc - -PetscErrorCode __mat_apply_EPS_A0(Mat A, Vec x, Vec y) -{ - PetscFunctionBeginUser; - palace::slepc::SlepcEPSSolver *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->opK->Mult(ctx->x, ctx->y); - ctx->y *= ctx->delta; - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __mat_apply_EPS_A1(Mat A, Vec x, Vec y) -{ - PetscFunctionBeginUser; - palace::slepc::SlepcEPSSolver *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->opM->Mult(ctx->x, ctx->y); - ctx->y *= ctx->delta * ctx->gamma; - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __mat_apply_EPS_B(Mat A, Vec x, Vec y) -{ - PetscFunctionBeginUser; - palace::slepc::SlepcEPSSolver *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->opB->Mult(ctx->x.Real(), ctx->y.Real()); - ctx->opB->Mult(ctx->x.Imag(), ctx->y.Imag()); - ctx->y *= ctx->delta * ctx->gamma; - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __pc_apply_EPS(PC pc, Vec x, Vec y) -{ - // Solve the linear system associated with the generalized eigenvalue problem: y = - // M⁻¹ x, or shift-and-invert spectral transformation: y = (K - σ M)⁻¹ x . Enforces the - // divergence-free constraint using the supplied projector. - PetscFunctionBeginUser; - palace::slepc::SlepcEPSSolver *ctx; - PetscCall(PCShellGetContext(pc, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell PC context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->opInv->Mult(ctx->x, ctx->y); - if (!ctx->sinvert) - { - ctx->y *= 1.0 / (ctx->delta * ctx->gamma); - } - else - { - ctx->y *= 1.0 / ctx->delta; - } - if (ctx->opProj) - { - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y)); - ctx->opProj->Mult(ctx->y); - // Mpi::Print(" After projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y)); - } - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __mat_apply_PEPLinear_L0(Mat A, Vec x, Vec y) -{ - // Apply the linearized operator L₀ = [ 0 I ] - // [ -K -C ] . - PetscFunctionBeginUser; - palace::slepc::SlepcPEPLinearSolver *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x1.Set(px, n / 2); - ctx->x2.Set(px + n / 2, n / 2); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->y1 = ctx->x2; - ctx->opC->Mult(ctx->x2, ctx->y2); - ctx->y2 *= ctx->gamma; - ctx->opK->AddMult(ctx->x1, ctx->y2, std::complex(1.0, 0.0)); - ctx->y2 *= -ctx->delta; - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y1.Get(py, n / 2); - ctx->y2.Get(py + n / 2, n / 2); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __mat_apply_PEPLinear_L1(Mat A, Vec x, Vec y) -{ - // Apply the linearized operator L₁ = [ I 0 ] - // [ 0 M ] . - PetscFunctionBeginUser; - palace::slepc::SlepcPEPLinearSolver *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x1.Set(px, n / 2); - ctx->x2.Set(px + n / 2, n / 2); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->y1 = ctx->x1; - ctx->opM->Mult(ctx->x2, ctx->y2); - ctx->y2 *= ctx->delta * ctx->gamma * ctx->gamma; - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y1.Get(py, n / 2); - ctx->y2.Get(py + n / 2, n / 2); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __mat_apply_PEPLinear_B(Mat A, Vec x, Vec y) -{ - PetscFunctionBeginUser; - palace::slepc::SlepcPEPLinearSolver *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x1.Set(px, n / 2); - ctx->x2.Set(px + n / 2, n / 2); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->opB->Mult(ctx->x1.Real(), ctx->y1.Real()); - ctx->opB->Mult(ctx->x1.Imag(), ctx->y1.Imag()); - ctx->opB->Mult(ctx->x2.Real(), ctx->y2.Real()); - ctx->opB->Mult(ctx->x2.Imag(), ctx->y2.Imag()); - ctx->y1 *= ctx->delta * ctx->gamma * ctx->gamma; - ctx->y2 *= ctx->delta * ctx->gamma * ctx->gamma; - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y1.Get(py, n / 2); - ctx->y2.Get(py + n / 2, n / 2); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __pc_apply_PEPLinear(PC pc, Vec x, Vec y) -{ - // Solve the linear system associated with the generalized eigenvalue problem after - // linearization: y = L₁⁻¹ x, or with the shift-and-invert spectral transformation: - // y = (L₀ - σ L₁)⁻¹ x, with: - // L₀ = [ 0 I ] L₁ = [ I 0 ] - // [ -K -C ] , [ 0 M ] . - // Enforces the divergence-free constraint using the supplied projector. - PetscFunctionBeginUser; - palace::slepc::SlepcPEPLinearSolver *ctx; - PetscCall(PCShellGetContext(pc, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell PC context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x1.Set(px, n / 2); - ctx->x2.Set(px + n / 2, n / 2); - PetscCall(VecRestoreArrayRead(x, &px)); - - if (!ctx->sinvert) - { - ctx->y1 = ctx->x1; - if (ctx->opProj) - { - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); - ctx->opProj->Mult(ctx->y1); - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); - } - - ctx->opInv->Mult(ctx->x2, ctx->y2); - ctx->y2 *= 1.0 / (ctx->delta * ctx->gamma * ctx->gamma); - if (ctx->opProj) - { - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y2)); - ctx->opProj->Mult(ctx->y2); - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y2)); - } - } - else - { - ctx->y1.AXPBY(-ctx->sigma / (ctx->delta * ctx->gamma), ctx->x2, 0.0); // Temporarily - ctx->opK->AddMult(ctx->x1, ctx->y1, std::complex(1.0, 0.0)); - ctx->opInv->Mult(ctx->y1, ctx->y2); - if (ctx->opProj) - { - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y2)); - ctx->opProj->Mult(ctx->y2); - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y2)); - } - - ctx->y1.AXPBYPCZ(ctx->gamma / ctx->sigma, ctx->y2, -ctx->gamma / ctx->sigma, ctx->x1, - 0.0); - if (ctx->opProj) - { - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); - ctx->opProj->Mult(ctx->y1); - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); - } - } - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y1.Get(py, n / 2); - ctx->y2.Get(py + n / 2, n / 2); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __mat_apply_PEP_A0(Mat A, Vec x, Vec y) -{ - PetscFunctionBeginUser; - palace::slepc::SlepcPEPSolver *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscFunctionBeginUser; - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->opK->Mult(ctx->x, ctx->y); - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __mat_apply_PEP_A1(Mat A, Vec x, Vec y) -{ - PetscFunctionBeginUser; - palace::slepc::SlepcPEPSolver *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscFunctionBeginUser; - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->opC->Mult(ctx->x, ctx->y); - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __mat_apply_PEP_A2(Mat A, Vec x, Vec y) -{ - PetscFunctionBeginUser; - palace::slepc::SlepcPEPSolver *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscFunctionBeginUser; - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->opM->Mult(ctx->x, ctx->y); - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __mat_apply_PEP_B(Mat A, Vec x, Vec y) -{ - PetscFunctionBeginUser; - palace::slepc::SlepcPEPSolver *ctx; - PetscCall(MatShellGetContext(A, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); - - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->opB->Mult(ctx->x.Real(), ctx->y.Real()); - ctx->opB->Mult(ctx->x.Imag(), ctx->y.Imag()); - ctx->y *= ctx->delta * ctx->gamma; - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -PetscErrorCode __pc_apply_PEP(PC pc, Vec x, Vec y) -{ - // Solve the linear system associated with the generalized eigenvalue problem: y = M⁻¹ x, - // or shift-and-invert spectral transformation: y = P(σ)⁻¹ x . Enforces the divergence- - // free constraint using the supplied projector. - PetscFunctionBeginUser; - palace::slepc::SlepcPEPSolver *ctx; - PetscCall(PCShellGetContext(pc, (void **)&ctx)); - MFEM_VERIFY(ctx, "Invalid PETSc shell PC context for SLEPc!"); - - PetscFunctionBeginUser; - PetscInt n; - PetscCall(VecGetLocalSize(x, &n)); - - const PetscScalar *px; - PetscCall(VecGetArrayRead(x, &px)); - ctx->x.Set(px, n); - PetscCall(VecRestoreArrayRead(x, &px)); - - ctx->opInv->Mult(ctx->x, ctx->y); - if (!ctx->sinvert) - { - ctx->y *= 1.0 / (ctx->delta * ctx->gamma * ctx->gamma); - } - else - { - ctx->y *= 1.0 / ctx->delta; - } - if (ctx->opProj) - { - // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y)); - ctx->opProj->Mult(ctx->y); - // Mpi::Print(" After projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y)); - } - - PetscScalar *py; - PetscCall(VecGetArrayWrite(y, &py)); - ctx->y.Get(py, n); - PetscCall(VecRestoreArrayWrite(y, &py)); - - PetscFunctionReturn(0); -} - -#endif +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "slepc.hpp" + +#if defined(PALACE_WITH_SLEPC) + +#include +#include +#include +#include +#include "linalg/divfree.hpp" +#include "linalg/nleps.hpp" +#include "linalg/rap.hpp" +#include "utils/communication.hpp" + +static PetscErrorCode __mat_apply_EPS_A0(Mat, Vec, Vec); +static PetscErrorCode __mat_apply_EPS_A1(Mat, Vec, Vec); +static PetscErrorCode __mat_apply_EPS_B(Mat, Vec, Vec); +static PetscErrorCode __pc_apply_EPS(PC, Vec, Vec); +static PetscErrorCode __mat_apply_PEPLinear_L0(Mat, Vec, Vec); +static PetscErrorCode __mat_apply_PEPLinear_L1(Mat, Vec, Vec); +static PetscErrorCode __mat_apply_PEPLinear_B(Mat, Vec, Vec); +static PetscErrorCode __pc_apply_PEPLinear(PC, Vec, Vec); +static PetscErrorCode __mat_apply_PEP_A0(Mat, Vec, Vec); +static PetscErrorCode __mat_apply_PEP_A1(Mat, Vec, Vec); +static PetscErrorCode __mat_apply_PEP_A2(Mat, Vec, Vec); +static PetscErrorCode __mat_apply_PEP_B(Mat, Vec, Vec); +static PetscErrorCode __pc_apply_PEP(PC, Vec, Vec); +// for NEP +static PetscErrorCode __mat_apply_NEP_A(Mat, Vec, Vec); +static PetscErrorCode __mat_apply_NEP_J(Mat, Vec, Vec); +static PetscErrorCode __mat_apply_NEP_B(Mat, Vec, Vec); +static PetscErrorCode __pc_apply_NEP(PC, Vec, Vec); +static PetscErrorCode __form_NEP_function(NEP, PetscScalar, Mat, Mat, void *); +static PetscErrorCode __form_NEP_jacobian(NEP, PetscScalar, Mat, void *); + +using namespace std::complex_literals; + +namespace +{ + +inline PetscErrorCode FromPetscVec(Vec x, palace::ComplexVector &y, int block = 0, + int nblocks = 1) +{ + PetscInt n; + const PetscScalar *px; + PetscMemType mtype; + PetscCall(VecGetLocalSize(x, &n)); + MFEM_ASSERT(y.Size() * nblocks == n, + "Invalid size mismatch for PETSc vector conversion!"); + PetscCall(VecGetArrayReadAndMemType(x, &px, &mtype)); + y.Set(px + block * n / nblocks, n / nblocks, PetscMemTypeDevice(mtype)); + PetscCall(VecRestoreArrayReadAndMemType(x, &px)); + return PETSC_SUCCESS; +} + +inline PetscErrorCode FromPetscVec(Vec x, palace::ComplexVector &y1, + palace::ComplexVector &y2) +{ + PetscInt n; + const PetscScalar *px; + PetscMemType mtype; + PetscCall(VecGetLocalSize(x, &n)); + MFEM_ASSERT(y1.Size() == n / 2 && y2.Size() == n / 2, + "Invalid size mismatch for PETSc vector conversion!"); + PetscCall(VecGetArrayReadAndMemType(x, &px, &mtype)); + y1.Set(px, n / 2, PetscMemTypeDevice(mtype)); + y2.Set(px + n / 2, n / 2, PetscMemTypeDevice(mtype)); + PetscCall(VecRestoreArrayReadAndMemType(x, &px)); + return PETSC_SUCCESS; +} + +inline PetscErrorCode ToPetscVec(const palace::ComplexVector &x, Vec y, int block = 0, + int nblocks = 1) +{ + PetscInt n; + PetscScalar *py; + PetscMemType mtype; + PetscCall(VecGetLocalSize(y, &n)); + MFEM_ASSERT(x.Size() * nblocks == n, + "Invalid size mismatch for PETSc vector conversion!"); + PetscCall(VecGetArrayWriteAndMemType(y, &py, &mtype)); + x.Get(py + block * n / nblocks, n / nblocks, PetscMemTypeDevice(mtype)); + PetscCall(VecRestoreArrayWriteAndMemType(y, &py)); + return PETSC_SUCCESS; +} + +inline PetscErrorCode ToPetscVec(const palace::ComplexVector &x1, + const palace::ComplexVector &x2, Vec y) +{ + PetscInt n; + PetscScalar *py; + PetscMemType mtype; + PetscCall(VecGetLocalSize(y, &n)); + MFEM_ASSERT(x1.Size() == n / 2 && x2.Size() == n / 2, + "Invalid size mismatch for PETSc vector conversion!"); + PetscCall(VecGetArrayWriteAndMemType(y, &py, &mtype)); + x1.Get(py, n / 2, PetscMemTypeDevice(mtype)); + x2.Get(py + n / 2, n / 2, PetscMemTypeDevice(mtype)); + PetscCall(VecRestoreArrayWriteAndMemType(y, &py)); + return PETSC_SUCCESS; +} + +} // namespace + +namespace palace::slepc +{ + +namespace +{ + +inline PetscErrorCode ConfigurePetscDevice() +{ + // Tell PETSc to use the same CUDA or HIP device as MFEM. + if (mfem::Device::Allows(mfem::Backend::CUDA_MASK)) + { + PetscCall(PetscOptionsSetValue(NULL, "-use_gpu_aware_mpi", + mfem::Device::GetGPUAwareMPI() ? "1" : "0")); + PetscCall(PetscOptionsSetValue(NULL, "-device_select_cuda", + std::to_string(mfem::Device::GetId()).c_str())); + // PetscCall(PetscOptionsSetValue(NULL, "-bv_type", "svec")); + // PetscCall(PetscOptionsSetValue(NULL, "-vec_type", "cuda")); + } + if (mfem::Device::Allows(mfem::Backend::HIP_MASK)) + { + PetscCall(PetscOptionsSetValue(NULL, "-use_gpu_aware_mpi", + mfem::Device::GetGPUAwareMPI() ? "1" : "0")); + PetscCall(PetscOptionsSetValue(NULL, "-device_select_hip", + std::to_string(mfem::Device::GetId()).c_str())); + // PetscCall(PetscOptionsSetValue(NULL, "-bv_type", "svec")); + // PetscCall(PetscOptionsSetValue(NULL, "-vec_type", "hip")); + } + return PETSC_SUCCESS; +} + +inline VecType PetscVecType() +{ + if (mfem::Device::Allows(mfem::Backend::CUDA_MASK)) + { + return VECCUDA; + } + if (mfem::Device::Allows(mfem::Backend::HIP_MASK)) + { + return VECHIP; + } + return VECSTANDARD; +} + +struct MatShellContext +{ + const ComplexOperator &A; + ComplexVector &x, &y; +}; + +PetscErrorCode __mat_apply_shell(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + MatShellContext *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x)); + ctx->A.Mult(ctx->x, ctx->y); + PetscCall(ToPetscVec(ctx->y, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_transpose_shell(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + MatShellContext *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x)); + ctx->A.MultTranspose(ctx->x, ctx->y); + PetscCall(ToPetscVec(ctx->y, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_hermitian_transpose_shell(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + MatShellContext *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x)); + ctx->A.MultHermitianTranspose(ctx->x, ctx->y); + PetscCall(ToPetscVec(ctx->y, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +}; + +inline void ConfigurePCShell(ST st, void *ctx, PetscErrorCode (*__pc_apply)(PC, Vec, Vec)) +{ + KSP ksp; + PC pc; + PalacePetscCall(STGetKSP(st, &ksp)); + PalacePetscCall(KSPGetPC(ksp, &pc)); + PalacePetscCall(PCSetType(pc, PCSHELL)); + PalacePetscCall(PCShellSetContext(pc, ctx)); + PalacePetscCall(PCShellSetApply(pc, __pc_apply)); +} + +inline void ConfigureRG(RG rg, PetscReal lr, PetscReal ur, PetscReal li, PetscReal ui, + bool complement = false) +{ + PalacePetscCall(RGSetType(rg, RGINTERVAL)); + PalacePetscCall(RGIntervalSetEndpoints(rg, lr, ur, li, ui)); + if (complement) + { + PalacePetscCall(RGSetComplement(rg, PETSC_TRUE)); + } +} + +} // namespace + +void Initialize(int &argc, char **&argv, const char rc_file[], const char help[]) +{ + ConfigurePetscDevice(); + PalacePetscCall(SlepcInitialize(&argc, &argv, rc_file, help)); + + // Remove default PETSc signal handling, since it can be confusing when the errors occur + // not from within SLEPc/PETSc. + PalacePetscCall(PetscPopSignalHandler()); +} + +void Initialize() +{ + ConfigurePetscDevice(); + PalacePetscCall(SlepcInitializeNoArguments()); + + // Remove default PETSc signal handling, since it can be confusing when the errors occur + // not from within SLEPc/PETSc. + PalacePetscCall(PetscPopSignalHandler()); +} + +void Finalize() +{ + PalacePetscCall(SlepcFinalize()); +} + +PetscReal GetMaxSingularValue(MPI_Comm comm, const ComplexOperator &A, bool herm, + PetscReal tol, PetscInt max_it) +{ + // This method assumes the provided operator has the required operations for SLEPc's EPS + // or SVD solvers, namely MATOP_MULT and MATOP_MULT_HERMITIAN_TRANSPOSE (if the matrix + // is not Hermitian). + MFEM_VERIFY(A.Height() == A.Width(), "Spectral norm requires a square matrix!"); + const PetscInt n = A.Height(); + ComplexVector x(n), y(n); + x.UseDevice(true); + y.UseDevice(true); + MatShellContext ctx = {A, x, y}; + Mat A0; + PalacePetscCall( + MatCreateShell(comm, n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)&ctx, &A0)); + PalacePetscCall(MatShellSetOperation(A0, MATOP_MULT, (void (*)(void))__mat_apply_shell)); + PalacePetscCall(MatShellSetVecType(A0, PetscVecType())); + if (herm) + { + EPS eps; + PetscInt num_conv; + PetscScalar eig; + PalacePetscCall(EPSCreate(comm, &eps)); + PalacePetscCall(EPSSetOperators(eps, A0, nullptr)); + PalacePetscCall(EPSSetProblemType(eps, EPS_HEP)); + PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_LARGEST_MAGNITUDE)); + PalacePetscCall(EPSSetDimensions(eps, 1, PETSC_DEFAULT, PETSC_DEFAULT)); + PalacePetscCall(EPSSetTolerances(eps, tol, max_it)); + PalacePetscCall(EPSSolve(eps)); + PalacePetscCall(EPSGetConverged(eps, &num_conv)); + if (num_conv < 1) + { + Mpi::Warning(comm, "SLEPc EPS solve did not converge for maximum singular value!\n"); + eig = 0.0; + } + else + { + PalacePetscCall(EPSGetEigenvalue(eps, 0, &eig, nullptr)); + MFEM_VERIFY(PetscImaginaryPart(eig) == 0.0, + "Unexpected complex eigenvalue for Hermitian matrix (λ = " << eig + << ")!"); + } + PalacePetscCall(EPSDestroy(&eps)); + PalacePetscCall(MatDestroy(&A0)); + return PetscAbsScalar(eig); + } + else + { + PalacePetscCall(MatShellSetOperation(A0, MATOP_MULT_TRANSPOSE, + (void (*)(void))__mat_apply_transpose_shell)); + PalacePetscCall( + MatShellSetOperation(A0, MATOP_MULT_HERMITIAN_TRANSPOSE, + (void (*)(void))__mat_apply_hermitian_transpose_shell)); + SVD svd; + PetscInt num_conv; + PetscReal sigma; + PalacePetscCall(SVDCreate(comm, &svd)); + PalacePetscCall(SVDSetOperators(svd, A0, nullptr)); + PalacePetscCall(SVDSetProblemType(svd, SVD_STANDARD)); + PalacePetscCall(SVDSetWhichSingularTriplets(svd, SVD_LARGEST)); + PalacePetscCall(SVDSetDimensions(svd, 1, PETSC_DEFAULT, PETSC_DEFAULT)); + PalacePetscCall(SVDSetTolerances(svd, tol, max_it)); + PalacePetscCall(SVDSolve(svd)); + PalacePetscCall(SVDGetConverged(svd, &num_conv)); + if (num_conv < 1) + { + Mpi::Warning(comm, "SLEPc SVD solve did not converge for maximum singular value!\n"); + sigma = 0.0; + } + else + { + PalacePetscCall(SVDGetSingularTriplet(svd, 0, &sigma, nullptr, nullptr)); + } + PalacePetscCall(SVDDestroy(&svd)); + PalacePetscCall(MatDestroy(&A0)); + return sigma; + } +} + +// Eigensolver base class methods. + +SlepcEigenvalueSolver::SlepcEigenvalueSolver(int print) : print(print) +{ + sinvert = false; + region = true; + sigma = 0.0; + gamma = delta = 1.0; + + opInv = nullptr; + opProj = nullptr; + opB = nullptr; + + B0 = nullptr; + v0 = nullptr; + + cl_custom = false; +} + +SlepcEigenvalueSolver::~SlepcEigenvalueSolver() +{ + PalacePetscCall(MatDestroy(&B0)); + PalacePetscCall(VecDestroy(&v0)); +} + +void SlepcEigenvalueSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &M, + EigenvalueSolver::ScaleType type) +{ + MFEM_ABORT("SetOperators not defined for base class SlepcEigenvalueSolver!"); +} + +void SlepcEigenvalueSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, + EigenvalueSolver::ScaleType type) +{ + MFEM_ABORT("SetOperators not defined for base class SlepcEigenvalueSolver!"); +} + +void SlepcEigenvalueSolver::SetLinearSolver(ComplexKspSolver &ksp) +{ + opInv = &ksp; +} + +void SlepcEigenvalueSolver::SetDivFreeProjector(const DivFreeSolver &divfree) +{ + opProj = &divfree; +} + +void SlepcEigenvalueSolver::SetBMat(const Operator &B) +{ + opB = &B; +} + +void SlepcEigenvalueSolver::SetShiftInvert(std::complex s, bool precond) +{ + ST st = GetST(); + if (precond) + { + PalacePetscCall(STSetType(st, STPRECOND)); + } + else + { + PalacePetscCall(STSetType(st, STSINVERT)); + } + PalacePetscCall(STSetTransform(st, PETSC_TRUE)); + PalacePetscCall(STSetMatMode(st, ST_MATMODE_SHELL)); + sigma = s; // Wait until solve time to call EPS/PEPSetTarget + sinvert = true; +} + +void SlepcEigenvalueSolver::SetOrthogonalization(bool mgs, bool cgs2) +{ + // The SLEPc default is CGS with refinement if needed. + if (mgs || cgs2) + { + BV bv = GetBV(); + BVOrthogType type; + BVOrthogRefineType refine; + if (mgs) + { + type = BV_ORTHOG_MGS; + refine = BV_ORTHOG_REFINE_NEVER; + } + else // cgs2 + { + type = BV_ORTHOG_CGS; + refine = BV_ORTHOG_REFINE_ALWAYS; + } + PalacePetscCall(BVSetOrthogonalization(bv, type, refine, 1.0, BV_ORTHOG_BLOCK_GS)); + } +} + +void SlepcEigenvalueSolver::Customize() +{ + // Configure the KSP object for non-preconditioned spectral transformations. + PetscBool precond; + ST st = GetST(); + PalacePetscCall( + PetscObjectTypeCompare(reinterpret_cast(st), STPRECOND, &precond)); + if (!precond) + { + KSP ksp; + PalacePetscCall(STGetKSP(st, &ksp)); + PalacePetscCall(KSPSetType(ksp, KSPPREONLY)); + } + + // Configure the region based on the given target if necessary. + if (sinvert && region) + { + if (PetscImaginaryPart(sigma) == 0.0) + { + PetscReal sr = PetscRealPart(sigma); + if (sr > 0.0) + { + ConfigureRG(GetRG(), sr / gamma, mfem::infinity(), -mfem::infinity(), + mfem::infinity()); + } + else if (sr < 0.0) + { + ConfigureRG(GetRG(), -mfem::infinity(), sr / gamma, -mfem::infinity(), + mfem::infinity()); + } + } + else if (PetscRealPart(sigma) == 0.0) + { + PetscReal si = PetscImaginaryPart(sigma); + if (si > 0.0) + { + ConfigureRG(GetRG(), -mfem::infinity(), mfem::infinity(), si / gamma, + mfem::infinity()); + } + else if (si < 0.0) + { + ConfigureRG(GetRG(), -mfem::infinity(), mfem::infinity(), -mfem::infinity(), + si / gamma); + } + } + else + { + MFEM_ABORT("Shift-and-invert with general complex eigenvalue target is unsupported!"); + } + } +} + +PetscReal SlepcEigenvalueSolver::GetEigenvectorNorm(const ComplexVector &x, + ComplexVector &Bx) const +{ + if (opB) + { + return linalg::Norml2(GetComm(), x, *opB, Bx); + } + else + { + return linalg::Norml2(GetComm(), x); + } +} + +PetscReal SlepcEigenvalueSolver::GetError(int i, EigenvalueSolver::ErrorType type) const +{ + switch (type) + { + case ErrorType::ABSOLUTE: + return res.get()[i]; + case ErrorType::RELATIVE: + return res.get()[i] / PetscAbsScalar(GetEigenvalue(i)); + case ErrorType::BACKWARD: + return res.get()[i] / GetBackwardScaling(GetEigenvalue(i)); + } + return 0.0; +} + +void SlepcEigenvalueSolver::RescaleEigenvectors(int num_eig) +{ + res = std::make_unique(num_eig); + xscale = std::make_unique(num_eig); + for (int i = 0; i < num_eig; i++) + { + xscale.get()[i] = 0.0; + GetEigenvector(i, x1); + xscale.get()[i] = 1.0 / GetEigenvectorNorm(x1, y1); + res.get()[i] = + GetResidualNorm(GetEigenvalue(i), x1, y1) / linalg::Norml2(GetComm(), x1); + } +} + +// EPS specific methods. + +SlepcEPSSolverBase::SlepcEPSSolverBase(MPI_Comm comm, int print, const std::string &prefix) + : SlepcEigenvalueSolver(print) +{ + PalacePetscCall(EPSCreate(comm, &eps)); + PalacePetscCall(EPSSetOptionsPrefix(eps, prefix.c_str())); + if (print > 0) + { + std::string opts = "-eps_monitor"; + if (print > 2) + { + opts.append(" -eps_view"); + } + if (prefix.length() > 0) + { + PetscOptionsPrefixPush(nullptr, prefix.c_str()); + } + PetscOptionsInsertString(nullptr, opts.c_str()); + if (prefix.length() > 0) + { + PetscOptionsPrefixPop(nullptr); + } + } + A0 = A1 = nullptr; +} + +SlepcEPSSolverBase::~SlepcEPSSolverBase() +{ + PalacePetscCall(EPSDestroy(&eps)); + PalacePetscCall(MatDestroy(&A0)); + PalacePetscCall(MatDestroy(&A1)); +} + +void SlepcEPSSolverBase::SetNumModes(int num_eig, int num_vec) +{ + PalacePetscCall(EPSSetDimensions(eps, num_eig, (num_vec > 0) ? num_vec : PETSC_DEFAULT, + PETSC_DEFAULT)); +} + +void SlepcEPSSolverBase::SetTol(PetscReal tol) +{ + PalacePetscCall(EPSSetTolerances(eps, tol, PETSC_DEFAULT)); + PalacePetscCall(EPSSetConvergenceTest(eps, EPS_CONV_REL)); + // PalacePetscCall(EPSSetTrackAll(eps, PETSC_TRUE)); + // PalacePetscCall(EPSSetTrueResidual(eps, PETSC_TRUE)); +} + +void SlepcEPSSolverBase::SetMaxIter(int max_it) +{ + PalacePetscCall( + EPSSetTolerances(eps, PETSC_DEFAULT, (max_it > 0) ? max_it : PETSC_DEFAULT)); +} + +void SlepcEPSSolverBase::SetWhichEigenpairs(EigenvalueSolver::WhichType type) +{ + switch (type) + { + case WhichType::LARGEST_MAGNITUDE: + PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_LARGEST_MAGNITUDE)); + region = false; + break; + case WhichType::SMALLEST_MAGNITUDE: + PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_SMALLEST_MAGNITUDE)); + region = false; + break; + case WhichType::LARGEST_REAL: + PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_LARGEST_REAL)); + break; + case WhichType::SMALLEST_REAL: + PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_SMALLEST_REAL)); + break; + case WhichType::LARGEST_IMAGINARY: + PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_LARGEST_IMAGINARY)); + break; + case WhichType::SMALLEST_IMAGINARY: + PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_SMALLEST_IMAGINARY)); + break; + case WhichType::TARGET_MAGNITUDE: + PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_TARGET_MAGNITUDE)); + region = false; + break; + case WhichType::TARGET_REAL: + PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_TARGET_REAL)); + break; + case WhichType::TARGET_IMAGINARY: + PalacePetscCall(EPSSetWhichEigenpairs(eps, EPS_TARGET_IMAGINARY)); + break; + } +} + +void SlepcEPSSolverBase::SetProblemType(SlepcEigenvalueSolver::ProblemType type) +{ + switch (type) + { + case ProblemType::HERMITIAN: + PalacePetscCall(EPSSetProblemType(eps, EPS_HEP)); + break; + case ProblemType::NON_HERMITIAN: + PalacePetscCall(EPSSetProblemType(eps, EPS_NHEP)); + break; + case ProblemType::GEN_HERMITIAN: + PalacePetscCall(EPSSetProblemType(eps, EPS_GHEP)); + break; + case ProblemType::GEN_INDEFINITE: + PalacePetscCall(EPSSetProblemType(eps, EPS_GHIEP)); + break; + case ProblemType::GEN_NON_HERMITIAN: + PalacePetscCall(EPSSetProblemType(eps, EPS_GNHEP)); + // PalacePetscCall(EPSSetProblemType(eps, EPS_PGNHEP)); // If B is SPD + break; + case ProblemType::HYPERBOLIC: + case ProblemType::GYROSCOPIC: + case ProblemType::GENERAL: + MFEM_ABORT("Problem type not implemented!"); + break; + } +} + +void SlepcEPSSolverBase::SetType(SlepcEigenvalueSolver::Type type) +{ + switch (type) + { + case Type::KRYLOVSCHUR: + PalacePetscCall(EPSSetType(eps, EPSKRYLOVSCHUR)); + break; + case Type::POWER: + PalacePetscCall(EPSSetType(eps, EPSPOWER)); + break; + case Type::SUBSPACE: + PalacePetscCall(EPSSetType(eps, EPSSUBSPACE)); + break; + case Type::JD: + PalacePetscCall(EPSSetType(eps, EPSJD)); + region = false; + break; + case Type::TOAR: + case Type::STOAR: + case Type::QARNOLDI: + case Type::SLP: + case Type::NLEIGS: + MFEM_ABORT("Eigenvalue solver type not implemented!"); + break; + } +} + +void SlepcEPSSolverBase::SetInitialSpace(const ComplexVector &v) +{ + MFEM_VERIFY( + A0 && A1, + "Must call SetOperators before using SetInitialSpace for SLEPc eigenvalue solver!"); + if (!v0) + { + PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); + } + PalacePetscCall(ToPetscVec(v, v0)); + Vec is[1] = {v0}; + PalacePetscCall(EPSSetInitialSpace(eps, 1, is)); +} + +void SlepcEPSSolverBase::Customize() +{ + SlepcEigenvalueSolver::Customize(); + PalacePetscCall(EPSSetTarget(eps, sigma / gamma)); + if (!cl_custom) + { + PalacePetscCall(EPSSetFromOptions(eps)); + if (print > 0) + { + PetscOptionsView(nullptr, PETSC_VIEWER_STDOUT_(GetComm())); + Mpi::Print(GetComm(), "\n"); + } + cl_custom = true; + } +} + +int SlepcEPSSolverBase::Solve() +{ + MFEM_VERIFY(A0 && A1 && opInv, "Operators are not set for SlepcEPSSolverBase!"); + + // Solve the eigenvalue problem. + PetscInt num_conv; + Customize(); + PalacePetscCall(EPSSolve(eps)); + PalacePetscCall(EPSGetConverged(eps, &num_conv)); + if (print > 0) + { + Mpi::Print(GetComm(), "\n"); + PalacePetscCall(EPSConvergedReasonView(eps, PETSC_VIEWER_STDOUT_(GetComm()))); + Mpi::Print(GetComm(), + " Total number of linear systems solved: {:d}\n" + " Total number of linear solver iterations: {:d}\n", + opInv->NumTotalMult(), opInv->NumTotalMultIterations()); + } + + // Compute and store the eigenpair residuals. + RescaleEigenvectors(num_conv); + return (int)num_conv; +} + +std::complex SlepcEPSSolverBase::GetEigenvalue(int i) const +{ + PetscScalar l; + PalacePetscCall(EPSGetEigenvalue(eps, i, &l, nullptr)); + return l * gamma; +} + +void SlepcEPSSolverBase::GetEigenvector(int i, ComplexVector &x) const +{ + MFEM_VERIFY( + v0, + "Must call SetOperators before using GetEigenvector for SLEPc eigenvalue solver!"); + PalacePetscCall(EPSGetEigenvector(eps, i, v0, nullptr)); + PalacePetscCall(FromPetscVec(v0, x)); + if (xscale.get()[i] > 0.0) + { + x *= xscale.get()[i]; + } +} + +BV SlepcEPSSolverBase::GetBV() const +{ + BV bv; + PalacePetscCall(EPSGetBV(eps, &bv)); + return bv; +} + +ST SlepcEPSSolverBase::GetST() const +{ + ST st; + PalacePetscCall(EPSGetST(eps, &st)); + return st; +} + +RG SlepcEPSSolverBase::GetRG() const +{ + RG rg; + PalacePetscCall(EPSGetRG(eps, &rg)); + return rg; +} + +SlepcEPSSolver::SlepcEPSSolver(MPI_Comm comm, int print, const std::string &prefix) + : SlepcEPSSolverBase(comm, print, prefix) +{ + opK = opM = nullptr; + normK = normM = 0.0; +} + +void SlepcEPSSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &M, + EigenvalueSolver::ScaleType type) +{ + // Construct shell matrices for the scaled operators which define the generalized + // eigenvalue problem. + const bool first = (opK == nullptr); + opK = &K; + opM = &M; + + if (first) + { + const PetscInt n = opK->Height(); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A0)); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A1)); + PalacePetscCall( + MatShellSetOperation(A0, MATOP_MULT, (void (*)(void))__mat_apply_EPS_A0)); + PalacePetscCall( + MatShellSetOperation(A1, MATOP_MULT, (void (*)(void))__mat_apply_EPS_A1)); + PalacePetscCall(MatShellSetVecType(A0, PetscVecType())); + PalacePetscCall(MatShellSetVecType(A1, PetscVecType())); + PalacePetscCall(EPSSetOperators(eps, A0, A1)); + } + + if (first && type != ScaleType::NONE) + { + normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); + normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); + MFEM_VERIFY(normK >= 0.0 && normM >= 0.0, "Invalid matrix norms for EPS scaling!"); + if (normK > 0 && normM > 0.0) + { + gamma = normK / normM; // Store γ² for linear problem + delta = 2.0 / normK; + } + } + + // Set up workspace. + if (!v0) + { + PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); + } + x1.SetSize(opK->Height()); + y1.SetSize(opK->Height()); + x1.UseDevice(true); + y1.UseDevice(true); + + // Configure linear solver for generalized problem or spectral transformation. This also + // allows use of the divergence-free projector as a linear solve side-effect. + if (first) + { + ConfigurePCShell(GetST(), (void *)this, __pc_apply_EPS); + } +} + +void SlepcEPSSolver::SetBMat(const Operator &B) +{ + SlepcEigenvalueSolver::SetBMat(B); + + const PetscInt n = B.Height(); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &B0)); + PalacePetscCall(MatShellSetOperation(B0, MATOP_MULT, (void (*)(void))__mat_apply_EPS_B)); + PalacePetscCall(MatShellSetVecType(B0, PetscVecType())); + + BV bv = GetBV(); + PalacePetscCall(BVSetMatrix(bv, B0, PETSC_FALSE)); +} + +PetscReal SlepcEPSSolver::GetResidualNorm(PetscScalar l, const ComplexVector &x, + ComplexVector &r) const +{ + // Compute the i-th eigenpair residual: || (K - λ M) x ||₂ for eigenvalue λ. + opK->Mult(x, r); + opM->AddMult(x, r, -l); + return linalg::Norml2(GetComm(), r); +} + +PetscReal SlepcEPSSolver::GetBackwardScaling(PetscScalar l) const +{ + // Make sure not to use norms from scaling as this can be confusing if they are different. + // Note that SLEPc typically uses ||.||∞, not the 2-norm. + if (normK <= 0.0) + { + normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); + } + if (normM <= 0.0) + { + normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); + } + return normK + PetscAbsScalar(l) * normM; +} + +SlepcPEPLinearSolver::SlepcPEPLinearSolver(MPI_Comm comm, int print, + const std::string &prefix) + : SlepcEPSSolverBase(comm, print, prefix) +{ + opK = opC = opM = nullptr; + normK = normC = normM = 0.0; +} + +void SlepcPEPLinearSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, + EigenvalueSolver::ScaleType type) +{ + // Construct shell matrices for the scaled linearized operators which define the block 2x2 + // eigenvalue problem. + const bool first = (opK == nullptr); + opK = &K; + opC = &C; + opM = &M; + if (first) + { + const PetscInt n = opK->Height(); + PalacePetscCall(MatCreateShell(GetComm(), 2 * n, 2 * n, PETSC_DECIDE, PETSC_DECIDE, + (void *)this, &A0)); + PalacePetscCall(MatCreateShell(GetComm(), 2 * n, 2 * n, PETSC_DECIDE, PETSC_DECIDE, + (void *)this, &A1)); + PalacePetscCall( + MatShellSetOperation(A0, MATOP_MULT, (void (*)(void))__mat_apply_PEPLinear_L0)); + PalacePetscCall( + MatShellSetOperation(A1, MATOP_MULT, (void (*)(void))__mat_apply_PEPLinear_L1)); + PalacePetscCall(MatShellSetVecType(A0, PetscVecType())); + PalacePetscCall(MatShellSetVecType(A1, PetscVecType())); + PalacePetscCall(EPSSetOperators(eps, A0, A1)); + } + + if (first && type != ScaleType::NONE) + { + normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); + normC = linalg::SpectralNorm(GetComm(), *opC, opC->IsReal()); + normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); + MFEM_VERIFY(normK >= 0.0 && normC >= 0.0 && normM >= 0.0, + "Invalid matrix norms for PEP scaling!"); + if (normK > 0 && normC >= 0.0 && normM > 0.0) + { + gamma = std::sqrt(normK / normM); + delta = 2.0 / (normK + gamma * normC); + } + } + + // Set up workspace. + if (!v0) + { + PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); + } + x1.SetSize(opK->Height()); + x2.SetSize(opK->Height()); + y1.SetSize(opK->Height()); + y2.SetSize(opK->Height()); + x1.UseDevice(true); + x2.UseDevice(true); + y1.UseDevice(true); + y2.UseDevice(true); + + // Configure linear solver. + if (first) + { + ConfigurePCShell(GetST(), (void *)this, __pc_apply_PEPLinear); + } +} + +void SlepcPEPLinearSolver::SetBMat(const Operator &B) +{ + SlepcEigenvalueSolver::SetBMat(B); + + const PetscInt n = B.Height(); + PalacePetscCall(MatCreateShell(GetComm(), 2 * n, 2 * n, PETSC_DECIDE, PETSC_DECIDE, + (void *)this, &B0)); + PalacePetscCall( + MatShellSetOperation(B0, MATOP_MULT, (void (*)(void))__mat_apply_PEPLinear_B)); + PalacePetscCall(MatShellSetVecType(B0, PetscVecType())); + + BV bv = GetBV(); + PalacePetscCall(BVSetMatrix(bv, B0, PETSC_FALSE)); +} + +void SlepcPEPLinearSolver::SetInitialSpace(const ComplexVector &v) +{ + MFEM_VERIFY( + A0 && A1, + "Must call SetOperators before using SetInitialSpace for SLEPc eigenvalue solver!"); + if (!v0) + { + PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); + } + PalacePetscCall(ToPetscVec(v, v0, 0, 2)); + Vec is[1] = {v0}; + PalacePetscCall(EPSSetInitialSpace(eps, 1, is)); +} + +void SlepcPEPLinearSolver::GetEigenvector(int i, ComplexVector &x) const +{ + // Select the most accurate x for y = [x₁; x₂] from the linearized eigenvalue problem. Or, + // just take x = x₁. + MFEM_VERIFY( + v0, + "Must call SetOperators before using GetEigenvector for SLEPc eigenvalue solver!"); + PalacePetscCall(EPSGetEigenvector(eps, i, v0, nullptr)); + PalacePetscCall(FromPetscVec(v0, x, 0, 2)); + if (xscale.get()[i] > 0.0) + { + x *= xscale.get()[i]; + } +} + +PetscReal SlepcPEPLinearSolver::GetResidualNorm(PetscScalar l, const ComplexVector &x, + ComplexVector &r) const +{ + // Compute the i-th eigenpair residual: || P(λ) x ||₂ = || (K + λ C + λ² M) x ||₂ for + // eigenvalue λ. + opK->Mult(x, r); + if (opC) + { + opC->AddMult(x, r, l); + } + opM->AddMult(x, r, l * l); + return linalg::Norml2(GetComm(), r); +} + +PetscReal SlepcPEPLinearSolver::GetBackwardScaling(PetscScalar l) const +{ + // Make sure not to use norms from scaling as this can be confusing if they are different. + // Note that SLEPc typically uses ||.||∞, not the 2-norm. + if (normK <= 0.0) + { + normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); + } + if (normC <= 0.0 && opC) + { + normC = linalg::SpectralNorm(GetComm(), *opC, opC->IsReal()); + } + if (normM <= 0.0) + { + normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); + } + PetscReal t = PetscAbsScalar(l); + return normK + t * normC + t * t * normM; +} + +// PEP specific methods. + +SlepcPEPSolverBase::SlepcPEPSolverBase(MPI_Comm comm, int print, const std::string &prefix) + : SlepcEigenvalueSolver(print) +{ + PalacePetscCall(PEPCreate(comm, &pep)); + PalacePetscCall(PEPSetOptionsPrefix(pep, prefix.c_str())); + if (print > 0) + { + std::string opts = "-pep_monitor"; + if (print > 2) + { + opts.append(" -pep_view"); + } + if (prefix.length() > 0) + { + PetscOptionsPrefixPush(nullptr, prefix.c_str()); + } + PetscOptionsInsertString(nullptr, opts.c_str()); + if (prefix.length() > 0) + { + PetscOptionsPrefixPop(nullptr); + } + } + A0 = A1 = A2 = nullptr; +} + +SlepcPEPSolverBase::~SlepcPEPSolverBase() +{ + PalacePetscCall(PEPDestroy(&pep)); + PalacePetscCall(MatDestroy(&A0)); + PalacePetscCall(MatDestroy(&A1)); + PalacePetscCall(MatDestroy(&A2)); +} + +void SlepcPEPSolverBase::SetNumModes(int num_eig, int num_vec) +{ + PalacePetscCall(PEPSetDimensions(pep, num_eig, (num_vec > 0) ? num_vec : PETSC_DEFAULT, + PETSC_DEFAULT)); +} + +void SlepcPEPSolverBase::SetTol(PetscReal tol) +{ + PalacePetscCall(PEPSetTolerances(pep, tol, PETSC_DEFAULT)); + PalacePetscCall(PEPSetConvergenceTest(pep, PEP_CONV_REL)); + // PalacePetscCall(PEPSetTrackAll(pep, PETSC_TRUE)); +} + +void SlepcPEPSolverBase::SetMaxIter(int max_it) +{ + PalacePetscCall( + PEPSetTolerances(pep, PETSC_DEFAULT, (max_it > 0) ? max_it : PETSC_DEFAULT)); +} + +void SlepcPEPSolverBase::SetWhichEigenpairs(EigenvalueSolver::WhichType type) +{ + switch (type) + { + case WhichType::LARGEST_MAGNITUDE: + PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_LARGEST_MAGNITUDE)); + region = false; + break; + case WhichType::SMALLEST_MAGNITUDE: + PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_SMALLEST_MAGNITUDE)); + region = false; + break; + case WhichType::LARGEST_REAL: + PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_LARGEST_REAL)); + break; + case WhichType::SMALLEST_REAL: + PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_SMALLEST_REAL)); + break; + case WhichType::LARGEST_IMAGINARY: + PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_LARGEST_IMAGINARY)); + break; + case WhichType::SMALLEST_IMAGINARY: + PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_SMALLEST_IMAGINARY)); + break; + case WhichType::TARGET_MAGNITUDE: + PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_TARGET_MAGNITUDE)); + region = false; + break; + case WhichType::TARGET_REAL: + PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_TARGET_REAL)); + break; + case WhichType::TARGET_IMAGINARY: + PalacePetscCall(PEPSetWhichEigenpairs(pep, PEP_TARGET_IMAGINARY)); + break; + } +} + +void SlepcPEPSolverBase::SetProblemType(SlepcEigenvalueSolver::ProblemType type) +{ + switch (type) + { + case ProblemType::HERMITIAN: + case ProblemType::GEN_HERMITIAN: + PalacePetscCall(PEPSetProblemType(pep, PEP_HERMITIAN)); + break; + case ProblemType::NON_HERMITIAN: + case ProblemType::GEN_INDEFINITE: + case ProblemType::GEN_NON_HERMITIAN: + case ProblemType::GENERAL: + PalacePetscCall(PEPSetProblemType(pep, PEP_GENERAL)); + break; + case ProblemType::HYPERBOLIC: + PalacePetscCall(PEPSetProblemType(pep, PEP_HYPERBOLIC)); + break; + case ProblemType::GYROSCOPIC: + PalacePetscCall(PEPSetProblemType(pep, PEP_GYROSCOPIC)); + break; + } +} + +void SlepcPEPSolverBase::SetType(SlepcEigenvalueSolver::Type type) +{ + switch (type) + { + case Type::TOAR: + PalacePetscCall(PEPSetType(pep, PEPTOAR)); + break; + case Type::STOAR: + PalacePetscCall(PEPSetType(pep, PEPSTOAR)); + break; + case Type::QARNOLDI: + PalacePetscCall(PEPSetType(pep, PEPQARNOLDI)); + break; + case Type::JD: + PalacePetscCall(PEPSetType(pep, PEPJD)); + region = false; + break; + case Type::KRYLOVSCHUR: + case Type::POWER: + case Type::SUBSPACE: + case Type::SLP: + case Type::NLEIGS: + MFEM_ABORT("Eigenvalue solver type not implemented!"); + break; + } +} + +void SlepcPEPSolverBase::SetInitialSpace(const ComplexVector &v) +{ + MFEM_VERIFY( + A0 && A1 && A2, + "Must call SetOperators before using SetInitialSpace for SLEPc eigenvalue solver!"); + if (!v0) + { + PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); + } + PalacePetscCall(ToPetscVec(v, v0)); + Vec is[1] = {v0}; + PalacePetscCall(PEPSetInitialSpace(pep, 1, is)); +} + +void SlepcPEPSolverBase::Customize() +{ + SlepcEigenvalueSolver::Customize(); + PalacePetscCall(PEPSetTarget(pep, sigma / gamma)); + if (!cl_custom) + { + PalacePetscCall(PEPSetFromOptions(pep)); + if (print > 0) + { + PetscOptionsView(nullptr, PETSC_VIEWER_STDOUT_(GetComm())); + Mpi::Print(GetComm(), "\n"); + } + cl_custom = true; + } +} + +int SlepcPEPSolverBase::Solve() +{ + MFEM_VERIFY(A0 && A1 && A2 && opInv, "Operators are not set for SlepcPEPSolverBase!"); + + // Solve the eigenvalue problem. + PetscInt num_conv; + Customize(); + PalacePetscCall(PEPSolve(pep)); + PalacePetscCall(PEPGetConverged(pep, &num_conv)); + if (print > 0) + { + Mpi::Print(GetComm(), "\n"); + PalacePetscCall(PEPConvergedReasonView(pep, PETSC_VIEWER_STDOUT_(GetComm()))); + Mpi::Print(GetComm(), + " Total number of linear systems solved: {:d}\n" + " Total number of linear solver iterations: {:d}\n", + opInv->NumTotalMult(), opInv->NumTotalMultIterations()); + } + + // Compute and store the eigenpair residuals. + RescaleEigenvectors(num_conv); + return (int)num_conv; +} + +std::complex SlepcPEPSolverBase::GetEigenvalue(int i) const +{ + PetscScalar l; + PalacePetscCall(PEPGetEigenpair(pep, i, &l, nullptr, nullptr, nullptr)); + return l * gamma; +} + +void SlepcPEPSolverBase::GetEigenvector(int i, ComplexVector &x) const +{ + MFEM_VERIFY( + v0, + "Must call SetOperators before using GetEigenvector for SLEPc eigenvalue solver!"); + PalacePetscCall(PEPGetEigenpair(pep, i, nullptr, nullptr, v0, nullptr)); + PalacePetscCall(FromPetscVec(v0, x)); + if (xscale.get()[i] > 0.0) + { + x *= xscale.get()[i]; + } +} + +BV SlepcPEPSolverBase::GetBV() const +{ + BV bv; + PalacePetscCall(PEPGetBV(pep, &bv)); + return bv; +} + +ST SlepcPEPSolverBase::GetST() const +{ + ST st; + PalacePetscCall(PEPGetST(pep, &st)); + return st; +} + +RG SlepcPEPSolverBase::GetRG() const +{ + RG rg; + PalacePetscCall(PEPGetRG(pep, &rg)); + return rg; +} + +SlepcPEPSolver::SlepcPEPSolver(MPI_Comm comm, int print, const std::string &prefix) + : SlepcPEPSolverBase(comm, print, prefix) +{ + opK = opC = opM = nullptr; + normK = normC = normM = 0.0; +} + +void SlepcPEPSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, + EigenvalueSolver::ScaleType type) +{ + // Construct shell matrices for the scaled operators which define the quadratic polynomial + // eigenvalue problem. + const bool first = (opK == nullptr); + opK = &K; + opC = &C; + opM = &M; + + if (first) + { + const PetscInt n = opK->Height(); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A0)); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A1)); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A2)); + PalacePetscCall( + MatShellSetOperation(A0, MATOP_MULT, (void (*)(void))__mat_apply_PEP_A0)); + PalacePetscCall( + MatShellSetOperation(A1, MATOP_MULT, (void (*)(void))__mat_apply_PEP_A1)); + PalacePetscCall( + MatShellSetOperation(A2, MATOP_MULT, (void (*)(void))__mat_apply_PEP_A2)); + PalacePetscCall(MatShellSetVecType(A0, PetscVecType())); + PalacePetscCall(MatShellSetVecType(A1, PetscVecType())); + PalacePetscCall(MatShellSetVecType(A2, PetscVecType())); + Mat A[3] = {A0, A1, A2}; + PalacePetscCall(PEPSetOperators(pep, 3, A)); + } + + if (first && type != ScaleType::NONE) + { + normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); + normC = linalg::SpectralNorm(GetComm(), *opC, opC->IsReal()); + normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); + MFEM_VERIFY(normK >= 0.0 && normC >= 0.0 && normM >= 0.0, + "Invalid matrix norms for PEP scaling!"); + if (normK > 0 && normC >= 0.0 && normM > 0.0) + { + gamma = std::sqrt(normK / normM); + delta = 2.0 / (normK + gamma * normC); + } + } + + // Set up workspace. + if (!v0) + { + PalacePetscCall(MatCreateVecs(A0, nullptr, &v0)); + } + x1.SetSize(opK->Height()); + y1.SetSize(opK->Height()); + + // Configure linear solver. + if (first) + { + ConfigurePCShell(GetST(), (void *)this, __pc_apply_PEP); + } +} + +void SlepcPEPSolver::SetBMat(const Operator &B) +{ + SlepcEigenvalueSolver::SetBMat(B); + + const PetscInt n = B.Height(); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &B0)); + PalacePetscCall(MatShellSetOperation(B0, MATOP_MULT, (void (*)(void))__mat_apply_PEP_B)); + PalacePetscCall(MatShellSetVecType(B0, PetscVecType())); + + BV bv = GetBV(); + PalacePetscCall(BVSetMatrix(bv, B0, PETSC_FALSE)); +} + +PetscReal SlepcPEPSolver::GetResidualNorm(PetscScalar l, const ComplexVector &x, + ComplexVector &r) const +{ + // Compute the i-th eigenpair residual: || P(λ) x ||₂ = || (K + λ C + λ² M) x ||₂ for + // eigenvalue λ. + opK->Mult(x, r); + if (opC) + { + opC->AddMult(x, r, l); + } + opM->AddMult(x, r, l * l); + return linalg::Norml2(GetComm(), r); +} + +PetscReal SlepcPEPSolver::GetBackwardScaling(PetscScalar l) const +{ + // Make sure not to use norms from scaling as this can be confusing if they are different. + // Note that SLEPc typically uses ||.||∞, not Frobenius. + if (normK <= 0.0) + { + normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); + } + if (normC <= 0.0 && opC) + { + normC = linalg::SpectralNorm(GetComm(), *opC, opC->IsReal()); + } + if (normM <= 0.0) + { + normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); + } + PetscReal t = PetscAbsScalar(l); + return normK + t * normC + t * t * normM; +} + +// NEP specific methods. + +SlepcNEPSolverBase::SlepcNEPSolverBase(MPI_Comm comm, int print, const std::string &prefix) + : SlepcEigenvalueSolver(print) +{ + PalacePetscCall(NEPCreate(comm, &nep)); + PalacePetscCall(NEPSetOptionsPrefix(nep, prefix.c_str())); + if (print > 0) + { + std::string opts = "-nep_monitor"; + if (print > 2) + { + opts.append(" -nep_view"); + } + if (prefix.length() > 0) + { + PetscOptionsPrefixPush(nullptr, prefix.c_str()); + } + PetscOptionsInsertString(nullptr, opts.c_str()); + if (prefix.length() > 0) + { + PetscOptionsPrefixPop(nullptr); + } + } + A = J = nullptr; +} + +SlepcNEPSolverBase::~SlepcNEPSolverBase() +{ + PalacePetscCall(NEPDestroy(&nep)); + PalacePetscCall(MatDestroy(&A)); + PalacePetscCall(MatDestroy(&J)); +} + +void SlepcNEPSolverBase::SetNumModes(int num_eig, int num_vec) +{ + PalacePetscCall(NEPSetDimensions(nep, num_eig, (num_vec > 0) ? num_vec : PETSC_DEFAULT, + PETSC_DEFAULT)); +} + +void SlepcNEPSolverBase::SetTol(PetscReal tol) +{ + PalacePetscCall(NEPSetTolerances(nep, tol, PETSC_DEFAULT)); + PalacePetscCall(NEPSetConvergenceTest(nep, NEP_CONV_REL)); +} + +void SlepcNEPSolverBase::SetMaxIter(int max_it) +{ + PalacePetscCall( + NEPSetTolerances(nep, PETSC_DEFAULT, (max_it > 0) ? max_it : PETSC_DEFAULT)); +} + +void SlepcNEPSolverBase::SetShiftInvert(std::complex s, bool precond) +{ + sigma = s; // Wait until solve time to call NEPSetTarget + sinvert = false; +} + +void SlepcNEPSolverBase::SetWhichEigenpairs(EigenvalueSolver::WhichType type) +{ + switch (type) + { + case WhichType::LARGEST_MAGNITUDE: + PalacePetscCall(NEPSetWhichEigenpairs(nep, NEP_LARGEST_MAGNITUDE)); + region = false; + break; + case WhichType::SMALLEST_MAGNITUDE: + PalacePetscCall(NEPSetWhichEigenpairs(nep, NEP_SMALLEST_MAGNITUDE)); + region = false; + break; + case WhichType::LARGEST_REAL: + PalacePetscCall(NEPSetWhichEigenpairs(nep, NEP_LARGEST_REAL)); + break; + case WhichType::SMALLEST_REAL: + PalacePetscCall(NEPSetWhichEigenpairs(nep, NEP_SMALLEST_REAL)); + break; + case WhichType::LARGEST_IMAGINARY: + PalacePetscCall(NEPSetWhichEigenpairs(nep, NEP_LARGEST_IMAGINARY)); + break; + case WhichType::SMALLEST_IMAGINARY: + PalacePetscCall(NEPSetWhichEigenpairs(nep, NEP_SMALLEST_IMAGINARY)); + break; + case WhichType::TARGET_MAGNITUDE: + PalacePetscCall(NEPSetWhichEigenpairs(nep, NEP_TARGET_MAGNITUDE)); + region = false; + break; + case WhichType::TARGET_REAL: + PalacePetscCall(NEPSetWhichEigenpairs(nep, NEP_TARGET_REAL)); + break; + case WhichType::TARGET_IMAGINARY: + PalacePetscCall(NEPSetWhichEigenpairs(nep, NEP_TARGET_IMAGINARY)); + break; + } +} + +void SlepcNEPSolverBase::SetProblemType(SlepcEigenvalueSolver::ProblemType type) +{ + switch (type) + { + case ProblemType::GENERAL: + PalacePetscCall(NEPSetProblemType(nep, NEP_GENERAL)); + break; + case ProblemType::HERMITIAN: + case ProblemType::GEN_HERMITIAN: + case ProblemType::NON_HERMITIAN: + case ProblemType::GEN_INDEFINITE: + case ProblemType::GEN_NON_HERMITIAN: + case ProblemType::HYPERBOLIC: + case ProblemType::GYROSCOPIC: + MFEM_ABORT("Problem type not implemented!"); + break; + } +} + +void SlepcNEPSolverBase::SetType(SlepcEigenvalueSolver::Type type) +{ + switch (type) + { + case Type::SLP: + PalacePetscCall(NEPSetType(nep, NEPSLP)); + break; + case Type::NLEIGS: + case Type::KRYLOVSCHUR: + case Type::POWER: + case Type::SUBSPACE: + case Type::JD: + case Type::TOAR: + case Type::STOAR: + case Type::QARNOLDI: + MFEM_ABORT("Eigenvalue solver type not implemented!"); + break; + } +} + +void SlepcNEPSolverBase::SetInitialSpace(const ComplexVector &v) +{ + MFEM_VERIFY( + A && J, + "Must call SetOperators before using SetInitialSpace for SLEPc eigenvalue solver!"); + if (!v0) + { + PalacePetscCall(MatCreateVecs(A, nullptr, &v0)); + } + PalacePetscCall(ToPetscVec(v, v0)); + Vec is[1] = {v0}; + PalacePetscCall(NEPSetInitialSpace(nep, 1, is)); +} + +void SlepcNEPSolverBase::Customize() +{ + // Configure the region based on the given target if necessary. + PalacePetscCall(NEPSetTarget(nep, sigma)); + if (!cl_custom) + { + PalacePetscCall(NEPSetFromOptions(nep)); + if (print > 0) + { + PetscOptionsView(nullptr, PETSC_VIEWER_STDOUT_(GetComm())); + Mpi::Print(GetComm(), "\n"); + } + cl_custom = true; + } +} + +int SlepcNEPSolverBase::Solve() +{ + MFEM_VERIFY(A && J && opInv, "Operators are not set for SlepcNEPSolverBase!"); + + // Solve the eigenvalue problem. + perm.reset(); + PetscInt num_conv; + Customize(); + PalacePetscCall(NEPSolve(nep)); + PalacePetscCall(NEPGetConverged(nep, &num_conv)); + if (print > 0) + { + Mpi::Print(GetComm(), "\n"); + PalacePetscCall(NEPConvergedReasonView(nep, PETSC_VIEWER_STDOUT_(GetComm()))); + Mpi::Print(GetComm(), + " Total number of linear systems solved: {:d}\n" + " Total number of linear solver iterations: {:d}\n", + opInv->NumTotalMult(), opInv->NumTotalMultIterations()); + } + + // Compute and store the ordered eigenpair residuals. + const int nev = (int)num_conv; + perm = std::make_unique(nev); + std::vector> eig(nev); + for (int i = 0; i < nev; i++) + { + PetscScalar l; + PalacePetscCall(NEPGetEigenpair(nep, i, &l, nullptr, nullptr, nullptr)); + eig[i] = l; + perm[i] = i; + } + // Sort by ascending imaginary component. + std::sort(perm.get(), perm.get() + nev, + [&eig](auto l, auto r) { return eig[l].imag() < eig[r].imag(); }); + RescaleEigenvectors(nev); + return nev; +} + +std::complex SlepcNEPSolverBase::GetEigenvalue(int i) const +{ + PetscScalar l; + const int &j = perm.get()[i]; + PalacePetscCall(NEPGetEigenpair(nep, j, &l, nullptr, nullptr, nullptr)); + return l; +} + +void SlepcNEPSolverBase::GetEigenvector(int i, ComplexVector &x) const +{ + MFEM_VERIFY( + v0, + "Must call SetOperators before using GetEigenvector for SLEPc eigenvalue solver!"); + const int &j = perm.get()[i]; + PalacePetscCall(NEPGetEigenpair(nep, j, nullptr, nullptr, v0, nullptr)); + PalacePetscCall(FromPetscVec(v0, x)); + if (xscale.get()[i] > 0.0) + { + x *= xscale.get()[i]; + } +} + +BV SlepcNEPSolverBase::GetBV() const +{ + BV bv; + PalacePetscCall(NEPGetBV(nep, &bv)); + return bv; +} + +ST SlepcNEPSolverBase::GetST() const +{ + ST st = nullptr; + // NEPGetST does not exist. + return st; +} + +RG SlepcNEPSolverBase::GetRG() const +{ + RG rg; + PalacePetscCall(NEPGetRG(nep, &rg)); + return rg; +} + +SlepcNEPSolver::SlepcNEPSolver(MPI_Comm comm, int print, const std::string &prefix) + : SlepcNEPSolverBase(comm, print, prefix) +{ + opK = opC = opM = nullptr; + normK = normC = normM = 0.0; +} + +void SlepcNEPSolver::SetExtraSystemMatrix( + std::function(double)> A2) +{ + funcA2 = A2; +} + +void SlepcNEPSolver::SetPreconditionerUpdate( + std::function( + std::complex, std::complex, std::complex, double)> + P) +{ + funcP = P; +} + +void SlepcNEPSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &M, + EigenvalueSolver::ScaleType type) +{ + // Construct shell matrices for the scaled operators which define the quadratic polynomial + // eigenvalue problem. + const bool first = (opK == nullptr); + opK = &K; + opM = &M; + + if (first) + { + const PetscInt n = opK->Height(); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A)); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &J)); + PalacePetscCall(MatShellSetOperation(A, MATOP_MULT, (void (*)(void))__mat_apply_NEP_A)); + PalacePetscCall(MatShellSetOperation(J, MATOP_MULT, (void (*)(void))__mat_apply_NEP_J)); + PalacePetscCall(MatShellSetVecType(A, PetscVecType())); + PalacePetscCall(MatShellSetVecType(J, PetscVecType())); + PalacePetscCall(NEPSetFunction(nep, A, A, __form_NEP_function, NULL)); + PalacePetscCall(NEPSetJacobian(nep, J, __form_NEP_jacobian, NULL)); + } + + if (first && type != ScaleType::NONE) + { + normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); + normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); + MFEM_VERIFY(normK >= 0.0 && normM >= 0.0, "Invalid matrix norms for NEP scaling!"); + if (normK > 0 && normM > 0.0) + { + gamma = std::sqrt(normK / normM); + delta = 2.0 / normK; + } + } + + // Set up workspace. + if (!v0) + { + PalacePetscCall(MatCreateVecs(A, nullptr, &v0)); + } + x1.SetSize(opK->Height()); + y1.SetSize(opK->Height()); + + // Configure linear solver. + if (first) + { + // SLP. + PC pc; + KSP ksp; + EPS eps; + PalacePetscCall(NEPSLPGetKSP(nep, &ksp)); + PalacePetscCall(KSPSetType(ksp, KSPPREONLY)); + PalacePetscCall(NEPSLPGetEPS(nep, &eps)); + PalacePetscCall(EPSSetType(eps, EPSKRYLOVSCHUR)); + PalacePetscCall(KSPGetPC(ksp, &pc)); + PalacePetscCall(PCSetType(pc, PCSHELL)); + PalacePetscCall(PCShellSetContext(pc, (void *)this)); + PalacePetscCall(PCShellSetApply(pc, __pc_apply_NEP)); + } +} + +void SlepcNEPSolver::SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, + EigenvalueSolver::ScaleType type) +{ + // Construct shell matrices for the scaled operators which define the quadratic polynomial + // eigenvalue problem. + const bool first = (opK == nullptr); + opK = &K; + opC = &C; + opM = &M; + + if (first) + { + const PetscInt n = opK->Height(); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &A)); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &J)); + PalacePetscCall(MatShellSetOperation(A, MATOP_MULT, (void (*)(void))__mat_apply_NEP_A)); + PalacePetscCall(MatShellSetOperation(J, MATOP_MULT, (void (*)(void))__mat_apply_NEP_J)); + PalacePetscCall(MatShellSetVecType(A, PetscVecType())); + PalacePetscCall(MatShellSetVecType(J, PetscVecType())); + PalacePetscCall(NEPSetFunction(nep, A, A, __form_NEP_function, NULL)); + PalacePetscCall(NEPSetJacobian(nep, J, __form_NEP_jacobian, NULL)); + } + + if (first && type != ScaleType::NONE) + { + normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); + normC = linalg::SpectralNorm(GetComm(), *opC, opC->IsReal()); + normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); + MFEM_VERIFY(normK >= 0.0 && normC >= 0.0 && normM >= 0.0, + "Invalid matrix norms for NEP scaling!"); + if (normK > 0 && normC >= 0.0 && normM > 0.0) + { + gamma = std::sqrt(normK / normM); + delta = 2.0 / (normK + gamma * normC); + } + } + + // Set up workspace. + if (!v0) + { + PalacePetscCall(MatCreateVecs(A, nullptr, &v0)); + } + x1.SetSize(opK->Height()); + y1.SetSize(opK->Height()); + + // Configure linear solver. + if (first) + { + // SLP. + PC pc; + KSP ksp; + EPS eps; + PalacePetscCall(NEPSLPGetKSP(nep, &ksp)); + PalacePetscCall(KSPSetType(ksp, KSPPREONLY)); + PalacePetscCall(NEPSLPGetEPS(nep, &eps)); + PalacePetscCall(EPSSetType(eps, EPSKRYLOVSCHUR)); + PalacePetscCall(KSPGetPC(ksp, &pc)); + PalacePetscCall(PCSetType(pc, PCSHELL)); + PalacePetscCall(PCShellSetContext(pc, (void *)this)); + PalacePetscCall(PCShellSetApply(pc, __pc_apply_NEP)); + } +} + +void SlepcNEPSolver::SetBMat(const Operator &B) +{ + SlepcEigenvalueSolver::SetBMat(B); + + const PetscInt n = B.Height(); + PalacePetscCall( + MatCreateShell(GetComm(), n, n, PETSC_DECIDE, PETSC_DECIDE, (void *)this, &B0)); + PalacePetscCall(MatShellSetOperation(B0, MATOP_MULT, (void (*)(void))__mat_apply_NEP_B)); + PalacePetscCall(MatShellSetVecType(B0, PetscVecType())); + + BV bv = GetBV(); + PalacePetscCall(BVSetMatrix(bv, B0, PETSC_FALSE)); +} + +PetscReal SlepcNEPSolver::GetResidualNorm(PetscScalar l, const ComplexVector &x, + ComplexVector &r) const +{ + // Compute the i-th eigenpair residual: || P(λ) x ||₂ = || (K + λ C + λ² M) x ||₂ for + // eigenvalue λ. + opK->Mult(x, r); + if (opC) + { + opC->AddMult(x, r, l); + } + opM->AddMult(x, r, l * l); + if (funcA2) + { + auto A2 = (*funcA2)(std::abs(l.imag())); + A2->AddMult(x, r, 1.0 + 0.0i); + } + return linalg::Norml2(GetComm(), r); +} + +PetscReal SlepcNEPSolver::GetBackwardScaling(PetscScalar l) const +{ + // Make sure not to use norms from scaling as this can be confusing if they are different. + // Note that SLEPc typically uses ||.||∞, not Frobenius. + if (normK <= 0.0) + { + normK = linalg::SpectralNorm(GetComm(), *opK, opK->IsReal()); + } + if (normC <= 0.0 && opC) + { + normC = linalg::SpectralNorm(GetComm(), *opC, opC->IsReal()); + } + if (normM <= 0.0) + { + normM = linalg::SpectralNorm(GetComm(), *opM, opM->IsReal()); + } + PetscReal t = PetscAbsScalar(l); + return normK + t * normC + t * t * normM; +} + +} // namespace palace::slepc + +PetscErrorCode __mat_apply_EPS_A0(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcEPSSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1)); + ctx->opK->Mult(ctx->x1, ctx->y1); + ctx->y1 *= ctx->delta; + PetscCall(ToPetscVec(ctx->y1, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_EPS_A1(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcEPSSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1)); + ctx->opM->Mult(ctx->x1, ctx->y1); + ctx->y1 *= ctx->delta * ctx->gamma; + PetscCall(ToPetscVec(ctx->y1, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_EPS_B(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcEPSSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1)); + ctx->opB->Mult(ctx->x1.Real(), ctx->y1.Real()); + ctx->opB->Mult(ctx->x1.Imag(), ctx->y1.Imag()); + ctx->y1 *= ctx->delta * ctx->gamma; + PetscCall(ToPetscVec(ctx->y1, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __pc_apply_EPS(PC pc, Vec x, Vec y) +{ + // Solve the linear system associated with the generalized eigenvalue problem: y = + // M⁻¹ x, or shift-and-invert spectral transformation: y = (K - σ M)⁻¹ x . Enforces the + // divergence-free constraint using the supplied projector. + PetscFunctionBeginUser; + palace::slepc::SlepcEPSSolver *ctx; + PetscCall(PCShellGetContext(pc, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell PC context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1)); + ctx->opInv->Mult(ctx->x1, ctx->y1); + if (!ctx->sinvert) + { + ctx->y1 *= 1.0 / (ctx->delta * ctx->gamma); + } + else + { + ctx->y1 *= 1.0 / ctx->delta; + } + if (ctx->opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); + ctx->opProj->Mult(ctx->y1); + // Mpi::Print(" After projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); + } + PetscCall(ToPetscVec(ctx->y1, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_PEPLinear_L0(Mat A, Vec x, Vec y) +{ + // Apply the linearized operator L₀ = [ 0 I ] + // [ -K -C ] . + PetscFunctionBeginUser; + palace::slepc::SlepcPEPLinearSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + PetscCall(FromPetscVec(x, ctx->x1, ctx->x2)); + ctx->y1 = ctx->x2; + if (ctx->opC) + { + ctx->opC->Mult(ctx->x2, ctx->y2); + } + else + { + ctx->y2 = 0.0; + } + ctx->y2 *= ctx->gamma; + ctx->opK->AddMult(ctx->x1, ctx->y2, std::complex(1.0, 0.0)); + ctx->y2 *= -ctx->delta; + PetscCall(ToPetscVec(ctx->y1, ctx->y2, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_PEPLinear_L1(Mat A, Vec x, Vec y) +{ + // Apply the linearized operator L₁ = [ I 0 ] + // [ 0 M ] . + PetscFunctionBeginUser; + palace::slepc::SlepcPEPLinearSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1, ctx->x2)); + ctx->y1 = ctx->x1; + ctx->opM->Mult(ctx->x2, ctx->y2); + ctx->y2 *= ctx->delta * ctx->gamma * ctx->gamma; + PetscCall(ToPetscVec(ctx->y1, ctx->y2, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_PEPLinear_B(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcPEPLinearSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1, ctx->x2)); + ctx->opB->Mult(ctx->x1.Real(), ctx->y1.Real()); + ctx->opB->Mult(ctx->x1.Imag(), ctx->y1.Imag()); + ctx->opB->Mult(ctx->x2.Real(), ctx->y2.Real()); + ctx->opB->Mult(ctx->x2.Imag(), ctx->y2.Imag()); + ctx->y1 *= ctx->delta * ctx->gamma * ctx->gamma; + ctx->y2 *= ctx->delta * ctx->gamma * ctx->gamma; + PetscCall(ToPetscVec(ctx->y1, ctx->y2, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __pc_apply_PEPLinear(PC pc, Vec x, Vec y) +{ + // Solve the linear system associated with the generalized eigenvalue problem after + // linearization: y = L₁⁻¹ x, or with the shift-and-invert spectral transformation: + // y = (L₀ - σ L₁)⁻¹ x, with: + // L₀ = [ 0 I ] L₁ = [ I 0 ] + // [ -K -C ] , [ 0 M ] . + // Enforces the divergence-free constraint using the supplied projector. + PetscFunctionBeginUser; + palace::slepc::SlepcPEPLinearSolver *ctx; + PetscCall(PCShellGetContext(pc, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell PC context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1, ctx->x2)); + if (!ctx->sinvert) + { + ctx->y1 = ctx->x1; + if (ctx->opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); + ctx->opProj->Mult(ctx->y1); + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); + } + + ctx->opInv->Mult(ctx->x2, ctx->y2); + ctx->y2 *= 1.0 / (ctx->delta * ctx->gamma * ctx->gamma); + if (ctx->opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y2)); + ctx->opProj->Mult(ctx->y2); + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y2)); + } + } + else + { + ctx->y1.AXPBY(-ctx->sigma / (ctx->delta * ctx->gamma), ctx->x2, 0.0); // Temporarily + ctx->opK->AddMult(ctx->x1, ctx->y1, std::complex(1.0, 0.0)); + ctx->opInv->Mult(ctx->y1, ctx->y2); + if (ctx->opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y2)); + ctx->opProj->Mult(ctx->y2); + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y2)); + } + + ctx->y1.AXPBYPCZ(ctx->gamma / ctx->sigma, ctx->y2, -ctx->gamma / ctx->sigma, ctx->x1, + 0.0); + if (ctx->opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); + ctx->opProj->Mult(ctx->y1); + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); + } + } + PetscCall(ToPetscVec(ctx->y1, ctx->y2, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_PEP_A0(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcPEPSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1)); + ctx->opK->Mult(ctx->x1, ctx->y1); + PetscCall(ToPetscVec(ctx->y1, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_PEP_A1(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcPEPSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1)); + if (ctx->opC) + { + ctx->opC->Mult(ctx->x1, ctx->y1); + } + else + { + ctx->y1 = 0.0; + } + PetscCall(ToPetscVec(ctx->y1, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_PEP_A2(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcPEPSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1)); + ctx->opM->Mult(ctx->x1, ctx->y1); + PetscCall(ToPetscVec(ctx->y1, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_PEP_B(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcPEPSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1)); + ctx->opB->Mult(ctx->x1.Real(), ctx->y1.Real()); + ctx->opB->Mult(ctx->x1.Imag(), ctx->y1.Imag()); + ctx->y1 *= ctx->delta * ctx->gamma; + PetscCall(ToPetscVec(ctx->y1, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __pc_apply_PEP(PC pc, Vec x, Vec y) +{ + // Solve the linear system associated with the generalized eigenvalue problem: y = M⁻¹ x, + // or shift-and-invert spectral transformation: y = P(σ)⁻¹ x . Enforces the divergence- + // free constraint using the supplied projector. + PetscFunctionBeginUser; + palace::slepc::SlepcPEPSolver *ctx; + PetscCall(PCShellGetContext(pc, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell PC context for SLEPc!"); + + PetscCall(FromPetscVec(x, ctx->x1)); + ctx->opInv->Mult(ctx->x1, ctx->y1); + if (!ctx->sinvert) + { + ctx->y1 *= 1.0 / (ctx->delta * ctx->gamma * ctx->gamma); + } + else + { + ctx->y1 *= 1.0 / ctx->delta; + } + if (ctx->opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); + ctx->opProj->Mult(ctx->y1); + // Mpi::Print(" After projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); + } + PetscCall(ToPetscVec(ctx->y1, y)); + + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_NEP_A(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcNEPSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + PetscCall(FromPetscVec(x, ctx->x1)); + ctx->opA->Mult(ctx->x1, ctx->y1); + PetscCall(ToPetscVec(ctx->y1, y)); + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_NEP_J(Mat J, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcNEPSolver *ctx; + PetscCall(MatShellGetContext(J, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + PetscCall(FromPetscVec(x, ctx->x1)); + ctx->opJ->Mult(ctx->x1, ctx->y1); + PetscCall(ToPetscVec(ctx->y1, y)); + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __mat_apply_NEP_B(Mat A, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcNEPSolver *ctx; + PetscCall(MatShellGetContext(A, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell matrix context for SLEPc!"); + PetscCall(FromPetscVec(x, ctx->x1)); + ctx->opB->Mult(ctx->x1.Real(), ctx->y1.Real()); + ctx->opB->Mult(ctx->x1.Imag(), ctx->y1.Imag()); + PetscCall(ToPetscVec(ctx->y1, y)); + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __pc_apply_NEP(PC pc, Vec x, Vec y) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcNEPSolver *ctx; + PetscCall(PCShellGetContext(pc, (void **)&ctx)); + MFEM_VERIFY(ctx, "Invalid PETSc shell PC context for SLEPc!"); + PetscCall(FromPetscVec(x, ctx->x1)); + // Updating PC for new λ is needed for SLP, but should not be done for NLEIGS. + if (ctx->new_lambda && !ctx->first_pc) + { + if (ctx->lambda.imag() == 0.0) + ctx->lambda = ctx->sigma; + ctx->opA2_pc = (*ctx->funcA2)(std::abs(ctx->lambda.imag())); + ctx->opA_pc = palace::BuildParSumOperator( + {1.0 + 0.0i, ctx->lambda, ctx->lambda * ctx->lambda, 1.0 + 0.0i}, + {ctx->opK, ctx->opC, ctx->opM, ctx->opA2_pc.get()}, true); + ctx->opP_pc = (*ctx->funcP)(std::complex(1.0, 0.0), ctx->lambda, + ctx->lambda * ctx->lambda, ctx->lambda.imag()); + ctx->opInv->SetOperators(*ctx->opA_pc, *ctx->opP_pc); + ctx->new_lambda = false; + } + else if (ctx->first_pc) + { + ctx->first_pc = false; + ctx->new_lambda = false; + } + ctx->opInv->Mult(ctx->x1, ctx->y1); + if (ctx->opProj) + { + // Mpi::Print(" Before projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); + ctx->opProj->Mult(ctx->y1); + // Mpi::Print(" After projection: {:e}\n", linalg::Norml2(ctx->GetComm(), ctx->y1)); + } + PetscCall(ToPetscVec(ctx->y1, y)); + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __form_NEP_function(NEP nep, PetscScalar lambda, Mat fun, Mat B, void *ctx) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcNEPSolver *ctxF; + PetscCall(MatShellGetContext(fun, (void **)&ctxF)); + // A(λ) = K + λ C + λ² M + A2(Im{λ}). + ctxF->opA2 = (*ctxF->funcA2)(std::abs(lambda.imag())); + ctxF->opA = palace::BuildParSumOperator( + {1.0 + 0.0i, lambda, lambda * lambda, 1.0 + 0.0i}, + {ctxF->opK, ctxF->opC, ctxF->opM, ctxF->opA2.get()}, true); + ctxF->lambda = lambda; + ctxF->new_lambda = true; // flag to update the preconditioner in SLP + PetscFunctionReturn(PETSC_SUCCESS); +} + +PetscErrorCode __form_NEP_jacobian(NEP nep, PetscScalar lambda, Mat fun, void *ctx) +{ + PetscFunctionBeginUser; + palace::slepc::SlepcNEPSolver *ctxF; + PetscCall(MatShellGetContext(fun, (void **)&ctxF)); + // A(λ) = K + λ C + λ² M + A2(Im{λ}). + // J(λ) = C + 2 λ M + A2'(Im{λ}). + ctxF->opA2 = (*ctxF->funcA2)(std::abs(lambda.imag())); + const auto eps = std::sqrt(std::numeric_limits::epsilon()); + ctxF->opA2p = (*ctxF->funcA2)(std::abs(lambda.imag()) * (1.0 + eps)); + std::complex denom = std::complex(0.0, eps * std::abs(lambda.imag())); + ctxF->opAJ = palace::BuildParSumOperator({1.0 / denom, -1.0 / denom}, + {ctxF->opA2p.get(), ctxF->opA2.get()}, true); + ctxF->opJ = palace::BuildParSumOperator( + {0.0 + 0.0i, 1.0 + 0.0i, 2.0 * lambda, 1.0 + 0.0i}, + {ctxF->opK, ctxF->opC, ctxF->opM, ctxF->opAJ.get()}, true); + PetscFunctionReturn(PETSC_SUCCESS); +} + +#endif diff --git a/palace/linalg/slepc.hpp b/palace/linalg/slepc.hpp index a6f3a2ba6f..e207c70d20 100644 --- a/palace/linalg/slepc.hpp +++ b/palace/linalg/slepc.hpp @@ -1,413 +1,544 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_SLEPC_HPP -#define PALACE_LINALG_SLEPC_HPP - -#if defined(PALACE_WITH_SLEPC) - -#include "linalg/petsc.hpp" - -#if !defined(PETSC_USE_COMPLEX) -#error "SLEPc interface requires PETSc compiled with complex scalars!" -#endif - -#include -#include -#include -#include "linalg/eps.hpp" -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" - -// Forward declarations of SLEPc objects. -typedef struct _p_EPS *EPS; -typedef struct _p_PEP *PEP; -typedef struct _p_BV *BV; -typedef struct _p_ST *ST; -typedef struct _p_RG *RG; - -namespace palace -{ - -class DivFreeSolver; - -namespace slepc -{ - -// Wrappers for SlepcInitialize/SlepcInitializeNoArguments/SlepcFinalize. -void Initialize(int &argc, char **&argv, const char rc_file[], const char help[]); -void Initialize(); -void Finalize(); - -// Compute and return the maximum singular value of the given operator, σₙ² = λₙ(Aᴴ A) . -PetscReal GetMaxSingularValue(MPI_Comm comm, const ComplexOperator &A, bool herm = false, - PetscReal tol = PETSC_DEFAULT, - PetscInt max_it = PETSC_DEFAULT); - -// -// A wrapper for the SLEPc library for generalized linear eigenvalue problems or quadratic -// polynomial eigenvalue problems. Shift-and-invert spectral transformations can be used to -// compute interior eigenvalues. -// -class SlepcEigenvalueSolver : public EigenvalueSolver -{ -public: - enum class ProblemType - { - HERMITIAN, - NON_HERMITIAN, - GEN_HERMITIAN, - GEN_INDEFINITE, - GEN_NON_HERMITIAN, - HYPERBOLIC, - GYROSCOPIC - }; - - enum class Type - { - KRYLOVSCHUR, - POWER, - SUBSPACE, - TOAR, - STOAR, - QARNOLDI, - JD - }; - -protected: - // Control print level for debugging. - int print; - - // Variables for scaling, from Higham et al., IJNME 2008. - PetscReal gamma, delta; - - // Parameters defining the spectral transformation. - PetscScalar sigma; - bool sinvert, region; - - // Storage for computed residual norms. - std::unique_ptr res; - - // Reference to linear solver used for operator action for M⁻¹ (with no spectral - // transformation) or (K - σ M)⁻¹ (generalized EVP with shift-and- invert) or P(σ)⁻¹ - // (polynomial with shift-and-invert) (not owned). - const ComplexKspSolver *opInv; - - // Reference to solver for projecting an intermediate vector onto a divergence-free space - // (not owned). - const DivFreeSolver *opProj; - - // Reference to matrix used for weighted inner products (not owned). May be nullptr, in - // which case identity is used. - const Operator *opB; - - // Workspace objects for eigenvalue calculations. - Mat B0; - Vec v0; - - // Boolean to handle SetFromOptions calls. - mutable bool cl_custom; - - // Customize object with command line options set. - virtual void Customize(); - - // Helper routine for computing the eigenpair residual. - virtual PetscReal GetResidualNorm(int i) const = 0; - - // Helper routine for computing the backward error. - virtual PetscReal GetBackwardScaling(PetscScalar l) const = 0; - -public: - SlepcEigenvalueSolver(int print); - ~SlepcEigenvalueSolver() override; - - // Set operators for the generalized eigenvalue problem or for the quadratic polynomial - // eigenvalue problem. - void SetOperators(const ComplexOperator &K, const ComplexOperator &M, - ScaleType type) override; - void SetOperators(const ComplexOperator &K, const ComplexOperator &C, - const ComplexOperator &M, ScaleType type) override; - - // For the linear generalized case, the linear solver should be configured to compute the - // action of M⁻¹ (with no spectral transformation) or (K - σ M)⁻¹. For the quadratic - // case, the linear solver should be configured to compute the action of M⁻¹ (with no - // spectral transformation) or P(σ)⁻¹. - void SetLinearSolver(const ComplexKspSolver &ksp) override; - - // Set the projection operator for enforcing the divergence-free constraint. - void SetDivFreeProjector(const DivFreeSolver &divfree) override; - - // Set optional B matrix used for weighted inner products. This must be set explicitly - // even for generalized problems, otherwise the identity will be used. - void SetBMat(const Operator &B) override; - - // Get scaling factors used by the solver. - PetscReal GetScalingGamma() const override { return gamma; } - PetscReal GetScalingDelta() const override { return delta; } - - // Set shift-and-invert spectral transformation. - void SetShiftInvert(PetscScalar s, bool precond = false) override; - - // Set problem type. - virtual void SetProblemType(ProblemType type) = 0; - - // Set eigenvalue solver. - virtual void SetType(Type type) = 0; - - // Configure the basis vectors object associated with the eigenvalue solver. - void SetOrthogonalization(bool mgs, bool cgs2); - - // Get the corresponding eigenpair error. - PetscReal GetError(int i, ErrorType type) const override; - - // Get the basis vectors object. - virtual BV GetBV() const = 0; - - // Get the spectral transformation object. - virtual ST GetST() const = 0; - - // Get the filtering region object. - virtual RG GetRG() const = 0; - - // Get the associated MPI communicator. - virtual MPI_Comm GetComm() const = 0; - - // Conversion function to PetscObject. - virtual operator PetscObject() const = 0; -}; - -// Base class for SLEPc's EPS problem type. -class SlepcEPSSolverBase : public SlepcEigenvalueSolver -{ -protected: - // SLEPc eigensolver object. Polynomial problems are handled using linearization. - EPS eps; - - // Shell matrices for the generalized eigenvalue problem. - Mat A0, A1; - - void Customize() override; - -public: - // Calls SLEPc's EPSCreate. Expects SLEPc to be initialized/finalized externally. - SlepcEPSSolverBase(MPI_Comm comm, int print, const std::string &prefix = std::string()); - - // Call's SLEPc's EPSDestroy. - ~SlepcEPSSolverBase() override; - - // Conversion function to SLEPc's EPS type. - operator EPS() const { return eps; } - - void SetNumModes(int num_eig, int num_vec = 0) override; - - void SetTol(PetscReal tol) override; - - void SetMaxIter(int max_it) override; - - void SetWhichEigenpairs(WhichType type) override; - - void SetProblemType(ProblemType type) override; - - void SetType(Type type) override; - - void SetInitialSpace(const ComplexVector &v) override; - - int Solve() override; - - PetscScalar GetEigenvalue(int i) const override; - - void GetEigenvector(int i, ComplexVector &x) const override; - - BV GetBV() const override; - - ST GetST() const override; - - RG GetRG() const override; - - MPI_Comm GetComm() const override - { - return eps ? PetscObjectComm(reinterpret_cast(eps)) : MPI_COMM_NULL; - } - - operator PetscObject() const override { return reinterpret_cast(eps); }; -}; - -// Generalized eigenvalue problem solver: K x = λ M x . -class SlepcEPSSolver : public SlepcEPSSolverBase -{ -public: - using SlepcEigenvalueSolver::delta; - using SlepcEigenvalueSolver::gamma; - using SlepcEigenvalueSolver::opB; - using SlepcEigenvalueSolver::opInv; - using SlepcEigenvalueSolver::opProj; - using SlepcEigenvalueSolver::sigma; - using SlepcEigenvalueSolver::sinvert; - - // References to matrices defining the generalized eigenvalue problem (not owned). - const ComplexOperator *opK, *opM; - - // Workspace vector for operator applications. - mutable ComplexVector x, y; - -private: - // Operator norms for scaling. - mutable PetscReal normK, normM; - -protected: - PetscReal GetResidualNorm(int i) const override; - - PetscReal GetBackwardScaling(PetscScalar l) const override; - -public: - SlepcEPSSolver(MPI_Comm comm, int print, const std::string &prefix = std::string()); - - void SetOperators(const ComplexOperator &K, const ComplexOperator &M, - ScaleType type) override; - - void SetBMat(const Operator &B) override; -}; - -// Quadratic eigenvalue problem solver: P(λ) x = (K + λ C + λ² M) x = 0 , solved via -// linearization: L₀ y = λ L₁ y . -class SlepcPEPLinearSolver : public SlepcEPSSolverBase -{ -public: - using SlepcEigenvalueSolver::delta; - using SlepcEigenvalueSolver::gamma; - using SlepcEigenvalueSolver::opB; - using SlepcEigenvalueSolver::opInv; - using SlepcEigenvalueSolver::opProj; - using SlepcEigenvalueSolver::sigma; - using SlepcEigenvalueSolver::sinvert; - - // References to matrices defining the quadratic polynomial eigenvalue problem - // (not owned). - const ComplexOperator *opK, *opC, *opM; - - // Workspace vectors for operator applications. - mutable ComplexVector x1, x2, y1, y2; - -private: - // Operator norms for scaling. - mutable PetscReal normK, normC, normM; - -protected: - PetscReal GetResidualNorm(int i) const override; - - PetscReal GetBackwardScaling(PetscScalar l) const override; - -public: - SlepcPEPLinearSolver(MPI_Comm comm, int print, const std::string &prefix = std::string()); - - void SetOperators(const ComplexOperator &K, const ComplexOperator &C, - const ComplexOperator &M, ScaleType type) override; - - void SetBMat(const Operator &B) override; - - void SetInitialSpace(const ComplexVector &v) override; - - void GetEigenvector(int i, ComplexVector &x) const override; -}; - -// Base class for SLEPc's PEP problem type. -class SlepcPEPSolverBase : public SlepcEigenvalueSolver -{ -protected: - // SLEPc eigensolver object. - PEP pep; - - // Shell matrices for the quadratic polynomial eigenvalue problem - Mat A0, A1, A2; - - void Customize() override; - -public: - // Calls SLEPc's PEPCreate. Expects SLEPc to be initialized/finalized externally. - SlepcPEPSolverBase(MPI_Comm comm, int print, const std::string &prefix = std::string()); - - // Call's SLEPc's PEPDestroy. - ~SlepcPEPSolverBase() override; - - // Conversion function to SLEPc's PEP type. - operator PEP() const { return pep; } - - void SetNumModes(int num_eig, int num_vec = 0) override; - - void SetTol(PetscReal tol) override; - - void SetMaxIter(int max_it) override; - - void SetWhichEigenpairs(WhichType type) override; - - void SetProblemType(ProblemType type) override; - - void SetType(Type type) override; - - void SetInitialSpace(const ComplexVector &v) override; - - int Solve() override; - - PetscScalar GetEigenvalue(int i) const override; - - void GetEigenvector(int i, ComplexVector &x) const override; - - BV GetBV() const override; - - ST GetST() const override; - - RG GetRG() const override; - - MPI_Comm GetComm() const override - { - return pep ? PetscObjectComm(reinterpret_cast(pep)) : MPI_COMM_NULL; - } - - operator PetscObject() const override { return reinterpret_cast(pep); }; -}; - -// Quadratic eigenvalue problem solver: P(λ) x = (K + λ C + λ² M) x = 0 . -class SlepcPEPSolver : public SlepcPEPSolverBase -{ -public: - using SlepcEigenvalueSolver::delta; - using SlepcEigenvalueSolver::gamma; - using SlepcEigenvalueSolver::opB; - using SlepcEigenvalueSolver::opInv; - using SlepcEigenvalueSolver::opProj; - using SlepcEigenvalueSolver::sigma; - using SlepcEigenvalueSolver::sinvert; - - // References to matrices defining the quadratic polynomial eigenvalue problem - // (not owned). - const ComplexOperator *opK, *opC, *opM; - - // Workspace vector for operator applications. - mutable ComplexVector x, y; - -private: - // Operator norms for scaling. - mutable PetscReal normK, normC, normM; - -protected: - PetscReal GetResidualNorm(int i) const override; - - PetscReal GetBackwardScaling(PetscScalar l) const override; - -public: - SlepcPEPSolver(MPI_Comm comm, int print, const std::string &prefix = std::string()); - - void SetOperators(const ComplexOperator &K, const ComplexOperator &C, - const ComplexOperator &M, ScaleType type) override; - - void SetBMat(const Operator &B) override; -}; - -} // namespace slepc - -} // namespace palace - -#endif - -#endif // PALACE_LINALG_SLEPC_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_SLEPC_HPP +#define PALACE_LINALG_SLEPC_HPP + +#if defined(PALACE_WITH_SLEPC) + +#include "linalg/petsc.hpp" + +#if !defined(PETSC_USE_COMPLEX) +#error "SLEPc interface requires PETSc compiled with complex scalars!" +#endif + +#include +#include +#include +#include +#include +#include "linalg/eps.hpp" +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +// Forward declarations of SLEPc objects. +typedef struct _p_EPS *EPS; +typedef struct _p_PEP *PEP; +typedef struct _p_NEP *NEP; +typedef struct _p_BV *BV; +typedef struct _p_ST *ST; +typedef struct _p_RG *RG; + +namespace palace +{ + +namespace slepc +{ + +// Wrappers for SlepcInitialize/SlepcInitializeNoArguments/SlepcFinalize. +void Initialize(int &argc, char **&argv, const char rc_file[], const char help[]); +void Initialize(); +void Finalize(); + +// Compute and return the maximum singular value of the given operator, σₙ² = λₙ(Aᴴ A) . +PetscReal GetMaxSingularValue(MPI_Comm comm, const ComplexOperator &A, bool herm = false, + PetscReal tol = PETSC_DEFAULT, + PetscInt max_it = PETSC_DEFAULT); + +// +// A wrapper for the SLEPc library for generalized linear eigenvalue problems or quadratic +// polynomial eigenvalue problems. Shift-and-invert spectral transformations can be used to +// compute interior eigenvalues. +// +class SlepcEigenvalueSolver : public EigenvalueSolver +{ +public: + enum class ProblemType + { + HERMITIAN, + NON_HERMITIAN, + GEN_HERMITIAN, + GEN_INDEFINITE, + GEN_NON_HERMITIAN, + HYPERBOLIC, + GYROSCOPIC, + GENERAL, + }; + + enum class Type + { + KRYLOVSCHUR, + POWER, + SUBSPACE, + TOAR, + STOAR, + QARNOLDI, + JD, + SLP, + NLEIGS + }; + + // Workspace vector for operator applications. + mutable ComplexVector x1, y1; + + // References to matrices defining the (possibly quadratic) eigenvalue problem + // (not owned). + const ComplexOperator *opK, *opC, *opM; + +protected: + // Control print level for debugging. + int print; + + // Variables for scaling, from Higham et al., IJNME 2008. + PetscReal gamma, delta; + + // Parameters defining the spectral transformation. + PetscScalar sigma; + bool sinvert, region; + + // Storage for computed residual norms and eigenvector normalizations. + std::unique_ptr res, xscale; + + // Reference to linear solver used for operator action for M⁻¹ (with no spectral + // transformation) or (K - σ M)⁻¹ (generalized EVP with shift-and- invert) or P(σ)⁻¹ + // (polynomial with shift-and-invert) (not owned). + ComplexKspSolver *opInv; + + // Reference to solver for projecting an intermediate vector onto a divergence-free space + // (not owned). + const DivFreeSolver *opProj; + + // Reference to matrix used for weighted inner products (not owned). May be nullptr, in + // which case identity is used. + const Operator *opB; + + // Workspace objects for eigenvalue calculations. + Mat B0; + Vec v0; + + // Boolean to handle SetFromOptions calls. + mutable bool cl_custom; + + // Customize object with command line options set. + virtual void Customize(); + + // Helper routine for computing the eigenvector normalization. + PetscReal GetEigenvectorNorm(const ComplexVector &x, ComplexVector &Bx) const; + + // Helper routine for computing the eigenpair residual. + virtual PetscReal GetResidualNorm(PetscScalar l, const ComplexVector &x, + ComplexVector &r) const = 0; + + // Helper routine for computing the backward error. + virtual PetscReal GetBackwardScaling(PetscScalar l) const = 0; + +public: + SlepcEigenvalueSolver(int print); + ~SlepcEigenvalueSolver() override; + + // Set operators for the generalized eigenvalue problem, the quadratic polynomial + // eigenvalue problem, or the nonlinear eigenvalue problem. + void SetOperators(const ComplexOperator &K, const ComplexOperator &M, + ScaleType type) override; + void SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, ScaleType type) override; + + // For the linear generalized case, the linear solver should be configured to compute the + // action of M⁻¹ (with no spectral transformation) or (K - σ M)⁻¹. For the quadratic + // case, the linear solver should be configured to compute the action of M⁻¹ (with no + // spectral transformation) or P(σ)⁻¹. + void SetLinearSolver(ComplexKspSolver &ksp) override; + + // Set the projection operator for enforcing the divergence-free constraint. + void SetDivFreeProjector(const DivFreeSolver &divfree) override; + + // Set optional B matrix used for weighted inner products. This must be set explicitly + // even for generalized problems, otherwise the identity will be used. + void SetBMat(const Operator &B) override; + + // Get scaling factors used by the solver. + PetscReal GetScalingGamma() const override { return gamma; } + PetscReal GetScalingDelta() const override { return delta; } + + // Set shift-and-invert spectral transformation. + void SetShiftInvert(std::complex s, bool precond = false) override; + + // Set problem type. + virtual void SetProblemType(ProblemType type) = 0; + + // Set eigenvalue solver. + virtual void SetType(Type type) = 0; + + // Configure the basis vectors object associated with the eigenvalue solver. + void SetOrthogonalization(bool mgs, bool cgs2); + + // Get the corresponding eigenpair error. + PetscReal GetError(int i, ErrorType type) const override; + + // Re-normalize the given number of eigenvectors, for example if the matrix B for weighted + // inner products has changed. This does not perform re-orthogonalization with respect to + // the new matrix, only normalization. + void RescaleEigenvectors(int num_eig) override; + + // Get the basis vectors object. + virtual BV GetBV() const = 0; + + // Get the spectral transformation object. + virtual ST GetST() const = 0; + + // Get the filtering region object. + virtual RG GetRG() const = 0; + + // Get the associated MPI communicator. + virtual MPI_Comm GetComm() const = 0; + + // Conversion function to PetscObject. + virtual operator PetscObject() const = 0; +}; + +// Base class for SLEPc's EPS problem type. +class SlepcEPSSolverBase : public SlepcEigenvalueSolver +{ +protected: + // SLEPc eigensolver object. Polynomial problems are handled using linearization. + EPS eps; + + // Shell matrices for the generalized eigenvalue problem. + Mat A0, A1; + + void Customize() override; + +public: + // Calls SLEPc's EPSCreate. Expects SLEPc to be initialized/finalized externally. + SlepcEPSSolverBase(MPI_Comm comm, int print, const std::string &prefix = std::string()); + + // Call's SLEPc's EPSDestroy. + ~SlepcEPSSolverBase() override; + + // Conversion function to SLEPc's EPS type. + operator EPS() const { return eps; } + + void SetNumModes(int num_eig, int num_vec = 0) override; + + void SetTol(PetscReal tol) override; + + void SetMaxIter(int max_it) override; + + void SetWhichEigenpairs(WhichType type) override; + + void SetProblemType(ProblemType type) override; + + void SetType(Type type) override; + + void SetInitialSpace(const ComplexVector &v) override; + + int Solve() override; + + std::complex GetEigenvalue(int i) const override; + + void GetEigenvector(int i, ComplexVector &x) const override; + + BV GetBV() const override; + + ST GetST() const override; + + RG GetRG() const override; + + MPI_Comm GetComm() const override + { + return eps ? PetscObjectComm(reinterpret_cast(eps)) : MPI_COMM_NULL; + } + + operator PetscObject() const override { return reinterpret_cast(eps); }; +}; + +// Generalized eigenvalue problem solver: K x = λ M x . +class SlepcEPSSolver : public SlepcEPSSolverBase +{ +public: + using SlepcEigenvalueSolver::delta; + using SlepcEigenvalueSolver::gamma; + using SlepcEigenvalueSolver::opB; + using SlepcEigenvalueSolver::opInv; + using SlepcEigenvalueSolver::opProj; + using SlepcEigenvalueSolver::sigma; + using SlepcEigenvalueSolver::sinvert; + +private: + // Operator norms for scaling. + mutable PetscReal normK, normM; + +protected: + PetscReal GetResidualNorm(PetscScalar l, const ComplexVector &x, + ComplexVector &r) const override; + + PetscReal GetBackwardScaling(PetscScalar l) const override; + +public: + SlepcEPSSolver(MPI_Comm comm, int print, const std::string &prefix = std::string()); + + using SlepcEigenvalueSolver::SetOperators; + void SetOperators(const ComplexOperator &K, const ComplexOperator &M, + ScaleType type) override; + + void SetBMat(const Operator &B) override; +}; + +// Quadratic eigenvalue problem solver: P(λ) x = (K + λ C + λ² M) x = 0 , solved via +// linearization: L₀ y = λ L₁ y . +class SlepcPEPLinearSolver : public SlepcEPSSolverBase +{ +public: + using SlepcEigenvalueSolver::delta; + using SlepcEigenvalueSolver::gamma; + using SlepcEigenvalueSolver::opB; + using SlepcEigenvalueSolver::opInv; + using SlepcEigenvalueSolver::opProj; + using SlepcEigenvalueSolver::sigma; + using SlepcEigenvalueSolver::sinvert; + + // Workspace vectors for operator applications. + mutable ComplexVector x2, y2; + +private: + // Operator norms for scaling. + mutable PetscReal normK, normC, normM; + +protected: + PetscReal GetResidualNorm(PetscScalar l, const ComplexVector &x, + ComplexVector &r) const override; + + PetscReal GetBackwardScaling(PetscScalar l) const override; + +public: + SlepcPEPLinearSolver(MPI_Comm comm, int print, const std::string &prefix = std::string()); + + using SlepcEigenvalueSolver::SetOperators; + void SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, ScaleType type) override; + + void SetBMat(const Operator &B) override; + + void SetInitialSpace(const ComplexVector &v) override; + + void GetEigenvector(int i, ComplexVector &x) const override; +}; + +// Base class for SLEPc's PEP problem type. +class SlepcPEPSolverBase : public SlepcEigenvalueSolver +{ +protected: + // SLEPc eigensolver object. + PEP pep; + + // Shell matrices for the quadratic polynomial eigenvalue problem. + Mat A0, A1, A2; + + void Customize() override; + +public: + // Calls SLEPc's PEPCreate. Expects SLEPc to be initialized/finalized externally. + SlepcPEPSolverBase(MPI_Comm comm, int print, const std::string &prefix = std::string()); + + // Call's SLEPc's PEPDestroy. + ~SlepcPEPSolverBase() override; + + // Conversion function to SLEPc's PEP type. + operator PEP() const { return pep; } + + void SetNumModes(int num_eig, int num_vec = 0) override; + + void SetTol(PetscReal tol) override; + + void SetMaxIter(int max_it) override; + + void SetWhichEigenpairs(WhichType type) override; + + void SetProblemType(ProblemType type) override; + + void SetType(Type type) override; + + void SetInitialSpace(const ComplexVector &v) override; + + int Solve() override; + + std::complex GetEigenvalue(int i) const override; + + void GetEigenvector(int i, ComplexVector &x) const override; + + BV GetBV() const override; + + ST GetST() const override; + + RG GetRG() const override; + + MPI_Comm GetComm() const override + { + return pep ? PetscObjectComm(reinterpret_cast(pep)) : MPI_COMM_NULL; + } + + operator PetscObject() const override { return reinterpret_cast(pep); }; +}; + +// Quadratic eigenvalue problem solver: P(λ) x = (K + λ C + λ² M) x = 0 . +class SlepcPEPSolver : public SlepcPEPSolverBase +{ +public: + using SlepcEigenvalueSolver::delta; + using SlepcEigenvalueSolver::gamma; + using SlepcEigenvalueSolver::opB; + using SlepcEigenvalueSolver::opInv; + using SlepcEigenvalueSolver::opProj; + using SlepcEigenvalueSolver::sigma; + using SlepcEigenvalueSolver::sinvert; + +private: + // Operator norms for scaling. + mutable PetscReal normK, normC, normM; + +protected: + PetscReal GetResidualNorm(PetscScalar l, const ComplexVector &x, + ComplexVector &r) const override; + + PetscReal GetBackwardScaling(PetscScalar l) const override; + +public: + SlepcPEPSolver(MPI_Comm comm, int print, const std::string &prefix = std::string()); + + using SlepcEigenvalueSolver::SetOperators; + void SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, ScaleType type) override; + void SetBMat(const Operator &B) override; +}; + +// Base class for SLEPc's NEP problem type +class SlepcNEPSolverBase : public SlepcEigenvalueSolver +{ +protected: + // SLEPc eigensolver object. + NEP nep; + + // Shell matrices for the nonlinear eigenvalue problem. + Mat A, J; + + // Order of sorted eigenvalues. + std::unique_ptr perm; + + void Customize() override; + +public: + // Calls SLEPc's NEPCreate. Expects SLEPc to be initialized/finalized externally. + SlepcNEPSolverBase(MPI_Comm comm, int print, const std::string &prefix = std::string()); + + // Call's SLEPc's NEPDestroy. + ~SlepcNEPSolverBase() override; + + // Conversion function to SLEPc's PEP type. + operator NEP() const { return nep; } + + void SetNumModes(int num_eig, int num_vec = 0) override; + + void SetTol(PetscReal tol) override; + + void SetMaxIter(int max_it) override; + + void SetWhichEigenpairs(WhichType type) override; + + void SetShiftInvert(std::complex s, bool precond = false) override; + + void SetProblemType(ProblemType type) override; + + void SetType(Type type) override; + + void SetInitialSpace(const ComplexVector &v) override; + + int Solve() override; + + std::complex GetEigenvalue(int i) const override; + + void GetEigenvector(int i, ComplexVector &x) const override; + + BV GetBV() const override; + + ST GetST() const override; + + RG GetRG() const override; + + MPI_Comm GetComm() const override + { + return nep ? PetscObjectComm(reinterpret_cast(nep)) : MPI_COMM_NULL; + } + + operator PetscObject() const override { return reinterpret_cast(nep); }; +}; + +// Nonlinear eigenvalue problem solver: T(λ) x = (K + λ C + λ² M + A2(λ)) x = 0. +class SlepcNEPSolver : public SlepcNEPSolverBase +{ +public: + using SlepcEigenvalueSolver::delta; + using SlepcEigenvalueSolver::gamma; + using SlepcEigenvalueSolver::opB; + using SlepcEigenvalueSolver::opInv; + using SlepcEigenvalueSolver::opProj; + using SlepcEigenvalueSolver::sigma; + using SlepcEigenvalueSolver::sinvert; + + // Operators for the nonlinear eigenvalue problem. + std::unique_ptr opA2, opA2p, opJ, opA, opAJ, opA2_pc, opA_pc, opP_pc; + + // Function to compute the A2 operator. + std::optional(double)>> funcA2; + + // Function to compute the preconditioner matrix. + std::optional( + std::complex, std::complex, std::complex, double)>> + funcP; + + // Eigenvalue estimate at current iteration. + PetscScalar lambda; + + // Boolean flag to identify new λ estimate requiring a preconditioner update. + bool new_lambda = true; + + // Boolean flag to avoid modifying an unused preconditioner. + bool first_pc = true; + +private: + // Operator norms for scaling. + mutable PetscReal normK, normC, normM; + +protected: + PetscReal GetResidualNorm(PetscScalar l, const ComplexVector &x, + ComplexVector &r) const override; + + PetscReal GetBackwardScaling(PetscScalar l) const override; + +public: + SlepcNEPSolver(MPI_Comm comm, int print, const std::string &prefix = std::string()); + + using SlepcEigenvalueSolver::SetOperators; + void SetOperators(const ComplexOperator &K, const ComplexOperator &M, + ScaleType type) override; + void SetOperators(const ComplexOperator &K, const ComplexOperator &C, + const ComplexOperator &M, ScaleType type) override; + void SetBMat(const Operator &B) override; + + // Set the frequency-dependent A2 matrix function. + void SetExtraSystemMatrix( + std::function(double)>) override; + + // Set the preconditioner update function. + void SetPreconditionerUpdate(std::function( + std::complex, std::complex, + std::complex, double)>) override; +}; + +} // namespace slepc + +} // namespace palace + +#endif + +#endif // PALACE_LINALG_SLEPC_HPP diff --git a/palace/linalg/solver.cpp b/palace/linalg/solver.cpp index fcecbe39be..357b524a49 100644 --- a/palace/linalg/solver.cpp +++ b/palace/linalg/solver.cpp @@ -3,44 +3,204 @@ #include "solver.hpp" +#include "linalg/mumps.hpp" +#include "linalg/rap.hpp" + namespace palace { template <> -void WrapperSolver::SetOperator(const Operator &op) +void MfemWrapperSolver::SetOperator(const Operator &op) { - pc->SetOperator(op); + // Operator is always assembled as a HypreParMatrix. + if (const auto *hA = dynamic_cast(&op)) + { + pc->SetOperator(*hA); + } + else + { + const auto *PtAP = dynamic_cast(&op); + MFEM_VERIFY(PtAP, + "MfemWrapperSolver must be able to construct a HypreParMatrix operator!"); + pc->SetOperator(!save_assembled ? *PtAP->StealParallelAssemble() + : PtAP->ParallelAssemble()); + } this->height = op.Height(); this->width = op.Width(); } template <> -void WrapperSolver::SetOperator(const ComplexOperator &op) +void MfemWrapperSolver::SetOperator(const ComplexOperator &op) { - MFEM_VERIFY(op.IsReal() && op.HasReal(), - "WrapperSolver::SetOperator requires an operator which is purely real for " - "mfem::Solver!"); - pc->SetOperator(*op.Real()); + // Assemble the real and imaginary parts, then add. + // XX TODO: Test complex matrix assembly if coarse solve supports it. + const mfem::HypreParMatrix *hAr = dynamic_cast(op.Real()); + const mfem::HypreParMatrix *hAi = dynamic_cast(op.Imag()); + const ParOperator *PtAPr = nullptr, *PtAPi = nullptr; + if (op.Real() && !hAr) + { + PtAPr = dynamic_cast(op.Real()); + MFEM_VERIFY(PtAPr, + "MfemWrapperSolver must be able to construct a HypreParMatrix operator!"); + hAr = &PtAPr->ParallelAssemble(); + } + if (op.Imag() && !hAi) + { + PtAPi = dynamic_cast(op.Imag()); + MFEM_VERIFY(PtAPi, + "MfemWrapperSolver must be able to construct a HypreParMatrix operator!"); + hAi = &PtAPi->ParallelAssemble(); + } + if (hAr && hAi) + { + if (complex_matrix) + { + // A = [Ar, -Ai] + // [Ai, Ar] + mfem::Array2D blocks(2, 2); + mfem::Array2D block_coeffs(2, 2); + blocks(0, 0) = hAr; + blocks(0, 1) = hAi; + blocks(1, 0) = hAi; + blocks(1, 1) = hAr; + block_coeffs(0, 0) = 1.0; + block_coeffs(0, 1) = -1.0; + block_coeffs(1, 0) = 1.0; + block_coeffs(1, 1) = 1.0; + A.reset(mfem::HypreParMatrixFromBlocks(blocks, &block_coeffs)); + } + else + { + // A = Ar + Ai. + A.reset(mfem::Add(1.0, *hAr, 1.0, *hAi)); + } + if (PtAPr) + { + PtAPr->StealParallelAssemble(); + } + if (PtAPi) + { + PtAPi->StealParallelAssemble(); + } + if (drop_small_entries) + { + DropSmallEntries(); + } + pc->SetOperator(*A); + if (!save_assembled) + { + A.reset(); + } + } + else if (hAr) + { + if (drop_small_entries) + { + A = std::make_unique(*hAr); + DropSmallEntries(); + pc->SetOperator(*A); + } + else + { + pc->SetOperator(*hAr); + } + if (PtAPr && !save_assembled) + { + PtAPr->StealParallelAssemble(); + } + } + else if (hAi) + { + if (drop_small_entries) + { + A = std::make_unique(*hAi); + DropSmallEntries(); + pc->SetOperator(*A); + } + else + { + pc->SetOperator(*hAi); + } + if (PtAPi && !save_assembled) + { + PtAPi->StealParallelAssemble(); + } + } + else + { + MFEM_ABORT("Empty ComplexOperator for MfemWrapperSolver!"); + } this->height = op.Height(); this->width = op.Width(); } template <> -void WrapperSolver::Mult(const Vector &x, Vector &y) const +void MfemWrapperSolver::Mult(const Vector &x, Vector &y) const { pc->Mult(x, y); } template <> -void WrapperSolver::Mult(const ComplexVector &x, ComplexVector &y) const +void MfemWrapperSolver::Mult(const ComplexVector &x, + ComplexVector &y) const +{ + if (pc->Height() == x.Size()) + { + mfem::Array X(2); + mfem::Array Y(2); + X[0] = &x.Real(); + X[1] = &x.Imag(); + Y[0] = &y.Real(); + Y[1] = &y.Imag(); + pc->ArrayMult(X, Y); + } + else + { + const int Nx = x.Size(), Ny = y.Size(); + Vector X(2 * Nx), Y(2 * Ny), yr, yi; + X.UseDevice(true); + Y.UseDevice(true); + yr.UseDevice(true); + yi.UseDevice(true); + linalg::SetSubVector(X, 0, x.Real()); + linalg::SetSubVector(X, Nx, x.Imag()); + pc->Mult(X, Y); + Y.ReadWrite(); + yr.MakeRef(Y, 0, Ny); + yi.MakeRef(Y, Ny, Ny); + y.Real() = yr; + y.Imag() = yi; + } +} + +template +void MfemWrapperSolver::DropSmallEntries() { - mfem::Array X(2); - mfem::Array Y(2); - X[0] = &x.Real(); - X[1] = &x.Imag(); - Y[0] = &y.Real(); - Y[1] = &y.Imag(); - pc->ArrayMult(X, Y); + const auto nnz_before = A->NNZ(); + A->DropSmallEntries(std::pow(std::numeric_limits::epsilon(), 2)); + const auto nnz_after = A->NNZ(); +#if defined(MFEM_USE_MUMPS) + if (auto *mumps = dynamic_cast(pc.get())) + { + if (reorder_reuse && (num_dropped_entries != 0) && + (num_dropped_entries != (nnz_before - nnz_after))) + { + // MUMPS errors out if there are any changes to the symmetry pattern after the first + // factorization so we don't reuse the reordering if the number of dropped entries has + // changed. + mumps->SetReorderReuse(false); + } + else if (reorder_reuse && (num_dropped_entries == (nnz_before - nnz_after))) + { + // Reuse the column ordering if the number of dropped entries has not changed. + mumps->SetReorderReuse(true); + } + } +#endif + num_dropped_entries = nnz_before - nnz_after; + Mpi::Print(" Dropping {} small entries in sparse matrix out of {} ({:.1f}%)\n", + num_dropped_entries, nnz_before, + (double)(num_dropped_entries) / nnz_before * 100.0); } } // namespace palace diff --git a/palace/linalg/solver.hpp b/palace/linalg/solver.hpp index 82755a1f2e..c9a8797c54 100644 --- a/palace/linalg/solver.hpp +++ b/palace/linalg/solver.hpp @@ -1,81 +1,134 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_SOLVER_HPP -#define PALACE_LINALG_SOLVER_HPP - -#include -#include -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" - -namespace palace -{ - -// -// The base Solver class is a templated version of mfem::Solver for operation with -// real- or complex-valued operators. -// - -// Abstract base class for real-valued or complex-valued solvers. -template -class Solver : public OperType -{ - static_assert(std::is_same::value || - std::is_same::value, - "Solver can only be defined for OperType = Operator or ComplexOperator!"); - -protected: - using VecType = typename std::conditional::value, - ComplexVector, Vector>::type; - - // Whether or not to use the second argument of Mult() as an initial guess. - bool initial_guess; - -public: - Solver(bool initial_guess = false) : OperType(), initial_guess(initial_guess) {} - virtual ~Solver() = default; - - // Configure whether or not to use an initial guess when applying the solver. - virtual void SetInitialGuess(bool guess) { initial_guess = guess; } - - // Set the operator associated with the solver, or update it if called repeatedly. - virtual void SetOperator(const OperType &op) = 0; - - // Apply the solver for the transpose problem. - void MultTranspose(const VecType &x, VecType &y) const override - { - MFEM_ABORT("MultTranspose() is not implemented for base class Solver!"); - } -}; - -// This solver wraps a real-valued mfem::Solver for application to complex-valued problems -// as a preconditioner inside of a Solver -template -class WrapperSolver : public Solver -{ - using VecType = typename Solver::VecType; - -protected: - std::unique_ptr pc; - -public: - WrapperSolver(std::unique_ptr &&pc) - : Solver(pc->iterative_mode), pc(std::move(pc)) - { - } - - void SetInitialGuess(bool guess) override - { - Solver::SetInitialGuess(guess); - pc->iterative_mode = guess; - } - - void SetOperator(const OperType &op) override; - - void Mult(const VecType &x, VecType &y) const override; -}; - -} // namespace palace - -#endif // PALACE_LINALG_SOLVER_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_SOLVER_HPP +#define PALACE_LINALG_SOLVER_HPP + +#include +#include +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +// +// The base Solver class is a templated version of mfem::Solver for operation with +// real- or complex-valued operators. +// + +// Abstract base class for real-valued or complex-valued solvers. +template +class Solver : public OperType +{ + static_assert(std::is_same::value || + std::is_same::value, + "Solver can only be defined for OperType = Operator or ComplexOperator!"); + +protected: + using VecType = typename std::conditional::value, + ComplexVector, Vector>::type; + + // Whether or not to use the second argument of Mult() as an initial guess. + bool initial_guess; + +public: + Solver(bool initial_guess = false) : OperType(), initial_guess(initial_guess) {} + virtual ~Solver() = default; + + // Configure whether or not to use an initial guess when applying the solver. + virtual void SetInitialGuess(bool guess) { initial_guess = guess; } + + // Set the operator associated with the solver, or update it if called repeatedly. + virtual void SetOperator(const OperType &op) = 0; + + // Apply the solver for the transpose problem. + void MultTranspose(const VecType &x, VecType &y) const override + { + MFEM_ABORT("MultTranspose() is not implemented for base class Solver!"); + } + + // Apply the solver with a preallocated temporary storage vector. + virtual void Mult2(const VecType &x, VecType &y, VecType &r) const + { + MFEM_ABORT("Mult2() with temporary storage vector is not implemented for base class " + "Solver!"); + } + + // Apply the solver for the transpose problem with a preallocated temporary storage + // vector. + virtual void MultTranspose2(const VecType &x, VecType &y, VecType &r) const + { + MFEM_ABORT("MultTranspose2() with temporary storage vector is not implemented for base " + "class Solver!"); + } +}; + +// This solver wraps a real-valued mfem::Solver for application to complex-valued problems +// as a preconditioner inside of a Solver or for assembling the matrix-free +// preconditioner operator as an mfem::HypreParMatrix. +template +class MfemWrapperSolver : public Solver +{ + using VecType = typename Solver::VecType; + +private: + // The actual mfem::Solver. + std::unique_ptr pc; + + // System matrix A in parallel assembled form. + std::unique_ptr A; + + // Whether or not to save the parallel assembled matrix after calling + // mfem::Solver::SetOperator (some solvers copy their input). + bool save_assembled; + + // Whether to use the exact complex-valued system matrix or the real-valued + // approximation A = Ar + Ai. + bool complex_matrix = true; + + // Whether to drop small entries (< ε) in the sparse system matrix. + bool drop_small_entries = true; + + // Whether to reuse the column reordering of previous factorizations. + bool reorder_reuse = true; + + // Number of small entries dropped by the most recent DropSmallEntries() call. + int num_dropped_entries = 0; + + // Drop small entries. + void DropSmallEntries(); + +public: + MfemWrapperSolver(std::unique_ptr &&pc, bool save_assembled = true, + bool complex_matrix = true, bool drop_small_entries = true, + bool reorder_reuse = true) + : Solver(pc->iterative_mode), pc(std::move(pc)), + save_assembled(save_assembled), complex_matrix(complex_matrix), + drop_small_entries(drop_small_entries), reorder_reuse(reorder_reuse) + { + } + + // Access the underlying solver. + const mfem::Solver &GetSolver() { return *pc; } + + // Configure whether or not to save the assembled operator. + void SetSaveAssembled(bool save) { save_assembled = save; } + + // Configure whether or not to drop small entries in the system matrix. + void SetDropSmallEntries(bool drop) { drop_small_entries = drop; } + + void SetInitialGuess(bool guess) override + { + Solver::SetInitialGuess(guess); + pc->iterative_mode = guess; + } + + void SetOperator(const OperType &op) override; + + void Mult(const VecType &x, VecType &y) const override; +}; + +} // namespace palace + +#endif // PALACE_LINALG_SOLVER_HPP diff --git a/palace/linalg/strumpack.cpp b/palace/linalg/strumpack.cpp index 21b2dc32b9..2ca467a0a2 100644 --- a/palace/linalg/strumpack.cpp +++ b/palace/linalg/strumpack.cpp @@ -1,163 +1,172 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "strumpack.hpp" - -#if defined(MFEM_USE_STRUMPACK) - -#include "linalg/rap.hpp" - -namespace palace -{ - -namespace -{ - -strumpack::CompressionType -GetCompressionType(config::LinearSolverData::CompressionType type) -{ - switch (type) - { - case config::LinearSolverData::CompressionType::HSS: - return strumpack::CompressionType::HSS; - case config::LinearSolverData::CompressionType::BLR: - return strumpack::CompressionType::BLR; - case config::LinearSolverData::CompressionType::HODLR: - return strumpack::CompressionType::HODLR; - case config::LinearSolverData::CompressionType::ZFP: - return strumpack::CompressionType::LOSSY; - case config::LinearSolverData::CompressionType::BLR_HODLR: - return strumpack::CompressionType::BLR_HODLR; - break; - case config::LinearSolverData::CompressionType::ZFP_BLR_HODLR: - return strumpack::CompressionType::ZFP_BLR_HODLR; - break; - case config::LinearSolverData::CompressionType::NONE: - return strumpack::CompressionType::NONE; - } - return strumpack::CompressionType::NONE; // For compiler warning -} - -} // namespace - -template -StrumpackSolverBase::StrumpackSolverBase( - MPI_Comm comm, config::LinearSolverData::SymFactType reorder, - config::LinearSolverData::CompressionType compression, double lr_tol, int butterfly_l, - int lossy_prec, int print) - : StrumpackSolverType(comm), comm(comm) -{ - // Configure the solver. - this->SetPrintFactorStatistics(print > 1); - this->SetPrintSolveStatistics(print > 1); - this->SetKrylovSolver(strumpack::KrylovSolver::DIRECT); // Always as a preconditioner or - // direct solver - this->SetMatching(strumpack::MatchingJob::NONE); - if (reorder == config::LinearSolverData::SymFactType::METIS) - { - this->SetReorderingStrategy(strumpack::ReorderingStrategy::METIS); - } - else if (reorder == config::LinearSolverData::SymFactType::PARMETIS) - { - this->SetReorderingStrategy(strumpack::ReorderingStrategy::PARMETIS); - } - else if (reorder == config::LinearSolverData::SymFactType::SCOTCH) - { - this->SetReorderingStrategy(strumpack::ReorderingStrategy::SCOTCH); - } - else if (reorder == config::LinearSolverData::SymFactType::PTSCOTCH) - { - this->SetReorderingStrategy(strumpack::ReorderingStrategy::PTSCOTCH); - } - else - { - // Use default - } - this->SetReorderingReuse(true); // Repeated calls use same sparsity pattern - - // Configure compression. - this->SetCompression(GetCompressionType(compression)); - switch (compression) - { - case config::LinearSolverData::CompressionType::ZFP: - if (lossy_prec <= 0) - { - this->SetCompression(strumpack::CompressionType::LOSSLESS); - } - else - { - this->SetCompressionLossyPrecision(lossy_prec); - } - break; - case config::LinearSolverData::CompressionType::ZFP_BLR_HODLR: - this->SetCompressionLossyPrecision(lossy_prec); - case config::LinearSolverData::CompressionType::HODLR: - case config::LinearSolverData::CompressionType::BLR_HODLR: - this->SetCompressionButterflyLevels(butterfly_l); - case config::LinearSolverData::CompressionType::HSS: - case config::LinearSolverData::CompressionType::BLR: - this->SetCompressionRelTol(lr_tol); - break; - case config::LinearSolverData::CompressionType::NONE: - break; - } -} - -template -void StrumpackSolverBase::SetOperator(const Operator &op) -{ - // Convert the input operator to a distributed STRUMPACK matrix (always assume a symmetric - // sparsity pattern). This is very similar to the MFEM STRUMPACKRowLocMatrix from a - // HypreParMatrix but avoids using the communicator from the Hypre matrix in the case that - // the solver is constructed on a different communicator. - const mfem::HypreParMatrix *hypA; - const auto *PtAP = dynamic_cast(&op); - if (PtAP) - { - hypA = &PtAP->ParallelAssemble(); - } - else - { - hypA = dynamic_cast(&op); - MFEM_VERIFY(hypA, "StrumpackSolver requires a HypreParMatrix operator!"); - } - auto *parcsr = (hypre_ParCSRMatrix *)const_cast(*hypA); - hypA->HostRead(); - hypre_CSRMatrix *csr = hypre_MergeDiagAndOffd(parcsr); - hypA->HypreRead(); - - // Create the STRUMPACKRowLocMatrix by taking the internal data from a hypre_CSRMatrix. - HYPRE_Int n_loc = csr->num_rows; - HYPRE_BigInt first_row = parcsr->first_row_index; - HYPRE_Int *I = csr->i; - HYPRE_BigInt *J = csr->big_j; - double *data = csr->data; - - // Safe to delete the matrix since STRUMPACK copies it on input. Also clean up the Hypre - // data structure once we are done with it. -#if !defined(HYPRE_BIGINT) - mfem::STRUMPACKRowLocMatrix A(comm, n_loc, first_row, hypA->GetGlobalNumRows(), - hypA->GetGlobalNumCols(), I, J, data, true); -#else - int n_loc_int = static_cast(n_loc); - MFEM_ASSERT(n_loc == (HYPRE_Int)n_loc_int, - "Overflow error for local sparse matrix size!"); - mfem::Array II(n_loc_int + 1); - for (int i = 0; i <= n_loc_int; i++) - { - II[i] = static_cast(I[i]); - MFEM_ASSERT(I[i] == (HYPRE_Int)II[i], "Overflow error for local sparse matrix index!"); - } - mfem::STRUMPACKRowLocMatrix A(comm, n_loc_int, first_row, hypA->GetGlobalNumRows(), - hypA->GetGlobalNumCols(), II, J, data, true); -#endif - StrumpackSolverType::SetOperator(A); - hypre_CSRMatrixDestroy(csr); -} - -template class StrumpackSolverBase; -template class StrumpackSolverBase; - -} // namespace palace - -#endif +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "strumpack.hpp" + +#if defined(MFEM_USE_STRUMPACK) + +namespace palace +{ + +namespace +{ + +strumpack::CompressionType GetCompressionType(SparseCompression type) +{ + switch (type) + { + case SparseCompression::HSS: + return strumpack::CompressionType::HSS; + case SparseCompression::BLR: + return strumpack::CompressionType::BLR; + case SparseCompression::HODLR: + return strumpack::CompressionType::HODLR; + case SparseCompression::ZFP: + return strumpack::CompressionType::LOSSY; + case SparseCompression::BLR_HODLR: + return strumpack::CompressionType::BLR_HODLR; + break; + case SparseCompression::ZFP_BLR_HODLR: + return strumpack::CompressionType::ZFP_BLR_HODLR; + break; + case SparseCompression::NONE: + return strumpack::CompressionType::NONE; + } + return strumpack::CompressionType::NONE; // For compiler warning +} + +} // namespace + +template +StrumpackSolverBase::StrumpackSolverBase( + MPI_Comm comm, SymbolicFactorization reorder, SparseCompression compression, + double lr_tol, int butterfly_l, int lossy_prec, bool reorder_reuse, int print) + : StrumpackSolverType(comm), comm(comm) +{ + // Configure the solver. + this->SetPrintFactorStatistics(print > 1); + this->SetPrintSolveStatistics(print > 1); + this->SetKrylovSolver(strumpack::KrylovSolver::DIRECT); // Always as a preconditioner or + // direct solver + this->SetMatching(strumpack::MatchingJob::NONE); + switch (reorder) + { + case SymbolicFactorization::METIS: + this->SetReorderingStrategy(strumpack::ReorderingStrategy::METIS); + // this->SetReorderingStrategy(strumpack::ReorderingStrategy::AND); + break; + case SymbolicFactorization::PARMETIS: + this->SetReorderingStrategy(strumpack::ReorderingStrategy::PARMETIS); + break; + case SymbolicFactorization::SCOTCH: + this->SetReorderingStrategy(strumpack::ReorderingStrategy::SCOTCH); + break; + case SymbolicFactorization::PTSCOTCH: + this->SetReorderingStrategy(strumpack::ReorderingStrategy::PTSCOTCH); + break; + case SymbolicFactorization::AMD: + this->SetReorderingStrategy(strumpack::ReorderingStrategy::AMD); + // this->SetReorderingStrategy(strumpack::ReorderingStrategy::MMD); + break; + case SymbolicFactorization::RCM: + this->SetReorderingStrategy(strumpack::ReorderingStrategy::RCM); + case SymbolicFactorization::PORD: + case SymbolicFactorization::DEFAULT: + // Should have good default. + break; + } + this->SetReorderingReuse( + reorder_reuse); // If true repeated calls use same sparsity pattern + + // Configure compression. + this->SetCompression(GetCompressionType(compression)); + switch (compression) + { + case SparseCompression::ZFP: + if (lossy_prec <= 0) + { + this->SetCompression(strumpack::CompressionType::LOSSLESS); + } + else + { + this->SetCompressionLossyPrecision(lossy_prec); + } + break; + case SparseCompression::ZFP_BLR_HODLR: + this->SetCompressionLossyPrecision(lossy_prec); + case SparseCompression::HODLR: + case SparseCompression::BLR_HODLR: + this->SetCompressionButterflyLevels(butterfly_l); + case SparseCompression::HSS: + case SparseCompression::BLR: + this->SetCompressionRelTol(lr_tol); + break; + case SparseCompression::NONE: + break; + } + // if (mfem::Device::Allows(mfem::Backend::DEVICE_MASK)) + // { + // this->EnableGPU(); // XX TODO: GPU support disabled for now + // } + // else + { + this->DisableGPU(); + } +} + +template +void StrumpackSolverBase::SetOperator(const Operator &op) +{ + // Convert the input operator to a distributed STRUMPACK matrix (always assume a symmetric + // sparsity pattern). This is very similar to the MFEM's STRUMPACKRowLocMatrix from a + // HypreParMatrix but avoids using the communicator from the Hypre matrix in the case that + // the solver is constructed on a different communicator. + const auto *hA = dynamic_cast(&op); + MFEM_VERIFY(hA && hA->GetGlobalNumRows() == hA->GetGlobalNumCols(), + "StrumpackSolver requires a square HypreParMatrix operator!"); + auto *parcsr = (hypre_ParCSRMatrix *)const_cast(*hA); + hypre_CSRMatrix *csr = hypre_MergeDiagAndOffd(parcsr); + hypre_CSRMatrixMigrate(csr, HYPRE_MEMORY_HOST); + + // Create the STRUMPACKRowLocMatrix by taking the internal data from a hypre_CSRMatrix. + HYPRE_BigInt glob_n = hypre_ParCSRMatrixGlobalNumRows(parcsr); + HYPRE_BigInt first_row = hypre_ParCSRMatrixFirstRowIndex(parcsr); + HYPRE_Int n_loc = hypre_CSRMatrixNumRows(csr); + HYPRE_Int *I = hypre_CSRMatrixI(csr); + HYPRE_BigInt *J = hypre_CSRMatrixBigJ(csr); + double *data = hypre_CSRMatrixData(csr); + + // Safe to delete the matrix since STRUMPACK copies it on input. Also clean up the Hypre + // data structure once we are done with it. +#if !defined(HYPRE_BIGINT) + mfem::STRUMPACKRowLocMatrix A(comm, n_loc, first_row, glob_n, glob_n, I, J, data, true); +#else + int n_loc_int = static_cast(n_loc); + MFEM_ASSERT(n_loc == (HYPRE_Int)n_loc_int, + "Overflow error for local sparse matrix size!"); + mfem::Array II(n_loc_int + 1); + for (int i = 0; i <= n_loc_int; i++) + { + II[i] = static_cast(I[i]); + MFEM_ASSERT(I[i] == (HYPRE_Int)II[i], "Overflow error for local sparse matrix index!"); + } + mfem::STRUMPACKRowLocMatrix A(comm, n_loc_int, first_row, glob_n, glob_n, II.HostRead(), + J, data, true); +#endif + StrumpackSolverType::SetOperator(A); + hypre_CSRMatrixDestroy(csr); +} + +template +void StrumpackSolverBase::SetReorderReuse(bool reorder_reuse) +{ + StrumpackSolverType::SetReorderingReuse( + reorder_reuse); // If true repeated calls use same sparsity pattern +} + +template class StrumpackSolverBase; +template class StrumpackSolverBase; + +} // namespace palace + +#endif diff --git a/palace/linalg/strumpack.hpp b/palace/linalg/strumpack.hpp index f1d17c9794..fb18be95c5 100644 --- a/palace/linalg/strumpack.hpp +++ b/palace/linalg/strumpack.hpp @@ -1,52 +1,55 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_STRUMPACK_HPP -#define PALACE_LINALG_STRUMPACK_HPP - -#include - -#if defined(MFEM_USE_STRUMPACK) - -#include "linalg/operator.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -// -// A wrapper for the STRUMPACK direct solver package. -// -template -class StrumpackSolverBase : public StrumpackSolverType -{ -private: - MPI_Comm comm; - -public: - StrumpackSolverBase(MPI_Comm comm, config::LinearSolverData::SymFactType reorder, - config::LinearSolverData::CompressionType compression, double lr_tol, - int butterfly_l, int lossy_prec, int print); - - StrumpackSolverBase(MPI_Comm comm, const IoData &iodata, int print) - : StrumpackSolverBase(comm, iodata.solver.linear.sym_fact_type, - iodata.solver.linear.strumpack_compression_type, - iodata.solver.linear.strumpack_lr_tol, - iodata.solver.linear.strumpack_butterfly_l, - iodata.solver.linear.strumpack_lossy_precision, print) - { - } - - void SetOperator(const Operator &op) override; -}; - -using StrumpackSolver = StrumpackSolverBase; - -using StrumpackMixedPrecisionSolver = - StrumpackSolverBase; - -} // namespace palace - -#endif - -#endif // PALACE_LINALG_STRUMPACK_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_STRUMPACK_HPP +#define PALACE_LINALG_STRUMPACK_HPP + +#include + +#if defined(MFEM_USE_STRUMPACK) + +#include "linalg/operator.hpp" +#include "utils/iodata.hpp" + +namespace palace +{ + +// +// A wrapper for the STRUMPACK direct solver package. +// +template +class StrumpackSolverBase : public StrumpackSolverType +{ +private: + MPI_Comm comm; + +public: + StrumpackSolverBase(MPI_Comm comm, SymbolicFactorization reorder, + SparseCompression compression, double lr_tol, int butterfly_l, + int lossy_prec, bool reorder_reuse, int print); + + StrumpackSolverBase(const IoData &iodata, MPI_Comm comm, int print) + : StrumpackSolverBase(comm, iodata.solver.linear.sym_factorization, + iodata.solver.linear.strumpack_compression_type, + iodata.solver.linear.strumpack_lr_tol, + iodata.solver.linear.strumpack_butterfly_l, + iodata.solver.linear.strumpack_lossy_precision, + iodata.solver.linear.reorder_reuse, print) + { + } + + void SetOperator(const Operator &op) override; + + void SetReorderReuse(bool reorder_reuse); +}; + +using StrumpackSolver = StrumpackSolverBase; + +using StrumpackMixedPrecisionSolver = + StrumpackSolverBase; + +} // namespace palace + +#endif + +#endif // PALACE_LINALG_STRUMPACK_HPP diff --git a/palace/linalg/superlu.cpp b/palace/linalg/superlu.cpp index 66cd59d335..114308efa8 100644 --- a/palace/linalg/superlu.cpp +++ b/palace/linalg/superlu.cpp @@ -1,137 +1,135 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "superlu.hpp" - -#if defined(MFEM_USE_SUPERLU) - -#include "linalg/rap.hpp" -#include "utils/communication.hpp" - -namespace palace -{ - -namespace -{ - -int GetNpDep(int np, bool use_3d) -{ - // Return heuristic choice of 3D processor grid depth based on communicator - // size. Performance doesn't matter here. - if (!use_3d) - { - return 1; - } - else - { - int npdep = (int)std::pow(2, std::floor(std::log2(std::cbrt(np)))); - while (npdep > 1 && np % npdep > 0) - { - npdep /= 2; - } - return npdep; - } -} - -} // namespace - -SuperLUSolver::SuperLUSolver(MPI_Comm comm, config::LinearSolverData::SymFactType reorder, - bool use_3d, int print) - : mfem::Solver(), comm(comm), A(nullptr), solver(comm, GetNpDep(Mpi::Size(comm), use_3d)) -{ - // Configure the solver. - if (print > 1) - { - if (solver.npdep_ > 1) - { - Mpi::Print(comm, " SuperLUSolver: Using 3D processor grid {:d} x {:d} x {:d}\n", - solver.nprow_, solver.npcol_, solver.npdep_); - } - else - { - Mpi::Print(comm, " SuperLUSolver: Using 2D processor grid {:d} x {:d}\n", - solver.nprow_, solver.npcol_); - } - } - solver.SetPrintStatistics(print > 1); - solver.SetEquilibriate(false); - solver.SetReplaceTinyPivot(false); - if (reorder == config::LinearSolverData::SymFactType::METIS) - { - solver.SetColumnPermutation(mfem::superlu::METIS_AT_PLUS_A); - } - else if (reorder == config::LinearSolverData::SymFactType::PARMETIS) - { - solver.SetColumnPermutation(mfem::superlu::PARMETIS); - } - else - { - // Use default - } - // solver.SetRowPermutation(mfem::superlu::NOROWPERM); - solver.SetIterativeRefine(mfem::superlu::NOREFINE); - solver.SetSymmetricPattern(true); // Always symmetric sparsity pattern -} - -void SuperLUSolver::SetOperator(const Operator &op) -{ - // For repeated factorizations, always reuse the sparsity pattern. This is very similar to - // the MFEM SuperLURowLocMatrix from a HypreParMatrix but avoids using the communicator - // from the Hypre matrix in the case that the solver is constructed on a different - // communicator. - if (A) - { - solver.SetFact(mfem::superlu::SamePattern_SameRowPerm); - } - const mfem::HypreParMatrix *hypA; - const auto *PtAP = dynamic_cast(&op); - if (PtAP) - { - hypA = &PtAP->ParallelAssemble(); - } - else - { - hypA = dynamic_cast(&op); - MFEM_VERIFY(hypA, "SuperLUSolver requires a HypreParMatrix operator!"); - } - auto *parcsr = (hypre_ParCSRMatrix *)const_cast(*hypA); - hypA->HostRead(); - hypre_CSRMatrix *csr = hypre_MergeDiagAndOffd(parcsr); - hypA->HypreRead(); - - // Create the SuperLURowLocMatrix by taking the internal data from a hypre_CSRMatrix. - HYPRE_Int n_loc = csr->num_rows; - HYPRE_BigInt first_row = parcsr->first_row_index; - HYPRE_Int *I = csr->i; - HYPRE_BigInt *J = csr->big_j; - double *data = csr->data; - - // We need to save A because SuperLU does not copy the input matrix. Also clean up the - // Hypre data structure once we are done with it. -#if !defined(HYPRE_BIGINT) - A = std::make_unique(comm, n_loc, first_row, - hypA->GetGlobalNumRows(), - hypA->GetGlobalNumCols(), I, J, data); -#else - int n_loc_int = static_cast(n_loc); - MFEM_ASSERT(n_loc == (HYPRE_Int)n_loc_int, - "Overflow error for local sparse matrix size!"); - mfem::Array II(n_loc_int + 1); - for (int i = 0; i <= n_loc_int; i++) - { - II[i] = static_cast(I[i]); - MFEM_ASSERT(I[i] == (HYPRE_Int)II[i], "Overflow error for local sparse matrix index!"); - } - A = std::make_unique(comm, n_loc_int, first_row, - hypA->GetGlobalNumRows(), - hypA->GetGlobalNumCols(), II, J, data); -#endif - solver.SetOperator(*A); - height = solver.Height(); - width = solver.Width(); - hypre_CSRMatrixDestroy(csr); -} - -} // namespace palace - -#endif +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "superlu.hpp" + +#if defined(MFEM_USE_SUPERLU) + +#include "utils/communication.hpp" + +namespace palace +{ + +namespace +{ + +int GetNpDep(int np, bool use_3d) +{ + // Return heuristic choice of 3D processor grid depth based on communicator + // size. Performance doesn't matter here. + if (!use_3d) + { + return 1; + } + else + { + int npdep = (int)std::pow(2, std::floor(std::log2(std::cbrt(np)))); + while (npdep > 1 && np % npdep > 0) + { + npdep /= 2; + } + return npdep; + } +} + +} // namespace + +SuperLUSolver::SuperLUSolver(MPI_Comm comm, SymbolicFactorization reorder, bool use_3d, + bool reorder_reuse, int print) + : mfem::Solver(), comm(comm), A(nullptr), solver(comm, GetNpDep(Mpi::Size(comm), use_3d)), + reorder_reuse(reorder_reuse) +{ + // Configure the solver. + if (print > 1) + { + if (solver.npdep_ > 1) + { + Mpi::Print(comm, " SuperLUSolver: Using 3D processor grid {:d} x {:d} x {:d}\n", + solver.nprow_, solver.npcol_, solver.npdep_); + } + else + { + Mpi::Print(comm, " SuperLUSolver: Using 2D processor grid {:d} x {:d}\n", + solver.nprow_, solver.npcol_); + } + } + solver.SetPrintStatistics(print > 1); + solver.SetEquilibriate(false); + solver.SetReplaceTinyPivot(false); + switch (reorder) + { + case SymbolicFactorization::METIS: + solver.SetColumnPermutation(mfem::superlu::METIS_AT_PLUS_A); + break; + case SymbolicFactorization::PARMETIS: + solver.SetColumnPermutation(mfem::superlu::PARMETIS); + break; + case SymbolicFactorization::AMD: + case SymbolicFactorization::RCM: + solver.SetColumnPermutation(mfem::superlu::MMD_AT_PLUS_A); + break; + case SymbolicFactorization::SCOTCH: + case SymbolicFactorization::PTSCOTCH: + case SymbolicFactorization::PORD: + case SymbolicFactorization::DEFAULT: + // Should have good default. + break; + } + // solver.SetRowPermutation(mfem::superlu::NOROWPERM); + solver.SetIterativeRefine(mfem::superlu::NOREFINE); + solver.SetSymmetricPattern(true); // Always symmetric sparsity pattern +} + +void SuperLUSolver::SetOperator(const Operator &op) +{ + // For repeated factorizations, always reuse the sparsity pattern. + if (A && reorder_reuse) + { + solver.SetFact(mfem::superlu::SamePattern_SameRowPerm); + } + + // This is very similar to the MFEM SuperLURowLocMatrix from a HypreParMatrix but avoids + // using the communicator from the Hypre matrix in the case that the solver is + // constructed on a different communicator. + const auto *hA = dynamic_cast(&op); + MFEM_VERIFY(hA && hA->GetGlobalNumRows() == hA->GetGlobalNumCols(), + "SuperLUSolver requires a square HypreParMatrix operator!"); + auto *parcsr = (hypre_ParCSRMatrix *)const_cast(*hA); + hypre_CSRMatrix *csr = hypre_MergeDiagAndOffd(parcsr); + hypre_CSRMatrixMigrate(csr, HYPRE_MEMORY_HOST); + + // Create the SuperLURowLocMatrix by taking the internal data from a hypre_CSRMatrix. + HYPRE_BigInt glob_n = hypre_ParCSRMatrixGlobalNumRows(parcsr); + HYPRE_BigInt first_row = hypre_ParCSRMatrixFirstRowIndex(parcsr); + HYPRE_Int n_loc = hypre_CSRMatrixNumRows(csr); + HYPRE_Int *I = hypre_CSRMatrixI(csr); + HYPRE_BigInt *J = hypre_CSRMatrixBigJ(csr); + double *data = hypre_CSRMatrixData(csr); + + // We need to save A because SuperLU does not copy the input matrix. Also clean up the + // Hypre data structure once we are done with it. +#if !defined(HYPRE_BIGINT) + A = std::make_unique(comm, n_loc, first_row, glob_n, glob_n, I, + J, data); +#else + int n_loc_int = static_cast(n_loc); + MFEM_ASSERT(n_loc == (HYPRE_Int)n_loc_int, + "Overflow error for local sparse matrix size!"); + mfem::Array II(n_loc_int + 1); + for (int i = 0; i <= n_loc_int; i++) + { + II[i] = static_cast(I[i]); + MFEM_ASSERT(I[i] == (HYPRE_Int)II[i], "Overflow error for local sparse matrix index!"); + } + A = std::make_unique(comm, n_loc_int, first_row, glob_n, + glob_n, II.HostRead(), J, data); +#endif + solver.SetOperator(*A); + height = solver.Height(); + width = solver.Width(); + hypre_CSRMatrixDestroy(csr); +} + +} // namespace palace + +#endif diff --git a/palace/linalg/superlu.hpp b/palace/linalg/superlu.hpp index 51febe601b..c5ad81da13 100644 --- a/palace/linalg/superlu.hpp +++ b/palace/linalg/superlu.hpp @@ -1,63 +1,67 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_SUPERLU_HPP -#define PALACE_LINALG_SUPERLU_HPP - -#include - -#if defined(MFEM_USE_SUPERLU) - -#include -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -// -// A wrapper for the SuperLU_DIST direct solver package. -// -class SuperLUSolver : public mfem::Solver -{ -private: - MPI_Comm comm; - std::unique_ptr A; - mfem::SuperLUSolver solver; - -public: - SuperLUSolver(MPI_Comm comm, config::LinearSolverData::SymFactType reorder, bool use_3d, - int print); - SuperLUSolver(MPI_Comm comm, const IoData &iodata, int print) - : SuperLUSolver(comm, iodata.solver.linear.sym_fact_type, - iodata.solver.linear.superlu_3d, print) - { - } - - mfem::SuperLUSolver &GetSolver() { return solver; } - - void SetOperator(const Operator &op) override; - - void Mult(const Vector &x, Vector &y) const override { solver.Mult(x, y); } - void ArrayMult(const mfem::Array &X, - mfem::Array &Y) const override - { - solver.ArrayMult(X, Y); - } - void MultTranspose(const Vector &x, Vector &y) const override - { - solver.MultTranspose(x, y); - } - void ArrayMultTranspose(const mfem::Array &X, - mfem::Array &Y) const override - { - solver.ArrayMultTranspose(X, Y); - } -}; - -} // namespace palace - -#endif - -#endif // PALACE_LINALG_SUPERLU_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_SUPERLU_HPP +#define PALACE_LINALG_SUPERLU_HPP + +#include + +#if defined(MFEM_USE_SUPERLU) + +#include +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" +#include "utils/iodata.hpp" + +namespace palace +{ + +// +// A wrapper for the SuperLU_DIST direct solver package. +// +class SuperLUSolver : public mfem::Solver +{ +private: + MPI_Comm comm; + std::unique_ptr A; + mfem::SuperLUSolver solver; + bool reorder_reuse; + +public: + SuperLUSolver(MPI_Comm comm, SymbolicFactorization reorder, bool use_3d, + bool reorder_reuse, int print); + SuperLUSolver(const IoData &iodata, MPI_Comm comm, int print) + : SuperLUSolver(comm, iodata.solver.linear.sym_factorization, + iodata.solver.linear.superlu_3d, iodata.solver.linear.reorder_reuse, + print) + { + } + + mfem::SuperLUSolver &GetSolver() { return solver; } + + void SetOperator(const Operator &op) override; + + void Mult(const Vector &x, Vector &y) const override { solver.Mult(x, y); } + void ArrayMult(const mfem::Array &X, + mfem::Array &Y) const override + { + solver.ArrayMult(X, Y); + } + void MultTranspose(const Vector &x, Vector &y) const override + { + solver.MultTranspose(x, y); + } + void ArrayMultTranspose(const mfem::Array &X, + mfem::Array &Y) const override + { + solver.ArrayMultTranspose(X, Y); + } + + void SetReorderReuse(bool reorder_reuse_) { reorder_reuse = reorder_reuse_; } +}; + +} // namespace palace + +#endif + +#endif // PALACE_LINALG_SUPERLU_HPP diff --git a/palace/linalg/vector.cpp b/palace/linalg/vector.cpp index 5f4b57f340..dc23027c0b 100644 --- a/palace/linalg/vector.cpp +++ b/palace/linalg/vector.cpp @@ -1,531 +1,785 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "vector.hpp" - -#include -#include -#include - -namespace palace -{ - -ComplexVector::ComplexVector(int n) : x(2 * n), xr(x, 0, n), xi(x, n, n) {} - -ComplexVector::ComplexVector(const ComplexVector &y) : ComplexVector(y.Size()) -{ - Set(y); -} - -ComplexVector::ComplexVector(const Vector &yr, const Vector &yi) : ComplexVector(yr.Size()) -{ - MFEM_VERIFY(yr.Size() == yi.Size(), - "Mismatch in dimension of real and imaginary matrix parts in ComplexVector!"); - Set(yr, yi); -} - -ComplexVector::ComplexVector(const std::complex *py, int n) : ComplexVector(n) -{ - Set(py, n); -} - -void ComplexVector::SetSize(int n) -{ - x.SetSize(2 * n); - xr.MakeRef(x, 0, n); - xi.MakeRef(x, n, n); -} - -void ComplexVector::Set(const Vector &yr, const Vector &yi) -{ - MFEM_VERIFY(yr.Size() == yi.Size() && yr.Size() == Size(), - "Mismatch in dimension of real and imaginary matrix parts in ComplexVector!"); - Real() = yr; - Imag() = yi; -} - -void ComplexVector::Set(const std::complex *py, int n) -{ - MFEM_VERIFY(n == Size(), - "Mismatch in dimension for array of std::complex in ComplexVector!"); - Vector y(reinterpret_cast(const_cast *>(py)), 2 * n); - const int N = n; - const auto *Y = y.Read(); - auto *XR = Real().Write(); - auto *XI = Imag().Write(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - XR[i] = Y[2 * i]; - XI[i] = Y[2 * i + 1]; - }); -} - -void ComplexVector::Get(std::complex *py, int n) const -{ - MFEM_VERIFY(n == Size(), - "Mismatch in dimension for array of std::complex in ComplexVector!"); - Vector y(reinterpret_cast(py), 2 * n); - const int N = n; - const auto *XR = Real().Read(); - const auto *XI = Imag().Read(); - auto *Y = y.Write(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - Y[2 * i] = XR[i]; - Y[2 * i + 1] = XI[i]; - }); - y.HostReadWrite(); -} - -ComplexVector &ComplexVector::operator=(std::complex s) -{ - Real() = s.real(); - Imag() = s.imag(); - return *this; -} - -ComplexVector &ComplexVector::operator*=(std::complex s) -{ - const double sr = s.real(); - const double si = s.imag(); - if (si == 0.0) - { - Real() *= sr; - Imag() *= sr; - } - else - { - const int N = Size(); - auto *XR = Real().ReadWrite(); - auto *XI = Imag().ReadWrite(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - const double t = si * XR[i] + sr * XI[i]; - XR[i] = sr * XR[i] - si * XI[i]; - XI[i] = t; - }); - } - return *this; -} - -void ComplexVector::Conj() -{ - Imag() *= -1.0; -} - -void ComplexVector::Abs() -{ - const int N = Size(); - auto *XR = Real().ReadWrite(); - auto *XI = Imag().ReadWrite(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - XR[i] = std::sqrt(XR[i] * XR[i] + XI[i] * XI[i]); - XI[i] = 0.0; - }); -} - -void ComplexVector::Reciprocal() -{ - const int N = Size(); - auto *XR = Real().ReadWrite(); - auto *XI = Imag().ReadWrite(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - const std::complex t = 1.0 / std::complex(XR[i], XI[i]); - XR[i] = t.real(); - XI[i] = t.imag(); - }); -} - -std::complex ComplexVector::Dot(const ComplexVector &y) const -{ - return {(Real() * y.Real()) + (Imag() * y.Imag()), - (Imag() * y.Real()) - (Real() * y.Imag())}; -} - -std::complex ComplexVector::TransposeDot(const ComplexVector &y) const -{ - return {(Real() * y.Real()) - (Imag() * y.Imag()), - (Imag() * y.Real()) + (Real() * y.Imag())}; -} - -void ComplexVector::AXPY(std::complex alpha, const ComplexVector &x) -{ - const int N = Size(); - const double ar = alpha.real(); - const double ai = alpha.imag(); - const auto *XR = x.Real().Read(); - const auto *XI = x.Imag().Read(); - auto *YR = Real().ReadWrite(); - auto *YI = Imag().ReadWrite(); - if (ai == 0.0) - { - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { YR[i] += ar * XR[i]; }); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { YI[i] += ar * XI[i]; }); - } - else - { - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - YR[i] += ar * XR[i] - ai * XI[i]; - YI[i] += ai * XR[i] + ar * XI[i]; - }); - } -} - -void ComplexVector::AXPBY(std::complex alpha, const ComplexVector &x, - std::complex beta) -{ - const int N = Size(); - const double ar = alpha.real(); - const double ai = alpha.imag(); - const auto *XR = x.Real().Read(); - const auto *XI = x.Imag().Read(); - auto *YR = Real().ReadWrite(); - auto *YI = Imag().ReadWrite(); - if (beta == 0.0) - { - if (ai == 0.0) - { - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { YR[i] = ar * XR[i]; }); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { YI[i] = ar * XI[i]; }); - } - else - { - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - YR[i] = ar * XR[i] - ai * XI[i]; - YI[i] = ai * XR[i] + ar * XI[i]; - }); - } - } - else - { - const double br = beta.real(); - const double bi = beta.imag(); - if (ai == 0.0 && bi == 0.0) - { - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { YR[i] = ar * XR[i] + br * YR[i]; }); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { YI[i] = ar * XI[i] + br * YI[i]; }); - } - else - { - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - const double t = bi * YR[i] + br * YI[i]; - YR[i] = ar * XR[i] - ai * XI[i] + br * YR[i] - bi * YI[i]; - YI[i] = ai * XR[i] + ar * XI[i] + t; - }); - } - } -} - -void ComplexVector::AXPBYPCZ(std::complex alpha, const ComplexVector &x, - std::complex beta, const ComplexVector &y, - std::complex gamma) -{ - const int N = Size(); - const double ar = alpha.real(); - const double ai = alpha.imag(); - const double br = beta.real(); - const double bi = beta.imag(); - const auto *XR = x.Real().Read(); - const auto *XI = x.Imag().Read(); - const auto *YR = y.Real().Read(); - const auto *YI = y.Imag().Read(); - auto *ZR = Real().Write(); - auto *ZI = Imag().Write(); - if (gamma == 0.0) - { - if (ai == 0.0 && bi == 0.0) - { - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { ZR[i] = ar * XR[i] + br * YR[i]; }); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) { ZI[i] = ar * XI[i] + br * YI[i]; }); - } - else - { - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - ZR[i] = ar * XR[i] - ai * XI[i] + br * YR[i] - bi * YI[i]; - ZI[i] = ai * XR[i] + ar * XI[i] + bi * YR[i] + br * YI[i]; - }); - } - } - else - { - const double gr = gamma.real(); - const double gi = gamma.imag(); - if (ai == 0.0 && bi == 0.0 && gi == 0.0) - { - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) - { ZR[i] = ar * XR[i] + br * YR[i] + gr * ZR[i]; }); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) - { ZI[i] = ar * XI[i] + br * YI[i] + gr * ZI[i]; }); - } - else - { - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - const double t = gi * ZR[i] + gr * ZI[i]; - ZR[i] = ar * XR[i] - ai * XI[i] + br * YR[i] - bi * YI[i] + - gr * ZR[i] - gi * ZI[i]; - ZI[i] = ai * XR[i] + ar * XI[i] + bi * YR[i] + br * YI[i] + t; - }); - } - } -} - -namespace linalg -{ - -template <> -void SetRandom(MPI_Comm comm, Vector &x, int seed) -{ - if (seed == 0) - { - std::vector seeds(1); - std::seed_seq seed_gen{Mpi::Rank(comm)}; - seed_gen.generate(seeds.begin(), seeds.end()); - seed = static_cast(seeds[0]); - } - x.Randomize(seed); -} - -template <> -void SetRandomReal(MPI_Comm comm, Vector &x, int seed) -{ - SetRandom(comm, x, seed); -} - -template <> -void SetRandomSign(MPI_Comm comm, Vector &x, int seed) -{ - SetRandom(comm, x, seed); - const int N = x.Size(); - auto *X = x.ReadWrite(); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) - { X[i] = (X[i] > 0.0) ? 1.0 : ((X[i] < 0.0) ? -1.0 : 0.0); }); -} - -template <> -void SetRandom(MPI_Comm comm, ComplexVector &x, int seed) -{ - if (seed == 0) - { - std::vector seeds(2); - std::seed_seq seed_gen{2 * Mpi::Rank(comm), 2 * Mpi::Rank(comm) + 1}; - seed_gen.generate(seeds.begin(), seeds.end()); - SetRandom(comm, x.Real(), static_cast(seeds[0])); - SetRandom(comm, x.Imag(), static_cast(seeds[1])); - } - else - { - SetRandom(comm, x.Real(), seed); - SetRandom(comm, x.Imag(), seed); - } -} - -template <> -void SetRandomReal(MPI_Comm comm, ComplexVector &x, int seed) -{ - SetRandom(comm, x.Real(), seed); - x.Imag() = 0.0; -} - -template <> -void SetRandomSign(MPI_Comm comm, ComplexVector &x, int seed) -{ - SetRandom(comm, x, seed); - const int N = x.Size(); - auto *XR = x.Real().ReadWrite(); - auto *XI = x.Imag().ReadWrite(); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) - { XR[i] = (XR[i] > 0.0) ? 1.0 : ((XR[i] < 0.0) ? -1.0 : 0.0); }); - mfem::forall(N, [=] MFEM_HOST_DEVICE(int i) - { XI[i] = (XI[i] > 0.0) ? 1.0 : ((XI[i] < 0.0) ? -1.0 : 0.0); }); -} - -template <> -void SetSubVector(Vector &x, const mfem::Array &rows, double s) -{ - const int N = rows.Size(); - const double sr = s; - const auto *idx = rows.Read(); - auto *X = x.ReadWrite(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - const int id = idx[i]; - X[id] = sr; - }); -} - -template <> -void SetSubVector(ComplexVector &x, const mfem::Array &rows, double s) -{ - const int N = rows.Size(); - const double sr = s; - const auto *idx = rows.Read(); - auto *XR = x.Real().ReadWrite(); - auto *XI = x.Imag().ReadWrite(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - const int id = idx[i]; - XR[id] = sr; - }); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - const int id = idx[i]; - XI[id] = 0.0; - }); -} - -template <> -void SetSubVector(Vector &x, const mfem::Array &rows, const Vector &y) -{ - const int N = rows.Size(); - const auto *idx = rows.Read(); - const auto *Y = y.Read(); - auto *X = x.ReadWrite(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - const int id = idx[i]; - X[id] = Y[id]; - }); -} - -template <> -void SetSubVector(ComplexVector &x, const mfem::Array &rows, const ComplexVector &y) -{ - const int N = rows.Size(); - const auto *idx = rows.Read(); - const auto *YR = y.Real().Read(); - const auto *YI = y.Imag().Read(); - auto *XR = x.Real().ReadWrite(); - auto *XI = x.Imag().ReadWrite(); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - const int id = idx[i]; - XR[id] = YR[id]; - }); - mfem::forall(N, - [=] MFEM_HOST_DEVICE(int i) - { - const int id = idx[i]; - XI[id] = YI[id]; - }); -} - -template <> -double Norml2(MPI_Comm comm, const Vector &x, const Operator &B, Vector &Bx) -{ - B.Mult(x, Bx); - double dot = Dot(comm, Bx, x); - MFEM_ASSERT(dot > 0.0, - "Non-positive vector norm in normalization (dot = " << dot << ")!"); - return std::sqrt(dot); -} - -template <> -double Norml2(MPI_Comm comm, const ComplexVector &x, const Operator &B, ComplexVector &Bx) -{ - // For SPD B, xᴴ B x is real. - B.Mult(x.Real(), Bx.Real()); - B.Mult(x.Imag(), Bx.Imag()); - std::complex dot = Dot(comm, Bx, x); - MFEM_ASSERT(dot.real() > 0.0 && std::abs(dot.imag()) < 1.0e-9 * dot.real(), - "Non-positive vector norm in normalization (dot = " << dot << ")!"); - return std::sqrt(dot.real()); -} - -template <> -void AXPY(double alpha, const Vector &x, Vector &y) -{ - if (alpha == 1.0) - { - y += x; - } - else - { - y.Add(alpha, x); - } -} - -template <> -void AXPY(double alpha, const ComplexVector &x, ComplexVector &y) -{ - y.AXPY(alpha, x); -} - -template <> -void AXPY(std::complex alpha, const ComplexVector &x, ComplexVector &y) -{ - y.AXPY(alpha, x); -} - -template <> -void AXPBY(double alpha, const Vector &x, double beta, Vector &y) -{ - add(alpha, x, beta, y, y); -} - -template <> -void AXPBY(std::complex alpha, const ComplexVector &x, std::complex beta, - ComplexVector &y) -{ - y.AXPBY(alpha, x, beta); -} - -template <> -void AXPBY(double alpha, const ComplexVector &x, double beta, ComplexVector &y) -{ - y.AXPBY(alpha, x, beta); -} - -template <> -void AXPBYPCZ(double alpha, const Vector &x, double beta, const Vector &y, double gamma, - Vector &z) -{ - if (gamma == 0.0) - { - add(alpha, x, beta, y, z); - } - else - { - AXPBY(alpha, x, gamma, z); - z.Add(beta, y); - } -} - -template <> -void AXPBYPCZ(std::complex alpha, const ComplexVector &x, std::complex beta, - const ComplexVector &y, std::complex gamma, ComplexVector &z) -{ - z.AXPBYPCZ(alpha, x, beta, y, gamma); -} - -template <> -void AXPBYPCZ(double alpha, const ComplexVector &x, double beta, const ComplexVector &y, - double gamma, ComplexVector &z) -{ - z.AXPBYPCZ(alpha, x, beta, y, gamma); -} - -} // namespace linalg - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "vector.hpp" + +#include +#include +#include +#include "linalg/hypre.hpp" +#include "utils/omp.hpp" + +namespace palace +{ + +ComplexVector::ComplexVector(int size) : xr(size), xi(size) {} + +ComplexVector::ComplexVector(const ComplexVector &y) : ComplexVector(y.Size()) +{ + UseDevice(y.UseDevice()); + Set(y); +} + +ComplexVector::ComplexVector(const Vector &yr, const Vector &yi) : ComplexVector(yr.Size()) +{ + MFEM_ASSERT(yr.Size() == yi.Size(), + "Mismatch in dimension of real and imaginary parts in ComplexVector!"); + UseDevice(yr.UseDevice() || yi.UseDevice()); + Set(yr, yi); +} + +ComplexVector::ComplexVector(const std::complex *py, int size, bool on_dev) + : ComplexVector(size) +{ + Set(py, size, on_dev); +} + +ComplexVector::ComplexVector(Vector &y, int offset, int size) +{ + MakeRef(y, offset, size); +} + +void ComplexVector::UseDevice(bool use_dev) +{ + xr.UseDevice(use_dev); + xi.UseDevice(use_dev); +} + +void ComplexVector::SetSize(int size) +{ + xr.SetSize(size); + xi.SetSize(size); +} + +void ComplexVector::MakeRef(Vector &y, int offset, int size) +{ + MFEM_ASSERT(y.Size() >= offset + 2 * size, + "Insufficient storage for ComplexVector alias reference of the given size!"); + y.ReadWrite(); // Ensure memory is allocated on device before aliasing + xr.MakeRef(y, offset, size); + xi.MakeRef(y, offset + size, size); +} + +void ComplexVector::Set(const ComplexVector &y) +{ + MFEM_ASSERT(y.Size() == Size(), + "Mismatch in dimension of provided parts in ComplexVector!"); + Real() = y.Real(); + Imag() = y.Imag(); +} + +void ComplexVector::Set(const Vector &yr, const Vector &yi) +{ + MFEM_ASSERT(yr.Size() == yi.Size() && yr.Size() == Size(), + "Mismatch in dimension of real and imaginary parts in ComplexVector!"); + Real() = yr; + Imag() = yi; +} + +void ComplexVector::Set(const std::complex *py, int size, bool on_dev) +{ + MFEM_ASSERT(size == Size(), + "Mismatch in dimension for array of std::complex in ComplexVector!"); + auto SetImpl = [this](const double *Y, const int N, bool use_dev) + { + auto *XR = Real().Write(use_dev); + auto *XI = Imag().Write(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + XR[i] = Y[2 * i]; + XI[i] = Y[2 * i + 1]; + }); + }; + const bool use_dev = UseDevice(); + if (((!use_dev || !mfem::Device::Allows(mfem::Backend::DEVICE_MASK)) && !on_dev) || + (use_dev && mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && on_dev)) + { + // No copy (host pointer and not using device, or device pointer and using device). + SetImpl(reinterpret_cast(py), size, use_dev); + } + else if (!on_dev) + { + // Need copy from host to device (host pointer but using device). + Vector y(2 * size); + y.UseDevice(true); + { + auto *Y = y.HostWrite(); + PalacePragmaOmp(parallel for schedule(static)) + for (int i = 0; i < size; i++) + { + Y[2 * i] = py[i].real(); + Y[2 * i + 1] = py[i].imag(); + } + } + SetImpl(y.Read(use_dev), size, use_dev); + } + else + { + MFEM_ABORT("ComplexVector::Set using a device pointer is not implemented when MFEM is " + "not configured to use the device!"); + } +} + +void ComplexVector::Get(std::complex *py, int size, bool on_dev) const +{ + MFEM_ASSERT(size == Size(), + "Mismatch in dimension for array of std::complex in ComplexVector!"); + auto GetImpl = [this](double *Y, const int N, bool use_dev) + { + const auto *XR = Real().Read(use_dev); + const auto *XI = Imag().Read(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + Y[2 * i] = XR[i]; + Y[2 * i + 1] = XI[i]; + }); + }; + const bool use_dev = UseDevice(); + if (((!use_dev || !mfem::Device::Allows(mfem::Backend::DEVICE_MASK)) && !on_dev) || + (use_dev && mfem::Device::Allows(mfem::Backend::DEVICE_MASK) && on_dev)) + { + // No copy (host pointer and not using device, or device pointer and using device). + GetImpl(reinterpret_cast(py), size, use_dev); + } + else if (!on_dev) + { + // Need copy from device to host (host pointer but using device). + const auto *XR = Real().HostRead(); + const auto *XI = Imag().HostRead(); + PalacePragmaOmp(parallel for schedule(static)) + for (int i = 0; i < size; i++) + { + py[i].real(XR[i]); + py[i].imag(XI[i]); + } + } + else + { + MFEM_ABORT("ComplexVector::Get using a device pointer is not implemented when MFEM is " + "not configured to use the device!"); + } +} + +ComplexVector &ComplexVector::operator=(std::complex s) +{ + Real() = s.real(); + Imag() = s.imag(); + return *this; +} + +void ComplexVector::SetBlocks(const std::vector &y, + const std::vector> &s) +{ + MFEM_ASSERT(s.empty() || y.size() == s.size(), + "Mismatch in dimension of vector blocks and scaling coefficients!"); + auto *XR = Real().Write(); + auto *XI = Imag().Write(); + int offset = 0; + for (std::size_t b = 0; b < y.size(); b++) + { + MFEM_VERIFY(y[b] && ((b < y.size() - 1 && offset + y[b]->Size() < Size()) || + (b == y.size() - 1 && offset + y[b]->Size() == Size())), + "Mismatch between sum of block dimensions and parent vector dimension!"); + const double sr = s.empty() ? 1.0 : s[b].real(); + const double si = s.empty() ? 0.0 : s[b].imag(); + const bool use_dev = UseDevice() || y[b]->UseDevice(); + const int N = y[b]->Size(); + const auto *YR = y[b]->Real().Read(); + const auto *YI = y[b]->Imag().Read(); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + XR[i] = sr * YR[i] - si * YI[i]; + XI[i] = si * YR[i] + sr * YI[i]; + }); + XR += N; + XI += N; + offset += N; + } +} + +ComplexVector &ComplexVector::operator*=(std::complex s) +{ + const double sr = s.real(); + const double si = s.imag(); + if (si == 0.0) + { + Real() *= sr; + Imag() *= sr; + } + else + { + const bool use_dev = UseDevice(); + const int N = Size(); + auto *XR = Real().ReadWrite(use_dev); + auto *XI = Imag().ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto t = si * XR[i] + sr * XI[i]; + XR[i] = sr * XR[i] - si * XI[i]; + XI[i] = t; + }); + } + return *this; +} + +void ComplexVector::Conj() +{ + Imag() *= -1.0; +} + +void ComplexVector::Abs() +{ + const bool use_dev = UseDevice(); + const int N = Size(); + auto *XR = Real().ReadWrite(use_dev); + auto *XI = Imag().ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + XR[i] = std::sqrt(XR[i] * XR[i] + XI[i] * XI[i]); + XI[i] = 0.0; + }); +} + +void ComplexVector::Reciprocal() +{ + const bool use_dev = UseDevice(); + const int N = Size(); + auto *XR = Real().ReadWrite(use_dev); + auto *XI = Imag().ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto s = 1.0 / (XR[i] * XR[i] + XI[i] * XI[i]); + XR[i] *= s; + XI[i] *= -s; + }); +} + +std::complex ComplexVector::Dot(const ComplexVector &y) const +{ + return {(Real() * y.Real()) + (Imag() * y.Imag()), + (this == &y) ? 0.0 : ((Imag() * y.Real()) - (Real() * y.Imag()))}; +} + +std::complex ComplexVector::TransposeDot(const ComplexVector &y) const +{ + return {(Real() * y.Real()) - (Imag() * y.Imag()), + (this == &y) ? (2.0 * (Imag() * y.Real())) + : ((Imag() * y.Real()) + (Real() * y.Imag()))}; +} + +void ComplexVector::AXPY(std::complex alpha, const ComplexVector &x) +{ + AXPY(alpha, x.Real(), x.Imag(), Real(), Imag()); +} + +void ComplexVector::AXPY(std::complex alpha, const Vector &xr, const Vector &xi, + Vector &yr, Vector &yi) +{ + const bool use_dev = yr.UseDevice() || xr.UseDevice(); + const int N = yr.Size(); + const double ar = alpha.real(); + const double ai = alpha.imag(); + const auto *XR = xr.Read(use_dev); + const auto *XI = xi.Read(use_dev); + auto *YR = yr.ReadWrite(use_dev); + auto *YI = yi.ReadWrite(use_dev); + if (ai == 0.0) + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + YR[i] += ar * XR[i]; + YI[i] += ar * XI[i]; + }); + } + else + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto t = ai * XR[i] + ar * XI[i]; + YR[i] += ar * XR[i] - ai * XI[i]; + YI[i] += t; + }); + } +} + +void ComplexVector::AXPBY(std::complex alpha, const ComplexVector &x, + std::complex beta) +{ + AXPBY(alpha, x.Real(), x.Imag(), beta, Real(), Imag()); +} + +void ComplexVector::AXPBY(std::complex alpha, const Vector &xr, const Vector &xi, + std::complex beta, Vector &yr, Vector &yi) +{ + const bool use_dev = yr.UseDevice() || xr.UseDevice(); + const int N = yr.Size(); + const double ar = alpha.real(); + const double ai = alpha.imag(); + const auto *XR = xr.Read(use_dev); + const auto *XI = xi.Read(use_dev); + if (beta == 0.0) + { + auto *YR = yr.Write(use_dev); + auto *YI = yi.Write(use_dev); + if (ai == 0.0) + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + YR[i] = ar * XR[i]; + YI[i] = ar * XI[i]; + }); + } + else + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto t = ai * XR[i] + ar * XI[i]; + YR[i] = ar * XR[i] - ai * XI[i]; + YI[i] = t; + }); + } + } + else + { + const double br = beta.real(); + const double bi = beta.imag(); + auto *YR = yr.ReadWrite(use_dev); + auto *YI = yi.ReadWrite(use_dev); + if (ai == 0.0 && bi == 0.0) + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + YR[i] = ar * XR[i] + br * YR[i]; + YI[i] = ar * XI[i] + br * YI[i]; + }); + } + else + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto t = + ai * XR[i] + ar * XI[i] + bi * YR[i] + br * YI[i]; + YR[i] = ar * XR[i] - ai * XI[i] + br * YR[i] - bi * YI[i]; + YI[i] = t; + }); + } + } +} + +void ComplexVector::AXPBYPCZ(std::complex alpha, const ComplexVector &x, + std::complex beta, const ComplexVector &y, + std::complex gamma) +{ + AXPBYPCZ(alpha, x.Real(), x.Imag(), beta, y.Real(), y.Imag(), gamma, Real(), Imag()); +} + +void ComplexVector::AXPBYPCZ(std::complex alpha, const Vector &xr, const Vector &xi, + std::complex beta, const Vector &yr, const Vector &yi, + std::complex gamma, Vector &zr, Vector &zi) +{ + const bool use_dev = zr.UseDevice() || xr.UseDevice() || yr.UseDevice(); + const int N = zr.Size(); + const double ar = alpha.real(); + const double ai = alpha.imag(); + const double br = beta.real(); + const double bi = beta.imag(); + const auto *XR = xr.Read(use_dev); + const auto *XI = xi.Read(use_dev); + const auto *YR = yr.Read(use_dev); + const auto *YI = yi.Read(use_dev); + if (gamma == 0.0) + { + auto *ZR = zr.Write(use_dev); + auto *ZI = zi.Write(use_dev); + if (ai == 0.0 && bi == 0.0) + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + ZR[i] = ar * XR[i] + br * YR[i]; + ZI[i] = ar * XI[i] + br * YI[i]; + }); + } + else + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto t = + ai * XR[i] + ar * XI[i] + bi * YR[i] + br * YI[i]; + ZR[i] = ar * XR[i] - ai * XI[i] + br * YR[i] - bi * YI[i]; + ZI[i] = t; + }); + } + } + else + { + const double gr = gamma.real(); + const double gi = gamma.imag(); + auto *ZR = zr.ReadWrite(use_dev); + auto *ZI = zi.ReadWrite(use_dev); + if (ai == 0.0 && bi == 0.0 && gi == 0.0) + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + ZR[i] = ar * XR[i] + br * YR[i] + gr * ZR[i]; + ZI[i] = ar * XI[i] + br * YI[i] + gr * ZI[i]; + }); + } + else + { + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto t = ai * XR[i] + ar * XI[i] + bi * YR[i] + + br * YI[i] + gi * ZR[i] + gr * ZI[i]; + ZR[i] = ar * XR[i] - ai * XI[i] + br * YR[i] - bi * YI[i] + + gr * ZR[i] - gi * ZI[i]; + ZI[i] = t; + }); + } + } +} + +namespace linalg +{ + +template <> +void SetSubVector(Vector &x, const mfem::Array &rows, double s) +{ + const bool use_dev = x.UseDevice(); + const int N = rows.Size(); + const double sr = s; + const auto *idx = rows.Read(use_dev); + auto *X = x.ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const auto id = idx[i]; + X[id] = sr; + }); +} + +template <> +void SetSubVector(ComplexVector &x, const mfem::Array &rows, double s) +{ + const bool use_dev = x.UseDevice(); + const int N = rows.Size(); + const double sr = s; + const auto *idx = rows.Read(use_dev); + auto *XR = x.Real().ReadWrite(use_dev); + auto *XI = x.Imag().ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const int id = idx[i]; + XR[id] = sr; + XI[id] = 0.0; + }); +} + +template <> +void SetSubVector(Vector &x, const mfem::Array &rows, const Vector &y) +{ + const bool use_dev = x.UseDevice(); + const int N = rows.Size(); + const auto *idx = rows.Read(use_dev); + const auto *Y = y.Read(use_dev); + auto *X = x.ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const int id = idx[i]; + X[id] = Y[id]; + }); +} + +template <> +void SetSubVector(ComplexVector &x, const mfem::Array &rows, const ComplexVector &y) +{ + const bool use_dev = x.UseDevice(); + const int N = rows.Size(); + const auto *idx = rows.Read(use_dev); + const auto *YR = y.Real().Read(use_dev); + const auto *YI = y.Imag().Read(use_dev); + auto *XR = x.Real().ReadWrite(use_dev); + auto *XI = x.Imag().ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const int id = idx[i]; + XR[id] = YR[id]; + XI[id] = YI[id]; + }); +} + +template <> +void SetSubVector(Vector &x, int start, const Vector &y) +{ + const bool use_dev = x.UseDevice(); + const int N = y.Size(); + MFEM_ASSERT(start >= 0 && start + N <= x.Size(), "Invalid range for SetSubVector!"); + const auto *Y = y.Read(use_dev); + auto *X = x.ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const int id = start + i; + X[id] = Y[i]; + }); +} + +template <> +void SetSubVector(ComplexVector &x, int start, const ComplexVector &y) +{ + const bool use_dev = x.UseDevice(); + const int N = y.Size(); + MFEM_ASSERT(start >= 0 && start + N <= x.Size(), "Invalid range for SetSubVector!"); + const auto *YR = y.Real().Read(use_dev); + const auto *YI = y.Imag().Read(use_dev); + auto *XR = x.Real().ReadWrite(use_dev); + auto *XI = x.Imag().ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + const int id = start + i; + XR[id] = YR[i]; + XI[id] = YI[i]; + }); +} + +template <> +void SetSubVector(Vector &x, int start, int end, double s) +{ + const bool use_dev = x.UseDevice(); + MFEM_ASSERT(start >= 0 && end <= x.Size() && start <= end, + "Invalid range for SetSubVector!"); + const int N = end - start; + const double sr = s; + auto *X = x.ReadWrite(use_dev) + start; + mfem::forall_switch(use_dev, N, [=] MFEM_HOST_DEVICE(int i) { X[i] = sr; }); +} + +template <> +void SetSubVector(ComplexVector &x, int start, int end, double s) +{ + const bool use_dev = x.UseDevice(); + MFEM_ASSERT(start >= 0 && end <= x.Size() && start <= end, + "Invalid range for SetSubVector!"); + const int N = end - start; + const double sr = s; + auto *XR = x.Real().ReadWrite(use_dev) + start; + auto *XI = x.Imag().ReadWrite(use_dev) + start; + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + XR[i] = sr; + XI[i] = 0.0; + }); +} + +template <> +void SetRandom(MPI_Comm comm, Vector &x, int seed) +{ + if (seed == 0) + { + std::vector seeds(1); + std::seed_seq seed_gen{Mpi::Rank(comm)}; + seed_gen.generate(seeds.begin(), seeds.end()); + seed = static_cast(seeds[0]); + } + x.Randomize(seed); // On host always +} + +template <> +void SetRandomReal(MPI_Comm comm, Vector &x, int seed) +{ + SetRandom(comm, x, seed); +} + +template <> +void SetRandomSign(MPI_Comm comm, Vector &x, int seed) +{ + SetRandom(comm, x, seed); + const bool use_dev = x.UseDevice(); + const int N = x.Size(); + auto *X = x.ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, [=] MFEM_HOST_DEVICE(int i) + { X[i] = (X[i] > 0.0) ? 1.0 : ((X[i] < 0.0) ? -1.0 : 0.0); }); +} + +template <> +void SetRandom(MPI_Comm comm, ComplexVector &x, int seed) +{ + if (seed == 0) + { + std::vector seeds(2); + std::seed_seq seed_gen{2 * Mpi::Rank(comm), 2 * Mpi::Rank(comm) + 1}; + seed_gen.generate(seeds.begin(), seeds.end()); + SetRandom(comm, x.Real(), static_cast(seeds[0])); + SetRandom(comm, x.Imag(), static_cast(seeds[1])); + } + else + { + SetRandom(comm, x.Real(), seed); + SetRandom(comm, x.Imag(), seed); + } +} + +template <> +void SetRandomReal(MPI_Comm comm, ComplexVector &x, int seed) +{ + SetRandom(comm, x.Real(), seed); + x.Imag() = 0.0; +} + +template <> +void SetRandomSign(MPI_Comm comm, ComplexVector &x, int seed) +{ + SetRandom(comm, x, seed); + const bool use_dev = x.UseDevice(); + const int N = x.Size(); + auto *XR = x.Real().ReadWrite(use_dev); + auto *XI = x.Imag().ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) + { + XR[i] = (XR[i] > 0.0) ? 1.0 : ((XR[i] < 0.0) ? -1.0 : 0.0); + XI[i] = (XI[i] > 0.0) ? 1.0 : ((XI[i] < 0.0) ? -1.0 : 0.0); + }); +} + +double LocalDot(const Vector &x, const Vector &y) +{ + static hypre::HypreVector X, Y; + MFEM_ASSERT(x.Size() == y.Size(), "Size mismatch for vector inner product!"); + X.Update(x); + Y.Update(y); + return hypre_SeqVectorInnerProd(X, Y); +} + +std::complex LocalDot(const ComplexVector &x, const ComplexVector &y) +{ + if (&x == &y) + { + return {LocalDot(x.Real(), y.Real()) + LocalDot(x.Imag(), y.Imag()), 0.0}; + } + else + { + return {LocalDot(x.Real(), y.Real()) + LocalDot(x.Imag(), y.Imag()), + LocalDot(x.Imag(), y.Real()) - LocalDot(x.Real(), y.Imag())}; + } +} + +// We implement LocalSum using Hypre instead of using MFEM's Sum because it is +// more efficient on GPUs. TODO: Verify this +double LocalSum(const Vector &x) +{ + static hypre::HypreVector X; + X.Update(x); + return hypre_SeqVectorSumElts(X); +} + +std::complex LocalSum(const ComplexVector &x) +{ + return {LocalSum(x.Real()), LocalSum(x.Imag())}; +} + +template <> +void AXPY(double alpha, const Vector &x, Vector &y) +{ + if (alpha == 1.0) + { + y += x; + } + else + { + y.Add(alpha, x); + } +} + +template <> +void AXPY(double alpha, const ComplexVector &x, ComplexVector &y) +{ + y.AXPY(alpha, x); +} + +template <> +void AXPY(std::complex alpha, const ComplexVector &x, ComplexVector &y) +{ + y.AXPY(alpha, x); +} + +template <> +void AXPBY(double alpha, const Vector &x, double beta, Vector &y) +{ + add(alpha, x, beta, y, y); +} + +template <> +void AXPBY(std::complex alpha, const ComplexVector &x, std::complex beta, + ComplexVector &y) +{ + y.AXPBY(alpha, x, beta); +} + +template <> +void AXPBY(double alpha, const ComplexVector &x, double beta, ComplexVector &y) +{ + y.AXPBY(alpha, x, beta); +} + +template <> +void AXPBYPCZ(double alpha, const Vector &x, double beta, const Vector &y, double gamma, + Vector &z) +{ + if (gamma == 0.0) + { + add(alpha, x, beta, y, z); + } + else + { + AXPBY(alpha, x, gamma, z); + z.Add(beta, y); + } +} + +template <> +void AXPBYPCZ(std::complex alpha, const ComplexVector &x, std::complex beta, + const ComplexVector &y, std::complex gamma, ComplexVector &z) +{ + z.AXPBYPCZ(alpha, x, beta, y, gamma); +} + +template <> +void AXPBYPCZ(double alpha, const ComplexVector &x, double beta, const ComplexVector &y, + double gamma, ComplexVector &z) +{ + z.AXPBYPCZ(alpha, x, beta, y, gamma); +} + +void Sqrt(Vector &x, double s) +{ + const bool use_dev = x.UseDevice(); + const int N = x.Size(); + auto *X = x.ReadWrite(use_dev); + mfem::forall_switch(use_dev, N, + [=] MFEM_HOST_DEVICE(int i) { X[i] = std::sqrt(X[i] * s); }); +} + +} // namespace linalg + +} // namespace palace diff --git a/palace/linalg/vector.hpp b/palace/linalg/vector.hpp index cb002f3a5f..ade6519e2f 100644 --- a/palace/linalg/vector.hpp +++ b/palace/linalg/vector.hpp @@ -1,205 +1,343 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_LINALG_VECTOR_HPP -#define PALACE_LINALG_VECTOR_HPP - -#include -#include -#include "utils/communication.hpp" - -namespace palace -{ - -using Operator = mfem::Operator; -using Vector = mfem::Vector; - -// -// Functionality extending mfem::Vector from MFEM, including basic functions for parallel -// vectors distributed across MPI processes. -// - -// A complex-valued vector represented as two real vectors, one for each component. -class ComplexVector -{ -private: - Vector x, xr, xi; - -public: - // Create a vector with the given size. - ComplexVector(int n = 0); - - // Copy constructor. - ComplexVector(const ComplexVector &y); - - // Copy constructor from separately provided real and imaginary parts. - ComplexVector(const Vector &yr, const Vector &yi); - - // Copy constructor from an array of complex values. - ComplexVector(const std::complex *py, int n); - - // Return the size of the vector. - int Size() const { return x.Size() / 2; } - - // Set the size of the vector. See the notes for Vector::SetSize for behavior in the cases - // where n is less than or greater than Size() or Capacity(). - void SetSize(int n); - - // Get access to the real and imaginary vector parts. - const Vector &Real() const { return xr; } - Vector &Real() { return xr; } - const Vector &Imag() const { return xi; } - Vector &Imag() { return xi; } - - // Set from a ComplexVector, without resizing. - ComplexVector &operator=(const ComplexVector &y) { return Set(y); } - ComplexVector &Set(const ComplexVector &y) - { - Set(y.Real(), y.Imag()); - return *this; - } - - // Set from separately provided real and imaginary parts, without resizing. - void Set(const Vector &yr, const Vector &yi); - - // Set from an array of complex values, without resizing. - void Set(const std::complex *py, int n); - - // Copy the vector into an array of complex values. - void Get(std::complex *py, int n) const; - - // Set all entries equal to s. - ComplexVector &operator=(std::complex s); - ComplexVector &operator=(double s) - { - *this = std::complex(s, 0.0); - return *this; - } - - // Scale all entries by s. - ComplexVector &operator*=(std::complex s); - - // Replace entries with their complex conjugate. - void Conj(); - - // Replace entries with their absolute value. - void Abs(); - - // Set all entries to their reciprocal. - void Reciprocal(); - - // Vector dot product (yᴴ x) or indefinite dot product (yᵀ x) for complex vectors. - std::complex Dot(const ComplexVector &y) const; - std::complex TransposeDot(const ComplexVector &y) const; - std::complex operator*(const ComplexVector &y) const { return Dot(y); } - - // In-place addition (*this) += alpha * x. - void AXPY(std::complex alpha, const ComplexVector &x); - void Add(std::complex alpha, const ComplexVector &x) { AXPY(alpha, x); } - ComplexVector &operator+=(const ComplexVector &x) - { - AXPY(1.0, x); - return *this; - } - - // In-place addition (*this) = alpha * x + beta * (*this). - void AXPBY(std::complex alpha, const ComplexVector &x, std::complex beta); - - // In-place addition (*this) = alpha * x + beta * y + gamma * (*this). - void AXPBYPCZ(std::complex alpha, const ComplexVector &x, - std::complex beta, const ComplexVector &y, - std::complex gamma); -}; - -namespace linalg -{ - -// Returns the global vector size. -template -inline HYPRE_BigInt GlobalSize(MPI_Comm comm, const VecType &x) -{ - HYPRE_BigInt N = x.Size(); - Mpi::GlobalSum(1, &N, comm); - return N; -} - -// Returns the global vector size for two vectors. -template -inline std::pair GlobalSize2(MPI_Comm comm, const VecType1 &x1, - const VecType2 &x2) -{ - HYPRE_BigInt N[2] = {x1.Size(), x2.Size()}; - Mpi::GlobalSum(2, N, comm); - return {N[0], N[1]}; -} - -// Sets all entries of the vector corresponding to the given indices to the given (real) -// value. -template -void SetSubVector(VecType &x, const mfem::Array &rows, double s); -template -void SetSubVector(VecType &x, const mfem::Array &rows, const VecType &y); - -// Sets all entries of the vector to random numbers sampled from the [-1, 1] or [-1 - 1i, -// 1 + 1i] for complex-valued vectors. -template -void SetRandom(MPI_Comm comm, VecType &x, int seed = 0); -template -void SetRandomReal(MPI_Comm comm, VecType &x, int seed = 0); -template -void SetRandomSign(MPI_Comm comm, VecType &x, int seed = 0); - -// Calculate the inner product yᴴ x or yᵀ x. -template -inline auto Dot(MPI_Comm comm, const VecType &x, const VecType &y) -{ - auto dot = x * y; - Mpi::GlobalSum(1, &dot, comm); - return dot; -} - -// Calculate the vector 2-norm. -template -inline double Norml2(MPI_Comm comm, const VecType &x) -{ - return std::sqrt(std::abs(Dot(comm, x, x))); -} -template -double Norml2(MPI_Comm comm, const VecType &x, const Operator &B, VecType &Bx); - -// Normalize the vector, possibly with respect to an SPD matrix B. -template -inline double Normalize(MPI_Comm comm, VecType &x) -{ - double norm = Norml2(comm, x); - MFEM_ASSERT(norm > 0.0, "Zero vector norm in normalization!"); - x *= 1.0 / norm; - return norm; -} -template -inline double Normalize(MPI_Comm comm, VecType &x, const Operator &B, VecType &Bx) -{ - double norm = Norml2(comm, x, B, Bx); - MFEM_ASSERT(norm > 0.0, "Zero vector norm in normalization!"); - x *= 1.0 / norm; - return norm; -} - -// Addition y += alpha * x. -template -void AXPY(ScalarType alpha, const VecType &x, VecType &y); - -// Addition y = alpha * x + beta * y. -template -void AXPBY(ScalarType alpha, const VecType &x, ScalarType beta, VecType &y); - -// Addition z = alpha * x + beta * y + gamma * z. -template -void AXPBYPCZ(ScalarType alpha, const VecType &x, ScalarType beta, const VecType &y, - ScalarType gamma, VecType &z); - -} // namespace linalg - -} // namespace palace - -#endif // PALACE_LINALG_VECTOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_LINALG_VECTOR_HPP +#define PALACE_LINALG_VECTOR_HPP + +#include +#include +#include +#include "utils/communication.hpp" + +namespace palace +{ + +// +// Functionality extending mfem::Vector from MFEM, including basic functions for parallel +// vectors distributed across MPI processes. +// + +using Vector = mfem::Vector; + +// A complex-valued vector represented as two real vectors, one for each component. +class ComplexVector +{ +private: + Vector xr, xi; + +public: + // Create a vector with the given size. + ComplexVector(int size = 0); + + // Copy constructor. + ComplexVector(const ComplexVector &y); + + // Copy constructor from separately provided real and imaginary parts. + ComplexVector(const Vector &yr, const Vector &yi); + + // Copy constructor from an array of complex values. + ComplexVector(const std::complex *py, int size, bool on_dev); + + // Create a vector referencing the memory of another vector, at the given base offset and + // size. + ComplexVector(Vector &y, int offset, int size); + + // Flag for runtime execution on the mfem::Device. See the documentation for mfem::Vector. + void UseDevice(bool use_dev); + bool UseDevice() const { return xr.UseDevice(); } + + // Return the size of the vector. + int Size() const { return xr.Size(); } + + // Set the size of the vector. See the notes for Vector::SetSize for behavior in the cases + // where the new size is less than or greater than Size() or Capacity(). + void SetSize(int size); + + // Set this vector to reference the memory of another vector, at the given base offset and + // size. + void MakeRef(Vector &y, int offset, int size); + + // Get access to the real and imaginary vector parts. + const Vector &Real() const { return xr; } + Vector &Real() { return xr; } + const Vector &Imag() const { return xi; } + Vector &Imag() { return xi; } + + // Set from a ComplexVector, without resizing. + void Set(const ComplexVector &y); + ComplexVector &operator=(const ComplexVector &y) + { + Set(y); + return *this; + } + + // Set from separately provided real and imaginary parts, without resizing. + void Set(const Vector &yr, const Vector &yi); + + // Set from an array of complex values, without resizing. + void Set(const std::complex *py, int size, bool on_dev); + + // Copy the vector into an array of complex values. + void Get(std::complex *py, int size, bool on_dev) const; + + // Set all entries equal to s. + ComplexVector &operator=(std::complex s); + ComplexVector &operator=(double s) + { + *this = std::complex(s, 0.0); + return *this; + } + + // Set the vector from an array of blocks and coefficients, without resizing. + void SetBlocks(const std::vector &y, + const std::vector> &s); + + // Scale all entries by s. + ComplexVector &operator*=(std::complex s); + + // Replace entries with their complex conjugate. + void Conj(); + + // Replace entries with their absolute value. + void Abs(); + + // Set all entries to their reciprocal. + void Reciprocal(); + + // Vector dot product (yᴴ x) or indefinite dot product (yᵀ x) for complex vectors. + std::complex Dot(const ComplexVector &y) const; + std::complex TransposeDot(const ComplexVector &y) const; + std::complex operator*(const ComplexVector &y) const { return Dot(y); } + + // In-place addition (*this) += alpha * x. + void AXPY(std::complex alpha, const ComplexVector &x); + void Add(std::complex alpha, const ComplexVector &x) { AXPY(alpha, x); } + void Subtract(std::complex alpha, const ComplexVector &x) { AXPY(-alpha, x); } + ComplexVector &operator+=(const ComplexVector &x) + { + AXPY(1.0, x); + return *this; + } + ComplexVector &operator-=(const ComplexVector &x) + { + AXPY(-1.0, x); + return *this; + } + + // In-place addition (*this) = alpha * x + beta * (*this). + void AXPBY(std::complex alpha, const ComplexVector &x, std::complex beta); + + // In-place addition (*this) = alpha * x + beta * y + gamma * (*this). + void AXPBYPCZ(std::complex alpha, const ComplexVector &x, + std::complex beta, const ComplexVector &y, + std::complex gamma); + + static void AXPY(std::complex alpha, const Vector &xr, const Vector &xi, + Vector &yr, Vector &yi); + + static void AXPBY(std::complex alpha, const Vector &xr, const Vector &xi, + std::complex beta, Vector &yr, Vector &yi); + + static void AXPBYPCZ(std::complex alpha, const Vector &xr, const Vector &xi, + std::complex beta, const Vector &yr, const Vector &yi, + std::complex gamma, Vector &zr, Vector &zi); +}; + +// A stack-allocated vector with compile-time fixed size. +// +// StaticVector provides a Vector interface backed by stack memory instead of +// heap allocation. The size N is fixed at compile time, making it suitable for +// small vectors where performance and avoiding dynamic allocation are +// important. +// +// Template parameters: +// - N: The fixed size of the vector (number of elements) +// +// Notes: +// - Inherits from mfem::Vector, so can be used anywhere Vector is expected. +// - Memory is automatically managed (no new/delete needed). +// - Faster than dynamic Vector for small sizes due to stack allocation. +// +// Example usage: +// +// StaticVector<3> vec; // 3D vector on stack +// vec[0] = 1.0; +// vec[1] = 2.0; +// vec[2] = 3.0; +// +// vec.Sum(); +// +// You can also create StaticComplexVectors: +// +// StaticVector<3> vec_real, vec_imag; +// ComplexVector complex_vec(vec_real, vec_imag); +template +class StaticVector : public Vector +{ +private: + double buff[N]; + +public: + StaticVector() : Vector() { SetDataAndSize(buff, N); } + + ~StaticVector() + { + MFEM_ASSERT(GetData() == buff, + "Buffer of StaticVector changed. This indicates a possible bug."); + MFEM_ASSERT(Size() == N, "Size of StaticVector changed. This indicates a possible bug.") + } + + using Vector::operator=; // Extend the implicitly defined assignment operators +}; + +namespace linalg +{ + +// Returns the global vector size. +template +inline HYPRE_BigInt GlobalSize(MPI_Comm comm, const VecType &x) +{ + HYPRE_BigInt N = x.Size(); + Mpi::GlobalSum(1, &N, comm); + return N; +} + +// Returns the global vector size for two vectors. +template +inline std::pair GlobalSize2(MPI_Comm comm, const VecType1 &x1, + const VecType2 &x2) +{ + HYPRE_BigInt N[2] = {x1.Size(), x2.Size()}; + Mpi::GlobalSum(2, N, comm); + return {N[0], N[1]}; +} + +// Sets all entries of the vector corresponding to the given indices to the given (real) +// value or vector of values. +template +void SetSubVector(VecType &x, const mfem::Array &rows, double s); +template +void SetSubVector(VecType &x, const mfem::Array &rows, const VecType &y); + +// Sets contiguous entries from start to the given vector. +template +void SetSubVector(VecType &x, int start, const VecType &y); + +// Sets all entries in the range [start, end) to the given value. +template +void SetSubVector(VecType &x, int start, int end, double s); + +// Sets all entries of the vector to random numbers sampled from the [-1, 1] or [-1 - 1i, +// 1 + 1i] for complex-valued vectors. +template +void SetRandom(MPI_Comm comm, VecType &x, int seed = 0); +template +void SetRandomReal(MPI_Comm comm, VecType &x, int seed = 0); +template +void SetRandomSign(MPI_Comm comm, VecType &x, int seed = 0); + +// Calculate the local inner product yᴴ x or yᵀ x. +double LocalDot(const Vector &x, const Vector &y); +std::complex LocalDot(const ComplexVector &x, const ComplexVector &y); + +// Calculate the parallel inner product yᴴ x or yᵀ x. +template +inline auto Dot(MPI_Comm comm, const VecType &x, const VecType &y) +{ + auto dot = LocalDot(x, y); + Mpi::GlobalSum(1, &dot, comm); + return dot; +} + +// Calculate the vector 2-norm. +template +inline auto Norml2(MPI_Comm comm, const VecType &x) +{ + return std::sqrt(std::abs(Dot(comm, x, x))); +} + +// Normalize the vector, possibly with respect to an SPD matrix B. +template +inline auto Normalize(MPI_Comm comm, VecType &x) +{ + auto norm = Norml2(comm, x); + MFEM_ASSERT(norm > 0.0, "Zero vector norm in normalization!"); + x *= 1.0 / norm; + return norm; +} + +// Calculate the local sum of all elements in the vector. +double LocalSum(const Vector &x); +std::complex LocalSum(const ComplexVector &x); + +// Calculate the sum of all elements in the vector. +template +inline auto Sum(MPI_Comm comm, const VecType &x) +{ + auto sum = LocalSum(x); + Mpi::GlobalSum(1, &sum, comm); + return sum; +} + +// Calculate the mean of all elements in the vector. +template +inline auto Mean(MPI_Comm comm, const VecType &x) +{ + using ScalarType = typename std::conditional::value, + std::complex, double>::type; + ScalarType sum[2] = {LocalSum(x), ScalarType(x.Size())}; + Mpi::GlobalSum(2, sum, comm); + return sum[0] / sum[1]; +} + +// Normalize a complex vector so its mean is on the positive real axis. +// Returns the original mean phase. +inline double NormalizePhase(MPI_Comm comm, ComplexVector &x) +{ + std::complex mean = Mean(comm, x); + x *= std::conj(mean) / std::abs(mean); + return std::atan2(mean.imag(), mean.real()); +} + +// Addition y += alpha * x. +template +void AXPY(ScalarType alpha, const VecType &x, VecType &y); + +// Addition y = alpha * x + beta * y. +template +void AXPBY(ScalarType alpha, const VecType &x, ScalarType beta, VecType &y); + +// Addition z = alpha * x + beta * y + gamma * z. +template +void AXPBYPCZ(ScalarType alpha, const VecType &x, ScalarType beta, const VecType &y, + ScalarType gamma, VecType &z); + +// Compute element-wise square root, optionally with scaling (multiplied before the square +// root). +void Sqrt(Vector &x, double s = 1.0); + +// Compute the 3D Cartesian product between A and B and store the result in C. +// If add is true, accumulate the result to C instead of overwriting its +// content. +template +void Cross3(const VecTypeA &A, const VecTypeB &B, VecTypeC &C, bool add = false) +{ + if (add) + { + C[0] += A[1] * B[2] - A[2] * B[1]; + C[1] += A[2] * B[0] - A[0] * B[2]; + C[2] += A[0] * B[1] - A[1] * B[0]; + } + else + { + C[0] = A[1] * B[2] - A[2] * B[1]; + C[1] = A[2] * B[0] - A[0] * B[2]; + C[2] = A[0] * B[1] - A[1] * B[0]; + } +} + +} // namespace linalg + +} // namespace palace + +#endif // PALACE_LINALG_VECTOR_HPP diff --git a/palace/main.cpp b/palace/main.cpp index faa186868b..0cc27bb284 100644 --- a/palace/main.cpp +++ b/palace/main.cpp @@ -1,304 +1,304 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include -#include -#include -#include -#include -#include -#include -#include "drivers/drivensolver.hpp" -#include "drivers/eigensolver.hpp" -#include "drivers/electrostaticsolver.hpp" -#include "drivers/magnetostaticsolver.hpp" -#include "drivers/transientsolver.hpp" -#include "fem/errorindicator.hpp" -#include "fem/libceed/utils.hpp" -#include "linalg/slepc.hpp" -#include "utils/communication.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" -#include "utils/timer.hpp" - -#if defined(MFEM_USE_OPENMP) -#include -#endif - -using namespace palace; - -static const char *GetPalaceGitTag() -{ -#if defined(PALACE_GIT_COMMIT) - static const char *commit = PALACE_GIT_COMMIT_ID; -#else - static const char *commit = "UNKNOWN"; -#endif - return commit; -} - -static const char *GetPalaceCeedJitSourceDir() -{ -#if defined(PALACE_LIBCEED_JIT_SOURCE) - static const char *path = PALACE_LIBCEED_JIT_SOURCE_DIR; -#else - static const char *path = ""; -#endif - return path; -} - -static int ConfigureOmp() -{ -#if defined(MFEM_USE_OPENMP) - int nt; - const char *env = std::getenv("OMP_NUM_THREADS"); - if (env) - { - std::sscanf(env, "%d", &nt); - } - else - { - nt = 1; - omp_set_num_threads(nt); - } - omp_set_dynamic(0); - return nt; -#else - return 0; -#endif -} - -static int GetDeviceId(MPI_Comm comm) -{ - // Assign devices round-robin over MPI ranks if GPU support is enabled. -#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP) - MPI_Comm node_comm; - MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, Mpi::Rank(comm), MPI_INFO_NULL, - &node_comm); - int node_size = Mpi::Rank(node_comm); - MPI_Comm_free(&node_comm); - return node_size % mfem::Device::GetNumGPU(); -#else - return 0; -#endif -} - -static std::string ConfigureDeviceAndBackend(config::SolverData::Device device, - const std::string &ceed_backend) -{ - // Configure - std::string device_str, default_ceed_backend; - switch (device) - { - case config::SolverData::Device::CPU: - device_str = "cpu"; - default_ceed_backend = "/cpu/self"; - break; - case config::SolverData::Device::GPU: -#if defined(MFEM_USE_CUDA) - device_str = "cuda"; - default_ceed_backend = "/gpu/cuda/magma"; -#elif defined(MFEM_USE_HIP) - device_str = "hip"; - default_ceed_backend = "/gpu/hip/magma"; -#else - MFEM_ABORT( - "Palace must be built with either CUDA or HIP support for GPU device usage!"); -#endif - break; - case config::SolverData::Device::DEBUG: - device_str = "cpu,debug"; - default_ceed_backend = "/cpu/self/ref"; - break; - } -#if defined(MFEM_USE_OPENMP) - device_str += ",omp"; -#endif - - // Initialize libCEED. - const std::string &backend = - !ceed_backend.empty() ? ceed_backend.c_str() : default_ceed_backend.c_str(); - ceed::Initialize(backend.c_str(), GetPalaceCeedJitSourceDir()); - - // Check that the provided resource matches the requested one. - std::string ceed_resource = ceed::Print(); - if (backend.compare(0, backend.length(), ceed_resource, 0, backend.length())) - { - Mpi::Warning( - "libCEED is not using the requested backend (requested \"{}\", got \"{}\")!\n", - backend, ceed_resource); - } - - return device_str; -} - -static void PrintPalaceBanner(MPI_Comm comm) -{ - Mpi::Print(comm, "_____________ _______\n" - "_____ __ \\____ __ /____ ____________\n" - "____ /_/ / __ ` / / __ ` / ___/ _ \\\n" - "___ _____/ /_/ / / /_/ / /__/ ___/\n" - " /__/ \\___,__/__/\\___,__/\\_____\\_____/\n\n"); -} - -static void PrintPalaceInfo(MPI_Comm comm, int np, int nt, mfem::Device &device) -{ - if (std::strcmp(GetPalaceGitTag(), "UNKNOWN")) - { - Mpi::Print(comm, "Git changeset ID: {}\n", GetPalaceGitTag()); - } - Mpi::Print(comm, "Running with {:d} MPI process{}", np, (np > 1) ? "es" : ""); - if (nt > 0) - { - Mpi::Print(comm, ", {:d} OpenMP thread{}", nt, (nt > 1) ? "s" : ""); - } -#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP) - int ngpu = mfem::Device::GetNumGPU(); -#if defined(MFEM_USE_CUDA) - const char *device_name = "CUDA"; -#else - const char *device_name = "HIP"; -#endif - Mpi::Print(comm, "\n{:d} detected {} device{}{}", ngpu, device_name, - (ngpu > 1) ? "s" : "", - mfem::Device::GetGPUAwareMPI() ? " (MPI is GPU aware)" : ""); -#endif - std::ostringstream resource(std::stringstream::out); - resource << "\n"; - device.Print(resource); - resource << "libCEED backend: " << ceed::Print(); - Mpi::Print(comm, "{}\n\n", resource.str()); - Mpi::Barrier(comm); -} - -int main(int argc, char *argv[]) -{ - // Initialize the timer. - BlockTimer bt(Timer::INIT); - - // Initialize MPI. - Mpi::Init(argc, argv); - MPI_Comm world_comm = Mpi::World(); - bool world_root = Mpi::Root(world_comm); - int world_size = Mpi::Size(world_comm); - Mpi::Print(world_comm, "\n"); - - // Parse command-line options. - std::vector argv_sv(argv, argv + argc); - bool dryrun = false; - auto Help = [executable_path = argv_sv[0], &world_comm]() - { - Mpi::Print(world_comm, - "Usage: {} [OPTIONS] CONFIG_FILE\n\n" - "Options:\n" - " -h, --help Show this help message and exit\n" - " -dry-run, --dry-run Parse configuration file for errors and exit\n\n", - executable_path.substr(executable_path.find_last_of('/') + 1)); - }; - for (int i = 1; i < argc; i++) - { - std::string_view argv_i = argv_sv.at(i); - if ((argv_i == "-h") || (argv_i == "--help")) - { - Help(); - return 0; - } - if ((argv_i == "-dry-run") || (argv_i == "--dry-run")) - { - dryrun = true; - continue; - } - } - if (argc < 2) - { - Mpi::Print(world_comm, "Error: Invalid usage!\n\n"); - Help(); - return 1; - } - - // Perform dry run: Parse configuration file for errors and exit. - if (dryrun) - { - if (Mpi::Root(world_comm)) - { - IoData iodata(argv[argc - 1], false); - } - Mpi::Print(world_comm, "Dry-run: No errors detected in configuration file \"{}\"\n\n", - argv[argc - 1]); - return 0; - } - - // Parse configuration file. - PrintPalaceBanner(world_comm); - IoData iodata(argv[1], false); - - // Initialize the MFEM device and configure libCEED backend. - int omp_threads = ConfigureOmp(), device_id = GetDeviceId(world_comm); - mfem::Device device( - ConfigureDeviceAndBackend(iodata.solver.device, iodata.solver.ceed_backend), - device_id); -#if defined(HYPRE_WITH_GPU_AWARE_MPI) - device.SetGPUAwareMPI(true); -#endif - - // Initialize Hypre and, optionally, SLEPc/PETSc. - mfem::Hypre::Init(); -#if defined(PALACE_WITH_SLEPC) - slepc::Initialize(argc, argv, nullptr, nullptr); - if (PETSC_COMM_WORLD != world_comm) - { - Mpi::Print(world_comm, "Error: Problem during MPI initialization!\n\n"); - return 1; - } -#endif - - // Initialize the problem driver. - PrintPalaceInfo(world_comm, world_size, omp_threads, device); - const auto solver = [&]() -> std::unique_ptr - { - switch (iodata.problem.type) - { - case config::ProblemData::Type::DRIVEN: - return std::make_unique(iodata, world_root, world_size, omp_threads, - GetPalaceGitTag()); - case config::ProblemData::Type::EIGENMODE: - return std::make_unique(iodata, world_root, world_size, omp_threads, - GetPalaceGitTag()); - case config::ProblemData::Type::ELECTROSTATIC: - return std::make_unique(iodata, world_root, world_size, - omp_threads, GetPalaceGitTag()); - case config::ProblemData::Type::MAGNETOSTATIC: - return std::make_unique(iodata, world_root, world_size, - omp_threads, GetPalaceGitTag()); - case config::ProblemData::Type::TRANSIENT: - return std::make_unique(iodata, world_root, world_size, - omp_threads, GetPalaceGitTag()); - } - return nullptr; - }(); - - // Read the mesh from file, refine, partition, and distribute it. Then nondimensionalize - // it and the input parameters. - std::vector> mesh; - mesh.push_back(mesh::ReadMesh(world_comm, iodata, false, true, true, false)); - iodata.NondimensionalizeInputs(*mesh[0]); - mesh::RefineMesh(iodata, mesh); - - // Run the problem driver. - solver->SolveEstimateMarkRefine(mesh); - - // Print timing summary. - BlockTimer::Print(world_comm); - solver->SaveMetadata(BlockTimer::GlobalTimer()); - Mpi::Print(world_comm, "\n"); - - // Finalize libCEED. - ceed::Finalize(); - - // Finalize SLEPc/PETSc. -#if defined(PALACE_WITH_SLEPC) - slepc::Finalize(); -#endif - - return 0; -} +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include "drivers/drivensolver.hpp" +#include "drivers/eigensolver.hpp" +#include "drivers/electrostaticsolver.hpp" +#include "drivers/magnetostaticsolver.hpp" +#include "drivers/transientsolver.hpp" +#include "fem/errorindicator.hpp" +#include "fem/libceed/ceed.hpp" +#include "fem/mesh.hpp" +#include "linalg/hypre.hpp" +#include "linalg/slepc.hpp" +#include "utils/communication.hpp" +#include "utils/device.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" +#include "utils/omp.hpp" +#include "utils/outputdir.hpp" +#include "utils/timer.hpp" + +#if defined(MFEM_USE_STRUMPACK) +#include +#endif + +using namespace palace; + +static const char *GetPalaceGitTag() +{ +#if defined(PALACE_GIT_COMMIT) + static const char *commit = PALACE_GIT_COMMIT_ID; +#else + static const char *commit = "UNKNOWN"; +#endif + return commit; +} + +static const char *GetPalaceCeedJitSourceDir() +{ +#if defined(PALACE_LIBCEED_JIT_SOURCE) + static const char *path = PALACE_LIBCEED_JIT_SOURCE_DIR; +#else + static const char *path = ""; +#endif + return path; +} + +static std::string ConfigureDevice(Device device) +{ + std::string device_str; + switch (device) + { + case Device::CPU: + device_str = "cpu"; + break; + case Device::GPU: +#if defined(MFEM_USE_CUDA) + device_str = "cuda"; +#elif defined(MFEM_USE_HIP) + device_str = "hip"; +#else + Mpi::Warning("Palace must be built with either CUDA or HIP support for GPU device " + "usage, reverting to CPU!\n"); + device_str = "cpu"; +#endif + break; + case Device::DEBUG: + device_str = "cpu,debug"; + break; + } +#if defined(MFEM_USE_OPENMP) + device_str += ",omp"; +#endif + return device_str; +} + +static void ConfigureCeedBackend(const std::string &ceed_backend) +{ + // Initialize libCEED (only after MFEM device is configured). + std::string default_ceed_backend; + if (mfem::Device::Allows(mfem::Backend::CUDA_MASK)) + { + default_ceed_backend = "/gpu/cuda/magma"; + } + else if (mfem::Device::Allows(mfem::Backend::HIP_MASK)) + { + default_ceed_backend = "/gpu/hip/magma"; + } + else if (mfem::Device::Allows(mfem::Backend::DEBUG_DEVICE)) + { + default_ceed_backend = "/cpu/self/ref/serial"; + } + else + { + default_ceed_backend = "/cpu/self"; + } + const std::string &backend = + !ceed_backend.empty() ? ceed_backend.c_str() : default_ceed_backend.c_str(); + ceed::Initialize(backend.c_str(), GetPalaceCeedJitSourceDir()); + + // Check that the provided resource matches the requested one. + std::string ceed_resource = ceed::Print(); + if (backend.compare(0, backend.length(), ceed_resource, 0, backend.length())) + { + Mpi::Warning( + "libCEED is not using the requested backend!\nRequested \"{}\", got \"{}\"!\n", + backend, ceed_resource); + } +} + +static void PrintPalaceBanner(MPI_Comm comm) +{ + Mpi::Print(comm, "_____________ _______\n" + "_____ __ \\____ __ /____ ____________\n" + "____ /_/ / __ ` / / __ ` / ___/ _ \\\n" + "___ _____/ /_/ / / /_/ / /__/ ___/\n" + " /__/ \\___,__/__/\\___,__/\\_____\\_____/\n\n"); +} + +static void PrintPalaceInfo(MPI_Comm comm, int np, int nt, int ngpu, mfem::Device &device) +{ + if (std::strcmp(GetPalaceGitTag(), "UNKNOWN")) + { + Mpi::Print(comm, "Git changeset ID: {}\n", GetPalaceGitTag()); + } + Mpi::Print(comm, "Running with {:d} MPI process{}", np, (np > 1) ? "es" : ""); + if (nt > 0) + { + Mpi::Print(comm, ", {:d} OpenMP thread{}", nt, (nt > 1) ? "s" : ""); + } +#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP) +#if defined(MFEM_USE_CUDA) + const char *device_name = "CUDA"; +#else + const char *device_name = "HIP"; +#endif + Mpi::Print(comm, "\nDetected {:d} {} device{}{}", ngpu, device_name, + (ngpu != 1) ? "s" : "", + mfem::Device::GetGPUAwareMPI() ? " (MPI is GPU aware)" : ""); +#endif + std::ostringstream resource(std::stringstream::out); + resource << "\n"; + device.Print(resource); + resource << "libCEED backend: " << ceed::Print(); + Mpi::Print(comm, "{}\n\n", resource.str()); + Mpi::Barrier(comm); +} + +int main(int argc, char *argv[]) +{ + // Initialize MPI. +#if defined(MFEM_USE_STRUMPACK) && \ + (defined(STRUMPACK_USE_PTSCOTCH) || defined(STRUMPACK_USE_SLATE_SCALAPACK)) + Mpi::default_thread_required = MPI_THREAD_MULTIPLE; +#endif + Mpi::Init(argc, argv); + MPI_Comm world_comm = Mpi::World(); + bool world_root = Mpi::Root(world_comm); + int world_size = Mpi::Size(world_comm); + Mpi::Print(world_comm, "\n"); + + // Initialize the timer. + BlockTimer bt(Timer::INIT); + + // Parse command-line options. + std::vector argv_sv(argv, argv + argc); + bool dryrun = false; + auto Help = [executable_path = argv_sv[0], &world_comm]() + { + Mpi::Print(world_comm, + "Usage: {} [OPTIONS] CONFIG_FILE\n\n" + "Options:\n" + " -h, --help Show this help message and exit\n" + " --version Show version information and exit\n" + " -dry-run, --dry-run Parse configuration file for errors and exit\n\n", + executable_path.substr(executable_path.find_last_of('/') + 1)); + }; + for (int i = 1; i < argc; i++) + { + std::string_view argv_i = argv_sv.at(i); + if ((argv_i == "-h") || (argv_i == "--help")) + { + Help(); + return 0; + } + if (argv_i == "--version") + { + Mpi::Print(world_comm, "Palace version: {}\n", GetPalaceGitTag()); + return 0; + } + if ((argv_i == "-dry-run") || (argv_i == "--dry-run")) + { + dryrun = true; + continue; + } + } + if (argc < 2) + { + Mpi::Print(world_comm, "Error: Invalid usage!\n\n"); + Help(); + return 1; + } + + // Perform dry run: Parse configuration file for errors and exit. + if (dryrun) + { + if (Mpi::Root(world_comm)) + { + IoData iodata(argv[argc - 1], false); + } + Mpi::Print(world_comm, "Dry-run: No errors detected in configuration file \"{}\"\n\n", + argv[argc - 1]); + return 0; + } + + // Parse configuration file. + PrintPalaceBanner(world_comm); + IoData iodata(argv[1], false); + MakeOutputFolder(iodata, world_comm); + + BlockTimer bt1(Timer::INIT); + // Initialize the MFEM device and configure libCEED backend. + int omp_threads = utils::ConfigureOmp(), ngpu = utils::GetDeviceCount(); + mfem::Device device(ConfigureDevice(iodata.solver.device), + utils::GetDeviceId(world_comm, ngpu)); + ConfigureCeedBackend(iodata.solver.ceed_backend); +#if defined(PALACE_WITH_GPU_AWARE_MPI) + device.SetGPUAwareMPI(true); +#endif + + // Initialize Hypre and, optionally, SLEPc/PETSc. + hypre::Initialize(); +#if defined(PALACE_WITH_SLEPC) + slepc::Initialize(argc, argv, nullptr, nullptr); + if (PETSC_COMM_WORLD != world_comm) + { + Mpi::Print(world_comm, "Error: Problem during MPI initialization!\n\n"); + return 1; + } +#endif + + // Initialize the problem driver. + PrintPalaceInfo(world_comm, world_size, omp_threads, ngpu, device); + const auto solver = [&]() -> std::unique_ptr + { + switch (iodata.problem.type) + { + case ProblemType::DRIVEN: + return std::make_unique(iodata, world_root, world_size, omp_threads, + GetPalaceGitTag()); + case ProblemType::EIGENMODE: + return std::make_unique(iodata, world_root, world_size, omp_threads, + GetPalaceGitTag()); + case ProblemType::ELECTROSTATIC: + return std::make_unique(iodata, world_root, world_size, + omp_threads, GetPalaceGitTag()); + case ProblemType::MAGNETOSTATIC: + return std::make_unique(iodata, world_root, world_size, + omp_threads, GetPalaceGitTag()); + case ProblemType::TRANSIENT: + return std::make_unique(iodata, world_root, world_size, + omp_threads, GetPalaceGitTag()); + } + return nullptr; + }(); + + // Read the mesh from file, refine, partition, and distribute it. Then nondimensionalize + // it and the input parameters. + std::vector> mesh; + { + std::vector> mfem_mesh; + mfem_mesh.push_back(mesh::ReadMesh(iodata, world_comm)); + iodata.NondimensionalizeInputs(*mfem_mesh[0]); + mesh::RefineMesh(iodata, mfem_mesh); + for (auto &m : mfem_mesh) + { + mesh.push_back(std::make_unique(std::move(m))); + } + } + + // Run the problem driver. + solver->SolveEstimateMarkRefine(mesh); + + // Print timing summary. + BlockTimer::Print(world_comm); + solver->SaveMetadata(BlockTimer::GlobalTimer()); + Mpi::Print(world_comm, "\n"); + + // Finalize libCEED. + ceed::Finalize(); + + // Finalize SLEPc/PETSc. +#if defined(PALACE_WITH_SLEPC) + slepc::Finalize(); +#endif + + return 0; +} diff --git a/palace/models/CMakeLists.txt b/palace/models/CMakeLists.txt index 516afab657..8b1f470aa2 100644 --- a/palace/models/CMakeLists.txt +++ b/palace/models/CMakeLists.txt @@ -1,25 +1,28 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Add source files and subdirectories. -# - -target_sources(${LIB_TARGET_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/curlcurloperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/domainpostoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/farfieldboundaryoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/laplaceoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/lumpedportoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/materialoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/postoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/romoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/spaceoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/surfaceconductivityoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/surfacecurrentoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/surfaceimpedanceoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/surfacepostoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/timeoperator.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/waveportoperator.cpp -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Add source files and subdirectories. +# + +target_sources(${LIB_TARGET_NAME} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/curlcurloperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/domainpostoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/farfieldboundaryoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/laplaceoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/lumpedportoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/materialoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/postoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/postoperatorcsv.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/portexcitations.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/romoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/strattonchu.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/spaceoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/surfaceconductivityoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/surfacecurrentoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/surfaceimpedanceoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/surfacepostoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/timeoperator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/waveportoperator.cpp +) diff --git a/palace/models/curlcurloperator.cpp b/palace/models/curlcurloperator.cpp index 4aba3849ae..d13f8cf7d6 100644 --- a/palace/models/curlcurloperator.cpp +++ b/palace/models/curlcurloperator.cpp @@ -1,182 +1,222 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "curlcurloperator.hpp" - -#include "fem/bilinearform.hpp" -#include "fem/coefficient.hpp" -#include "fem/integrator.hpp" -#include "fem/multigrid.hpp" -#include "linalg/rap.hpp" -#include "utils/communication.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" -#include "utils/prettyprint.hpp" - -namespace palace -{ - -namespace -{ - -mfem::Array SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh) -{ - int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; - if (!iodata.boundaries.pec.empty()) - { - // Check that boundary attributes have been specified correctly. - mfem::Array bdr_attr_marker(bdr_attr_max); - bdr_attr_marker = 0; - for (auto attr : mesh.bdr_attributes) - { - bdr_attr_marker[attr - 1] = 1; - } - bool first = true; - for (auto attr : iodata.boundaries.pec.attributes) - { - // MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, - // "PEC boundary attribute tags must be non-negative and correspond to " - // "attributes in the mesh!"); - // MFEM_VERIFY(bdr_attr_marker[attr-1], - // "Unknown PEC boundary attribute " << attr << "!"); - if (attr <= 0 || attr > bdr_attr_marker.Size() || !bdr_attr_marker[attr - 1]) - { - if (first) - { - Mpi::Print("\n"); - first = false; - } - Mpi::Warning("Unknown PEC boundary attribute {:d}!\nSolver will just ignore it!\n", - attr); - } - } - } - - // Mark selected boundary attributes from the mesh as essential (Dirichlet). - mfem::Array dbc_bcs, dbc_marker; - dbc_bcs.Reserve(static_cast(iodata.boundaries.pec.attributes.size())); - for (auto attr : iodata.boundaries.pec.attributes) - { - if (attr <= 0 || attr > bdr_attr_max) - { - continue; // Can just ignore if wrong - } - dbc_bcs.Append(attr); - } - mesh::AttrToMarker(bdr_attr_max, dbc_bcs, dbc_marker); - return dbc_marker; -} - -} // namespace - -CurlCurlOperator::CurlCurlOperator(const IoData &iodata, - const std::vector> &mesh) - : pa_order_threshold(iodata.solver.pa_order_threshold), skip_zeros(false), - print_hdr(true), dbc_marker(SetUpBoundaryProperties(iodata, *mesh.back())), - nd_fecs(fem::ConstructFECollections( - iodata.solver.order, mesh.back()->Dimension(), iodata.solver.linear.mg_max_levels, - iodata.solver.linear.mg_coarsen_type, false)), - h1_fecs(fem::ConstructFECollections( - iodata.solver.order, mesh.back()->Dimension(), iodata.solver.linear.mg_max_levels, - iodata.solver.linear.mg_coarsen_type, false)), - rt_fec(std::make_unique(iodata.solver.order - 1, - mesh.back()->Dimension())), - nd_fespaces(fem::ConstructFiniteElementSpaceHierarchy( - iodata.solver.linear.mg_max_levels, mesh, nd_fecs, &dbc_marker, &dbc_tdof_lists)), - h1_fespaces(fem::ConstructAuxiliaryFiniteElementSpaceHierarchy( - nd_fespaces, h1_fecs)), - rt_fespace(nd_fespaces.GetFinestFESpace(), mesh.back().get(), rt_fec.get()), - mat_op(iodata, *mesh.back()), surf_j_op(iodata, GetH1Space()) -{ - // Finalize setup. - CheckBoundaryProperties(); - - // Print essential BC information. - if (dbc_marker.Size() && dbc_marker.Max() > 0) - { - Mpi::Print("\nConfiguring Dirichlet BC at attributes:\n"); - utils::PrettyPrintMarker(dbc_marker); - } -} - -void CurlCurlOperator::CheckBoundaryProperties() -{ - // A final check that no boundary attribute is assigned multiple boundary conditions. - const auto &surf_j_marker = surf_j_op.GetMarker(); - for (int i = 0; i < dbc_marker.Size(); i++) - { - MFEM_VERIFY(dbc_marker[i] + surf_j_marker[i] <= 1, - "Boundary attributes should not be specified with multiple BC!"); - } -} - -std::unique_ptr CurlCurlOperator::GetStiffnessMatrix() -{ - if (print_hdr) - { - Mpi::Print("\nAssembling system matrices, number of global unknowns:\n" - " H1: {:d}, ND: {:d}, RT: {:d}\n Operator assembly level: {}\n", - GetH1Space().GlobalTrueVSize(), GetNDSpace().GlobalTrueVSize(), - GetRTSpace().GlobalTrueVSize(), - GetNDSpace().GetMaxElementOrder() > pa_order_threshold ? "Partial" : "Full"); - Mpi::Print("\nAssembling multigrid hierarchy:\n"); - } - auto K = std::make_unique(GetNDSpaces().GetNumLevels()); - for (std::size_t l = 0; l < GetNDSpaces().GetNumLevels(); l++) - { - // Force coarse level operator to be fully assembled always. - const auto &nd_fespace_l = GetNDSpaces().GetFESpaceAtLevel(l); - if (print_hdr) - { - Mpi::Print(" Level {:d} (p = {:d}): {:d} unknowns", l, - nd_fespace_l.GetMaxElementOrder(), nd_fespace_l.GlobalTrueVSize()); - } - constexpr auto MatType = MaterialPropertyType::INV_PERMEABILITY; - MaterialPropertyCoefficient muinv_func(mat_op); - BilinearForm k(nd_fespace_l); - k.AddDomainIntegrator(muinv_func); - auto K_l = std::make_unique( - k.Assemble((l > 0) ? pa_order_threshold : 99, skip_zeros), nd_fespace_l); - if (print_hdr) - { - if (const auto *k_spm = - dynamic_cast(&K_l->LocalOperator())) - { - HYPRE_BigInt nnz = k_spm->NumNonZeroElems(); - Mpi::GlobalSum(1, &nnz, nd_fespace_l.GetComm()); - Mpi::Print(", {:d} NNZ\n", nnz); - } - else - { - Mpi::Print("\n"); - } - } - K_l->SetEssentialTrueDofs(dbc_tdof_lists[l], Operator::DiagonalPolicy::DIAG_ONE); - K->AddOperator(std::move(K_l)); - } - print_hdr = false; - return K; -} - -void CurlCurlOperator::GetExcitationVector(int idx, Vector &RHS) -{ - // Assemble the surface current excitation +J. The SurfaceCurrentOperator assembles -J - // (meant for time or frequency domain Maxwell discretization, so we multiply by -1 to - // retrieve +J). - SumVectorCoefficient fb(GetNDSpace().GetParMesh()->SpaceDimension()); - surf_j_op.AddExcitationBdrCoefficients(idx, fb); - RHS.SetSize(GetNDSpace().GetTrueVSize()); - RHS = 0.0; - if (fb.empty()) - { - return; - } - mfem::LinearForm rhs(&GetNDSpace()); - rhs.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fb)); - rhs.UseFastAssembly(false); - rhs.Assemble(); - GetNDSpace().GetProlongationMatrix()->AddMultTranspose(rhs, RHS, -1.0); - linalg::SetSubVector(RHS, dbc_tdof_lists.back(), 0.0); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "curlcurloperator.hpp" + +#include +#include "fem/bilinearform.hpp" +#include "fem/coefficient.hpp" +#include "fem/integrator.hpp" +#include "fem/mesh.hpp" +#include "fem/multigrid.hpp" +#include "linalg/hypre.hpp" +#include "linalg/rap.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" +#include "utils/prettyprint.hpp" + +namespace palace +{ + +CurlCurlOperator::CurlCurlOperator(const IoData &iodata, + const std::vector> &mesh) + : print_hdr(true), dbc_attr(SetUpBoundaryProperties(iodata, *mesh.back())), + nd_fecs(fem::ConstructFECollections( + iodata.solver.order, mesh.back()->Dimension(), iodata.solver.linear.mg_max_levels, + iodata.solver.linear.mg_coarsening, false)), + h1_fecs(fem::ConstructFECollections( + iodata.solver.order, mesh.back()->Dimension(), iodata.solver.linear.mg_max_levels, + iodata.solver.linear.mg_coarsening, false)), + rt_fec(std::make_unique(iodata.solver.order - 1, + mesh.back()->Dimension())), + nd_fespaces(fem::ConstructFiniteElementSpaceHierarchy( + iodata.solver.linear.mg_max_levels, mesh, nd_fecs, &dbc_attr, &dbc_tdof_lists)), + h1_fespaces(fem::ConstructFiniteElementSpaceHierarchy( + iodata.solver.linear.mg_max_levels, mesh, h1_fecs)), + rt_fespace(*mesh.back(), rt_fec.get()), mat_op(iodata, *mesh.back()), + surf_j_op(iodata, *mesh.back()) +{ + // Finalize setup. + CheckBoundaryProperties(); + + // Print essential BC information. + if (dbc_attr.Size()) + { + Mpi::Print("\nConfiguring Dirichlet BC at attributes:\n"); + utils::PrettyPrint(dbc_attr); + } +} + +mfem::Array CurlCurlOperator::SetUpBoundaryProperties(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + // Check that boundary attributes have been specified correctly. + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array bdr_attr_marker; + if (!iodata.boundaries.pec.empty()) + { + bdr_attr_marker.SetSize(bdr_attr_max); + bdr_attr_marker = 0; + for (auto attr : mesh.bdr_attributes) + { + bdr_attr_marker[attr - 1] = 1; + } + std::set bdr_warn_list; + for (auto attr : iodata.boundaries.pec.attributes) + { + // MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, + // "PEC boundary attribute tags must be non-negative and correspond to " + // "attributes in the mesh!"); + // MFEM_VERIFY(bdr_attr_marker[attr - 1], + // "Unknown PEC boundary attribute " << attr << "!"); + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + bdr_warn_list.insert(attr); + } + } + if (!bdr_warn_list.empty()) + { + Mpi::Print("\n"); + Mpi::Warning("Unknown PEC boundary attributes!\nSolver will just ignore them!"); + utils::PrettyPrint(bdr_warn_list, "Boundary attribute list:"); + Mpi::Print("\n"); + } + } + + // Mark selected boundary attributes from the mesh as essential (Dirichlet). + mfem::Array dbc_bcs; + dbc_bcs.Reserve(static_cast(iodata.boundaries.pec.attributes.size())); + for (auto attr : iodata.boundaries.pec.attributes) + { + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + continue; // Can just ignore if wrong + } + dbc_bcs.Append(attr); + } + return dbc_bcs; +} + +void CurlCurlOperator::CheckBoundaryProperties() +{ + // A final check that no boundary attribute is assigned multiple boundary conditions. + const mfem::ParMesh &mesh = GetMesh(); + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + const auto dbc_marker = mesh::AttrToMarker(bdr_attr_max, dbc_attr); + const auto surf_j_marker = mesh::AttrToMarker(bdr_attr_max, surf_j_op.GetAttrList()); + for (int i = 0; i < dbc_marker.Size(); i++) + { + MFEM_VERIFY(dbc_marker[i] + surf_j_marker[i] <= 1, + "Boundary attributes should not be specified with multiple BC!"); + } +} + +namespace +{ + +void PrintHeader(const mfem::ParFiniteElementSpace &h1_fespace, + const mfem::ParFiniteElementSpace &nd_fespace, + const mfem::ParFiniteElementSpace &rt_fespace, bool &print_hdr) +{ + if (print_hdr) + { + Mpi::Print("\nAssembling system matrices, number of global unknowns:\n" + " H1 (p = {:d}): {:d}, ND (p = {:d}): {:d}, RT (p = {:d}): {:d}\n Operator " + "assembly level: {}\n", + h1_fespace.GetMaxElementOrder(), h1_fespace.GlobalTrueVSize(), + nd_fespace.GetMaxElementOrder(), nd_fespace.GlobalTrueVSize(), + rt_fespace.GetMaxElementOrder(), rt_fespace.GlobalTrueVSize(), + (nd_fespace.GetMaxElementOrder() >= BilinearForm::pa_order_threshold) + ? "Partial" + : "Full"); + + const auto &mesh = *nd_fespace.GetParMesh(); + const auto geom_types = mesh::CheckElements(mesh).GetGeomTypes(); + Mpi::Print(" Mesh geometries:\n"); + for (auto geom : geom_types) + { + const auto *fe = nd_fespace.FEColl()->FiniteElementForGeometry(geom); + MFEM_VERIFY(fe, "MFEM does not support ND spaces on geometry = " + << mfem::Geometry::Name[geom] << "!"); + const int q_order = fem::DefaultIntegrationOrder::Get(mesh, geom); + Mpi::Print(" {}: P = {:d}, Q = {:d} (quadrature order = {:d}){}\n", + mfem::Geometry::Name[geom], fe->GetDof(), + mfem::IntRules.Get(geom, q_order).GetNPoints(), q_order, + (geom == geom_types.back()) ? "" : ","); + } + + Mpi::Print("\nAssembling multigrid hierarchy:\n"); + } +} + +} // namespace + +std::unique_ptr CurlCurlOperator::GetStiffnessMatrix() +{ + // When partially assembled, the coarse operators can reuse the fine operator quadrature + // data if the spaces correspond to the same mesh. + PrintHeader(GetH1Space(), GetNDSpace(), GetRTSpace(), print_hdr); + + constexpr bool skip_zeros = false; + MaterialPropertyCoefficient muinv_func(mat_op.GetAttributeToMaterial(), + mat_op.GetInvPermeability()); + BilinearForm k(GetNDSpace()); + k.AddDomainIntegrator(muinv_func); + // k.AssembleQuadratureData(); + auto k_vec = k.Assemble(GetNDSpaces(), skip_zeros); + auto K = std::make_unique(GetNDSpaces().GetNumLevels()); + for (std::size_t l = 0; l < GetNDSpaces().GetNumLevels(); l++) + { + const auto &nd_fespace_l = GetNDSpaces().GetFESpaceAtLevel(l); + if (print_hdr) + { + Mpi::Print(" Level {:d} (p = {:d}): {:d} unknowns", l, + nd_fespace_l.GetMaxElementOrder(), nd_fespace_l.GlobalTrueVSize()); + if (const auto *k_spm = dynamic_cast(k_vec[l].get())) + { + HYPRE_BigInt nnz = k_spm->NNZ(); + Mpi::GlobalSum(1, &nnz, nd_fespace_l.GetComm()); + Mpi::Print(", {:d} NNZ\n", nnz); + } + else + { + Mpi::Print("\n"); + } + } + auto K_l = std::make_unique(std::move(k_vec[l]), nd_fespace_l); + K_l->SetEssentialTrueDofs(dbc_tdof_lists[l], Operator::DiagonalPolicy::DIAG_ONE); + K->AddOperator(std::move(K_l)); + } + + print_hdr = false; + return K; +} + +void CurlCurlOperator::GetExcitationVector(int idx, Vector &RHS) +{ + // Assemble the surface current excitation +J. The SurfaceCurrentOperator assembles -J + // (meant for time or frequency domain Maxwell discretization, so we multiply by -1 to + // retrieve +J). + SumVectorCoefficient fb(GetMesh().SpaceDimension()); + surf_j_op.AddExcitationBdrCoefficients(idx, fb); + RHS.SetSize(GetNDSpace().GetTrueVSize()); + RHS.UseDevice(true); + RHS = 0.0; + int empty = (fb.empty()); + Mpi::GlobalMin(1, &empty, GetComm()); + if (empty) + { + return; + } + mfem::LinearForm rhs(&GetNDSpace().Get()); + rhs.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fb)); + rhs.UseFastAssembly(false); + rhs.UseDevice(false); + rhs.Assemble(); + rhs.UseDevice(true); + GetNDSpace().GetProlongationMatrix()->AddMultTranspose(rhs, RHS, -1.0); + linalg::SetSubVector(RHS, dbc_tdof_lists.back(), 0.0); +} + +} // namespace palace diff --git a/palace/models/curlcurloperator.hpp b/palace/models/curlcurloperator.hpp index 7393ab0db1..3dc613d239 100644 --- a/palace/models/curlcurloperator.hpp +++ b/palace/models/curlcurloperator.hpp @@ -1,96 +1,100 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_CURL_CURL_OPERATOR_HPP -#define PALACE_MODELS_CURL_CURL_OPERATOR_HPP - -#include -#include -#include -#include "fem/fespace.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" -#include "models/materialoperator.hpp" -#include "models/surfacecurrentoperator.hpp" - -namespace palace -{ - -class IoData; - -// -// A class handling discretization of curl-curl problems for magnetostatics. -// -class CurlCurlOperator -{ -private: - const int pa_order_threshold; // Order above which to use partial assembly vs. full - const bool skip_zeros; // Skip zeros during full assembly of matrices - - // Helper variable for log file printing. - bool print_hdr; - - // Essential boundary condition markers. - mfem::Array dbc_marker; - std::vector> dbc_tdof_lists; - void CheckBoundaryProperties(); - - // Objects defining the finite element spaces for the magnetic vector potential - // (Nedelec) and magnetic flux density (Raviart-Thomas) on the given mesh. The H1 spaces - // are used for various purposes throughout the code including postprocessing. - std::vector> nd_fecs; - std::vector> h1_fecs; - std::unique_ptr rt_fec; - FiniteElementSpaceHierarchy nd_fespaces; - AuxiliaryFiniteElementSpaceHierarchy h1_fespaces; - AuxiliaryFiniteElementSpace rt_fespace; - - // Operator for domain material properties. - MaterialOperator mat_op; - - // Operator for source current excitation. - SurfaceCurrentOperator surf_j_op; - -public: - CurlCurlOperator(const IoData &iodata, - const std::vector> &mesh); - - // Return material operator for postprocessing. - const MaterialOperator &GetMaterialOp() const { return mat_op; } - - // Access to underlying BC operator objects for postprocessing. - const auto &GetSurfaceCurrentOp() const { return surf_j_op; } - - // Return the parallel finite element space objects. - auto &GetNDSpaces() { return nd_fespaces; } - const auto &GetNDSpaces() const { return nd_fespaces; } - auto &GetNDSpace() { return nd_fespaces.GetFinestFESpace(); } - const auto &GetNDSpace() const { return nd_fespaces.GetFinestFESpace(); } - auto &GetH1Spaces() { return h1_fespaces; } - const auto &GetH1Spaces() const { return h1_fespaces; } - auto &GetH1Space() { return h1_fespaces.GetFinestFESpace(); } - const auto &GetH1Space() const { return h1_fespaces.GetFinestFESpace(); } - auto &GetRTSpace() { return rt_fespace; } - const auto &GetRTSpace() const { return rt_fespace; } - - // Return the number of true (conforming) dofs on the finest ND space. - auto GlobalTrueVSize() { return GetNDSpace().GlobalTrueVSize(); } - - // Construct and return system matrix representing discretized curl-curl operator for - // Ampere's law. - std::unique_ptr GetStiffnessMatrix(); - - // Construct and return the discrete curl matrix. - const Operator &GetCurlMatrix() const { return GetRTSpace().GetDiscreteInterpolator(); } - - // Assemble the right-hand side source term vector for a current source applied on - // specified excited boundaries. - void GetExcitationVector(int idx, Vector &RHS); - - // Get the associated MPI communicator. - MPI_Comm GetComm() const { return GetNDSpace().GetComm(); } -}; - -} // namespace palace - -#endif // PALACE_MODELS_CURL_CURL_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_CURL_CURL_OPERATOR_HPP +#define PALACE_MODELS_CURL_CURL_OPERATOR_HPP + +#include +#include +#include +#include "fem/fespace.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" +#include "models/materialoperator.hpp" +#include "models/surfacecurrentoperator.hpp" + +namespace palace +{ + +class IoData; +class Mesh; + +// +// A class handling discretization of curl-curl problems for magnetostatics. +// +class CurlCurlOperator +{ +private: + // Helper variable for log file printing. + bool print_hdr; + + // Essential boundary condition attributes. + mfem::Array dbc_attr; + std::vector> dbc_tdof_lists; + + // Objects defining the finite element spaces for the magnetic vector potential + // (Nedelec) and magnetic flux density (Raviart-Thomas) on the given mesh. The H1 spaces + // are used for various purposes throughout the code including postprocessing. + std::vector> nd_fecs; + std::vector> h1_fecs; + std::unique_ptr rt_fec; + FiniteElementSpaceHierarchy nd_fespaces, h1_fespaces; + FiniteElementSpace rt_fespace; + + // Operator for domain material properties. + MaterialOperator mat_op; + + // Operator for source current excitation. + SurfaceCurrentOperator surf_j_op; + + mfem::Array SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh); + void CheckBoundaryProperties(); + +public: + CurlCurlOperator(const IoData &iodata, const std::vector> &mesh); + + // Return material operator for postprocessing. + const MaterialOperator &GetMaterialOp() const { return mat_op; } + + // Access to underlying BC operator objects for postprocessing. + const auto &GetSurfaceCurrentOp() const { return surf_j_op; } + + // Return the parallel finite element space objects. + auto &GetNDSpaces() { return nd_fespaces; } + const auto &GetNDSpaces() const { return nd_fespaces; } + auto &GetNDSpace() { return nd_fespaces.GetFinestFESpace(); } + const auto &GetNDSpace() const { return nd_fespaces.GetFinestFESpace(); } + auto &GetH1Spaces() { return h1_fespaces; } + const auto &GetH1Spaces() const { return h1_fespaces; } + auto &GetH1Space() { return h1_fespaces.GetFinestFESpace(); } + const auto &GetH1Space() const { return h1_fespaces.GetFinestFESpace(); } + auto &GetRTSpace() { return rt_fespace; } + const auto &GetRTSpace() const { return rt_fespace; } + + // Access the underlying mesh object. + const auto &GetMesh() const { return GetNDSpace().GetMesh(); } + + // Return the number of true (conforming) dofs on the finest ND space. + auto GlobalTrueVSize() const { return GetNDSpace().GlobalTrueVSize(); } + + // Construct and return system matrix representing discretized curl-curl operator for + // Ampere's law. + std::unique_ptr GetStiffnessMatrix(); + + // Construct and return the discrete curl matrix. + const Operator &GetCurlMatrix() const + { + return GetRTSpace().GetDiscreteInterpolator(GetNDSpace()); + } + + // Assemble the right-hand side source term vector for a current source applied on + // specified excited boundaries. + void GetExcitationVector(int idx, Vector &RHS); + + // Get the associated MPI communicator. + MPI_Comm GetComm() const { return GetNDSpace().GetComm(); } +}; + +} // namespace palace + +#endif // PALACE_MODELS_CURL_CURL_OPERATOR_HPP diff --git a/palace/models/domainpostoperator.cpp b/palace/models/domainpostoperator.cpp index 9cc9b2a36f..5e2d902b63 100644 --- a/palace/models/domainpostoperator.cpp +++ b/palace/models/domainpostoperator.cpp @@ -1,192 +1,224 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "domainpostoperator.hpp" - -#include "fem/bilinearform.hpp" -#include "fem/coefficient.hpp" -#include "fem/integrator.hpp" -#include "models/materialoperator.hpp" -#include "utils/communication.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -DomainPostOperator::DomainPostOperator(const IoData &iodata, const MaterialOperator &mat_op, - const mfem::ParFiniteElementSpace *nd_fespace, - const mfem::ParFiniteElementSpace *rt_fespace) -{ - // Mass operators are always partially assembled. - if (nd_fespace) - { - // Construct ND mass matrix to compute the electric field energy integral as: - // E_elec = 1/2 Re{∫_Ω Dᴴ E dV} as (M_eps * e)ᴴ e. - // Only the real part of the permeability contributes to the energy (imaginary part - // cancels out in the inner product due to symmetry). - constexpr auto MatTypeEpsReal = MaterialPropertyType::PERMITTIVITY_REAL; - constexpr auto MatTypeEpsImag = MaterialPropertyType::PERMITTIVITY_IMAG; - MaterialPropertyCoefficient epsilon_func(mat_op); - BilinearForm m_nd(*nd_fespace); - m_nd.AddDomainIntegrator(epsilon_func); - M_ND = m_nd.Assemble(); - D.SetSize(M_ND->Height()); - - // Use the provided domain postprocessing indices to group for postprocessing bulk - // dielectric loss. - int attr_max = nd_fespace->GetParMesh()->attributes.Max(); - for (const auto &[idx, data] : iodata.domains.postpro.dielectric) - { - mfem::Array attr_marker(attr_max); - attr_marker = 0; - for (auto attr : data.attributes) - { - attr_marker[attr - 1] = 1; - } - SumMatrixCoefficient epsilon_func_r(nd_fespace->GetParMesh()->SpaceDimension()); - SumMatrixCoefficient epsilon_func_i(nd_fespace->GetParMesh()->SpaceDimension()); - epsilon_func_r.AddCoefficient( - std::make_unique>(mat_op), - attr_marker); - epsilon_func_i.AddCoefficient( - std::make_unique>(mat_op, -1.0), - attr_marker); - BilinearForm mr_nd(*nd_fespace), mi_nd(*nd_fespace); - mr_nd.AddDomainIntegrator(epsilon_func_r); - mi_nd.AddDomainIntegrator(epsilon_func_i); - M_NDi.emplace(idx, std::make_pair(mr_nd.Assemble(), mi_nd.Assemble())); - } - } - - if (rt_fespace) - { - // Construct RT mass matrix to compute the magnetic field energy integral as: - // E_mag = 1/2 Re{∫_Ω Bᴴ H dV} as (M_muinv * b)ᴴ b. - constexpr auto MatTypeMuInv = MaterialPropertyType::INV_PERMEABILITY; - MaterialPropertyCoefficient muinv_func(mat_op); - BilinearForm m_rt(*rt_fespace); - m_rt.AddDomainIntegrator(muinv_func); - M_RT = m_rt.Assemble(); - H.SetSize(M_RT->Height()); - } -} - -double -DomainPostOperator::GetElectricFieldEnergy(const mfem::ParComplexGridFunction &E) const -{ - if (M_ND) - { - M_ND->Mult(E.real(), D); - double res = mfem::InnerProduct(E.real(), D); - M_ND->Mult(E.imag(), D); - res += mfem::InnerProduct(E.imag(), D); - Mpi::GlobalSum(1, &res, E.ParFESpace()->GetComm()); - return 0.5 * res; - } - MFEM_ABORT( - "Domain postprocessing is not configured for electric field energy calculation!"); - return 0.0; -} - -double DomainPostOperator::GetElectricFieldEnergy(const mfem::ParGridFunction &E) const -{ - if (M_ND) - { - M_ND->Mult(E, D); - double res = mfem::InnerProduct(E, D); - Mpi::GlobalSum(1, &res, E.ParFESpace()->GetComm()); - return 0.5 * res; - } - MFEM_ABORT( - "Domain postprocessing is not configured for electric field energy calculation!"); - return 0.0; -} - -double -DomainPostOperator::GetMagneticFieldEnergy(const mfem::ParComplexGridFunction &B) const -{ - if (M_RT) - { - M_RT->Mult(B.real(), H); - double res = mfem::InnerProduct(B.real(), H); - M_RT->Mult(B.imag(), H); - res += mfem::InnerProduct(B.imag(), H); - Mpi::GlobalSum(1, &res, B.ParFESpace()->GetComm()); - return 0.5 * res; - } - MFEM_ABORT( - "Domain postprocessing is not configured for magnetic field energy calculation!"); - return 0.0; -} - -double DomainPostOperator::GetMagneticFieldEnergy(const mfem::ParGridFunction &B) const -{ - if (M_RT) - { - M_RT->Mult(B, H); - double res = mfem::InnerProduct(B, H); - Mpi::GlobalSum(1, &res, B.ParFESpace()->GetComm()); - return 0.5 * res; - } - MFEM_ABORT( - "Domain postprocessing is not configured for magnetic field energy calculation!"); - return 0.0; -} - -double DomainPostOperator::GetDomainElectricFieldEnergy( - int idx, const mfem::ParComplexGridFunction &E) const -{ - // Compute the electric field energy integral for only a portion of the domain. - auto it = M_NDi.find(idx); - MFEM_VERIFY(it != M_NDi.end(), - "Invalid domain index when postprocessing bulk dielectric loss!"); - it->second.first->Mult(E.real(), D); - double res = mfem::InnerProduct(E.real(), D); - it->second.first->Mult(E.imag(), D); - res += mfem::InnerProduct(E.imag(), D); - Mpi::GlobalSum(1, &res, E.ParFESpace()->GetComm()); - return 0.5 * res; -} - -double -DomainPostOperator::GetDomainElectricFieldEnergy(int idx, - const mfem::ParGridFunction &E) const -{ - auto it = M_NDi.find(idx); - MFEM_VERIFY(it != M_NDi.end(), - "Invalid domain index when postprocessing bulk dielectric loss!"); - it->second.first->Mult(E, D); - double res = mfem::InnerProduct(E, D); - Mpi::GlobalSum(1, &res, E.ParFESpace()->GetComm()); - return 0.5 * res; -} - -double DomainPostOperator::GetDomainElectricFieldEnergyLoss( - int idx, const mfem::ParComplexGridFunction &E) const -{ - // Compute the electric field energy integral for only a portion of the domain. - auto it = M_NDi.find(idx); - MFEM_VERIFY(it != M_NDi.end(), - "Invalid domain index when postprocessing bulk dielectric loss!"); - it->second.second->Mult(E.real(), D); - double res = mfem::InnerProduct(E.real(), D); - it->second.second->Mult(E.imag(), D); - res += mfem::InnerProduct(E.imag(), D); - Mpi::GlobalSum(1, &res, E.ParFESpace()->GetComm()); - return 0.5 * res; -} - -double -DomainPostOperator::GetDomainElectricFieldEnergyLoss(int idx, - const mfem::ParGridFunction &E) const -{ - auto it = M_NDi.find(idx); - MFEM_VERIFY(it != M_NDi.end(), - "Invalid domain index when postprocessing bulk dielectric loss!"); - it->second.second->Mult(E, D); - double res = mfem::InnerProduct(E, D); - Mpi::GlobalSum(1, &res, E.ParFESpace()->GetComm()); - return 0.5 * res; -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "domainpostoperator.hpp" + +#include +#include "fem/bilinearform.hpp" +#include "fem/fespace.hpp" +#include "fem/gridfunction.hpp" +#include "fem/integrator.hpp" +#include "models/materialoperator.hpp" +#include "utils/communication.hpp" +#include "utils/iodata.hpp" + +namespace palace +{ + +DomainPostOperator::DomainPostOperator(const IoData &iodata, const MaterialOperator &mat_op, + const FiniteElementSpace &nd_fespace, + const FiniteElementSpace &rt_fespace) +{ + // Mass operators are always partially assembled. + MFEM_VERIFY(nd_fespace.GetFEColl().GetMapType(nd_fespace.Dimension()) == + mfem::FiniteElement::H_CURL && + rt_fespace.GetFEColl().GetMapType(nd_fespace.Dimension()) == + mfem::FiniteElement::H_DIV, + "Unexpected finite element space types for domain energy postprocessing!"); + { + // Construct ND mass matrix to compute the electric field energy integral as: + // E_elec = 1/2 Re{∫_Ω Dᴴ E dV} as (M_eps * e)ᴴ e. + // Only the real part of the permeability contributes to the energy (imaginary part + // cancels out in the inner product due to symmetry). + MaterialPropertyCoefficient epsilon_func(mat_op.GetAttributeToMaterial(), + mat_op.GetPermittivityReal()); + BilinearForm m(nd_fespace); + m.AddDomainIntegrator(epsilon_func); + M_elec = m.PartialAssemble(); + D.SetSize(M_elec->Height()); + D.UseDevice(true); + } + { + // Construct RT mass matrix to compute the magnetic field energy integral as: + // E_mag = 1/2 Re{∫_Ω Hᴴ B dV} as (M_muinv * b)ᴴ b. + MaterialPropertyCoefficient muinv_func(mat_op.GetAttributeToMaterial(), + mat_op.GetInvPermeability()); + BilinearForm m(rt_fespace); + m.AddDomainIntegrator(muinv_func); + M_mag = m.PartialAssemble(); + H.SetSize(M_mag->Height()); + H.UseDevice(true); + } + + // Use the provided domain postprocessing indices for postprocessing the electric and + // magnetic field energy in specific regions of the domain. + for (const auto &[idx, data] : iodata.domains.postpro.energy) + { + std::unique_ptr M_elec_i, M_mag_i; + { + MaterialPropertyCoefficient epsilon_func(mat_op.GetAttributeToMaterial(), + mat_op.GetPermittivityReal()); + epsilon_func.RestrictCoefficient(mat_op.GetCeedAttributes(data.attributes)); + BilinearForm m(nd_fespace); + m.AddDomainIntegrator(epsilon_func); + M_elec_i = m.PartialAssemble(); + } + { + MaterialPropertyCoefficient muinv_func(mat_op.GetAttributeToMaterial(), + mat_op.GetInvPermeability()); + muinv_func.RestrictCoefficient(mat_op.GetCeedAttributes(data.attributes)); + BilinearForm m(rt_fespace); + m.AddDomainIntegrator(muinv_func); + M_mag_i = m.PartialAssemble(); + } + M_i.emplace(idx, std::make_pair(std::move(M_elec_i), std::move(M_mag_i))); + } +} + +DomainPostOperator::DomainPostOperator(const IoData &iodata, const MaterialOperator &mat_op, + const FiniteElementSpace &fespace) +{ + const auto map_type = fespace.GetFEColl().GetMapType(fespace.Dimension()); + if (map_type == mfem::FiniteElement::VALUE) + { + // H1 space for voltage and electric field energy. + { + MaterialPropertyCoefficient epsilon_func(mat_op.GetAttributeToMaterial(), + mat_op.GetPermittivityReal()); + BilinearForm m(fespace); + m.AddDomainIntegrator(epsilon_func); + M_elec = m.PartialAssemble(); + D.SetSize(M_elec->Height()); + D.UseDevice(true); + } + + for (const auto &[idx, data] : iodata.domains.postpro.energy) + { + std::unique_ptr M_elec_i; + { + MaterialPropertyCoefficient epsilon_func(mat_op.GetAttributeToMaterial(), + mat_op.GetPermittivityReal()); + epsilon_func.RestrictCoefficient(mat_op.GetCeedAttributes(data.attributes)); + BilinearForm m(fespace); + m.AddDomainIntegrator(epsilon_func); + M_elec_i = m.PartialAssemble(); + } + M_i.emplace(idx, std::make_pair(std::move(M_elec_i), nullptr)); + } + } + else if (map_type == mfem::FiniteElement::H_CURL) + { + // H(curl) space for magnetic vector potential and magnetic field energy. + { + MaterialPropertyCoefficient muinv_func(mat_op.GetAttributeToMaterial(), + mat_op.GetInvPermeability()); + BilinearForm m(fespace); + m.AddDomainIntegrator(muinv_func); + M_mag = m.PartialAssemble(); + H.SetSize(M_mag->Height()); + H.UseDevice(true); + } + + for (const auto &[idx, data] : iodata.domains.postpro.energy) + { + std::unique_ptr M_mag_i; + { + MaterialPropertyCoefficient muinv_func(mat_op.GetAttributeToMaterial(), + mat_op.GetInvPermeability()); + muinv_func.RestrictCoefficient(mat_op.GetCeedAttributes(data.attributes)); + BilinearForm m(fespace); + m.AddDomainIntegrator(muinv_func); + M_mag_i = m.PartialAssemble(); + } + M_i.emplace(idx, std::make_pair(nullptr, std::move(M_mag_i))); + } + } + else + { + MFEM_ABORT("Unexpected finite element space type for domain energy postprocessing!"); + } +} + +double DomainPostOperator::GetElectricFieldEnergy(const GridFunction &E) const +{ + if (M_elec) + { + M_elec->Mult(E.Real(), D); + double dot = linalg::LocalDot(E.Real(), D); + if (E.HasImag()) + { + M_elec->Mult(E.Imag(), D); + dot += linalg::LocalDot(E.Imag(), D); + } + Mpi::GlobalSum(1, &dot, E.GetComm()); + return 0.5 * dot; + } + MFEM_ABORT( + "Domain postprocessing is not configured for electric field energy calculation!"); + return 0.0; +} + +double DomainPostOperator::GetMagneticFieldEnergy(const GridFunction &B) const +{ + if (M_mag) + { + M_mag->Mult(B.Real(), H); + double dot = linalg::LocalDot(B.Real(), H); + if (B.HasImag()) + { + M_mag->Mult(B.Imag(), H); + dot += linalg::LocalDot(B.Imag(), H); + } + Mpi::GlobalSum(1, &dot, B.GetComm()); + return 0.5 * dot; + } + MFEM_ABORT( + "Domain postprocessing is not configured for magnetic field energy calculation!"); + return 0.0; +} + +double DomainPostOperator::GetDomainElectricFieldEnergy(int idx, + const GridFunction &E) const +{ + // Compute the electric field energy integral for only a portion of the domain. + auto it = M_i.find(idx); + MFEM_VERIFY(it != M_i.end(), + "Invalid domain index when postprocessing domain electric field energy!"); + if (!it->second.first) + { + return 0.0; + } + it->second.first->Mult(E.Real(), D); + double dot = linalg::LocalDot(E.Real(), D); + if (E.HasImag()) + { + it->second.first->Mult(E.Imag(), D); + dot += linalg::LocalDot(E.Imag(), D); + } + Mpi::GlobalSum(1, &dot, E.GetComm()); + return 0.5 * dot; +} + +double DomainPostOperator::GetDomainMagneticFieldEnergy(int idx, + const GridFunction &B) const +{ + // Compute the magnetic field energy integral for only a portion of the domain. + auto it = M_i.find(idx); + MFEM_VERIFY(it != M_i.end(), + "Invalid domain index when postprocessing domain magnetic field energy!"); + if (!it->second.second) + { + return 0.0; + } + it->second.second->Mult(B.Real(), H); + double dot = linalg::LocalDot(B.Real(), H); + if (B.HasImag()) + { + it->second.second->Mult(B.Imag(), H); + dot += linalg::LocalDot(B.Imag(), H); + } + Mpi::GlobalSum(1, &dot, B.GetComm()); + return 0.5 * dot; +} + +} // namespace palace diff --git a/palace/models/domainpostoperator.hpp b/palace/models/domainpostoperator.hpp index 7c31014cba..5460313493 100644 --- a/palace/models/domainpostoperator.hpp +++ b/palace/models/domainpostoperator.hpp @@ -1,55 +1,56 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_DOMAIN_POST_OPERATOR_HPP -#define PALACE_MODELS_DOMAIN_POST_OPERATOR_HPP - -#include -#include -#include -#include -#include "linalg/operator.hpp" - -namespace palace -{ - -class IoData; -class MaterialOperator; - -// -// A class handling domain postprocessing. -// -class DomainPostOperator -{ -private: - // Bilinear forms for computing field energy integrals over domains. - std::unique_ptr M_ND, M_RT; - std::map, std::unique_ptr>> M_NDi; - - // Temporary vectors for inner product calculations. - mutable mfem::Vector D, H; - -public: - DomainPostOperator(const IoData &iodata, const MaterialOperator &mat_op, - const mfem::ParFiniteElementSpace *nd_fespace, - const mfem::ParFiniteElementSpace *rt_fespace); - - // Access underlying bulk loss postprocessing data structures (for keys). - const auto &GetEps() const { return M_NDi; } - auto SizeEps() const { return M_NDi.size(); } - - // Get volume integrals computing bulk electric or magnetic field energy. - double GetElectricFieldEnergy(const mfem::ParComplexGridFunction &E) const; - double GetElectricFieldEnergy(const mfem::ParGridFunction &E) const; - double GetMagneticFieldEnergy(const mfem::ParComplexGridFunction &B) const; - double GetMagneticFieldEnergy(const mfem::ParGridFunction &B) const; - double GetDomainElectricFieldEnergy(int idx, const mfem::ParComplexGridFunction &E) const; - double GetDomainElectricFieldEnergy(int idx, const mfem::ParGridFunction &E) const; - double GetDomainElectricFieldEnergyLoss(int idx, - const mfem::ParComplexGridFunction &E) const; - double GetDomainElectricFieldEnergyLoss(int idx, const mfem::ParGridFunction &E) const; -}; - -} // namespace palace - -#endif // PALACE_MODELS_DOMAIN_POST_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_DOMAIN_POST_OPERATOR_HPP +#define PALACE_MODELS_DOMAIN_POST_OPERATOR_HPP + +#include +#include +#include +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +class GridFunction; +class FiniteElementSpace; +class IoData; +class MaterialOperator; + +// +// Class to handle domain energy postprocessing. We use a leading factor of 1/2 instead of +// 1/4 even though the eigenmodes are peak phasors and not RMS normalized because the same +// peak phasors are used to compute the voltages/currents which are 2x the time-averaged +// values. This correctly yields an EPR of 1 in cases where expected. +// +class DomainPostOperator +{ +public: + // Temporary vectors for inner product calculations. + mutable Vector D, H; + + // Bilinear forms for computing field energy integrals over domains. + std::unique_ptr M_elec, M_mag; + std::map, std::unique_ptr>> M_i; + + DomainPostOperator(const IoData &iodata, const MaterialOperator &mat_op, + const FiniteElementSpace &nd_fespace, + const FiniteElementSpace &rt_fespace); + DomainPostOperator(const IoData &iodata, const MaterialOperator &mat_op, + const FiniteElementSpace &fespace); + + // Get volume integrals computing the electric or magnetic field energy in the entire + // domain. + double GetElectricFieldEnergy(const GridFunction &E) const; + double GetMagneticFieldEnergy(const GridFunction &B) const; + + // Get volume integrals for the electric or magnetic field energy in a portion of the + // domain. + double GetDomainElectricFieldEnergy(int idx, const GridFunction &E) const; + double GetDomainMagneticFieldEnergy(int idx, const GridFunction &B) const; +}; + +} // namespace palace + +#endif // PALACE_MODELS_DOMAIN_POST_OPERATOR_HPP diff --git a/palace/models/farfieldboundaryoperator.cpp b/palace/models/farfieldboundaryoperator.cpp index 3a8b5e6245..d17d794547 100644 --- a/palace/models/farfieldboundaryoperator.cpp +++ b/palace/models/farfieldboundaryoperator.cpp @@ -1,102 +1,134 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "farfieldboundaryoperator.hpp" - -#include "fem/coefficient.hpp" -#include "models/materialoperator.hpp" -#include "utils/communication.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" -#include "utils/prettyprint.hpp" - -namespace palace -{ - -FarfieldBoundaryOperator::FarfieldBoundaryOperator(const IoData &iodata, - const MaterialOperator &mat, - const mfem::ParMesh &mesh) - : mat_op(mat) -{ - // Set up impedance boundary conditions. - SetUpBoundaryProperties(iodata, mesh); - - // Print out BC info for all farfield attributes. - if (farfield_marker.Size() && farfield_marker.Max() > 0) - { - Mpi::Print("\nConfiguring Robin absorbing BC (order {:d}) at attributes:\n", order); - utils::PrettyPrintMarker(farfield_marker); - } -} - -void FarfieldBoundaryOperator::SetUpBoundaryProperties(const IoData &iodata, - const mfem::ParMesh &mesh) -{ - // Check that impedance boundary attributes have been specified correctly. - int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; - if (!iodata.boundaries.farfield.empty()) - { - mfem::Array bdr_attr_marker(bdr_attr_max); - bdr_attr_marker = 0; - for (auto attr : mesh.bdr_attributes) - { - bdr_attr_marker[attr - 1] = 1; - } - for (auto attr : iodata.boundaries.farfield.attributes) - { - MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, - "Absorbing boundary attribute tags must be non-negative and correspond " - "to attributes in the mesh!"); - MFEM_VERIFY(bdr_attr_marker[attr - 1], - "Unknown absorbing boundary attribute " << attr << "!"); - } - } - - // Set the order of the farfield boundary condition. - order = iodata.boundaries.farfield.order; - - // Mark selected boundary attributes from the mesh as farfield. - MFEM_VERIFY(iodata.boundaries.farfield.attributes.empty() || order < 2 || - iodata.problem.type == config::ProblemData::Type::DRIVEN, - "Second-order farfield boundaries are only available for frequency " - "domain driven simulations!"); - mesh::AttrToMarker(bdr_attr_max, iodata.boundaries.farfield.attributes, farfield_marker); -} - -void FarfieldBoundaryOperator::AddDampingBdrCoefficients(double coef, - SumMatrixCoefficient &fb) -{ - // First-order absorbing boundary condition. - if (farfield_marker.Size() && farfield_marker.Max() > 0) - { - constexpr auto MatType = MaterialPropertyType::INV_Z0; - constexpr auto ElemType = MeshElementType::BDR_ELEMENT; - fb.AddCoefficient( - std::make_unique>(mat_op, coef), - farfield_marker); - } -} - -void FarfieldBoundaryOperator::AddExtraSystemBdrCoefficients(double omega, - SumCoefficient &dfbr, - SumCoefficient &dfbi) -{ - // Contribution for second-order absorbing BC. See Jin Section 9.3 for reference. The β - // coefficient for the second-order ABC is 1/(2ik+2/r). Taking the radius of curvature as - // infinity (plane wave scattering), the r-dependence vanishes and the contribution is - // purely imaginary. Multiplying through by μ⁻¹ we get the material coefficient to ω as - // 1 / (μ √με). Also, this implementation ignores the divergence term ∇⋅Eₜ, as COMSOL - // does as well. - if (farfield_marker.Size() && farfield_marker.Max() > 0 && order > 1) - { - constexpr auto MatType = MaterialPropertyType::INV_PERMEABILITY_C0; - constexpr auto ElemType = MeshElementType::BDR_ELEMENT; - dfbi.AddCoefficient( - std::make_unique( - std::make_unique>(mat_op, - 0.5 / omega)), - farfield_marker); - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "farfieldboundaryoperator.hpp" + +#include +#include "linalg/densematrix.hpp" +#include "models/materialoperator.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" +#include "utils/prettyprint.hpp" + +namespace palace +{ + +FarfieldBoundaryOperator::FarfieldBoundaryOperator(const IoData &iodata, + const MaterialOperator &mat_op, + const mfem::ParMesh &mesh) + : mat_op(mat_op), farfield_attr(SetUpBoundaryProperties(iodata, mesh)) +{ + // Print out BC info for all farfield attributes. + if (farfield_attr.Size()) + { + Mpi::Print("\nConfiguring Robin absorbing BC (order {:d}) at attributes:\n", order); + std::sort(farfield_attr.begin(), farfield_attr.end()); + utils::PrettyPrint(farfield_attr); + } +} + +mfem::Array +FarfieldBoundaryOperator::SetUpBoundaryProperties(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + // Check that impedance boundary attributes have been specified correctly. + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array bdr_attr_marker; + if (!iodata.boundaries.farfield.empty()) + { + bdr_attr_marker.SetSize(bdr_attr_max); + bdr_attr_marker = 0; + for (auto attr : mesh.bdr_attributes) + { + bdr_attr_marker[attr - 1] = 1; + } + std::set bdr_warn_list; + for (auto attr : iodata.boundaries.farfield.attributes) + { + // MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, + // "Absorbing boundary attribute tags must be non-negative and correspond + // " "to attributes in the mesh!"); + // MFEM_VERIFY(bdr_attr_marker[attr - 1], + // "Unknown absorbing boundary attribute " << attr << "!"); + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + bdr_warn_list.insert(attr); + } + if (!bdr_warn_list.empty()) + { + Mpi::Print("\n"); + Mpi::Warning( + "Unknown absorbing boundary attributes!\nSolver will just ignore them!"); + utils::PrettyPrint(bdr_warn_list, "Boundary attribute list:"); + Mpi::Print("\n"); + } + } + } + + // Set the order of the farfield boundary condition. + order = iodata.boundaries.farfield.order; + + // Mark selected boundary attributes from the mesh as farfield. + mfem::Array farfield_bcs; + farfield_bcs.Reserve(static_cast(iodata.boundaries.farfield.attributes.size())); + for (auto attr : iodata.boundaries.farfield.attributes) + { + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + continue; // Can just ignore if wrong + } + farfield_bcs.Append(attr); + } + MFEM_VERIFY(farfield_bcs.Size() == 0 || order < 2 || + iodata.problem.type == ProblemType::DRIVEN || + iodata.problem.type == ProblemType::EIGENMODE, + "Second-order farfield boundaries are only available for frequency " + "domain simulations!"); + return farfield_bcs; +} + +void FarfieldBoundaryOperator::AddDampingBdrCoefficients(double coeff, + MaterialPropertyCoefficient &fb) +{ + // First-order absorbing boundary condition. + if (farfield_attr.Size()) + { + MaterialPropertyCoefficient invz0_func(mat_op.GetBdrAttributeToMaterial(), + mat_op.GetInvImpedance()); + invz0_func.RestrictCoefficient(mat_op.GetCeedBdrAttributes(farfield_attr)); + fb.AddCoefficient(invz0_func.GetAttributeToMaterial(), + invz0_func.GetMaterialProperties(), coeff); + } +} + +void FarfieldBoundaryOperator::AddExtraSystemBdrCoefficients( + double omega, MaterialPropertyCoefficient &dfbr, MaterialPropertyCoefficient &dfbi) +{ + // Contribution for second-order absorbing BC. See Jin Section 9.3 for reference. The β + // coefficient for the second-order ABC is 1/(2ik+2/r). Taking the radius of curvature + // as infinity (plane wave scattering), the r-dependence vanishes and the contribution + // is purely imaginary. Multiplying through by μ⁻¹ we get the material coefficient to ω + // as 1 / (μ √(με)). Also, this implementation ignores the divergence term ∇⋅Eₜ, as + // COMSOL does as well. + if (farfield_attr.Size() && order > 1) + { + mfem::DenseTensor muinvc0 = + linalg::Mult(mat_op.GetInvPermeability(), mat_op.GetLightSpeed()); + MaterialPropertyCoefficient muinvc0_func(mat_op.GetBdrAttributeToMaterial(), muinvc0); + muinvc0_func.RestrictCoefficient(mat_op.GetCeedBdrAttributes(farfield_attr)); + + // Instead getting the correct normal of farfield boundary elements, just pick the + // the first element normal. This is fine as long as the farfield material properties + // are not anisotropic. + mfem::Vector normal(mat_op.SpaceDimension()); + normal = 0.0; + normal(0) = 1.0; + muinvc0_func.NormalProjectedCoefficient(normal); + + dfbi.AddCoefficient(muinvc0_func.GetAttributeToMaterial(), + muinvc0_func.GetMaterialProperties(), 0.5 / omega); + } +} + +} // namespace palace diff --git a/palace/models/farfieldboundaryoperator.hpp b/palace/models/farfieldboundaryoperator.hpp index d9e068339f..584b33edee 100644 --- a/palace/models/farfieldboundaryoperator.hpp +++ b/palace/models/farfieldboundaryoperator.hpp @@ -1,52 +1,52 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_FARFIELD_BOUNDARY_OPERATOR_HPP -#define PALACE_MODELS_FARFIELD_BOUNDARY_OPERATOR_HPP - -#include - -namespace palace -{ - -class IoData; -class MaterialOperator; -class SumCoefficient; -class SumMatrixCoefficient; - -// -// A class handling farfield, or absorbing, boundaries. -// -class FarfieldBoundaryOperator -{ -private: - // Reference to input data (not owned). - const MaterialOperator &mat_op; - - // First- or second-order absorbing boundary condition. - int order; - - // Marker for all absorbing boundary condition attributes. - mfem::Array farfield_marker; - void SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh); - -public: - FarfieldBoundaryOperator(const IoData &iodata, const MaterialOperator &mat, - const mfem::ParMesh &mesh); - - // Returns order of absorbing BC approximation. - int GetOrder() const { return order; } - - // Returns array marking farfield BC attributes. - const mfem::Array &GetMarker() const { return farfield_marker; } - - // Add contributions to system matrices from first- or second-order absorbing boundary - // condition. - void AddDampingBdrCoefficients(double coef, SumMatrixCoefficient &fb); - void AddExtraSystemBdrCoefficients(double omega, SumCoefficient &dfbr, - SumCoefficient &dfbi); -}; - -} // namespace palace - -#endif // PALACE_MODELS_FARFIELD_BOUNDARY_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_FARFIELD_BOUNDARY_OPERATOR_HPP +#define PALACE_MODELS_FARFIELD_BOUNDARY_OPERATOR_HPP + +#include + +namespace palace +{ + +class IoData; +class MaterialOperator; +class MaterialPropertyCoefficient; + +// +// A class handling farfield, or absorbing, boundaries. +// +class FarfieldBoundaryOperator +{ +private: + // Reference to material property data (not owned). + const MaterialOperator &mat_op; + + // List of all absorbing boundary condition attributes. + mfem::Array farfield_attr; + + // First- or second-order absorbing boundary condition. + int order; + + mfem::Array SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh); + +public: + FarfieldBoundaryOperator(const IoData &iodata, const MaterialOperator &mat_op, + const mfem::ParMesh &mesh); + + // Returns array of farfield BC attributes. + const auto &GetAttrList() const { return farfield_attr; } + + // Returns order of absorbing BC approximation. + int GetOrder() const { return order; } + + // Add contributions to system matrices from first- or second-order absorbing boundary + // condition. + void AddDampingBdrCoefficients(double coeff, MaterialPropertyCoefficient &fb); + void AddExtraSystemBdrCoefficients(double omega, MaterialPropertyCoefficient &dfbr, + MaterialPropertyCoefficient &dfbi); +}; + +} // namespace palace + +#endif // PALACE_MODELS_FARFIELD_BOUNDARY_OPERATOR_HPP diff --git a/palace/models/laplaceoperator.cpp b/palace/models/laplaceoperator.cpp index f7b89645bc..70a74f92cf 100644 --- a/palace/models/laplaceoperator.cpp +++ b/palace/models/laplaceoperator.cpp @@ -1,227 +1,275 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "laplaceoperator.hpp" - -#include "fem/bilinearform.hpp" -#include "fem/coefficient.hpp" -#include "fem/integrator.hpp" -#include "fem/multigrid.hpp" -#include "linalg/rap.hpp" -#include "utils/communication.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" -#include "utils/prettyprint.hpp" - -namespace palace -{ - -namespace -{ - -mfem::Array SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh) -{ - int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; - if (!iodata.boundaries.pec.empty() || !iodata.boundaries.lumpedport.empty()) - { - // Check that boundary attributes have been specified correctly. - mfem::Array bdr_attr_marker(bdr_attr_max); - bdr_attr_marker = 0; - for (auto attr : mesh.bdr_attributes) - { - bdr_attr_marker[attr - 1] = 1; - } - bool first = true; - for (auto attr : iodata.boundaries.pec.attributes) - { - // MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, - // "Ground boundary attribute tags must be non-negative and correspond to - // " attributes in the mesh!"); - // MFEM_VERIFY(bdr_attr_marker[attr-1], - // "Unknown ground boundary attribute " << attr << "!"); - if (attr <= 0 || attr > bdr_attr_marker.Size() || !bdr_attr_marker[attr - 1]) - { - if (first) - { - Mpi::Print("\n"); - first = false; - } - Mpi::Warning( - "Unknown ground boundary attribute {:d}!\nSolver will just ignore it!\n", attr); - } - } - for (const auto &[idx, data] : iodata.boundaries.lumpedport) - { - for (const auto &elem : data.elements) - { - for (auto attr : elem.attributes) - { - MFEM_VERIFY( - attr > 0 && attr <= bdr_attr_max, - "Terminal boundary attribute tags must be non-negative and correspond to " - "attributes in the mesh!"); - MFEM_VERIFY(bdr_attr_marker[attr - 1] > 0, - "Unknown terminal boundary attribute " << attr << "!"); - } - } - } - } - - // Mark selected boundary attributes from the mesh as essential (Dirichlet). - mfem::Array dbc_bcs, dbc_marker; - for (auto attr : iodata.boundaries.pec.attributes) - { - if (attr <= 0 || attr > bdr_attr_max) - { - continue; // Can just ignore if wrong - } - dbc_bcs.Append(attr); - } - for (const auto &[idx, data] : iodata.boundaries.lumpedport) - { - for (const auto &elem : data.elements) - { - for (auto attr : elem.attributes) - { - dbc_bcs.Append(attr); - } - } - } - MFEM_VERIFY(dbc_bcs.Size() > 0, - "Electrostatic problem is ill-posed without any Dirichlet boundaries!"); - mesh::AttrToMarker(bdr_attr_max, dbc_bcs, dbc_marker); - return dbc_marker; -} - -std::map> ConstructSources(const IoData &iodata) -{ - // Construct mapping from terminal index to list of associated attributes. - std::map> source_attr_lists; - for (const auto &[idx, data] : iodata.boundaries.lumpedport) - { - mfem::Array &attr_list = source_attr_lists[idx]; - for (const auto &elem : data.elements) - { - for (auto attr : elem.attributes) - { - attr_list.Append(attr); - } - } - } - return source_attr_lists; -} - -std::map ConstructBCValues(const IoData &iodata) -{ - std::map values; - for (const auto &[idx, data] : iodata.boundaries.lumpedport) - { - values[idx] = data.voltage; - } - return values; -} - -} // namespace - -LaplaceOperator::LaplaceOperator(const IoData &iodata, - const std::vector> &mesh) - : pa_order_threshold(iodata.solver.pa_order_threshold), skip_zeros(false), - print_hdr(true), dbc_marker(SetUpBoundaryProperties(iodata, *mesh.back())), - h1_fecs(fem::ConstructFECollections( - iodata.solver.order, mesh.back()->Dimension(), iodata.solver.linear.mg_max_levels, - iodata.solver.linear.mg_coarsen_type, false)), - nd_fec(std::make_unique(iodata.solver.order, - mesh.back()->Dimension())), - h1_fespaces(fem::ConstructFiniteElementSpaceHierarchy( - iodata.solver.linear.mg_max_levels, mesh, h1_fecs, &dbc_marker, &dbc_tdof_lists)), - nd_fespace(h1_fespaces.GetFinestFESpace(), mesh.back().get(), nd_fec.get()), - mat_op(iodata, *mesh.back()), source_attr_lists(ConstructSources(iodata)), dbc_values(ConstructBCValues(iodata)) -{ - // Print essential BC information. - if (dbc_marker.Size() && dbc_marker.Max() > 0) - { - Mpi::Print("\nConfiguring Dirichlet BC at attributes:\n"); - utils::PrettyPrintMarker(dbc_marker); - } -} - -std::unique_ptr LaplaceOperator::GetStiffnessMatrix() -{ - if (print_hdr) - { - Mpi::Print("\nAssembling system matrices, number of global unknowns:\n" - " H1: {:d}, ND: {:d}\n Operator assembly level: {}\n", - GetH1Space().GlobalTrueVSize(), GetNDSpace().GlobalTrueVSize(), - GetH1Space().GetMaxElementOrder() > pa_order_threshold ? "Partial" : "Full"); - Mpi::Print("\nAssembling multigrid hierarchy:\n"); - } - auto K = std::make_unique(GetH1Spaces().GetNumLevels()); - for (std::size_t l = 0; l < GetH1Spaces().GetNumLevels(); l++) - { - // Force coarse level operator to be fully assembled always. - const auto &h1_fespace_l = GetH1Spaces().GetFESpaceAtLevel(l); - if (print_hdr) - { - Mpi::Print(" Level {:d} (p = {:d}): {:d} unknowns", l, - h1_fespace_l.GetMaxElementOrder(), h1_fespace_l.GlobalTrueVSize()); - } - constexpr auto MatType = MaterialPropertyType::PERMITTIVITY_REAL; - MaterialPropertyCoefficient epsilon_func(mat_op); - BilinearForm k(h1_fespace_l); - k.AddDomainIntegrator(epsilon_func); - auto K_l = std::make_unique( - k.Assemble((l > 0) ? pa_order_threshold : 99, skip_zeros), h1_fespace_l); - if (print_hdr) - { - if (const auto *k_spm = - dynamic_cast(&K_l->LocalOperator())) - { - HYPRE_BigInt nnz = k_spm->NumNonZeroElems(); - Mpi::GlobalSum(1, &nnz, h1_fespace_l.GetComm()); - Mpi::Print(", {:d} NNZ\n", nnz); - } - else - { - Mpi::Print("\n"); - } - } - K_l->SetEssentialTrueDofs(dbc_tdof_lists[l], Operator::DiagonalPolicy::DIAG_ONE); - K->AddOperator(std::move(K_l)); - } - print_hdr = false; - return K; -} - -void LaplaceOperator::GetExcitationVector(int idx, const Operator &K, Vector &X, - Vector &RHS) -{ - // Apply the Dirichlet BCs to the solution vector: V = 1 on terminal boundaries with the - // given index, V = 0 on all ground and other terminal boundaries. - mfem::ParGridFunction x(&GetH1Space()); - x = 0.0; - - // Get a marker of all boundary attributes with the given source surface index. - mfem::Array source_marker; - const mfem::Array &source_list = source_attr_lists[idx]; - mesh::AttrToMarker(dbc_marker.Size(), source_list, source_marker); - mfem::ConstantCoefficient dbc_val(1.0); - std::map::iterator it = dbc_values.find(idx); - if (it != dbc_values.end()) { - dbc_val = mfem::ConstantCoefficient(it->second); - } - x.ProjectBdrCoefficient(dbc_val, source_marker); // Values are only correct on master - - // Eliminate the essential BC to get the RHS vector. - X.SetSize(GetH1Space().GetTrueVSize()); - RHS.SetSize(GetH1Space().GetTrueVSize()); - X = 0.0; - RHS = 0.0; - x.ParallelProject(X); // Restrict to the true dofs - const auto *mg_K = dynamic_cast(&K); - const auto *PtAP_K = mg_K ? dynamic_cast(&mg_K->GetFinestOperator()) - : dynamic_cast(&K); - MFEM_VERIFY(PtAP_K, "LaplaceOperator requires ParOperator for RHS elimination!"); - PtAP_K->EliminateRHS(X, RHS); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "laplaceoperator.hpp" + +#include +#include "fem/bilinearform.hpp" +#include "fem/integrator.hpp" +#include "fem/mesh.hpp" +#include "fem/multigrid.hpp" +#include "linalg/hypre.hpp" +#include "linalg/rap.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" +#include "utils/prettyprint.hpp" + +namespace palace +{ + +LaplaceOperator::LaplaceOperator(const IoData &iodata, + const std::vector> &mesh) + : print_hdr(true), dbc_attr(SetUpBoundaryProperties(iodata, *mesh.back())), + h1_fecs(fem::ConstructFECollections( + iodata.solver.order, mesh.back()->Dimension(), iodata.solver.linear.mg_max_levels, + iodata.solver.linear.mg_coarsening, false)), + nd_fec(std::make_unique(iodata.solver.order, + mesh.back()->Dimension())), + rt_fecs(fem::ConstructFECollections( + iodata.solver.order - 1, mesh.back()->Dimension(), + iodata.solver.linear.estimator_mg ? iodata.solver.linear.mg_max_levels : 1, + iodata.solver.linear.mg_coarsening, false)), + h1_fespaces(fem::ConstructFiniteElementSpaceHierarchy( + iodata.solver.linear.mg_max_levels, mesh, h1_fecs, &dbc_attr, &dbc_tdof_lists)), + nd_fespace(*mesh.back(), nd_fec.get()), + rt_fespaces(fem::ConstructFiniteElementSpaceHierarchy( + iodata.solver.linear.estimator_mg ? iodata.solver.linear.mg_max_levels : 1, mesh, + rt_fecs)), + mat_op(iodata, *mesh.back()), source_attr_lists(ConstructSources(iodata)), dbc_values(ConstructBCValues(iodata)) +{ + // Print essential BC information. + if (dbc_attr.Size()) + { + Mpi::Print("\nConfiguring Dirichlet BC at attributes:\n"); + utils::PrettyPrint(dbc_attr); + } +} + +mfem::Array LaplaceOperator::SetUpBoundaryProperties(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + // Check that boundary attributes have been specified correctly. + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array bdr_attr_marker; + if (!iodata.boundaries.pec.empty() || !iodata.boundaries.lumpedport.empty()) + { + bdr_attr_marker.SetSize(bdr_attr_max); + bdr_attr_marker = 0; + for (auto attr : mesh.bdr_attributes) + { + bdr_attr_marker[attr - 1] = 1; + } + std::set bdr_warn_list; + for (auto attr : iodata.boundaries.pec.attributes) + { + // MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, + // "Ground boundary attribute tags must be non-negative and correspond to + // " attributes in the mesh!"); + // MFEM_VERIFY(bdr_attr_marker[attr - 1], + // "Unknown ground boundary attribute " << attr << "!"); + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + bdr_warn_list.insert(attr); + } + } + if (!bdr_warn_list.empty()) + { + Mpi::Print("\n"); + Mpi::Warning("Unknown ground boundary attributes!\nSolver will just ignore them!"); + utils::PrettyPrint(bdr_warn_list, "Boundary attribute list:"); + Mpi::Print("\n"); + } + for (const auto &[idx, data] : iodata.boundaries.lumpedport) + { + for (const auto &elem : data.elements) + { + for (auto attr : elem.attributes) + { + MFEM_VERIFY( + attr > 0 && attr <= bdr_attr_max, + "Terminal boundary attribute tags must be non-negative and correspond to " + "attributes in the mesh!"); + MFEM_VERIFY(bdr_attr_marker[attr - 1] > 0, + "Unknown terminal boundary attribute " << attr << "!"); + } + } + } + } + + // Mark selected boundary attributes from the mesh as essential (Dirichlet). + mfem::Array dbc_bcs; + dbc_bcs.Reserve(static_cast(iodata.boundaries.pec.attributes.size()) + + static_cast(iodata.boundaries.lumpedport.size())); + for (auto attr : iodata.boundaries.pec.attributes) + { + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + continue; // Can just ignore if wrong + } + dbc_bcs.Append(attr); + } + for (const auto &[idx, data] : iodata.boundaries.lumpedport) + { + for (const auto &elem : data.elements) + { + for (auto attr : elem.attributes) + { + dbc_bcs.Append(attr); + } + } + } + MFEM_VERIFY(dbc_bcs.Size() > 0, + "Electrostatic problem is ill-posed without any Dirichlet boundaries!"); + return dbc_bcs; +} + +std::map> LaplaceOperator::ConstructSources(const IoData &iodata) +{ + // Construct mapping from terminal index to list of associated attributes. + std::map> attr_lists; + for (const auto &[idx, data] : iodata.boundaries.lumpedport) + { + mfem::Array &attr_list = attr_lists[idx]; + attr_list.Reserve( + static_cast(data.elements.size())); // Average one attribute per element + for (const auto &elem : data.elements) + { + for (auto attr : elem.attributes) + { + attr_list.Append(attr); + } + } + } + return attr_lists; +} + +std::map LaplaceOperator::ConstructBCValues(const IoData &iodata) +{ + std::map values; + for (const auto &[idx, data] : iodata.boundaries.lumpedport) + { + values[idx] = data.voltage; + } + return values; +} + +namespace +{ + +void PrintHeader(const mfem::ParFiniteElementSpace &h1_fespace, + const mfem::ParFiniteElementSpace &nd_fespace, + const mfem::ParFiniteElementSpace &rt_fespace, bool &print_hdr) +{ + if (print_hdr) + { + Mpi::Print("\nAssembling system matrices, number of global unknowns:\n" + " H1 (p = {:d}): {:d}, ND (p = {:d}): {:d}, RT (p = {:d}): {:d}\n Operator " + "assembly level: {}\n", + h1_fespace.GetMaxElementOrder(), h1_fespace.GlobalTrueVSize(), + nd_fespace.GetMaxElementOrder(), nd_fespace.GlobalTrueVSize(), + rt_fespace.GetMaxElementOrder(), rt_fespace.GlobalTrueVSize(), + (h1_fespace.GetMaxElementOrder() >= BilinearForm::pa_order_threshold) + ? "Partial" + : "Full"); + + const auto &mesh = *h1_fespace.GetParMesh(); + const auto geom_types = mesh::CheckElements(mesh).GetGeomTypes(); + Mpi::Print(" Mesh geometries:\n"); + for (auto geom : geom_types) + { + const auto *fe = nd_fespace.FEColl()->FiniteElementForGeometry(geom); + MFEM_VERIFY(fe, "MFEM does not support ND spaces on geometry = " + << mfem::Geometry::Name[geom] << "!"); + const int q_order = fem::DefaultIntegrationOrder::Get(mesh, geom); + Mpi::Print(" {}: P = {:d}, Q = {:d} (quadrature order = {:d}){}\n", + mfem::Geometry::Name[geom], fe->GetDof(), + mfem::IntRules.Get(geom, q_order).GetNPoints(), q_order, + (geom == geom_types.back()) ? "" : ","); + } + + Mpi::Print("\nAssembling multigrid hierarchy:\n"); + } +} + +} // namespace + +std::unique_ptr LaplaceOperator::GetStiffnessMatrix() +{ + // When partially assembled, the coarse operators can reuse the fine operator quadrature + // data if the spaces correspond to the same mesh. + PrintHeader(GetH1Space(), GetNDSpace(), GetRTSpace(), print_hdr); + + constexpr bool skip_zeros = false; + MaterialPropertyCoefficient epsilon_func(mat_op.GetAttributeToMaterial(), + mat_op.GetPermittivityReal()); + BilinearForm k(GetH1Space()); + k.AddDomainIntegrator(epsilon_func); + // k.AssembleQuadratureData(); + auto k_vec = k.Assemble(GetH1Spaces(), skip_zeros); + auto K = std::make_unique(GetH1Spaces().GetNumLevels()); + for (std::size_t l = 0; l < GetH1Spaces().GetNumLevels(); l++) + { + const auto &h1_fespace_l = GetH1Spaces().GetFESpaceAtLevel(l); + if (print_hdr) + { + Mpi::Print(" Level {:d} (p = {:d}): {:d} unknowns", l, + h1_fespace_l.GetMaxElementOrder(), h1_fespace_l.GlobalTrueVSize()); + if (const auto *k_spm = dynamic_cast(k_vec[l].get())) + { + HYPRE_BigInt nnz = k_spm->NNZ(); + Mpi::GlobalSum(1, &nnz, h1_fespace_l.GetComm()); + Mpi::Print(", {:d} NNZ\n", nnz); + } + else + { + Mpi::Print("\n"); + } + } + auto K_l = std::make_unique(std::move(k_vec[l]), h1_fespace_l); + K_l->SetEssentialTrueDofs(dbc_tdof_lists[l], Operator::DiagonalPolicy::DIAG_ONE); + K->AddOperator(std::move(K_l)); + } + + print_hdr = false; + return K; +} + +void LaplaceOperator::GetExcitationVector(int idx, const Operator &K, Vector &X, + Vector &RHS) +{ + // Apply the Dirichlet BCs to the solution vector: V = 1 on terminal boundaries with the + // given index, V = 0 on all ground and other terminal boundaries. + mfem::ParGridFunction x(&GetH1Space().Get()); + x = 0.0; + + // Get a marker of all boundary attributes with the given source surface index. + const mfem::ParMesh &mesh = GetMesh(); + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array source_marker = mesh::AttrToMarker(bdr_attr_max, source_attr_lists[idx]); + mfem::ConstantCoefficient dbc_val(1.0); + std::map::iterator it = dbc_values.find(idx); + if (it != dbc_values.end()) + { + dbc_val = mfem::ConstantCoefficient(it->second); + } + x.ProjectBdrCoefficient(dbc_val, source_marker); // Values are only correct on master + + + + // Eliminate the essential BC to get the RHS vector. + X.SetSize(GetH1Space().GetTrueVSize()); + RHS.SetSize(GetH1Space().GetTrueVSize()); + X.UseDevice(true); + RHS.UseDevice(true); + X = 0.0; + RHS = 0.0; + x.ParallelProject(X); // Restrict to the true dofs + const auto *mg_K = dynamic_cast(&K); + const auto *PtAP_K = mg_K ? dynamic_cast(&mg_K->GetFinestOperator()) + : dynamic_cast(&K); + MFEM_VERIFY(PtAP_K, "LaplaceOperator requires ParOperator for RHS elimination!"); + PtAP_K->EliminateRHS(X, RHS); +} + +} // namespace palace diff --git a/palace/models/laplaceoperator.hpp b/palace/models/laplaceoperator.hpp index 4fe18a6128..74e1a8de8c 100644 --- a/palace/models/laplaceoperator.hpp +++ b/palace/models/laplaceoperator.hpp @@ -1,91 +1,104 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_LAPLACE_OPERATOR_HPP -#define PALACE_MODELS_LAPLACE_OPERATOR_HPP - -#include -#include -#include -#include -#include "fem/fespace.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" -#include "models/materialoperator.hpp" - -namespace palace -{ - -class IoData; - -// -// A class handling discretization of Laplace problems for electrostatics. -// -class LaplaceOperator -{ -private: - const int pa_order_threshold; // Order above which to use partial assembly vs. full - const bool skip_zeros; // Skip zeros during full assembly of matrices - - // Helper variable for log file printing. - bool print_hdr; - - // Essential boundary condition markers. - mfem::Array dbc_marker; - std::vector> dbc_tdof_lists; - - // Objects defining the finite element spaces for the electrostatic potential (H1) and - // electric field (Nedelec) on the given mesh. - std::vector> h1_fecs; - std::unique_ptr nd_fec; - FiniteElementSpaceHierarchy h1_fespaces; - AuxiliaryFiniteElementSpace nd_fespace; - - // Operator for domain material properties. - MaterialOperator mat_op; - - // Boundary attributes for each terminal index. - std::map> source_attr_lists; - std::map dbc_values; - - -public: - LaplaceOperator(const IoData &iodata, - const std::vector> &mesh); - - // Return material operator for postprocessing. - const MaterialOperator &GetMaterialOp() const { return mat_op; } - - // Access source attribute lists. - const auto &GetSources() const { return source_attr_lists; } - const auto &GetBCValues() const { return dbc_values; } - - // Return the parallel finite element space objects. - auto &GetH1Spaces() { return h1_fespaces; } - const auto &GetH1Spaces() const { return h1_fespaces; } - auto &GetH1Space() { return h1_fespaces.GetFinestFESpace(); } - const auto &GetH1Space() const { return h1_fespaces.GetFinestFESpace(); } - auto &GetNDSpace() { return nd_fespace; } - const auto &GetNDSpace() const { return nd_fespace; } - - // Return the number of true (conforming) dofs on the finest H1 space. - auto GlobalTrueVSize() { return GetH1Space().GlobalTrueVSize(); } - - // Construct and return system matrix representing discretized Laplace operator for - // Gauss's law. - std::unique_ptr GetStiffnessMatrix(); - - // Construct and return the discrete gradient matrix. - const Operator &GetGradMatrix() const { return GetNDSpace().GetDiscreteInterpolator(); } - - // Assemble the solution boundary conditions and right-hand side vector for a nonzero - // prescribed voltage on the specified surface index. - void GetExcitationVector(int idx, const Operator &K, Vector &X, Vector &RHS); - - // Get the associated MPI communicator. - MPI_Comm GetComm() const { return GetH1Space().GetComm(); } -}; - -} // namespace palace - -#endif // PALACE_MODELS_LAPLACE_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_LAPLACE_OPERATOR_HPP +#define PALACE_MODELS_LAPLACE_OPERATOR_HPP + +#include +#include +#include +#include +#include "fem/fespace.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" +#include "models/materialoperator.hpp" + +namespace palace +{ + +class IoData; +class Mesh; + +// +// A class handling discretization of Laplace problems for electrostatics. +// +class LaplaceOperator +{ +private: + // Helper variable for log file printing. + bool print_hdr; + + // Essential boundary condition markers. + mfem::Array dbc_attr; + std::vector> dbc_tdof_lists; + + // Objects defining the finite element spaces for the electrostatic potential (H1) and + // electric field (Nedelec) on the given mesh. The RT spaces are used for error + // estimation. + std::vector> h1_fecs; + std::unique_ptr nd_fec; + std::vector> rt_fecs; + FiniteElementSpaceHierarchy h1_fespaces; + FiniteElementSpace nd_fespace; + FiniteElementSpaceHierarchy rt_fespaces; + + // Operator for domain material properties. + MaterialOperator mat_op; + + // Boundary attributes for each terminal index. + std::map> source_attr_lists; + std::map dbc_values; + + mfem::Array SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh); + std::map> ConstructSources(const IoData &iodata); + std::map ConstructBCValues(const IoData &iodata); + +public: + LaplaceOperator(const IoData &iodata, const std::vector> &mesh); + + // Return material operator for postprocessing. + const MaterialOperator &GetMaterialOp() const { return mat_op; } + + // Access source attribute lists. + const auto &GetSources() const { return source_attr_lists; } + const auto &GetBCValues() const { return dbc_values; } + + // Return the parallel finite element space objects. + auto &GetH1Spaces() { return h1_fespaces; } + const auto &GetH1Spaces() const { return h1_fespaces; } + auto &GetH1Space() { return h1_fespaces.GetFinestFESpace(); } + const auto &GetH1Space() const { return h1_fespaces.GetFinestFESpace(); } + auto &GetNDSpace() { return nd_fespace; } + const auto &GetNDSpace() const { return nd_fespace; } + auto &GetRTSpaces() { return rt_fespaces; } + const auto &GetRTSpaces() const { return rt_fespaces; } + auto &GetRTSpace() { return rt_fespaces.GetFinestFESpace(); } + const auto &GetRTSpace() const { return rt_fespaces.GetFinestFESpace(); } + + // Access the underlying mesh object. + const auto &GetMesh() const { return GetH1Space().GetMesh(); } + + // Return the number of true (conforming) dofs on the finest H1 space. + auto GlobalTrueVSize() const { return GetH1Space().GlobalTrueVSize(); } + + // Construct and return system matrix representing discretized Laplace operator for + // Gauss's law. + std::unique_ptr GetStiffnessMatrix(); + + // Construct and return the discrete gradient matrix. + const Operator &GetGradMatrix() const + { + return GetNDSpace().GetDiscreteInterpolator(GetH1Space()); + } + + // Assemble the solution boundary conditions and right-hand side vector for a nonzero + // prescribed voltage on the specified surface index. + void GetExcitationVector(int idx, const Operator &K, Vector &X, Vector &RHS); + + // Get the associated MPI communicator. + MPI_Comm GetComm() const { return GetH1Space().GetComm(); } +}; + +} // namespace palace + +#endif // PALACE_MODELS_LAPLACE_OPERATOR_HPP diff --git a/palace/models/lumpedportoperator.cpp b/palace/models/lumpedportoperator.cpp index eedf5a0e2e..45d5ba1e8e 100644 --- a/palace/models/lumpedportoperator.cpp +++ b/palace/models/lumpedportoperator.cpp @@ -1,580 +1,632 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "lumpedportoperator.hpp" - -#include -#include "fem/coefficient.hpp" -#include "fem/integrator.hpp" -#include "fem/lumpedelement.hpp" -#include "models/materialoperator.hpp" -#include "utils/communication.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -using namespace std::complex_literals; - -LumpedPortData::LumpedPortData(const config::LumpedPortData &data, - mfem::ParFiniteElementSpace &h1_fespace) - : excitation(data.excitation), s(nullptr), v(nullptr) -{ - // Check inputs. Only one of the circuit or per square properties should be specified - // for the port boundary. - bool has_circ = (std::abs(data.R) + std::abs(data.L) + std::abs(data.C) > 0.0); - bool has_surf = (std::abs(data.Rs) + std::abs(data.Ls) + std::abs(data.Cs) > 0.0); - MFEM_VERIFY(has_circ || has_surf, - "Lumped port boundary has no R/L/C or Rs/Ls/Cs defined, needs " - "at least one!"); - MFEM_VERIFY(!(has_circ && has_surf), - "Lumped port boundary has both R/L/C and Rs/Ls/Cs defined, " - "should only use one!"); - if (excitation) - { - if (has_circ) - { - MFEM_VERIFY(data.R > 0.0, "Excited lumped port must have nonzero resistance!"); - MFEM_VERIFY(data.C == 0.0 && data.L == 0.0, - "Lumped port excitations do not support nonzero reactance!"); - } - else - { - MFEM_VERIFY(data.Rs > 0.0, "Excited lumped port must have nonzero resistance!"); - MFEM_VERIFY(data.Cs == 0.0 && data.Ls == 0.0, - "Lumped port excitations do not support nonzero reactance!"); - } - } - - // Construct the port elements allowing for a possible multielement lumped port. - for (const auto &elem : data.elements) - { - mfem::Array attr_marker; - mesh::AttrToMarker(h1_fespace.GetParMesh()->bdr_attributes.Size() - ? h1_fespace.GetParMesh()->bdr_attributes.Max() - : 0, - elem.attributes, attr_marker); - switch (elem.coordinate_system) - { - case config::internal::ElementData::CoordinateSystem::CYLINDRICAL: - elems.push_back( - std::make_unique(elem.direction, attr_marker, h1_fespace)); - break; - case config::internal::ElementData::CoordinateSystem::CARTESIAN: - elems.push_back( - std::make_unique(elem.direction, attr_marker, h1_fespace)); - break; - } - } - - // Populate the property data for the lumped port. - if (std::abs(data.Rs) + std::abs(data.Ls) + std::abs(data.Cs) == 0.0) - { - R = data.R; - L = data.L; - C = data.C; - } - else - { - // If defined by surface properties, need to compute circuit properties for the - // multielement port. - double ooR = 0.0, ooL = 0.0; - R = L = C = 0.0; - for (const auto &elem : elems) - { - const double sq = elem->GetGeometryWidth() / elem->GetGeometryLength(); - if (std::abs(data.Rs) > 0.0) - { - ooR += sq / data.Rs; - } - if (std::abs(data.Ls) > 0.0) - { - ooL += sq / data.Ls; - } - if (std::abs(data.Cs) > 0.0) - { - C += sq * data.Cs; - } - } - if (std::abs(ooR) > 0.0) - { - R = 1.0 / ooR; - } - if (std::abs(ooL) > 0.0) - { - L = 1.0 / ooL; - } - } -} - -std::complex LumpedPortData::GetCharacteristicImpedance(double omega) const -{ - MFEM_VERIFY((L == 0.0 && C == 0.0) || omega > 0.0, - "Lumped port with nonzero reactance requires frequency in order to define " - "characteristic impedance!"); - std::complex Y = 0.0; - if (std::abs(R) > 0.0) - { - Y += 1.0 / R; - } - if (std::abs(L) > 0.0) - { - Y += 1.0 / (1i * omega * L); - } - Y += 1i * omega * C; - MFEM_VERIFY(std::abs(Y) > 0.0, - "Characteristic impedance requested for lumped port with zero admittance!") - return 1.0 / Y; -} - -double LumpedPortData::GetExcitationPower() const -{ - // The lumped port excitation is normalized such that the power integrated over the port - // is 1: ∫ (E_inc x H_inc) ⋅ n dS = 1. - return excitation ? 1.0 : 0.0; -} - -double LumpedPortData::GetExcitationVoltage() const -{ - // Incident voltage should be the same across all elements of an excited lumped port. - if (excitation) - { - double Vinc = 0.0; - for (const auto &elem : elems) - { - const double Rs = R * GetToSquare(*elem); - const double Einc = std::sqrt( - Rs / (elem->GetGeometryWidth() * elem->GetGeometryLength() * elems.size())); - Vinc += Einc * elem->GetGeometryLength() / elems.size(); - } - return Vinc; - } - else - { - return 0.0; - } -} - -void LumpedPortData::InitializeLinearForms(mfem::ParFiniteElementSpace &nd_fespace) const -{ - // The port S-parameter, or the projection of the field onto the port mode, is computed - // as: (E x H_inc) ⋅ n = E ⋅ (E_inc / Z_s), integrated over the port surface. - if (!s) - { - SumVectorCoefficient fb(nd_fespace.GetParMesh()->SpaceDimension()); - for (const auto &elem : elems) - { - const double Rs = R * GetToSquare(*elem); - const double Hinc = 1.0 / std::sqrt(Rs * elem->GetGeometryWidth() * - elem->GetGeometryLength() * elems.size()); - fb.AddCoefficient(elem->GetModeCoefficient(Hinc), elem->GetMarker()); - } - s = std::make_unique(&nd_fespace); - s->AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fb)); - s->UseFastAssembly(false); - s->Assemble(); - } - - // The voltage across a port is computed using the electric field solution. - // We have: - // V = ∫ E ⋅ l̂ dl = 1/w ∫ E ⋅ l̂ dS (for rectangular ports) - // or, - // V = 1/(2π) ∫ E ⋅ r̂ / r dS (for coaxial ports). - // We compute the surface integral via an inner product between the linear form with the - // averaging function as a vector coefficient and the solution expansion coefficients. - if (!v) - { - SumVectorCoefficient fb(nd_fespace.GetParMesh()->SpaceDimension()); - for (const auto &elem : elems) - { - fb.AddCoefficient( - elem->GetModeCoefficient(1.0 / (elem->GetGeometryWidth() * elems.size())), - elem->GetMarker()); - } - v = std::make_unique(&nd_fespace); - v->AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fb)); - v->UseFastAssembly(false); - v->Assemble(); - } -} - -std::complex LumpedPortData::GetSParameter(mfem::ParComplexGridFunction &E) const -{ - // Compute port S-parameter, or the projection of the field onto the port mode. - InitializeLinearForms(*E.ParFESpace()); - std::complex dot((*s) * E.real(), (*s) * E.imag()); - Mpi::GlobalSum(1, &dot, E.ParFESpace()->GetComm()); - return dot; -} - -double LumpedPortData::GetPower(mfem::ParGridFunction &E, mfem::ParGridFunction &B, - const MaterialOperator &mat_op) const -{ - // Compute port power, (E x H) ⋅ n = E ⋅ (-n x H), integrated over the port surface - // using the computed E and H = μ⁻¹ B fields. The linear form is reconstructed from - // scratch each time due to changing H. The BdrCurrentVectorCoefficient computes -n x H, - // where n is an outward normal. - auto &nd_fespace = *E.ParFESpace(); - SumVectorCoefficient fb(nd_fespace.GetParMesh()->SpaceDimension()); - for (const auto &elem : elems) - { - fb.AddCoefficient(std::make_unique(B, mat_op), - elem->GetMarker()); - } - mfem::LinearForm p(&nd_fespace); - p.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fb)); - p.UseFastAssembly(false); - p.Assemble(); - double dot = p * E; - Mpi::GlobalSum(1, &dot, E.ParFESpace()->GetComm()); - return dot; -} - -std::complex LumpedPortData::GetPower(mfem::ParComplexGridFunction &E, - mfem::ParComplexGridFunction &B, - const MaterialOperator &mat_op) const -{ - // Compute port power, (E x H⋆) ⋅ n = E ⋅ (-n x H⋆), integrated over the port surface - // using the computed E and H = μ⁻¹ B fields. The linear form is reconstructed from - // scratch each time due to changing H. The BdrCurrentVectorCoefficient computes -n x H, - // where n is an outward normal. - auto &nd_fespace = *E.ParFESpace(); - SumVectorCoefficient fbr(nd_fespace.GetParMesh()->SpaceDimension()); - SumVectorCoefficient fbi(nd_fespace.GetParMesh()->SpaceDimension()); - for (const auto &elem : elems) - { - fbr.AddCoefficient(std::make_unique(B.real(), mat_op), - elem->GetMarker()); - fbi.AddCoefficient(std::make_unique(B.imag(), mat_op), - elem->GetMarker()); - } - mfem::LinearForm pr(&nd_fespace), pi(&nd_fespace); - pr.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fbr)); - pi.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fbi)); - pr.UseFastAssembly(false); - pi.UseFastAssembly(false); - pr.Assemble(); - pi.Assemble(); - std::complex dot((pr * E.real()) + (pi * E.imag()), - (pr * E.imag()) - (pi * E.real())); - Mpi::GlobalSum(1, &dot, E.ParFESpace()->GetComm()); - return dot; -} - -double LumpedPortData::GetVoltage(mfem::ParGridFunction &E) const -{ - // Compute the average voltage across the port. - InitializeLinearForms(*E.ParFESpace()); - double dot = (*v) * E; - Mpi::GlobalSum(1, &dot, E.ParFESpace()->GetComm()); - return dot; -} - -std::complex LumpedPortData::GetVoltage(mfem::ParComplexGridFunction &E) const -{ - // Compute the average voltage across the port. - InitializeLinearForms(*E.ParFESpace()); - std::complex dot((*v) * E.real(), (*v) * E.imag()); - Mpi::GlobalSum(1, &dot, E.ParFESpace()->GetComm()); - return dot; -} - -LumpedPortOperator::LumpedPortOperator(const IoData &iodata, - mfem::ParFiniteElementSpace &h1_fespace) -{ - // Set up lumped port boundary conditions. - SetUpBoundaryProperties(iodata, h1_fespace); - PrintBoundaryInfo(iodata, *h1_fespace.GetParMesh()); -} - -void LumpedPortOperator::SetUpBoundaryProperties(const IoData &iodata, - mfem::ParFiniteElementSpace &h1_fespace) -{ - // Check that lumped port boundary attributes have been specified correctly. - int bdr_attr_max = h1_fespace.GetParMesh()->bdr_attributes.Size() - ? h1_fespace.GetParMesh()->bdr_attributes.Max() - : 0; - if (!iodata.boundaries.lumpedport.empty()) - { - mfem::Array bdr_attr_marker(bdr_attr_max); - bdr_attr_marker = 0; - for (auto attr : h1_fespace.GetParMesh()->bdr_attributes) - { - bdr_attr_marker[attr - 1] = 1; - } - for (const auto &[idx, data] : iodata.boundaries.lumpedport) - { - for (const auto &elem : data.elements) - { - for (auto attr : elem.attributes) - { - MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, - "Port boundary attribute tags must be non-negative and correspond to " - "boundaries in the mesh!"); - MFEM_VERIFY(bdr_attr_marker[attr - 1], - "Unknown port boundary attribute " << attr << "!"); - } - } - } - } - - // Set up lumped port data structures. - for (const auto &[idx, data] : iodata.boundaries.lumpedport) - { - ports.try_emplace(idx, data, h1_fespace); - } - - // Mark selected boundary attributes from the mesh for lumped ports. - port_marker.SetSize(bdr_attr_max); - port_Rs_marker.SetSize(bdr_attr_max); - port_Ls_marker.SetSize(bdr_attr_max); - port_Cs_marker.SetSize(bdr_attr_max); - port_marker = 0; - port_Rs_marker = 0; - port_Ls_marker = 0; - port_Cs_marker = 0; - for (const auto &[idx, data] : ports) - { - for (const auto &elem : data.GetElements()) - { - for (int i = 0; i < elem->GetMarker().Size(); i++) - { - MFEM_VERIFY(!(port_marker[i] && elem->GetMarker()[i]), - "Boundary attribute is assigned to more than one lumped port!"); - port_marker[i] = port_marker[i] || elem->GetMarker()[i]; - if (std::abs(data.GetR()) > 0.0) - { - port_Rs_marker[i] = port_Rs_marker[i] || elem->GetMarker()[i]; - } - if (std::abs(data.GetL()) > 0.0) - { - port_Ls_marker[i] = port_Ls_marker[i] || elem->GetMarker()[i]; - } - if (std::abs(data.GetC()) > 0.0) - { - port_Cs_marker[i] = port_Cs_marker[i] || elem->GetMarker()[i]; - } - } - } - } -} - -void LumpedPortOperator::PrintBoundaryInfo(const IoData &iodata, mfem::ParMesh &mesh) -{ - // Print out BC info for all port attributes. - if (ports.empty()) - { - return; - } - Mpi::Print("\nConfiguring Robin impedance BC for lumped ports at attributes:\n"); - for (const auto &[idx, data] : ports) - { - for (const auto &elem : data.GetElements()) - { - for (int i = 0; i < elem->GetMarker().Size(); i++) - { - if (!elem->GetMarker()[i]) - { - continue; - } - const int attr = i + 1; - mfem::Vector nor; - mesh::GetSurfaceNormal(mesh, attr, nor); - const double Rs = data.GetR() * data.GetToSquare(*elem); - const double Ls = data.GetL() * data.GetToSquare(*elem); - const double Cs = data.GetC() / data.GetToSquare(*elem); - bool comma = false; - Mpi::Print(" {:d}:", attr); - if (std::abs(Rs) > 0.0) - { - Mpi::Print(" Rs = {:.3e} Ω/sq", - iodata.DimensionalizeValue(IoData::ValueType::IMPEDANCE, Rs)); - comma = true; - } - if (std::abs(Ls) > 0.0) - { - if (comma) - { - Mpi::Print(","); - } - Mpi::Print(" Ls = {:.3e} H/sq", - iodata.DimensionalizeValue(IoData::ValueType::INDUCTANCE, Ls)); - comma = true; - } - if (std::abs(Cs) > 0.0) - { - if (comma) - { - Mpi::Print(","); - } - Mpi::Print(" Cs = {:.3e} F/sq", - iodata.DimensionalizeValue(IoData::ValueType::CAPACITANCE, Cs)); - comma = true; - } - if (comma) - { - Mpi::Print(","); - } - if (mesh.SpaceDimension() == 3) - { - Mpi::Print(" n = ({:+.1f}, {:+.1f}, {:+.1f})", nor(0), nor(1), nor(2)); - } - else - { - Mpi::Print(" n = ({:+.1f}, {:+.1f})", nor(0), nor(1)); - } - Mpi::Print("\n"); - } - } - } - - // Print out port info for all ports. - Mpi::Print("\nConfiguring lumped port circuit properties:\n"); - for (const auto &[idx, data] : ports) - { - bool comma = false; - Mpi::Print(" Index = {:d}:", idx); - if (std::abs(data.GetR()) > 0.0) - { - Mpi::Print(" R = {:.3e} Ω", - iodata.DimensionalizeValue(IoData::ValueType::IMPEDANCE, data.GetR())); - comma = true; - } - if (std::abs(data.GetL()) > 0.0) - { - if (comma) - { - Mpi::Print(","); - } - Mpi::Print(" L = {:.3e} H", - iodata.DimensionalizeValue(IoData::ValueType::INDUCTANCE, data.GetL())); - comma = true; - } - if (std::abs(data.GetC()) > 0.0) - { - if (comma) - { - Mpi::Print(","); - } - Mpi::Print(" C = {:.3e} F", - iodata.DimensionalizeValue(IoData::ValueType::CAPACITANCE, data.GetC())); - } - Mpi::Print("\n"); - } - - // Print some information for excited lumped ports. - bool first = true; - for (const auto &[idx, data] : ports) - { - if (!data.IsExcited()) - { - continue; - } - if (first) - { - Mpi::Print("\nConfiguring lumped port excitation source term at attributes:\n"); - first = false; - } - for (const auto &elem : data.GetElements()) - { - for (int i = 0; i < elem->GetMarker().Size(); i++) - { - if (elem->GetMarker()[i]) - { - Mpi::Print(" {:d}: Index = {:d}\n", i + 1, idx); - } - } - } - } -} - -const LumpedPortData &LumpedPortOperator::GetPort(int idx) const -{ - auto it = ports.find(idx); - MFEM_VERIFY(it != ports.end(), "Unknown lumped port index requested!"); - return it->second; -} - -void LumpedPortOperator::AddStiffnessBdrCoefficients(double coef, SumMatrixCoefficient &fb) -{ - // Add lumped inductor boundaries to the bilinear form. - for (const auto &[idx, data] : ports) - { - if (data.GetL() == 0.0) - { - continue; - } - for (const auto &elem : data.GetElements()) - { - const double Ls = data.GetL() * data.GetToSquare(*elem); - fb.AddCoefficient(std::make_unique(coef / Ls), - elem->GetMarker()); - } - } -} - -void LumpedPortOperator::AddMassBdrCoefficients(double coef, SumMatrixCoefficient &fb) -{ - // Add lumped mass boundaries to the bilinear form. - for (const auto &[idx, data] : ports) - { - if (data.GetC() == 0.0) - { - continue; - } - for (const auto &elem : data.GetElements()) - { - const double Cs = data.GetC() / data.GetToSquare(*elem); - fb.AddCoefficient(std::make_unique(coef * Cs), - elem->GetMarker()); - } - } -} - -void LumpedPortOperator::AddDampingBdrCoefficients(double coef, SumMatrixCoefficient &fb) -{ - // Add lumped resistor boundaries to the bilinear form. - for (const auto &[idx, data] : ports) - { - if (data.GetR() == 0.0) - { - continue; - } - for (const auto &elem : data.GetElements()) - { - const double Rs = data.GetR() * data.GetToSquare(*elem); - fb.AddCoefficient(std::make_unique(coef / Rs), - elem->GetMarker()); - } - } -} - -void LumpedPortOperator::AddExcitationBdrCoefficients(SumVectorCoefficient &fb) -{ - // Construct the RHS source term for lumped port boundaries, which looks like -U_inc = - // +2 iω/Z_s E_inc for a port boundary with an incident field E_inc. The chosen incident - // field magnitude corresponds to a unit incident power over the full port boundary. See - // p. 49 and p. 82 of the COMSOL RF Module manual for more detail. - // Note: The real RHS returned here does not yet have the factor of (iω) included, so - // works for time domain simulations requiring RHS -U_inc(t). - for (const auto &[idx, data] : ports) - { - if (!data.IsExcited()) - { - continue; - } - MFEM_VERIFY(std::abs(data.GetR()) > 0.0, - "Unexpected zero resistance in excited lumped port!"); - for (const auto &elem : data.GetElements()) - { - const double Rs = data.GetR() * data.GetToSquare(*elem); - const double Hinc = - 1.0 / std::sqrt(Rs * elem->GetGeometryWidth() * elem->GetGeometryLength() * - data.GetElements().size()); - fb.AddCoefficient(elem->GetModeCoefficient(2.0 * Hinc), elem->GetMarker()); - } - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "lumpedportoperator.hpp" + +#include +#include "fem/coefficient.hpp" +#include "fem/gridfunction.hpp" +#include "fem/integrator.hpp" +#include "models/materialoperator.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" + +namespace palace +{ + +using namespace std::complex_literals; + +LumpedPortData::LumpedPortData(const config::LumpedPortData &data, + const MaterialOperator &mat_op, const mfem::ParMesh &mesh) + : mat_op(mat_op), excitation(data.excitation), active(data.active) +{ + // Check inputs. Only one of the circuit or per square properties should be specified + // for the port boundary. + bool has_circ = (std::abs(data.R) + std::abs(data.L) + std::abs(data.C) > 0.0); + bool has_surf = (std::abs(data.Rs) + std::abs(data.Ls) + std::abs(data.Cs) > 0.0); + MFEM_VERIFY(has_circ || has_surf, + "Lumped port boundary has no R/L/C or Rs/Ls/Cs defined, needs " + "at least one!"); + MFEM_VERIFY(!(has_circ && has_surf), + "Lumped port boundary has both R/L/C and Rs/Ls/Cs defined, " + "should only use one!"); + + if (HasExcitation()) + { + if (has_circ) + { + MFEM_VERIFY(data.R > 0.0, "Excited lumped port must have nonzero resistance!"); + MFEM_VERIFY(data.C == 0.0 && data.L == 0.0, + "Lumped port excitations do not support nonzero reactance!"); + } + else + { + MFEM_VERIFY(data.Rs > 0.0, "Excited lumped port must have nonzero resistance!"); + MFEM_VERIFY(data.Cs == 0.0 && data.Ls == 0.0, + "Lumped port excitations do not support nonzero reactance!"); + } + } + + // Construct the port elements allowing for a possible multielement lumped port. + for (const auto &elem : data.elements) + { + mfem::Array attr_list; + attr_list.Append(elem.attributes.data(), elem.attributes.size()); + switch (elem.coordinate_system) + { + case CoordinateSystem::CYLINDRICAL: + elems.push_back( + std::make_unique(elem.direction, attr_list, mesh)); + break; + case CoordinateSystem::CARTESIAN: + elems.push_back( + std::make_unique(elem.direction, attr_list, mesh)); + break; + } + } + + // Populate the property data for the lumped port. + if (std::abs(data.Rs) + std::abs(data.Ls) + std::abs(data.Cs) == 0.0) + { + R = data.R; + L = data.L; + C = data.C; + } + else + { + // If defined by surface properties, need to compute circuit properties for the + // multielement port. + double ooR = 0.0, ooL = 0.0; + R = L = C = 0.0; + for (const auto &elem : elems) + { + const double sq = elem->GetGeometryWidth() / elem->GetGeometryLength(); + if (std::abs(data.Rs) > 0.0) + { + ooR += sq / data.Rs; + } + if (std::abs(data.Ls) > 0.0) + { + ooL += sq / data.Ls; + } + if (std::abs(data.Cs) > 0.0) + { + C += sq * data.Cs; + } + } + if (std::abs(ooR) > 0.0) + { + R = 1.0 / ooR; + } + if (std::abs(ooL) > 0.0) + { + L = 1.0 / ooL; + } + } +} + +std::complex +LumpedPortData::GetCharacteristicImpedance(double omega, + LumpedPortData::Branch branch) const +{ + MFEM_VERIFY((L == 0.0 && C == 0.0) || branch == Branch::R || omega > 0.0, + "Lumped port with nonzero reactance requires frequency in order to define " + "characteristic impedance!"); + std::complex Y = 0.0; + if (std::abs(R) > 0.0 && (branch == Branch::TOTAL || branch == Branch::R)) + { + Y += 1.0 / R; + } + if (std::abs(L) > 0.0 && (branch == Branch::TOTAL || branch == Branch::L)) + { + Y += 1.0 / (1i * omega * L); + } + if (std::abs(C) > 0.0 && (branch == Branch::TOTAL || branch == Branch::C)) + { + Y += 1i * omega * C; + } + MFEM_VERIFY(std::abs(Y) > 0.0, + "Characteristic impedance requested for lumped port with zero admittance!") + return 1.0 / Y; +} + +double LumpedPortData::GetExcitationPower() const +{ + // The lumped port excitation is normalized such that the power integrated over the port + // is 1: ∫ (E_inc x H_inc) ⋅ n dS = 1. + return HasExcitation() ? 1.0 : 0.0; +} + +double LumpedPortData::GetExcitationVoltage() const +{ + // Incident voltage should be the same across all elements of an excited lumped port. + if (HasExcitation()) + { + double V_inc = 0.0; + for (const auto &elem : elems) + { + const double Rs = R * GetToSquare(*elem); + const double E_inc = std::sqrt( + Rs / (elem->GetGeometryWidth() * elem->GetGeometryLength() * elems.size())); + V_inc += E_inc * elem->GetGeometryLength() / elems.size(); + } + return V_inc; + } + else + { + return 0.0; + } +} + +void LumpedPortData::InitializeLinearForms(mfem::ParFiniteElementSpace &nd_fespace) const +{ + const auto &mesh = *nd_fespace.GetParMesh(); + mfem::Array attr_marker; + if (!s || !v) + { + mfem::Array attr_list; + for (const auto &elem : elems) + { + attr_list.Append(elem->GetAttrList()); + } + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mesh::AttrToMarker(bdr_attr_max, attr_list, attr_marker); + } + + // The port S-parameter, or the projection of the field onto the port mode, is computed + // as: (E x H_inc) ⋅ n = E ⋅ (E_inc / Z_s), integrated over the port surface. + if (!s) + { + SumVectorCoefficient fb(mesh.SpaceDimension()); + for (const auto &elem : elems) + { + const double Rs = R * GetToSquare(*elem); + const double Hinc = (std::abs(Rs) > 0.0) + ? 1.0 / std::sqrt(Rs * elem->GetGeometryWidth() * + elem->GetGeometryLength() * elems.size()) + : 0.0; + fb.AddCoefficient(elem->GetModeCoefficient(Hinc)); + } + s = std::make_unique(&nd_fespace); + s->AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fb), attr_marker); + s->UseFastAssembly(false); + s->UseDevice(false); + s->Assemble(); + s->UseDevice(true); + } + + // The voltage across a port is computed using the electric field solution. + // We have: + // V = ∫ E ⋅ l̂ dl = 1/w ∫ E ⋅ l̂ dS (for rectangular ports) + // or, + // V = 1/(2π) ∫ E ⋅ r̂ / r dS (for coaxial ports). + // We compute the surface integral via an inner product between the linear form with the + // averaging function as a vector coefficient and the solution expansion coefficients. + if (!v) + { + SumVectorCoefficient fb(mesh.SpaceDimension()); + for (const auto &elem : elems) + { + fb.AddCoefficient( + elem->GetModeCoefficient(1.0 / (elem->GetGeometryWidth() * elems.size()))); + } + v = std::make_unique(&nd_fespace); + v->AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fb), attr_marker); + v->UseFastAssembly(false); + v->UseDevice(false); + v->Assemble(); + v->UseDevice(true); + } +} + +std::complex LumpedPortData::GetPower(GridFunction &E, GridFunction &B) const +{ + // Compute port power, (E x H) ⋅ n = E ⋅ (-n x H), integrated over the port surface using + // the computed E and H = μ⁻¹ B fields, where +n is the direction of propagation (into the + // domain). The BdrSurfaceCurrentVectorCoefficient computes -n x H for an outward normal, + // so we multiply by -1. The linear form is reconstructed from scratch each time due to + // changing H. + MFEM_VERIFY((E.HasImag() && B.HasImag()) || (!E.HasImag() && !B.HasImag()), + "Mismatch between real- and complex-valued E and B fields in port power " + "calculation!"); + const bool has_imag = E.HasImag(); + auto &nd_fespace = *E.ParFESpace(); + const auto &mesh = *nd_fespace.GetParMesh(); + SumVectorCoefficient fbr(mesh.SpaceDimension()), fbi(mesh.SpaceDimension()); + mfem::Array attr_list; + for (const auto &elem : elems) + { + fbr.AddCoefficient( + std::make_unique>( + elem->GetAttrList(), B.Real(), mat_op)); + if (has_imag) + { + fbi.AddCoefficient( + std::make_unique>( + elem->GetAttrList(), B.Imag(), mat_op)); + } + attr_list.Append(elem->GetAttrList()); + } + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array attr_marker = mesh::AttrToMarker(bdr_attr_max, attr_list); + std::complex dot; + { + mfem::LinearForm pr(&nd_fespace); + pr.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fbr), attr_marker); + pr.UseFastAssembly(false); + pr.UseDevice(false); + pr.Assemble(); + pr.UseDevice(true); + dot = -(pr * E.Real()) + (has_imag ? -1i * (pr * E.Imag()) : 0.0); + } + if (has_imag) + { + mfem::LinearForm pi(&nd_fespace); + pi.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fbi), attr_marker); + pi.UseFastAssembly(false); + pi.UseDevice(false); + pi.Assemble(); + pi.UseDevice(true); + dot += -(pi * E.Imag()) + 1i * (pi * E.Real()); + Mpi::GlobalSum(1, &dot, E.ParFESpace()->GetComm()); + return dot; + } + else + { + double rdot = dot.real(); + Mpi::GlobalSum(1, &rdot, E.ParFESpace()->GetComm()); + return rdot; + } +} + +std::complex LumpedPortData::GetSParameter(GridFunction &E) const +{ + // Compute port S-parameter, or the projection of the field onto the port mode. + InitializeLinearForms(*E.ParFESpace()); + std::complex dot((*s) * E.Real(), 0.0); + if (E.HasImag()) + { + dot.imag((*s) * E.Imag()); + } + Mpi::GlobalSum(1, &dot, E.GetComm()); + return dot; +} + +std::complex LumpedPortData::GetVoltage(GridFunction &E) const +{ + // Compute the average voltage across the port. + InitializeLinearForms(*E.ParFESpace()); + std::complex dot((*v) * E.Real(), 0.0); + if (E.HasImag()) + { + dot.imag((*v) * E.Imag()); + } + Mpi::GlobalSum(1, &dot, E.GetComm()); + return dot; +} + +LumpedPortOperator::LumpedPortOperator(const IoData &iodata, const MaterialOperator &mat_op, + const mfem::ParMesh &mesh) +{ + // Set up lumped port boundary conditions. + SetUpBoundaryProperties(iodata, mat_op, mesh); + PrintBoundaryInfo(iodata, mesh); +} + +void LumpedPortOperator::SetUpBoundaryProperties(const IoData &iodata, + const MaterialOperator &mat_op, + const mfem::ParMesh &mesh) +{ + // Check that lumped port boundary attributes have been specified correctly. + if (!iodata.boundaries.lumpedport.empty()) + { + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array bdr_attr_marker(bdr_attr_max), port_marker(bdr_attr_max); + bdr_attr_marker = 0; + port_marker = 0; + for (auto attr : mesh.bdr_attributes) + { + bdr_attr_marker[attr - 1] = 1; + } + for (const auto &[idx, data] : iodata.boundaries.lumpedport) + { + for (const auto &elem : data.elements) + { + for (auto attr : elem.attributes) + { + MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, + "Port boundary attribute tags must be non-negative and correspond to " + "boundaries in the mesh!"); + MFEM_VERIFY(bdr_attr_marker[attr - 1], + "Unknown port boundary attribute " << attr << "!"); + MFEM_VERIFY(!data.active || !port_marker[attr - 1], + "Boundary attribute is assigned to more than one lumped port!"); + port_marker[attr - 1] = 1; + } + } + } + } + + // Set up lumped port data structures. + for (const auto &[idx, data] : iodata.boundaries.lumpedport) + { + ports.try_emplace(idx, data, mat_op, mesh); + } +} + +void LumpedPortOperator::PrintBoundaryInfo(const IoData &iodata, const mfem::ParMesh &mesh) +{ + if (ports.empty()) + { + return; + } + + fmt::memory_buffer buf{}; // Output buffer & buffer append lambda for cleaner code + auto to = [](auto &buf, auto fmt, auto &&...args) + { fmt::format_to(std::back_inserter(buf), fmt, std::forward(args)...); }; + using VT = Units::ValueType; + + // Print out BC info for all port attributes, for both active and inactive ports. + to(buf, "\nConfiguring Robin impedance BC for lumped ports at attributes:\n"); + for (const auto &[idx, data] : ports) + { + for (const auto &elem : data.elems) + { + for (auto attr : elem->GetAttrList()) + { + to(buf, " {:d}:", attr); + if (std::abs(data.R) > 0.0) + { + double Rs = data.R * data.GetToSquare(*elem); + to(buf, " Rs = {:.3e} Ω/sq,", iodata.units.Dimensionalize(Rs)); + } + if (std::abs(data.L) > 0.0) + { + double Ls = data.L * data.GetToSquare(*elem); + to(buf, " Ls = {:.3e} H/sq,", iodata.units.Dimensionalize(Ls)); + } + if (std::abs(data.C) > 0.0) + { + double Cs = data.C / data.GetToSquare(*elem); + to(buf, " Cs = {:.3e} F/sq,", iodata.units.Dimensionalize(Cs)); + } + to(buf, " n = ({:+.1f})\n", fmt::join(mesh::GetSurfaceNormal(mesh, attr), ",")); + } + } + } + + // Print out port info for all active ports. + fmt::memory_buffer buf_a{}; + for (const auto &[idx, data] : ports) + { + if (!data.active) + { + continue; + } + to(buf_a, " Index = {:d}: ", idx); + if (std::abs(data.R) > 0.0) + { + to(buf_a, "R = {:.3e} Ω,", iodata.units.Dimensionalize(data.R)); + } + if (std::abs(data.L) > 0.0) + { + to(buf_a, "L = {:.3e} H,", iodata.units.Dimensionalize(data.L)); + } + if (std::abs(data.C) > 0.0) + { + to(buf_a, "C = {:.3e} F,", iodata.units.Dimensionalize(data.C)); + } + buf_a.resize(buf_a.size() - 1); // Remove last "," + to(buf_a, "\n"); + } + if (buf_a.size() > 0) + { + to(buf, "\nConfiguring lumped port circuit properties:\n"); + buf.append(buf_a); + buf_a.clear(); + } + + // Print some information for excited lumped ports. + for (const auto &[idx, data] : ports) + { + if (!data.HasExcitation()) + { + continue; + } + + for (const auto &elem : data.elems) + { + for (auto attr : elem->GetAttrList()) + { + to(buf_a, " {:d}: Index = {:d}\n", attr, idx); + } + } + } + if (buf_a.size() > 0) + { + to(buf, "\nConfiguring lumped port excitation source term at attributes:\n"); + buf.append(buf_a); + } + + Mpi::Print("{}", fmt::to_string(buf)); +} + +const LumpedPortData &LumpedPortOperator::GetPort(int idx) const +{ + auto it = ports.find(idx); + MFEM_VERIFY(it != ports.end(), "Unknown lumped port index requested!"); + return it->second; +} + +mfem::Array LumpedPortOperator::GetAttrList() const +{ + mfem::Array attr_list; + for (const auto &[idx, data] : ports) + { + if (!data.active) + { + continue; + } + for (const auto &elem : data.elems) + { + attr_list.Append(elem->GetAttrList()); + } + } + return attr_list; +} + +mfem::Array LumpedPortOperator::GetRsAttrList() const +{ + mfem::Array attr_list; + for (const auto &[idx, data] : ports) + { + if (!data.active) + { + continue; + } + if (std::abs(data.R) > 0.0) + { + for (const auto &elem : data.elems) + { + attr_list.Append(elem->GetAttrList()); + } + } + } + return attr_list; +} + +mfem::Array LumpedPortOperator::GetLsAttrList() const +{ + mfem::Array attr_list; + for (const auto &[idx, data] : ports) + { + if (!data.active) + { + continue; + } + if (std::abs(data.L) > 0.0) + { + for (const auto &elem : data.elems) + { + attr_list.Append(elem->GetAttrList()); + } + } + } + return attr_list; +} + +mfem::Array LumpedPortOperator::GetCsAttrList() const +{ + mfem::Array attr_list; + for (const auto &[idx, data] : ports) + { + if (!data.active) + { + continue; + } + if (std::abs(data.C) > 0.0) + { + for (const auto &elem : data.elems) + { + attr_list.Append(elem->GetAttrList()); + } + } + } + return attr_list; +} + +void LumpedPortOperator::AddStiffnessBdrCoefficients(double coeff, + MaterialPropertyCoefficient &fb) +{ + // Add lumped inductor boundaries to the bilinear form. + for (const auto &[idx, data] : ports) + { + if (!data.active) + { + continue; + } + if (std::abs(data.L) > 0.0) + { + for (const auto &elem : data.elems) + { + const double Ls = data.L * data.GetToSquare(*elem); + fb.AddMaterialProperty(data.mat_op.GetCeedBdrAttributes(elem->GetAttrList()), + coeff / Ls); + } + } + } +} + +void LumpedPortOperator::AddDampingBdrCoefficients(double coeff, + MaterialPropertyCoefficient &fb) +{ + // Add lumped resistor boundaries to the bilinear form. + for (const auto &[idx, data] : ports) + { + if (!data.active) + { + continue; + } + if (std::abs(data.R) > 0.0) + { + for (const auto &elem : data.elems) + { + const double Rs = data.R * data.GetToSquare(*elem); + fb.AddMaterialProperty(data.mat_op.GetCeedBdrAttributes(elem->GetAttrList()), + coeff / Rs); + } + } + } +} + +void LumpedPortOperator::AddMassBdrCoefficients(double coeff, + MaterialPropertyCoefficient &fb) +{ + // Add lumped capacitance boundaries to the bilinear form. + for (const auto &[idx, data] : ports) + { + if (!data.active) + { + continue; + } + if (std::abs(data.C) > 0.0) + { + for (const auto &elem : data.elems) + { + const double Cs = data.C / data.GetToSquare(*elem); + fb.AddMaterialProperty(data.mat_op.GetCeedBdrAttributes(elem->GetAttrList()), + coeff * Cs); + } + } + } +} + +void LumpedPortOperator::AddExcitationBdrCoefficients(int excitation_idx, + SumVectorCoefficient &fb) +{ + // Construct the RHS source term for lumped port boundaries, which looks like -U_inc = + // +2 iω/Z_s E_inc for a port boundary with an incident field E_inc. The chosen incident + // field magnitude corresponds to a unit incident power over the full port boundary. See + // p. 49 and p. 82 of the COMSOL RF Module manual for more detail. + // Note: The real RHS returned here does not yet have the factor of (iω) included, so + // works for time domain simulations requiring RHS -U_inc(t). + for (const auto &[idx, data] : ports) + { + if (data.excitation != excitation_idx) + { + continue; + } + MFEM_VERIFY(std::abs(data.R) > 0.0, + "Unexpected zero resistance in excited lumped port!"); + for (const auto &elem : data.elems) + { + const double Rs = data.R * data.GetToSquare(*elem); + const double Hinc = 1.0 / std::sqrt(Rs * elem->GetGeometryWidth() * + elem->GetGeometryLength() * data.elems.size()); + fb.AddCoefficient(elem->GetModeCoefficient(2.0 * Hinc)); + } + } +} + +} // namespace palace diff --git a/palace/models/lumpedportoperator.hpp b/palace/models/lumpedportoperator.hpp index 8c5fa30d9f..ab0a384e7a 100644 --- a/palace/models/lumpedportoperator.hpp +++ b/palace/models/lumpedportoperator.hpp @@ -1,126 +1,130 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_LUMPED_PORT_OPERATOR_HPP -#define PALACE_MODELS_LUMPED_PORT_OPERATOR_HPP - -#include -#include -#include -#include -#include -#include "fem/lumpedelement.hpp" - -namespace palace -{ - -class IoData; -class MaterialOperator; -class SumMatrixCoefficient; -class SumVectorCoefficient; - -namespace config -{ - -struct LumpedPortData; - -} // namespace config - -// -// Helper class for lumped ports in a model. -// -class LumpedPortData -{ -private: - bool excitation; - double R, L, C; - - // To accomodate multielement lumped ports, a port may be made up of elements with - // different attributes and directions which add in parallel. - std::vector> elems; - - // Linear forms for postprocessing integrated quantities on the port. - mutable std::unique_ptr s, v; - void InitializeLinearForms(mfem::ParFiniteElementSpace &nd_fespace) const; - -public: - LumpedPortData(const config::LumpedPortData &data, - mfem::ParFiniteElementSpace &h1_fespace); - - const std::vector> &GetElements() const - { - return elems; - } - - double GetToSquare(const LumpedElementData &elem) const - { - return elem.GetGeometryWidth() / elem.GetGeometryLength() * elems.size(); - } - - bool IsExcited() const { return excitation; } - double GetR() const { return R; } - double GetL() const { return L; } - double GetC() const { return C; } - - std::complex GetCharacteristicImpedance(double omega = 0.0) const; - - double GetExcitationPower() const; - double GetExcitationVoltage() const; - - std::complex GetSParameter(mfem::ParComplexGridFunction &E) const; - std::complex GetPower(mfem::ParComplexGridFunction &E, - mfem::ParComplexGridFunction &B, - const MaterialOperator &mat_op) const; - double GetPower(mfem::ParGridFunction &E, mfem::ParGridFunction &B, - const MaterialOperator &mat_op) const; - std::complex GetVoltage(mfem::ParComplexGridFunction &E) const; - double GetVoltage(mfem::ParGridFunction &E) const; -}; - -// -// A class handling lumped port boundaries and their postprocessing. -// -class LumpedPortOperator -{ -private: - // Mapping from port index to data structure containing port information and methods to - // calculate circuit properties like voltage and current on lumped or multielement lumped - // ports. - std::map ports; - mfem::Array port_marker, port_Rs_marker, port_Ls_marker, port_Cs_marker; - void SetUpBoundaryProperties(const IoData &iodata, - mfem::ParFiniteElementSpace &h1_fespace); - void PrintBoundaryInfo(const IoData &iodata, mfem::ParMesh &mesh); - -public: - LumpedPortOperator(const IoData &iodata, mfem::ParFiniteElementSpace &h1_fespace); - - // Access data structures for the lumped port with the given index. - const LumpedPortData &GetPort(int idx) const; - auto begin() const { return ports.begin(); } - auto end() const { return ports.end(); } - auto rbegin() const { return ports.rbegin(); } - auto rend() const { return ports.rend(); } - auto Size() const { return ports.size(); } - - // Returns array marking lumped port attributes. - const mfem::Array &GetMarker() const { return port_marker; } - const mfem::Array &GetRsMarker() const { return port_Rs_marker; } - const mfem::Array &GetLsMarker() const { return port_Ls_marker; } - const mfem::Array &GetCsMarker() const { return port_Cs_marker; } - - // Add contributions to system matrices from lumped elements with nonzero inductance, - // capacitance, and/or resistance. - void AddStiffnessBdrCoefficients(double coef, SumMatrixCoefficient &fb); - void AddMassBdrCoefficients(double coef, SumMatrixCoefficient &fb); - void AddDampingBdrCoefficients(double coef, SumMatrixCoefficient &fb); - - // Add contributions to the right-hand side source term vector for an incident field at - // excited port boundaries, -U_inc/(iω) for the real version (versus the full -U_inc for - // the complex one). - void AddExcitationBdrCoefficients(SumVectorCoefficient &fb); -}; - -} // namespace palace - -#endif // PALACE_MODELS_LUMPED_PORT_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_LUMPED_PORT_OPERATOR_HPP +#define PALACE_MODELS_LUMPED_PORT_OPERATOR_HPP + +#include +#include +#include +#include +#include +#include "fem/lumpedelement.hpp" + +namespace palace +{ + +class GridFunction; +class IoData; +class MaterialOperator; +class MaterialPropertyCoefficient; +class SumVectorCoefficient; + +namespace config +{ + +struct LumpedPortData; + +} // namespace config + +// +// Helper class for lumped ports in a model. +// +class LumpedPortData +{ +public: + // Reference to material property data (not owned). + const MaterialOperator &mat_op; + + // To accommodate multielement lumped ports, a port may be made up of elements with + // different attributes and directions which add in parallel. + std::vector> elems; + + // Lumped port properties. + double R, L, C; + int excitation; + bool active; + +private: + // Linear forms for postprocessing integrated quantities on the port. + mutable std::unique_ptr s, v; + + void InitializeLinearForms(mfem::ParFiniteElementSpace &nd_fespace) const; + +public: + LumpedPortData(const config::LumpedPortData &data, const MaterialOperator &mat_op, + const mfem::ParMesh &mesh); + + double GetToSquare(const LumpedElementData &elem) const + { + return elem.GetGeometryWidth() / elem.GetGeometryLength() * elems.size(); + } + + [[nodiscard]] constexpr bool HasExcitation() const { return excitation != 0; } + + enum class Branch + { + TOTAL, + R, + L, + C + }; + std::complex GetCharacteristicImpedance(double omega = 0.0, + Branch branch = Branch::TOTAL) const; + + double GetExcitationPower() const; + double GetExcitationVoltage() const; + + std::complex GetPower(GridFunction &E, GridFunction &B) const; + std::complex GetSParameter(GridFunction &E) const; + std::complex GetVoltage(GridFunction &E) const; +}; + +// +// A class handling lumped port boundaries and their postprocessing. +// +class LumpedPortOperator +{ +private: + // Mapping from port index to data structure containing port information and methods to + // calculate circuit properties like voltage and current on lumped or multielement lumped + // ports. + std::map ports; + + void SetUpBoundaryProperties(const IoData &iodata, const MaterialOperator &mat_op, + const mfem::ParMesh &mesh); + void PrintBoundaryInfo(const IoData &iodata, const mfem::ParMesh &mesh); + +public: + LumpedPortOperator(const IoData &iodata, const MaterialOperator &mat_op, + const mfem::ParMesh &mesh); + + // Access data structures for the lumped port with the given index. + const LumpedPortData &GetPort(int idx) const; + auto begin() const { return ports.begin(); } + auto end() const { return ports.end(); } + auto rbegin() const { return ports.rbegin(); } + auto rend() const { return ports.rend(); } + auto Size() const { return ports.size(); } + + // Returns array of lumped port attributes. + mfem::Array GetAttrList() const; + mfem::Array GetRsAttrList() const; + mfem::Array GetLsAttrList() const; + mfem::Array GetCsAttrList() const; + + // Add contributions to system matrices from lumped elements with nonzero inductance, + // resistance, and/or capacitance. + void AddStiffnessBdrCoefficients(double coeff, MaterialPropertyCoefficient &fb); + void AddDampingBdrCoefficients(double coeff, MaterialPropertyCoefficient &fb); + void AddMassBdrCoefficients(double coeff, MaterialPropertyCoefficient &fb); + + // Add contributions to the right-hand side source term vector for an incident field at + // excited port boundaries, -U_inc/(iω) for the real version (versus the full -U_inc for + // the complex one). + void AddExcitationBdrCoefficients(int excitation_idx, SumVectorCoefficient &fb); +}; + +} // namespace palace + +#endif // PALACE_MODELS_LUMPED_PORT_OPERATOR_HPP diff --git a/palace/models/materialoperator.cpp b/palace/models/materialoperator.cpp index 01aa9cf2f4..fee069a172 100644 --- a/palace/models/materialoperator.cpp +++ b/palace/models/materialoperator.cpp @@ -1,444 +1,684 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "materialoperator.hpp" - -#include -#include -#include -#include "utils/communication.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -namespace -{ - -// Compute matrix functions for symmetric real-valued 2x2 or 3x3 matrices. Returns the -// matrix U * f(Λ) * U' for input U * Λ * U' -// Reference: Deledalle et al., Closed-form expressions of the eigen decomposition of 2x2 -// and 3x3 Hermitian matrices, HAL hal-01501221 (2017). -mfem::DenseMatrix MatrixFunction(const mfem::DenseMatrix &M, - const std::function &functor) -{ - MFEM_ASSERT(M.Height() == M.Width(), - "MatrixFunction only available for square matrices!"); - const auto N = M.Height(); - constexpr auto tol = 10.0 * std::numeric_limits::epsilon(); - for (int i = 0; i < N; i++) - { - for (int j = i + 1; j < N; j++) - { - MFEM_VERIFY(std::abs(M(i, j) - M(j, i)) < tol, - "MatrixFunction only available for symmetric matrices!"); - } - } - mfem::DenseMatrix Mout(N, N); - Mout = 0.0; - if (N == 2) - { - MFEM_ABORT("2x2 MatrixFunction is not implemented yet!"); - } - else if (N == 3) - { - // Need to specialize based on the number of zeros and their locations. - const auto &a = M(0, 0), &b = M(1, 1), &c = M(2, 2); - const auto &d = M(0, 1), &e = M(1, 2), &f = M(0, 2); - const bool d_non_zero = std::abs(d) > tol; - const bool e_non_zero = std::abs(e) > tol; - const bool f_non_zero = std::abs(f) > tol; - if (!d_non_zero && !e_non_zero && !f_non_zero) - { - // a 0 0 - // 0 b 0 - // 0 0 c - for (int i = 0; i < 3; i++) - { - Mout(i, i) = functor(M(i, i)); - } - return Mout; - } - if (d_non_zero && !e_non_zero && !f_non_zero) - { - // a d 0 - // d b 0 - // 0 0 c - const double disc = std::sqrt(a * a - 2.0 * a * b + b * b + 4.0 * d * d); - const double lambda1 = c; - const double lambda2 = (a + b - disc) / 2.0; - const double lambda3 = (a + b + disc) / 2.0; - const mfem::Vector v1{{0.0, 0.0, 1.0}}; - const mfem::Vector v2{{-(-a + b + disc) / (2.0 * d), 1.0, 0.0}}; - const mfem::Vector v3{{-(-a + b - disc) / (2.0 * d), 1.0, 0.0}}; - AddMult_a_VVt(functor(lambda1), v1, Mout); - AddMult_a_VVt(functor(lambda2), v2, Mout); - AddMult_a_VVt(functor(lambda3), v3, Mout); - return Mout; - } - if (!d_non_zero && e_non_zero && !f_non_zero) - { - // a 0 0 - // 0 b e - // 0 e c - const double disc = std::sqrt(b * b - 2.0 * b * c + c * c + 4.0 * e * e); - const double lambda1 = a; - const double lambda2 = 0.5 * (b + c - disc); - const double lambda3 = 0.5 * (b + c + disc); - const mfem::Vector v1{{1.0, 0.0, 0.0}}; - const mfem::Vector v2{{0.0, -(-b + c + disc) / (2.0 * e), 1.0}}; - const mfem::Vector v3{{0.0, -(-b + c - disc) / (2.0 * e), 1.0}}; - AddMult_a_VVt(functor(lambda1), v1, Mout); - AddMult_a_VVt(functor(lambda2), v2, Mout); - AddMult_a_VVt(functor(lambda3), v3, Mout); - return Mout; - } - if (!d_non_zero && !e_non_zero && f_non_zero) - { - // a 0 f - // 0 b 0 - // f 0 c - const double disc = std::sqrt(a * a - 2.0 * a * c + c * c + 4.0 * f * f); - const double lambda1 = b; - const double lambda2 = 0.5 * (a + c - disc); - const double lambda3 = 0.5 * (a + c + disc); - const mfem::Vector v1{{0.0, 1.0, 0.0}}; - const mfem::Vector v2{{-(-a + c + disc) / (2.0 * f), 0.0, 1.0}}; - const mfem::Vector v3{{-(-a + c - disc) / (2.0 * f), 0.0, 1.0}}; - AddMult_a_VVt(functor(lambda1), v1, Mout); - AddMult_a_VVt(functor(lambda2), v2, Mout); - AddMult_a_VVt(functor(lambda3), v3, Mout); - return Mout; - } - if ((!d_non_zero && e_non_zero && f_non_zero) || - (d_non_zero && !e_non_zero && f_non_zero) || - (d_non_zero && e_non_zero && !f_non_zero)) - { - MFEM_ABORT("This nonzero pattern is not currently supported for MatrixFunction!"); - } - // General case for all nonzero: - // a d f - // d b e - // f e c - const double a2 = a * a, b2 = b * b, c2 = c * c, d2 = d * d, e2 = e * e, f2 = f * f; - const double a2mbmc = 2.0 * a - b - c; - const double b2mamc = 2.0 * b - a - c; - const double c2mamb = 2.0 * c - a - b; - const double x1 = a2 + b2 + c2 - a * b - b * c + 3.0 * (d2 + e2 + f2); - const double x2 = -(a2mbmc * b2mamc * c2mamb) + - 9.0 * (c2mamb * d2 + b2mamc * f2 + a2mbmc * e2) - 54.0 * d * e * f; - const double phi = std::atan2(std::sqrt(4.0 * x1 * x1 * x1 - x2 * x2), x2); - const double lambda1 = (a + b + c - 2.0 * std::sqrt(x1) * std::cos(phi / 3.0)) / 3.0; - const double lambda2 = - (a + b + c + 2.0 * std::sqrt(x1) * std::cos((phi - M_PI) / 3.0)) / 3.0; - const double lambda3 = - (a + b + c + 2.0 * std::sqrt(x1) * std::cos((phi + M_PI) / 3.0)) / 3.0; - - auto SafeDivide = [&](double x, double y) - { - if (std::abs(x) <= tol) - { - return 0.0; - } - if (std::abs(x) >= tol && std::abs(y) <= tol) - { - MFEM_ABORT("Logic error: Zero denominator with nonzero numerator!"); - return 0.0; - } - return x / y; - }; - const double m1 = SafeDivide(d * (c - lambda1) - e * f, f * (b - lambda1) - d * e); - const double m2 = SafeDivide(d * (c - lambda2) - e * f, f * (b - lambda2) - d * e); - const double m3 = SafeDivide(d * (c - lambda3) - e * f, f * (b - lambda3) - d * e); - const double l1mcmem1 = lambda1 - c - e * m1; - const double l2mcmem2 = lambda2 - c - e * m2; - const double l3mcmem3 = lambda3 - c - e * m3; - const double n1 = 1.0 + m1 * m1 + SafeDivide(std::pow(l1mcmem1, 2), f2); - const double n2 = 1.0 + m2 * m2 + SafeDivide(std::pow(l2mcmem2, 2), f2); - const double n3 = 1.0 + m3 * m3 + SafeDivide(std::pow(l3mcmem3, 2), f2); - const double tlambda1 = functor(lambda1) / n1; - const double tlambda2 = functor(lambda2) / n2; - const double tlambda3 = functor(lambda3) / n3; - - const double at = (tlambda1 * l1mcmem1 * l1mcmem1 + tlambda2 * l2mcmem2 * l2mcmem2 + - tlambda3 * l3mcmem3 * l3mcmem3) / - f2; - const double bt = tlambda1 * m1 * m1 + tlambda2 * m2 * m2 + tlambda3 * m3 * m3; - const double ct = tlambda1 + tlambda2 + tlambda3; - const double dt = - (tlambda1 * m1 * l1mcmem1 + tlambda2 * m2 * l2mcmem2 + tlambda3 * m3 * l3mcmem3) / - f; - const double et = tlambda1 * m1 + tlambda2 * m2 + tlambda3 * m3; - const double ft = (tlambda1 * l1mcmem1 + tlambda2 * l2mcmem2 + tlambda3 * l3mcmem3) / f; - Mout(0, 0) = at; - Mout(0, 1) = dt; - Mout(0, 2) = ft; - Mout(1, 0) = dt; - Mout(1, 1) = bt; - Mout(1, 2) = et; - Mout(2, 0) = ft; - Mout(2, 1) = et; - Mout(2, 2) = ct; - return Mout; - } - else - { - MFEM_ABORT("MatrixFunction only supports 2x2 or 3x3 matrices!"); - } - return Mout; -} - -mfem::DenseMatrix MatrixSqrt(const mfem::DenseMatrix &M) -{ - return MatrixFunction(M, [](auto s) { return std::sqrt(s); }); -} - -template -bool IsValid(const config::SymmetricMatrixData &data) -{ - // All the coefficients are nonzero. - bool valid = - std::all_of(data.s.begin(), data.s.end(), [](auto d) { return std::abs(d) > 0.0; }); - - // All the vectors are normalized. - constexpr auto tol = 1.0e-6; - auto UnitNorm = [&](const std::array &x) - { - double s = -1.0; - for (const auto &i : x) - { - s += std::pow(i, 2); - } - return std::abs(s) < tol; - }; - valid &= std::all_of(data.v.begin(), data.v.end(), UnitNorm); - - // All the vectors are orthogonal. - for (std::size_t i1 = 0; i1 < N; i1++) - { - const auto &v1 = data.v.at(i1); - for (std::size_t i2 = i1 + 1; i2 < N; i2++) - { - const auto &v2 = data.v.at(i2); - double s = 0.0; - for (std::size_t j = 0; j < N; j++) - { - s += v1[j] * v2[j]; - } - valid &= std::abs(s) < tol; - } - } - return valid; -} - -template -bool IsIsotropic(const config::SymmetricMatrixData &data) -{ - bool valid = true; - for (std::size_t i = 0; i < N; i++) - { - for (std::size_t j = 0; j < N; j++) - { - if (i == j) - { - valid &= data.v[i][j] == 1.0; - } - else - { - valid &= data.v[i][j] == 0.0; - } - } - } - return valid; -} - -template -bool IsIdentity(const config::SymmetricMatrixData &data) -{ - auto valid = std::all_of(data.s.begin(), data.s.end(), [](auto d) { return d == 1.0; }); - valid &= IsIsotropic(data); - return valid; -} - -template -mfem::DenseMatrix ToDenseMatrix(const config::SymmetricMatrixData &data) -{ - mfem::DenseMatrix M(N, N); - mfem::Vector V(N); - for (std::size_t i = 0; i < N; i++) - { - for (std::size_t j = 0; j < N; j++) - { - V(j) = data.v[i][j]; - } - AddMult_a_VVt(data.s[i], V, M); - } - return M; -} - -} // namespace - -MaterialOperator::MaterialOperator(const IoData &iodata, mfem::ParMesh &mesh) -{ - SetUpMaterialProperties(iodata, mesh); -} - -void MaterialOperator::SetUpMaterialProperties(const IoData &iodata, mfem::ParMesh &mesh) -{ - // Check that material attributes have been specified correctly. The mesh attributes may - // be non-contiguous and when no material attribute is specified the elements are deleted - // from the mesh so as to not cause problems. - MFEM_VERIFY(!iodata.domains.materials.empty(), "Materials must be non-empty!"); - int attr_max = mesh.attributes.Max(); - mfem::Array attr_marker(attr_max); - attr_marker = 0; - for (auto attr : mesh.attributes) - { - attr_marker[attr - 1] = 1; - } - for (const auto &data : iodata.domains.materials) - { - for (auto attr : data.attributes) - { - MFEM_VERIFY( - attr > 0 && attr <= attr_max, - "Material attribute tags must be non-negative and correspond to attributes " - "in the mesh!"); - MFEM_VERIFY(attr_marker[attr - 1], "Unknown material attribute " << attr << "!"); - } - } - - // Set up material properties of the different domain regions, represented with piece-wise - // constant matrix-valued coefficients for the relative permeability and permittivity, - // and other material properties. - const int sdim = mesh.SpaceDimension(); - mat_muinv.resize(attr_max, mfem::DenseMatrix(sdim)); - mat_epsilon.resize(attr_max, mfem::DenseMatrix(sdim)); - mat_epsilon_imag.resize(attr_max, mfem::DenseMatrix(sdim)); - mat_epsilon_abs.resize(attr_max, mfem::DenseMatrix(sdim)); - mat_invz0.resize(attr_max, mfem::DenseMatrix(sdim)); - mat_c0.resize(attr_max, mfem::DenseMatrix(sdim)); - mat_sigma.resize(attr_max, mfem::DenseMatrix(sdim)); - mat_invLondon.resize(attr_max, mfem::DenseMatrix(sdim)); - mat_c0_min.resize(attr_max, 0.0); - mat_c0_max.resize(attr_max, 0.0); - for (const auto &data : iodata.domains.materials) - { - if (iodata.problem.type == config::ProblemData::Type::ELECTROSTATIC) - { - MFEM_VERIFY(IsValid(data.epsilon_r), "Material has no valid permittivity defined!"); - if (!IsIdentity(data.mu_r) || IsValid(data.sigma) || std::abs(data.lambda_L) > 0.0) - { - Mpi::Warning("Electrostatic problem type does not account for material " - "permeability, electrical conductivity, or London depth!\n"); - } - } - else if (iodata.problem.type == config::ProblemData::Type::MAGNETOSTATIC) - { - MFEM_VERIFY(IsValid(data.mu_r), "Material has no valid permeability defined!"); - if (!IsIdentity(data.epsilon_r) || IsValid(data.tandelta) || IsValid(data.sigma) || - std::abs(data.lambda_L) > 0.0) - { - Mpi::Warning( - "Magnetostatic problem type does not account for material permittivity, loss " - "tangent, electrical conductivity, or London depth!\n"); - } - } - else - { - MFEM_VERIFY(IsValid(data.mu_r) && IsValid(data.epsilon_r), - "Material has no valid permeability or no valid permittivity defined!"); - if (iodata.problem.type == config::ProblemData::Type::TRANSIENT) - { - MFEM_VERIFY(!IsValid(data.tandelta), - "Transient problem type does not support material loss tangent, use " - "electrical conductivity instead!"); - } - else - { - MFEM_VERIFY(!(IsValid(data.tandelta) && IsValid(data.sigma)), - "Material loss model should probably use only one of loss tangent or " - "electrical conductivity!"); - } - } - for (auto attr : data.attributes) - { - MFEM_VERIFY( - mat_c0_min.at(attr - 1) == 0.0 && mat_c0_max.at(attr - 1) == 0.0, - "Detected multiple definitions of material properties for domain attribute " - << attr << "!"); - - // Compute the inverse of the input permeability matrix. - mfem::DenseMatrix mu_r = ToDenseMatrix(data.mu_r); - mfem::DenseMatrixInverse(mu_r, true).GetInverseMatrix(mat_muinv.at(attr - 1)); - - // Material permittivity: Im{ε} = - ε * tan(δ) - mfem::DenseMatrix T(sdim, sdim); - mat_epsilon.at(attr - 1) = ToDenseMatrix(data.epsilon_r); - Mult(mat_epsilon.at(attr - 1), ToDenseMatrix(data.tandelta), T); - T *= -1.0; - mat_epsilon_imag.at(attr - 1) = T; - - // ε * √(I + tan(δ) * tan(δ)ᵀ) - MultAAt(ToDenseMatrix(data.tandelta), T); - for (int i = 0; i < T.Height(); i++) - { - T(i, i) += 1.0; - } - Mult(mat_epsilon.at(attr - 1), MatrixSqrt(T), mat_epsilon_abs.at(attr - 1)); - - // √μ⁻¹ ε - Mult(mat_muinv.at(attr - 1), mat_epsilon.at(attr - 1), mat_invz0.at(attr - 1)); - mat_invz0.at(attr - 1) = MatrixSqrt(mat_invz0.at(attr - 1)); - - // (√μ ε)⁻¹ - mfem::DenseMatrixInverse(mat_epsilon.at(attr - 1), true).GetInverseMatrix(T); - Mult(mat_muinv.at(attr - 1), T, mat_c0.at(attr - 1)); - mat_c0.at(attr - 1) = MatrixSqrt(mat_c0.at(attr - 1)); - mat_c0_min.at(attr - 1) = mat_c0.at(attr - 1).CalcSingularvalue(sdim - 1); - mat_c0_max.at(attr - 1) = mat_c0.at(attr - 1).CalcSingularvalue(0); - - // Electrical conductivity, σ - mat_sigma.at(attr - 1) = ToDenseMatrix(data.sigma); - - // λ⁻² * μ⁻¹ - mat_invLondon.at(attr - 1) = mat_muinv.at(attr - 1); - mat_invLondon.at(attr - 1) *= - std::abs(data.lambda_L) > 0.0 ? std::pow(data.lambda_L, -2.0) : 0.0; - } - } - - // Construct shared face mapping for boundary coefficients. This is useful to have in one - // place alongside material properties so we construct and store it here. - for (int i = 0; i < mesh.GetNSharedFaces(); i++) - { - local_to_shared[mesh.GetSharedFace(i)] = i; - } - - // Mark selected material attributes from the mesh as having certain local properties. - mfem::Array losstan_mats, conductivity_mats, london_mats; - losstan_mats.Reserve(attr_max); - conductivity_mats.Reserve(attr_max); - london_mats.Reserve(attr_max); - for (int i = 0; i < attr_max; i++) - { - if (mat_epsilon_imag.at(i).MaxMaxNorm() > 0.0) - { - losstan_mats.Append(i + 1); // Markers are 1-based - } - if (mat_sigma.at(i).MaxMaxNorm() > 0.0) - { - conductivity_mats.Append(i + 1); - } - if (mat_invLondon.at(i).MaxMaxNorm() > 0.0) - { - london_mats.Append(i + 1); - } - } - mesh::AttrToMarker(attr_max, losstan_mats, losstan_marker); - mesh::AttrToMarker(attr_max, conductivity_mats, conductivity_marker); - mesh::AttrToMarker(attr_max, london_mats, london_marker); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "materialoperator.hpp" + +#include +#include +#include +#include "linalg/densematrix.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" + +namespace palace +{ + +namespace internal::mat +{ + +template +bool IsOrthonormal(const config::SymmetricMatrixData &data) +{ + + // All the vectors are normalized. + constexpr auto tol = 1.0e-6; + auto UnitNorm = [&](const std::array &x) + { + double s = -1.0; + for (const auto &i : x) + { + s += std::pow(i, 2); + } + return std::abs(s) < tol; + }; + bool valid = std::all_of(data.v.begin(), data.v.end(), UnitNorm); + + // All the vectors are orthogonal. + for (std::size_t i1 = 0; i1 < N; i1++) + { + const auto &v1 = data.v.at(i1); + for (std::size_t i2 = i1 + 1; i2 < N; i2++) + { + const auto &v2 = data.v.at(i2); + double s = 0.0; + for (std::size_t j = 0; j < N; j++) + { + s += v1[j] * v2[j]; + } + valid &= std::abs(s) < tol; + } + } + return valid; +} + +template +bool IsValid(const config::SymmetricMatrixData &data) +{ + return IsOrthonormal(data) && std::all_of(data.s.begin(), data.s.end(), + [](auto d) { return std::abs(d) > 0.0; }); +} + +template +bool IsIsotropic(const config::SymmetricMatrixData &data) +{ + return IsOrthonormal(data) && + std::all_of(data.s.begin(), data.s.end(), [&](auto d) { return d == data.s[0]; }); +} + +template +bool IsIdentity(const config::SymmetricMatrixData &data) +{ + return IsOrthonormal(data) && + std::all_of(data.s.begin(), data.s.end(), [](auto d) { return d == 1.0; }); +} + +template +mfem::DenseMatrix ToDenseMatrix(const config::SymmetricMatrixData &data) +{ + mfem::DenseMatrix M(N, N); + mfem::Vector V(N); + for (std::size_t i = 0; i < N; i++) + { + for (std::size_t j = 0; j < N; j++) + { + V(j) = data.v[i][j]; + } + AddMult_a_VVt(data.s[i], V, M); + } + return M; +} + +} // namespace internal::mat + +MaterialOperator::MaterialOperator(const IoData &iodata, const Mesh &mesh) : mesh(mesh) +{ + SetUpMaterialProperties(iodata, mesh); +} + +void MaterialOperator::SetUpMaterialProperties(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + // Check that material attributes have been specified correctly. The mesh attributes may + // be non-contiguous and when no material attribute is specified the elements are deleted + // from the mesh so as to not cause problems. + MFEM_VERIFY(!iodata.domains.materials.empty(), "Materials must be non-empty!"); + { + int attr_max = mesh.attributes.Size() ? mesh.attributes.Max() : 0; + mfem::Array attr_marker(attr_max); + attr_marker = 0; + for (auto attr : mesh.attributes) + { + attr_marker[attr - 1] = 1; + } + for (const auto &data : iodata.domains.materials) + { + for (auto attr : data.attributes) + { + MFEM_VERIFY( + attr > 0 && attr <= attr_max, + "Material attribute tags must be non-negative and correspond to attributes " + "in the mesh!"); + MFEM_VERIFY(attr_marker[attr - 1], "Unknown material attribute " << attr << "!"); + } + } + } + + // Set up material properties of the different domain regions, represented with element- + // wise constant matrix-valued coefficients for the relative permeability, permittivity, + // and other material properties. + const auto &loc_attr = this->mesh.GetCeedAttributes(); + mfem::Array mat_marker(iodata.domains.materials.size()); + mat_marker = 0; + int nmats = 0; + for (std::size_t i = 0; i < iodata.domains.materials.size(); i++) + { + const auto &data = iodata.domains.materials[i]; + for (auto attr : data.attributes) + { + if (loc_attr.find(attr) != loc_attr.end()) + { + mat_marker[i] = 1; + nmats++; + break; + } + } + } + attr_mat.SetSize(loc_attr.size()); + attr_mat = -1; + + attr_is_isotropic.SetSize(nmats); + + const int sdim = mesh.SpaceDimension(); + mat_muinv.SetSize(sdim, sdim, nmats); + mat_epsilon.SetSize(sdim, sdim, nmats); + mat_epsilon_imag.SetSize(sdim, sdim, nmats); + mat_epsilon_abs.SetSize(sdim, sdim, nmats); + mat_invz0.SetSize(sdim, sdim, nmats); + mat_c0.SetSize(sdim, sdim, nmats); + mat_sigma.SetSize(sdim, sdim, nmats); + mat_invLondon.SetSize(sdim, sdim, nmats); + mat_c0_min.SetSize(nmats); + mat_c0_max.SetSize(nmats); + mat_muinvkx.SetSize(sdim, sdim, nmats); + mat_kxTmuinvkx.SetSize(sdim, sdim, nmats); + mat_kx.SetSize(sdim, sdim, nmats); + has_losstan_attr = has_conductivity_attr = has_london_attr = has_wave_attr = false; + + // Set up Floquet wave vector for periodic meshes with phase-delay constraints. + SetUpFloquetWaveVector(iodata, mesh); + + int count = 0; + for (std::size_t i = 0; i < iodata.domains.materials.size(); i++) + { + if (!mat_marker[i]) + { + continue; + } + const auto &data = iodata.domains.materials[i]; + if (iodata.problem.type == ProblemType::ELECTROSTATIC) + { + MFEM_VERIFY(internal::mat::IsValid(data.epsilon_r), + "Material has no valid permittivity defined!"); + if (!internal::mat::IsIdentity(data.mu_r) || internal::mat::IsValid(data.sigma) || + std::abs(data.lambda_L) > 0.0) + { + Mpi::Warning( + "Electrostatic problem type does not account for material permeability,\n" + "electrical conductivity, or London depth!\n"); + } + } + else if (iodata.problem.type == ProblemType::MAGNETOSTATIC) + { + MFEM_VERIFY(internal::mat::IsValid(data.mu_r), + "Material has no valid permeability defined!"); + if (!internal::mat::IsIdentity(data.epsilon_r) || + internal::mat::IsValid(data.tandelta) || internal::mat::IsValid(data.sigma) || + std::abs(data.lambda_L) > 0.0) + { + Mpi::Warning( + "Magnetostatic problem type does not account for material permittivity,\n" + "loss tangent, electrical conductivity, or London depth!\n"); + } + } + else + { + MFEM_VERIFY(internal::mat::IsValid(data.mu_r) && + internal::mat::IsValid(data.epsilon_r), + "Material has no valid permeability or no valid permittivity defined!"); + if (iodata.problem.type == ProblemType::TRANSIENT) + { + MFEM_VERIFY(!internal::mat::IsValid(data.tandelta), + "Transient problem type does not support material loss tangent, use " + "electrical conductivity instead!"); + } + else + { + MFEM_VERIFY( + !(internal::mat::IsValid(data.tandelta) && internal::mat::IsValid(data.sigma)), + "Material loss model should probably use only one of loss tangent or " + "electrical conductivity!"); + } + } + + attr_is_isotropic[i] = internal::mat::IsIsotropic(data.mu_r) && + internal::mat::IsIsotropic(data.epsilon_r) && + internal::mat::IsIsotropic(data.tandelta) && + internal::mat::IsIsotropic(data.sigma); + + // Map all attributes to this material property index. + for (auto attr : data.attributes) + { + auto it = loc_attr.find(attr); + if (it != loc_attr.end()) + { + MFEM_VERIFY( + attr_mat[it->second - 1] < 0, + "Detected multiple definitions of material properties for domain attribute " + << attr << "!"); + attr_mat[it->second - 1] = count; + } + } + + // Compute the inverse of the input permeability matrix. + mfem::DenseMatrix mat_mu = internal::mat::ToDenseMatrix(data.mu_r); + mfem::DenseMatrixInverse(mat_mu, true).GetInverseMatrix(mat_muinv(count)); + + // Material permittivity: Re{ε} = ε, Im{ε} = -ε * tan(δ) + mfem::DenseMatrix T(sdim, sdim); + mat_epsilon(count) = internal::mat::ToDenseMatrix(data.epsilon_r); + Mult(mat_epsilon(count), internal::mat::ToDenseMatrix(data.tandelta), T); + T *= -1.0; + mat_epsilon_imag(count) = T; + if (mat_epsilon_imag(count).MaxMaxNorm() > 0.0) + { + has_losstan_attr = true; + } + + // ε * √(I + tan(δ) * tan(δ)ᵀ) + MultAAt(internal::mat::ToDenseMatrix(data.tandelta), T); + for (int d = 0; d < T.Height(); d++) + { + T(d, d) += 1.0; + } + Mult(mat_epsilon(count), linalg::MatrixSqrt(T), mat_epsilon_abs(count)); + + // √(μ⁻¹ ε) + Mult(mat_muinv(count), mat_epsilon(count), mat_invz0(count)); + mat_invz0(count) = linalg::MatrixSqrt(mat_invz0(count)); + + // √((μ ε)⁻¹) + Mult(mat_mu, mat_epsilon(count), T); + mat_c0(count) = linalg::MatrixPow(T, -0.5); + mat_c0_min[count] = linalg::SingularValueMin(mat_c0(count)); + mat_c0_max[count] = linalg::SingularValueMax(mat_c0(count)); + + // Electrical conductivity, σ + mat_sigma(count) = internal::mat::ToDenseMatrix(data.sigma); + if (mat_sigma(count).MaxMaxNorm() > 0.0) + { + has_conductivity_attr = true; + } + + // λ⁻² * μ⁻¹ + mat_invLondon(count) = mat_muinv(count); + mat_invLondon(count) *= + std::abs(data.lambda_L) > 0.0 ? std::pow(data.lambda_L, -2.0) : 0.0; + if (mat_invLondon(count).MaxMaxNorm() > 0.0) + { + has_london_attr = true; + } + + // μ⁻¹ [k x] + Mult(mat_muinv(count), wave_vector_cross, mat_muinvkx(count)); + + // [k x]^T μ⁻¹ [k x] + T.Transpose(wave_vector_cross); + Mult(T, mat_muinvkx(count), mat_kxTmuinvkx(count)); + + // [k x] + mat_kx(count) = wave_vector_cross; + + count++; + } + bool has_attr[4] = {has_losstan_attr, has_conductivity_attr, has_london_attr, + has_wave_attr}; + Mpi::GlobalOr(4, has_attr, mesh.GetComm()); + has_losstan_attr = has_attr[0]; + has_conductivity_attr = has_attr[1]; + has_london_attr = has_attr[2]; + has_wave_attr = has_attr[3]; +} + +void MaterialOperator::SetUpFloquetWaveVector(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + const int sdim = mesh.SpaceDimension(); + const double tol = std::numeric_limits::epsilon(); + + // Get Floquet wave vector. + mfem::Vector wave_vector(sdim); + wave_vector = 0.0; + const auto &data = iodata.boundaries.periodic; + MFEM_VERIFY(static_cast(data.wave_vector.size()) == sdim, + "Floquet wave vector size must equal the spatial dimension."); + std::copy(data.wave_vector.begin(), data.wave_vector.end(), wave_vector.GetData()); + has_wave_attr = (wave_vector.Norml2() > tol); + + MFEM_VERIFY(!has_wave_attr || iodata.problem.type == ProblemType::DRIVEN || + iodata.problem.type == ProblemType::EIGENMODE, + "Quasi-periodic Floquet boundary conditions are only available for " + " frequency domain driven or eigenmode simulations!"); + MFEM_VERIFY(!has_wave_attr || sdim == 3, + "Quasi-periodic Floquet periodic boundary conditions are only available " + " in 3D!"); + + // Get mesh dimensions in x/y/z coordinates. + mfem::Vector bbmin, bbmax; + mesh::GetAxisAlignedBoundingBox(mesh, bbmin, bbmax); + bbmax -= bbmin; + + // Ensure Floquet wave vector components are in range [-π/L, π/L]. + for (int i = 0; i < sdim; i++) + { + if (wave_vector[i] > M_PI / bbmax[i]) + { + wave_vector[i] = + -M_PI / bbmax[i] + fmod(wave_vector[i] + M_PI / bbmax[i], 2 * M_PI / bbmax[i]); + } + else if (wave_vector[i] < M_PI / bbmax[i]) + { + wave_vector[i] = + M_PI / bbmax[i] + fmod(wave_vector[i] - M_PI / bbmax[i], 2 * M_PI / bbmax[i]); + } + } + + // Matrix representation of cross product with wave vector + // [k x] = | 0 -k3 k2| + // | k3 0 -k1| + // |-k2 k1 0 | + wave_vector_cross.SetSize(3); + wave_vector_cross = 0.0; + wave_vector_cross(0, 1) = -wave_vector[2]; + wave_vector_cross(0, 2) = wave_vector[1]; + wave_vector_cross(1, 0) = wave_vector[2]; + wave_vector_cross(1, 2) = -wave_vector[0]; + wave_vector_cross(2, 0) = -wave_vector[1]; + wave_vector_cross(2, 1) = wave_vector[0]; +} + +mfem::Array MaterialOperator::GetBdrAttributeToMaterial() const +{ + // Construct map from all (contiguous) local libCEED boundary attributes to the material + // index in the neighboring element. + mfem::Array bdr_attr_mat(mesh.MaxCeedBdrAttribute()); + bdr_attr_mat = -1; + for (const auto &[attr, bdr_attr_map] : mesh.GetCeedBdrAttributes()) + { + for (auto it = bdr_attr_map.begin(); it != bdr_attr_map.end(); ++it) + { + MFEM_ASSERT(it->second > 0 && it->second <= bdr_attr_mat.Size(), + "Invalid libCEED boundary attribute " << it->second << "!"); + bdr_attr_mat[it->second - 1] = AttrToMat(it->first); + } + } + return bdr_attr_mat; +} + +MaterialPropertyCoefficient::MaterialPropertyCoefficient(int attr_max) +{ + attr_mat.SetSize(attr_max); + attr_mat = -1; +} + +MaterialPropertyCoefficient::MaterialPropertyCoefficient( + const mfem::Array &attr_mat_, const mfem::DenseTensor &mat_coeff_, double a) + : attr_mat(attr_mat_), mat_coeff(mat_coeff_) +{ + *this *= a; +} + +namespace +{ + +void UpdateProperty(mfem::DenseTensor &mat_coeff, int k, double coeff, double a) +{ + // Constant diagonal coefficient. + if (mat_coeff.SizeI() == 0 && mat_coeff.SizeJ() == 0) + { + // Initialize the coefficient material properties. + MFEM_VERIFY(k == 0 && mat_coeff.SizeK() == 1, + "Unexpected initial size for MaterialPropertyCoefficient!"); + mat_coeff.SetSize(1, 1, mat_coeff.SizeK()); + mat_coeff(0, 0, k) = a * coeff; + } + else + { + MFEM_VERIFY(mat_coeff.SizeI() == mat_coeff.SizeJ(), + "Invalid dimensions for MaterialPropertyCoefficient update!"); + for (int i = 0; i < mat_coeff.SizeI(); i++) + { + mat_coeff(i, i, k) += a * coeff; + } + } +} + +void UpdateProperty(mfem::DenseTensor &mat_coeff, int k, const mfem::DenseMatrix &coeff, + double a) +{ + if (mat_coeff.SizeI() == 0 && mat_coeff.SizeJ() == 0) + { + // Initialize the coefficient material properties. + MFEM_VERIFY(k == 0 && mat_coeff.SizeK() == 1, + "Unexpected initial size for MaterialPropertyCoefficient!"); + mat_coeff.SetSize(coeff.Height(), coeff.Width(), mat_coeff.SizeK()); + mat_coeff(k).Set(a, coeff); + } + else if (coeff.Height() == mat_coeff.SizeI() && coeff.Width() == mat_coeff.SizeJ()) + { + // Add as full matrix. + mat_coeff(k).Add(a, coeff); + } + else if (coeff.Height() == 1 && coeff.Width() == 1) + { + // Add as diagonal. + UpdateProperty(mat_coeff, k, coeff(0, 0), a); + } + else if (mat_coeff.SizeI() == 1 && mat_coeff.SizeJ() == 1) + { + // Convert to matrix coefficient and previous data add as diagonal. + mfem::DenseTensor mat_coeff_scalar(mat_coeff); + mat_coeff.SetSize(coeff.Height(), coeff.Width(), mat_coeff_scalar.SizeK()); + mat_coeff = 0.0; + for (int l = 0; l < mat_coeff.SizeK(); l++) + { + UpdateProperty(mat_coeff, l, mat_coeff_scalar(0, 0, l), 1.0); + } + mat_coeff(k).Add(a, coeff); + } + else + { + MFEM_ABORT("Invalid dimensions when updating material property at index " << k << "!"); + } +} + +bool Equals(const mfem::DenseMatrix &mat_coeff, double coeff, double a) +{ + MFEM_VERIFY(mat_coeff.Height() == mat_coeff.Width(), + "Invalid dimensions for MaterialPropertyCoefficient update!"); + constexpr double tol = 1.0e-9; + for (int i = 0; i < mat_coeff.Height(); i++) + { + if (std::abs(mat_coeff(i, i) - a * coeff) >= tol * std::abs(mat_coeff(i, i))) + { + return false; + } + for (int j = 0; j < mat_coeff.Width(); j++) + { + if (j != i && std::abs(mat_coeff(i, j)) > 0.0) + { + return false; + } + } + } + return true; +} + +bool Equals(const mfem::DenseMatrix &mat_coeff, const mfem::DenseMatrix &coeff, double a) +{ + if (coeff.Height() == 1 && coeff.Width() == 1) + { + return Equals(mat_coeff, coeff(0, 0), a); + } + else + { + constexpr double tol = 1.0e-9; + mfem::DenseMatrix T(mat_coeff); + T.Add(-a, coeff); + return (T.MaxMaxNorm() < tol * mat_coeff.MaxMaxNorm()); + } +} + +} // namespace + +void MaterialPropertyCoefficient::AddCoefficient(const mfem::Array &attr_mat_, + const mfem::DenseTensor &mat_coeff_, + double a) +{ + if (empty()) + { + MFEM_VERIFY(attr_mat_.Size() == attr_mat.Size(), + "Invalid resize of attribute to material property map in " + "MaterialPropertyCoefficient::AddCoefficient!"); + attr_mat = attr_mat_; + mat_coeff = mat_coeff_; + *this *= a; + } + else if (attr_mat_ == attr_mat) + { + MFEM_VERIFY(mat_coeff_.SizeK() == mat_coeff.SizeK(), + "Invalid dimensions for MaterialPropertyCoefficient::AddCoefficient!"); + for (int k = 0; k < mat_coeff.SizeK(); k++) + { + UpdateProperty(mat_coeff, k, mat_coeff_(k), a); + } + } + else + { + for (int k = 0; k < mat_coeff_.SizeK(); k++) + { + // Get list of all attributes which use this material property. + mfem::Array attr_list; + attr_list.Reserve(attr_mat_.Size()); + for (int i = 0; i < attr_mat_.Size(); i++) + { + if (attr_mat_[i] == k) + { + attr_list.Append(i + 1); + } + } + + // Add or update the material property. + AddMaterialProperty(attr_list, mat_coeff_(k), a); + } + } +} + +template +void MaterialPropertyCoefficient::AddMaterialProperty(const mfem::Array &attr_list, + const T &coeff, double a) +{ + // Preprocess the attribute list. If any of the given attributes already have material + // properties assigned, then they all need to point to the same material and it is + // updated in place. Otherwise a new material is added for these attributes. + if (attr_list.Size() == 0) + { + // No attributes, nothing to add. + return; + } + + int mat_idx = -1; + for (auto attr : attr_list) + { + MFEM_VERIFY(attr <= attr_mat.Size(), + "Out of bounds access for attribute " + << attr << " in MaterialPropertyCoefficient::AddMaterialProperty!"); + if (mat_idx < 0) + { + mat_idx = attr_mat[attr - 1]; + } + else + { + MFEM_VERIFY(mat_idx == attr_mat[attr - 1], + "All attributes for MaterialPropertyCoefficient::AddMaterialProperty " + "must correspond to the same " + "existing material if it exists!"); + } + } + + if (mat_idx < 0) + { + // Check if we can reuse an existing material. + for (int k = 0; k < mat_coeff.SizeK(); k++) + { + if (Equals(mat_coeff(k), coeff, a)) + { + mat_idx = k; + break; + } + } + if (mat_idx < 0) + { + // Append a new material and assign the attributes to it. + const mfem::DenseTensor mat_coeff_backup(mat_coeff); + mat_coeff.SetSize(mat_coeff_backup.SizeI(), mat_coeff_backup.SizeJ(), + mat_coeff_backup.SizeK() + 1); + for (int k = 0; k < mat_coeff_backup.SizeK(); k++) + { + mat_coeff(k) = mat_coeff_backup(k); + } + mat_idx = mat_coeff.SizeK() - 1; + } + mat_coeff(mat_idx) = 0.0; // Zero out so we can add + + // Assign all attributes to this new material. + for (auto attr : attr_list) + { + attr_mat[attr - 1] = mat_idx; + } + } + UpdateProperty(mat_coeff, mat_idx, coeff, a); +} + +MaterialPropertyCoefficient &MaterialPropertyCoefficient::operator*=(double a) +{ + for (int k = 0; k < mat_coeff.SizeK(); k++) + { + mat_coeff(k) *= a; + } + return *this; +} + +void MaterialPropertyCoefficient::RestrictCoefficient(const mfem::Array &attr_list) +{ + // Create a new material property coefficient with materials corresponding to only the + // unique ones in the given attribute list. + const mfem::Array attr_mat_orig(attr_mat); + const mfem::DenseTensor mat_coeff_orig(mat_coeff); + attr_mat = -1; + mat_coeff.SetSize(mat_coeff_orig.SizeI(), mat_coeff_orig.SizeJ(), 0); + for (auto attr : attr_list) + { + if (attr_mat[attr - 1] >= 0) + { + // Attribute has already been processed. + continue; + } + + // Find all attributes in restricted list of attributes which map to this material index + // and process them together. + const int orig_mat_idx = attr_mat_orig[attr - 1]; + const int new_mat_idx = mat_coeff.SizeK(); + for (auto attr2 : attr_list) + { + if (attr_mat_orig[attr2 - 1] == orig_mat_idx) + { + attr_mat[attr2 - 1] = new_mat_idx; + } + } + + // Append the new material property. + const mfem::DenseTensor mat_coeff_backup(mat_coeff); + mat_coeff.SetSize(mat_coeff_backup.SizeI(), mat_coeff_backup.SizeJ(), + mat_coeff_backup.SizeK() + 1); + for (int k = 0; k < mat_coeff_backup.SizeK(); k++) + { + mat_coeff(k) = mat_coeff_backup(k); + } + mat_coeff(new_mat_idx) = mat_coeff_orig(orig_mat_idx); + } +} + +void MaterialPropertyCoefficient::NormalProjectedCoefficient(const mfem::Vector &normal) +{ + mfem::DenseTensor mat_coeff_backup(mat_coeff); + mat_coeff.SetSize(1, 1, mat_coeff_backup.SizeK()); + for (int k = 0; k < mat_coeff.SizeK(); k++) + { + mat_coeff(k) = mat_coeff_backup(k).InnerProduct(normal, normal); + } +} + +template void MaterialPropertyCoefficient::AddMaterialProperty(const mfem::Array &, + const mfem::DenseMatrix &, + double); +template void MaterialPropertyCoefficient::AddMaterialProperty(const mfem::Array &, + const double &, double); + +// Explicit template instantiations for internal::mat functions. +template bool internal::mat::IsOrthonormal(const config::SymmetricMatrixData<3> &); +template bool internal::mat::IsValid(const config::SymmetricMatrixData<3> &); +template bool internal::mat::IsIsotropic(const config::SymmetricMatrixData<3> &); +template bool internal::mat::IsIdentity(const config::SymmetricMatrixData<3> &); + +} // namespace palace diff --git a/palace/models/materialoperator.hpp b/palace/models/materialoperator.hpp index 459dd8729a..579933157c 100644 --- a/palace/models/materialoperator.hpp +++ b/palace/models/materialoperator.hpp @@ -1,64 +1,192 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_MATERIAL_OPERATOR_HPP -#define PALACE_MODELS_MATERIAL_OPERATOR_HPP - -#include -#include -#include - -namespace palace -{ - -class IoData; - -// -// A class handling material attributes. -// -class MaterialOperator -{ -private: - // Material properties for domain attributes: relative permeability, relative - // permittivity, and others (like electrical conductivity and London penetration depth - // for superconductors. The i-1-th entry of each Vector is the property for mesh domain - // attribute i. Marker arrays contain a 1 for each domain attribute labeled, and 0 else. - std::vector mat_muinv, mat_epsilon, mat_epsilon_imag, mat_epsilon_abs, - mat_invz0, mat_c0, mat_sigma, mat_invLondon; - std::vector mat_c0_min, mat_c0_max; - mfem::Array losstan_marker, conductivity_marker, london_marker; - void SetUpMaterialProperties(const IoData &iodata, mfem::ParMesh &mesh); - - // Shared face mapping for boundary coefficients. - std::map local_to_shared; - -public: - MaterialOperator(const IoData &iodata, mfem::ParMesh &mesh); - - int SpaceDimension() const { return mat_muinv.front().Height(); } - - const auto &GetLocalToSharedFaceMap() const { return local_to_shared; } - - const auto &GetInvPermeability(int attr) const { return mat_muinv[attr - 1]; } - const auto &GetPermittivityReal(int attr) const { return mat_epsilon[attr - 1]; } - const auto &GetPermittivityImag(int attr) const { return mat_epsilon_imag[attr - 1]; } - const auto &GetPermittivityAbs(int attr) const { return mat_epsilon_abs[attr - 1]; } - const auto &GetInvImpedance(int attr) const { return mat_invz0[attr - 1]; } - const auto &GetLightSpeed(int attr) const { return mat_c0[attr - 1]; } - const auto &GetLightSpeedMin(int attr) const { return mat_c0_min[attr - 1]; } - const auto &GetLightSpeedMax(int attr) const { return mat_c0_max[attr - 1]; } - const auto &GetConductivity(int attr) const { return mat_sigma[attr - 1]; } - const auto &GetInvLondonDepth(int attr) const { return mat_invLondon[attr - 1]; } - - bool HasLossTangent() const { return (losstan_marker.Max() > 0); } - bool HasConductivity() const { return (conductivity_marker.Max() > 0); } - bool HasLondonDepth() const { return (london_marker.Max() > 0); } - - const auto &GetLossTangentMarker() const { return losstan_marker; } - const auto &GetConductivityMarker() const { return conductivity_marker; } - const auto &GetLondonDepthMarker() const { return london_marker; } -}; - -} // namespace palace - -#endif // PALACE_MODELS_MATERIAL_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_MATERIAL_OPERATOR_HPP +#define PALACE_MODELS_MATERIAL_OPERATOR_HPP + +#include +#include "fem/mesh.hpp" +#include "utils/configfile.hpp" + +namespace palace +{ + +class IoData; + +// +// A class handling material attributes. +// +class MaterialOperator +{ +private: + // Reference to underlying mesh object (not owned). + const Mesh &mesh; + + // Mapping from the local libCEED attribute to material index. + mfem::Array attr_mat; + + // Material properties: relative permeability, relative permittivity, and others (like + // electrical conductivity, London penetration depth for superconductors and Floquet wave + // vector). + mfem::DenseTensor mat_muinv, mat_epsilon, mat_epsilon_imag, mat_epsilon_abs, mat_invz0, + mat_c0, mat_sigma, mat_invLondon, mat_kxTmuinv, mat_muinvkx, mat_kxTmuinvkx, mat_kx; + mfem::DenseMatrix wave_vector_cross; + mfem::Array mat_c0_min, mat_c0_max; + + // Are materials isotropic? True when all the material properties are effectively + // scalar-valued (ie, true scalars or vectors with identical entries). Also true when a + // material is isotropic, the intersection is true when all are isotropic. + mfem::Array attr_is_isotropic; + + // Flag for global domain attributes with nonzero loss tangent, electrical conductivity, + // London penetration depth, or Floquet wave vector. + bool has_losstan_attr, has_conductivity_attr, has_london_attr, has_wave_attr; + + void SetUpMaterialProperties(const IoData &iodata, const mfem::ParMesh &mesh); + void SetUpFloquetWaveVector(const IoData &iodata, const mfem::ParMesh &mesh); + + // Map from an attribute (specified on a mesh) to a material index (location in the + // property vector). + auto AttrToMat(int attr) const + { + const auto &loc_attr = mesh.GetCeedAttributes(); + MFEM_ASSERT(loc_attr.find(attr) != loc_attr.end(), + "Missing libCEED domain attribute for attribute " << attr << "!"); + return attr_mat[loc_attr.at(attr) - 1]; + } + + auto Wrap(const mfem::DenseTensor &data, int attr) const + { + const int k = AttrToMat(attr); + return mfem::DenseMatrix(const_cast(data.GetData(k)), data.SizeI(), + data.SizeJ()); + } + +public: + MaterialOperator(const IoData &iodata, const Mesh &mesh); + + int SpaceDimension() const { return mat_muinv.SizeI(); } + + auto GetInvPermeability(int attr) const { return Wrap(mat_muinv, attr); } + auto GetPermittivityReal(int attr) const { return Wrap(mat_epsilon, attr); } + auto GetPermittivityImag(int attr) const { return Wrap(mat_epsilon_imag, attr); } + auto GetPermittivityAbs(int attr) const { return Wrap(mat_epsilon_abs, attr); } + auto GetInvImpedance(int attr) const { return Wrap(mat_invz0, attr); } + auto GetLightSpeed(int attr) const { return Wrap(mat_c0, attr); } + auto GetConductivity(int attr) const { return Wrap(mat_sigma, attr); } + auto GetInvLondonDepth(int attr) const { return Wrap(mat_invLondon, attr); } + auto GetFloquetCurl(int attr) const { return Wrap(mat_muinvkx, attr); } + auto GetFloquetMass(int attr) const { return Wrap(mat_kxTmuinvkx, attr); } + auto GetFloquetCross(int attr) const { return Wrap(mat_kx, attr); } + + auto GetLightSpeedMin(int attr) const { return mat_c0_min[AttrToMat(attr)]; } + auto GetLightSpeedMax(int attr) const { return mat_c0_max[AttrToMat(attr)]; } + + bool IsIsotropic(int attr) const { return attr_is_isotropic[AttrToMat(attr)]; } + + const auto &GetInvPermeability() const { return mat_muinv; } + const auto &GetPermittivityReal() const { return mat_epsilon; } + const auto &GetPermittivityImag() const { return mat_epsilon_imag; } + const auto &GetPermittivityAbs() const { return mat_epsilon_abs; } + const auto &GetInvImpedance() const { return mat_invz0; } + const auto &GetLightSpeed() const { return mat_c0; } + const auto &GetConductivity() const { return mat_sigma; } + const auto &GetInvLondonDepth() const { return mat_invLondon; } + const auto &GetFloquetCurl() const { return mat_muinvkx; } + const auto &GetFloquetMass() const { return mat_kxTmuinvkx; } + const auto &GetFloquetCross() const { return mat_kx; } + + const auto &GetLightSpeedMin() const { return mat_c0_min; } + const auto &GetLightSpeedMax() const { return mat_c0_max; } + + bool HasLossTangent() const { return has_losstan_attr; } + bool HasConductivity() const { return has_conductivity_attr; } + bool HasLondonDepth() const { return has_london_attr; } + bool HasWaveVector() const { return has_wave_attr; } + + const auto &GetAttributeToMaterial() const { return attr_mat; } + mfem::Array GetBdrAttributeToMaterial() const; + + template + auto GetCeedAttributes(const T &attr_list) const + { + return mesh.GetCeedAttributes(attr_list); + } + template + auto GetCeedBdrAttributes(const T &attr_list) const + { + return mesh.GetCeedBdrAttributes(attr_list); + } + + auto MaxCeedAttribute() const { return mesh.MaxCeedAttribute(); } + auto MaxCeedBdrAttribute() const { return mesh.MaxCeedBdrAttribute(); } + + const auto &GetMesh() const { return mesh; } +}; + +// +// Material property represented as a piecewise constant coefficient over domain or boundary +// mesh elements. Can be scalar-valued or matrix-valued. This should probably always operate +// at the level of libCEED attribute numbers (contiguous, 1-based) for consistency. +// +class MaterialPropertyCoefficient +{ +private: + // Map attribute to material index (coeff = mat_coeff[attr_mat[attr - 1]], for 1-based + // attributes). + mfem::Array attr_mat; + + // Material property coefficients, ordered by material index. + mfem::DenseTensor mat_coeff; + +public: + MaterialPropertyCoefficient(int attr_max); + MaterialPropertyCoefficient(const mfem::Array &attr_mat_, + const mfem::DenseTensor &mat_coeff_, double a = 1.0); + + bool empty() const { return mat_coeff.TotalSize() == 0; } + + const auto &GetAttributeToMaterial() const { return attr_mat; } + const auto &GetMaterialProperties() const { return mat_coeff; } + + void AddCoefficient(const mfem::Array &attr_mat_, + const mfem::DenseTensor &mat_coeff_, double a = 1.0); + + template + void AddMaterialProperty(const mfem::Array &attr_list, const T &coeff, + double a = 1.0); + template + void AddMaterialProperty(int attr, const T &coeff, double a = 1.0) + { + mfem::Array attr_list(1); + attr_list[0] = attr; + AddMaterialProperty(attr_list, coeff, a); + } + + MaterialPropertyCoefficient &operator*=(double a); + + void RestrictCoefficient(const mfem::Array &attr_list); + + void NormalProjectedCoefficient(const mfem::Vector &normal); +}; + +} // namespace palace + +namespace palace::internal::mat +{ + +template +bool IsOrthonormal(const config::SymmetricMatrixData &data); + +template +bool IsValid(const config::SymmetricMatrixData &data); + +template +bool IsIsotropic(const config::SymmetricMatrixData &data); + +template +bool IsIdentity(const config::SymmetricMatrixData &data); + +} // namespace palace::internal::mat + +#endif // PALACE_MODELS_MATERIAL_OPERATOR_HPP diff --git a/palace/models/portexcitations.cpp b/palace/models/portexcitations.cpp new file mode 100644 index 0000000000..35f51474f6 --- /dev/null +++ b/palace/models/portexcitations.cpp @@ -0,0 +1,111 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "portexcitations.hpp" + +#include "lumpedportoperator.hpp" +#include "surfacecurrentoperator.hpp" +#include "waveportoperator.hpp" + +#include +#include +#include + +namespace palace +{ + +[[nodiscard]] std::string PortExcitations::FmtLog() const +{ + fmt::memory_buffer buf{}; + auto to = [&buf](auto f, auto &&...a) // mini-lambda for cleaner code + { fmt::format_to(std::back_inserter(buf), f, std::forward(a)...); }; + + int i = 1; + for (const auto &[idx, ex] : excitations) + { + to("Excitation{} with index {:d} has contributions from:\n", + (Size() > 1) ? fmt::format(" {:d}/{:d}", i, Size()) : "", idx); + if (!ex.lumped_port.empty()) + { + to(" Lumped port{} {:2d}\n", (ex.lumped_port.size() > 1) ? "s" : "", + fmt::join(ex.lumped_port, " ")); + } + if (!ex.wave_port.empty()) + { + to(" Wave port{} {:2d}\n", (ex.wave_port.size() > 1) ? "s" : "", + fmt::join(ex.wave_port, " ")); + } + if (!ex.current_port.empty()) + { + to(" Surface current port{} {:2d}\n", (ex.current_port.size() > 1) ? "s" : "", + fmt::join(ex.current_port, " ")); + } + i++; + } + return fmt::to_string(buf); +} + +void to_json(nlohmann::json &j, const PortExcitations::SingleExcitationSpec &p) +{ + j = nlohmann::json{{"LumpedPort", p.lumped_port}, + {"WavePort", p.wave_port}, + {"SurfaceCurrent", p.current_port}}; +} + +void from_json(const nlohmann::json &j, PortExcitations::SingleExcitationSpec &p) +{ + j.at("LumpedPort").get_to(p.lumped_port); + j.at("WavePort").get_to(p.wave_port); + j.at("SurfaceCurrent").get_to(p.current_port); +} + +void to_json(nlohmann::json &j, const PortExcitations &p) +{ + j = nlohmann::json{p.excitations}; +} + +void from_json(const nlohmann::json &j, PortExcitations &p) +{ + j.get_to(p.excitations); +} + +PortExcitations::PortExcitations(const LumpedPortOperator &lumped_port_op, + const WavePortOperator &wave_port_op, + const SurfaceCurrentOperator &surf_j_op) +{ + for (const auto &[idx, port] : lumped_port_op) + { + if (!port.HasExcitation()) + { + continue; + } + excitations.try_emplace(port.excitation, SingleExcitationSpec{}); // If not present + excitations.at(port.excitation).lumped_port.push_back(idx); + } + for (const auto &[idx, port] : wave_port_op) + { + if (!port.HasExcitation()) + { + continue; + } + excitations.try_emplace(port.excitation, SingleExcitationSpec{}); + excitations.at(port.excitation).wave_port.push_back(idx); + } + + // Surface currents are always excited. Add them to all single existing excitations. + // TODO: Add excitation 1 if not present already? + std::vector current_port_idx; + for (const auto &[idx, port] : surf_j_op) + { + current_port_idx.push_back(idx); + } + if (!current_port_idx.empty()) + { + for (auto &[ex_idx, ex_spec] : excitations) + { + ex_spec.current_port = current_port_idx; + } + } +}; + +} // namespace palace diff --git a/palace/models/portexcitations.hpp b/palace/models/portexcitations.hpp new file mode 100644 index 0000000000..0affb48f1e --- /dev/null +++ b/palace/models/portexcitations.hpp @@ -0,0 +1,129 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_PORT_EXCITATION_MANAGER_HPP +#define PALACE_MODELS_PORT_EXCITATION_MANAGER_HPP + +#include +#include +#include +#include + +namespace palace +{ +class LumpedPortOperator; +class WavePortOperator; +class SurfaceCurrentOperator; + +// Small helper class to collect data of what (lumped / wave / surface) ports are +// excited in driven and transient simulation, as stored in space_op; +// Manages indices. + +enum class PortType : std::uint8_t +{ + LumpedPort = 0, + WavePort = 1, + CurrentPort = 2, + Undefined = 3 +}; + +class PortExcitations +{ +public: + struct SingleExcitationSpec + { + std::vector lumped_port = {}; + std::vector wave_port = {}; + std::vector current_port = {}; + + // TODO: C++20 to replace this with iterator over joined range. + auto FlattenPortIndices() const + { + std::vector out; + out.insert(out.end(), lumped_port.cbegin(), lumped_port.cend()); + out.insert(out.end(), wave_port.cbegin(), wave_port.cend()); + out.insert(out.end(), current_port.cbegin(), current_port.cend()); + return out; + } + + // Only a single port is excited. + std::tuple IsSimple() const + { + auto n_lumped = lumped_port.size(); + auto n_wave = wave_port.size(); + auto n_current = current_port.size(); + + if (n_lumped == 1 && n_wave == 0 && n_current == 0) + { + return std::make_tuple(true, PortType::LumpedPort, lumped_port.at(0)); + } + else if (n_lumped == 0 && n_wave == 1 && n_current == 0) + { + return std::make_tuple(true, PortType::WavePort, wave_port.at(0)); + } + else if (n_lumped == 0 && n_wave == 0 && n_current == 1) + { + return std::make_tuple(true, PortType::CurrentPort, current_port.at(0)); + } + else + { + return std::make_tuple(false, PortType::Undefined, 0); + } + } + }; + + std::map excitations = {}; + + auto begin() { return excitations.begin(); } + auto end() { return excitations.end(); } + auto begin() const { return excitations.begin(); } + auto end() const { return excitations.end(); } + + PortExcitations(const LumpedPortOperator &lumped_port_op, + const WavePortOperator &wave_port_op, + const SurfaceCurrentOperator &surf_j_op); + + [[nodiscard]] int MaxIdx() const + { + // Map is stored order by key so max key is last item. + return excitations.empty() ? 0 : std::next(std::rend(excitations))->first; + } + [[nodiscard]] auto Size() const { return excitations.size(); } + [[nodiscard]] auto Empty() const { return excitations.empty(); } + + [[nodiscard]] std::string FmtLog() const; + + // Single Simple (only 1 port per excitation) Excitation. + [[nodiscard]] std::tuple IsSingleSimple() const + { + if (Size() == 1) + { + const auto &[ex_idx, ex_spec] = *excitations.begin(); + const auto [is_simple, port_type, port_idx] = ex_spec.IsSimple(); + if (is_simple) + { + return std::make_tuple(true, ex_idx, port_type, port_idx); + } + } + return std::make_tuple(false, 0, PortType::Undefined, 0); + } + + // Multiple Simple (only 1 port per excitation) Excitation. + [[nodiscard]] bool IsMultipleSimple() const + { + return std::all_of(excitations.begin(), excitations.end(), + [](const auto &ex) { return std::get<0>(ex.second.IsSimple()); }); + } +}; + +void to_json(nlohmann::json &j, const PortExcitations::SingleExcitationSpec &p); + +void from_json(const nlohmann::json &j, PortExcitations::SingleExcitationSpec &p); + +void to_json(nlohmann::json &j, const PortExcitations &p); + +void from_json(const nlohmann::json &j, PortExcitations &p); + +} // namespace palace + +#endif // PALACE_MODELS_PORT_EXCITATION_MANAGER_HPP diff --git a/palace/models/postoperator.cpp b/palace/models/postoperator.cpp index 32abb35eef..704af9df60 100644 --- a/palace/models/postoperator.cpp +++ b/palace/models/postoperator.cpp @@ -3,11 +3,12 @@ #include "postoperator.hpp" +#include +#include #include "fem/coefficient.hpp" #include "fem/errorindicator.hpp" #include "models/curlcurloperator.hpp" #include "models/laplaceoperator.hpp" -#include "models/lumpedportoperator.hpp" #include "models/materialoperator.hpp" #include "models/spaceoperator.hpp" #include "models/surfacecurrentoperator.hpp" @@ -15,140 +16,292 @@ #include "utils/communication.hpp" #include "utils/geodata.hpp" #include "utils/iodata.hpp" +#include "utils/timer.hpp" namespace palace { -using namespace std::complex_literals; - namespace { -auto CreateParaviewPath(const IoData &iodata, const std::string &name) +std::string OutputFolderName(const ProblemType solver_t) { - std::string path = iodata.problem.output; - if (path[path.length() - 1] != '/') + switch (solver_t) { - path += '/'; + case ProblemType::DRIVEN: + return "driven"; + case ProblemType::EIGENMODE: + return "eigenmode"; + case ProblemType::ELECTROSTATIC: + return "electrostatic"; + case ProblemType::MAGNETOSTATIC: + return "magnetostatic"; + case ProblemType::TRANSIENT: + return "transient"; + default: + return "unknown"; } - path += "paraview/" + name; - return path; } } // namespace -PostOperator::PostOperator(const IoData &iodata, SpaceOperator &spaceop, - const std::string &name) - : mat_op(spaceop.GetMaterialOp()), - surf_post_op(iodata, spaceop.GetMaterialOp(), spaceop.GetH1Space()), - dom_post_op(iodata, spaceop.GetMaterialOp(), &spaceop.GetNDSpace(), - &spaceop.GetRTSpace()), - has_imaginary(iodata.problem.type != config::ProblemData::Type::TRANSIENT), - E(&spaceop.GetNDSpace()), B(&spaceop.GetRTSpace()), V(std::nullopt), A(std::nullopt), - lumped_port_init(false), wave_port_init(false), - paraview(CreateParaviewPath(iodata, name), spaceop.GetNDSpace().GetParMesh()), - paraview_bdr(CreateParaviewPath(iodata, name) + "_boundary", - spaceop.GetNDSpace().GetParMesh()), - interp_op(iodata, *spaceop.GetNDSpace().GetParMesh()) +template +PostOperator::PostOperator(const IoData &iodata, fem_op_t &fem_op_) + : fem_op(&fem_op_), units(iodata.units), post_dir(iodata.problem.output), + post_op_csv(iodata, fem_op_), + // dom_post_op does not have a default ctor so specialize via immediate lambda. + dom_post_op(std::move( + [&iodata, &fem_op_]() + { + if constexpr (solver_t == ProblemType::ELECTROSTATIC) + { + return DomainPostOperator(iodata, fem_op_.GetMaterialOp(), + fem_op_.GetH1Space()); + } + else if constexpr (solver_t == ProblemType::MAGNETOSTATIC) + { + return DomainPostOperator(iodata, fem_op_.GetMaterialOp(), + fem_op_.GetNDSpace()); + } + else + { + return DomainPostOperator(iodata, fem_op_.GetMaterialOp(), fem_op_.GetNDSpace(), + fem_op_.GetRTSpace()); + } + }())), + surf_post_op(iodata, fem_op->GetMaterialOp(), fem_op->GetH1Space(), + fem_op->GetNDSpace()), + interp_op(iodata, fem_op->GetNDSpace()) { - Esr = std::make_unique(E->real(), mat_op); - Bsr = std::make_unique(B->real(), mat_op); - Jsr = std::make_unique(B->real(), mat_op); - Qsr = std::make_unique(E->real(), mat_op); - if (has_imaginary) - { - Esi = std::make_unique(E->imag(), mat_op); - Bsi = std::make_unique(B->imag(), mat_op); - Jsi = std::make_unique(B->imag(), mat_op); - Qsi = std::make_unique(E->imag(), mat_op); - Ue = std::make_unique>(*E, - mat_op); - Um = std::make_unique>(*B, - mat_op); + // Define primary grid-functions. + if constexpr (HasVGridFunction()) + { + V = std::make_unique(fem_op->GetH1Space()); } - else + if constexpr (HasAGridFunction()) + { + A = std::make_unique(fem_op->GetNDSpace()); + } + if constexpr (HasEGridFunction()) + { + E = std::make_unique(fem_op->GetNDSpace(), + HasComplexGridFunction()); + } + if constexpr (HasBGridFunction()) + { + B = std::make_unique(fem_op->GetRTSpace(), + HasComplexGridFunction()); + } + + // Add wave port boundary mode postprocessing, if available. + if constexpr (std::is_same_v, SpaceOperator>) { - Ue = std::make_unique< - EnergyDensityCoefficient>( - E->real(), mat_op); - Um = std::make_unique< - EnergyDensityCoefficient>( - B->real(), mat_op); + // Add scaling factor to output dimensional wave port electric fields. + const double scaling = units.Dimensionalize(1.0); + for (const auto &[idx, data] : fem_op->GetWavePortOp()) + { + auto ret = port_E0.emplace(idx, WavePortFieldData()); + ret.first->second.E0r = data.GetModeFieldCoefficientReal(scaling); + ret.first->second.E0i = data.GetModeFieldCoefficientImag(scaling); + } } - // Initialize data collection objects and register additional fields associated with wave - // ports (only constructed in SpaceOperator). - InitializeDataCollection(iodata); - for (const auto &[idx, data] : spaceop.GetWavePortOp()) + // Prepare for saving fields + enable_paraview_output = iodata.problem.output_formats.paraview; + enable_gridfunction_output = iodata.problem.output_formats.gridfunction; + if (solver_t == ProblemType::DRIVEN) { - paraview_bdr.RegisterVCoeffField( - "nxH^0_" + std::to_string(idx) + "_real", - const_cast(&data.GetModeCoefficientReal())); - paraview_bdr.RegisterVCoeffField( - "nxH^0_" + std::to_string(idx) + "_imag", - const_cast(&data.GetModeCoefficientImag())); + output_save_indices = iodata.solver.driven.save_indices; } + else if (solver_t == ProblemType::EIGENMODE) + { + output_n_post = iodata.solver.eigenmode.n_post; + } + else if (solver_t == ProblemType::ELECTROSTATIC) + { + output_n_post = iodata.solver.electrostatic.n_post; + } + else if (solver_t == ProblemType::MAGNETOSTATIC) + { + output_n_post = iodata.solver.magnetostatic.n_post; + } + else if (solver_t == ProblemType::TRANSIENT) + { + output_delta_post = iodata.solver.transient.delta_post; + } + + gridfunction_output_dir = + (post_dir / "gridfunction" / OutputFolderName(solver_t)).string(); + + SetupFieldCoefficients(); + InitializeParaviewDataCollection(); + + // Initialize CSV files for measurements. + post_op_csv.InitializeCSVDataCollection(*this); } -PostOperator::PostOperator(const IoData &iodata, LaplaceOperator &laplaceop, - const std::string &name) - : mat_op(laplaceop.GetMaterialOp()), - surf_post_op(iodata, laplaceop.GetMaterialOp(), laplaceop.GetH1Space()), - dom_post_op(iodata, laplaceop.GetMaterialOp(), &laplaceop.GetNDSpace(), nullptr), - has_imaginary(false), E(&laplaceop.GetNDSpace()), B(std::nullopt), - V(&laplaceop.GetH1Space()), A(std::nullopt), lumped_port_init(false), - wave_port_init(false), - paraview(CreateParaviewPath(iodata, name), laplaceop.GetNDSpace().GetParMesh()), - paraview_bdr(CreateParaviewPath(iodata, name) + "_boundary", - laplaceop.GetNDSpace().GetParMesh()), - interp_op(iodata, *laplaceop.GetNDSpace().GetParMesh()) +template +template +auto PostOperator::InitializeParaviewDataCollection(int ex_idx) + -> std::enable_if_t { - // Note: When using this constructor, you should not use any of the magnetic field related - // postprocessing functions (magnetic field energy, inductor energy, surface currents, - // etc.), since only V and E fields are supplied. - Esr = std::make_unique(E->real(), mat_op); - Vs = std::make_unique(*V, mat_op); - Ue = std::make_unique< - EnergyDensityCoefficient>( - E->real(), mat_op); - Qsr = std::make_unique(E->real(), mat_op); - - // Initialize data collection objects. - InitializeDataCollection(iodata); + fs::path sub_folder_name = ""; + auto nr_excitations = fem_op->GetPortExcitations().Size(); + if ((nr_excitations > 1) && (ex_idx > 0)) + { + int spacing = 1 + int(std::log10(nr_excitations)); + sub_folder_name = fmt::format(FMT_STRING("excitation_{:0>{}}"), ex_idx, spacing); + } + InitializeParaviewDataCollection(sub_folder_name); +} + +bool createDirectory(const std::string &path) +{ + try + { + // Create directory (including parent directories if needed) + // Use fs::create_directory for single-level, or fs::create_directories for nested + if (fs::create_directories(path)) + { + // std::cout << "Directory created: " << path << std::endl; + // return true; + } + // else + //{ + // std::cout << "Directory already exists: " << path << std::endl; + // return true; + // } + } + catch (const fs::filesystem_error &e) + { + std::cerr << "Error creating directory: " << e.what() << std::endl; + return false; + } + + return true; } -PostOperator::PostOperator(const IoData &iodata, CurlCurlOperator &curlcurlop, - const std::string &name) - : mat_op(curlcurlop.GetMaterialOp()), - surf_post_op(iodata, curlcurlop.GetMaterialOp(), curlcurlop.GetH1Space()), - dom_post_op(iodata, curlcurlop.GetMaterialOp(), nullptr, &curlcurlop.GetRTSpace()), - has_imaginary(false), E(std::nullopt), B(&curlcurlop.GetRTSpace()), V(std::nullopt), - A(&curlcurlop.GetNDSpace()), lumped_port_init(false), wave_port_init(false), - paraview(CreateParaviewPath(iodata, name), curlcurlop.GetNDSpace().GetParMesh()), - paraview_bdr(CreateParaviewPath(iodata, name) + "_boundary", - curlcurlop.GetNDSpace().GetParMesh()), - interp_op(iodata, *curlcurlop.GetNDSpace().GetParMesh()) +template +void PostOperator::SetupFieldCoefficients() { - // Note: When using this constructor, you should not use any of the electric field related - // postprocessing functions (electric field energy, capacitor energy, surface charge, - // etc.), since only the B field is supplied. - Bsr = std::make_unique(B->real(), mat_op); - As = std::make_unique(*A, mat_op); - Um = std::make_unique< - EnergyDensityCoefficient>( - B->real(), mat_op); - Jsr = std::make_unique(B->real(), mat_op); - - // Initialize data collection objects. - InitializeDataCollection(iodata); + // We currently don't use the dependent grid functions apart from saving fields, so only + // initialize if needed. + if (!ShouldWriteFields()) + { + return; + } + + // Set-up grid-functions for the paraview output / measurement. + if constexpr (HasVGridFunction()) + { + V_s = std::make_unique(V->Real()); + } + + if constexpr (HasAGridFunction()) + { + A_s = std::make_unique(A->Real()); + } + + if constexpr (HasEGridFunction()) + { + // If E is dimensionalized when the coefficients are evaluated, the scaling only needs + // to account for the remaining ε_0 = D / E. This assumes ProjectCoefficient(), + // ProjectBdrCoefficient(), or paraview->Save() for U_e, Q_sr, and Q_si are always + // called after the E GridFunction has been dimensionalized. To output nondimensional + // coefficients, omit the scaling argument and make sure E is nondimensional when the + // coefficients are evaluated. + const double scaling = units.Dimensionalize(1.0) / + units.Dimensionalize(1.0); + + // Electric Energy Density. + // U_e = 1/2 Dᴴ E = 1/2 ε_0 Eᴴ E. + U_e = std::make_unique>( + *E, fem_op->GetMaterialOp(), scaling); + + // Electric Boundary Field & Surface Charge. + E_sr = std::make_unique(E->Real()); + // Q_s = D ⋅ n = ε_0 E ⋅ n. + Q_sr = std::make_unique>( + &E->Real(), nullptr, fem_op->GetMaterialOp(), true, mfem::Vector(), scaling); + + if constexpr (HasComplexGridFunction()) + { + E_si = std::make_unique(E->Imag()); + Q_si = std::make_unique>( + &E->Imag(), nullptr, fem_op->GetMaterialOp(), true, mfem::Vector(), scaling); + } + } + + if constexpr (HasBGridFunction()) + { + // If B is dimensionalized when the coefficients are evaluated, the scaling only needs + // to account for the remaining μ⁻¹ = H / B. This assumes ProjectCoefficient(), + // ProjectBdrCoefficient(), or paraview->Save() for U_m, J_sr, and J_si are always + // called after the B GridFunction has been dimensionalized. To output nondimensional + // coefficients, omit the scaling argument and make sure B is nondimensional when the + // coefficients are evaluated. + const double scaling = units.Dimensionalize(1.0) / + units.Dimensionalize(1.0); + + // Magnetic Energy Density. + // U_m = 1/2 Hᴴ B = 1/2 μ⁻¹ Bᴴ B. + U_m = std::make_unique>( + *B, fem_op->GetMaterialOp(), scaling); + + // Magnetic Boundary Field & Surface Current. + B_sr = std::make_unique(B->Real()); + // J_s = n x H = n x μ⁻¹ B. + J_sr = std::make_unique( + B->Real(), fem_op->GetMaterialOp(), scaling); + + if constexpr (HasComplexGridFunction()) + { + B_si = std::make_unique(B->Imag()); + J_si = std::make_unique( + B->Imag(), fem_op->GetMaterialOp(), scaling); + } + } + + if constexpr (HasEGridFunction() && HasBGridFunction()) + { + // Poynting Vector. + // S = Re{E x H⋆} = Re{E x μ⁻¹B⋆}. + // E and B will be dimensionalized when the coefficient is evaluated, so the scaling + // only needs to account for the remaining μ⁻¹ = H / B. As mentioned above, + // ProjectCoefficient(*S.get()) or paraview->Save() should always be called after + // E and B have been dimensionalized. + const double scaling = units.Dimensionalize(1.0) / + units.Dimensionalize(1.0); + S = std::make_unique(*E, *B, fem_op->GetMaterialOp(), + scaling); + } } -void PostOperator::InitializeDataCollection(const IoData &iodata) +template +void PostOperator::InitializeParaviewDataCollection( + const fs::path &sub_folder_name) { - // Set up postprocessing for output to disk. Results are stored in a directory at - // `iodata.problem.output/paraview`. + if (!ShouldWriteParaviewFields()) + { + return; + } + fs::path paraview_dir_v = post_dir / "paraview" / OutputFolderName(solver_t); + fs::path paraview_dir_b = + post_dir / "paraview" / fmt::format("{}_boundary", OutputFolderName(solver_t)); + if (!sub_folder_name.empty()) + { + paraview_dir_v /= sub_folder_name; + paraview_dir_b /= sub_folder_name; + } + // Set up postprocessing for output to disk. + paraview = {paraview_dir_v.string(), &fem_op->GetNDSpace().GetParMesh()}; + paraview_bdr = {paraview_dir_b.string(), &fem_op->GetNDSpace().GetParMesh()}; + bool bOk1 = createDirectory(paraview_dir_v.string()); + bool bOk2 = createDirectory(paraview_dir_b.string()); + if (!bOk1 || !bOk2) { assert(0); return; } + + + const mfem::VTKFormat format = mfem::VTKFormat::BINARY32; #if defined(MFEM_USE_ZLIB) const int compress = -1; // Default compression level @@ -156,23 +309,23 @@ void PostOperator::InitializeDataCollection(const IoData &iodata) const int compress = 0; #endif const bool use_ho = true; - const int refine_ho = - (E) ? E->ParFESpace()->GetMaxElementOrder() : B->ParFESpace()->GetMaxElementOrder(); - mesh_Lc0 = iodata.GetLengthScale(); + const int refine_ho = HasEGridFunction() + ? E->ParFESpace()->GetMaxElementOrder() + : B->ParFESpace()->GetMaxElementOrder(); // Output mesh coordinate units same as input. - paraview.SetCycle(-1); - paraview.SetDataFormat(format); - paraview.SetCompressionLevel(compress); - paraview.SetHighOrderOutput(use_ho); - paraview.SetLevelsOfDetail(refine_ho); - - paraview_bdr.SetBoundaryOutput(true); - paraview_bdr.SetCycle(-1); - paraview_bdr.SetDataFormat(format); - paraview_bdr.SetCompressionLevel(compress); - paraview_bdr.SetHighOrderOutput(use_ho); - paraview_bdr.SetLevelsOfDetail(refine_ho); + paraview->SetCycle(-1); + paraview->SetDataFormat(format); + paraview->SetCompressionLevel(compress); + paraview->SetHighOrderOutput(use_ho); + paraview->SetLevelsOfDetail(refine_ho); + + paraview_bdr->SetBoundaryOutput(true); + paraview_bdr->SetCycle(-1); + paraview_bdr->SetDataFormat(format); + paraview_bdr->SetCompressionLevel(compress); + paraview_bdr->SetHighOrderOutput(use_ho); + paraview_bdr->SetLevelsOfDetail(refine_ho); // Output fields @ phase = 0 and π/2 for frequency domain (rather than, for example, // peak phasors or magnitude = sqrt(2) * RMS). Also output fields evaluated on mesh @@ -181,515 +334,1122 @@ void PostOperator::InitializeDataCollection(const IoData &iodata) // permeability. if (E) { - if (has_imaginary) + if (HasComplexGridFunction()) { - paraview.RegisterField("E_real", &E->real()); - paraview.RegisterField("E_imag", &E->imag()); - paraview_bdr.RegisterVCoeffField("E_real", Esr.get()); - paraview_bdr.RegisterVCoeffField("E_imag", Esi.get()); + paraview->RegisterField("E_real", &E->Real()); + paraview->RegisterField("E_imag", &E->Imag()); + paraview_bdr->RegisterVCoeffField("E_real", E_sr.get()); + paraview_bdr->RegisterVCoeffField("E_imag", E_si.get()); } else { - paraview.RegisterField("E", &E->real()); - paraview_bdr.RegisterVCoeffField("E", Esr.get()); + paraview->RegisterField("E", &E->Real()); + paraview_bdr->RegisterVCoeffField("E", E_sr.get()); } } if (B) { - if (has_imaginary) + if (HasComplexGridFunction()) { - paraview.RegisterField("B_real", &B->real()); - paraview.RegisterField("B_imag", &B->imag()); - paraview_bdr.RegisterVCoeffField("B_real", Bsr.get()); - paraview_bdr.RegisterVCoeffField("B_imag", Bsi.get()); + paraview->RegisterField("B_real", &B->Real()); + paraview->RegisterField("B_imag", &B->Imag()); + paraview_bdr->RegisterVCoeffField("B_real", B_sr.get()); + paraview_bdr->RegisterVCoeffField("B_imag", B_si.get()); } else { - paraview.RegisterField("B", &B->real()); - paraview_bdr.RegisterVCoeffField("B", Bsr.get()); + paraview->RegisterField("B", &B->Real()); + paraview_bdr->RegisterVCoeffField("B", B_sr.get()); } } if (V) { - paraview.RegisterField("V", &*V); - paraview_bdr.RegisterCoeffField("V", Vs.get()); + paraview->RegisterField("V", &V->Real()); + paraview_bdr->RegisterCoeffField("V", V_s.get()); } if (A) { - paraview.RegisterField("A", &*A); - paraview_bdr.RegisterVCoeffField("A", As.get()); + paraview->RegisterField("A", &A->Real()); + paraview_bdr->RegisterVCoeffField("A", A_s.get()); + } + + // Extract energy density field for electric field energy 1/2 Dᴴ E or magnetic field + // energy 1/2 Hᴴ B. Also Poynting vector S = E x H⋆. + if (U_e) + { + paraview->RegisterCoeffField("U_e", U_e.get()); + paraview_bdr->RegisterCoeffField("U_e", U_e.get()); + } + if (U_m) + { + paraview->RegisterCoeffField("U_m", U_m.get()); + paraview_bdr->RegisterCoeffField("U_m", U_m.get()); + } + if (S) + { + paraview->RegisterVCoeffField("S", S.get()); + paraview_bdr->RegisterVCoeffField("S", S.get()); } // Extract surface charge from normally discontinuous ND E-field. Also extract surface // currents from tangentially discontinuous RT B-field The surface charge and surface // currents are single-valued at internal boundaries. - if (Qsr) + if (Q_sr) { - if (has_imaginary) + if (HasComplexGridFunction()) { - paraview_bdr.RegisterCoeffField("Qs_real", Qsr.get()); - paraview_bdr.RegisterCoeffField("Qs_imag", Qsi.get()); + paraview_bdr->RegisterCoeffField("Q_s_real", Q_sr.get()); + paraview_bdr->RegisterCoeffField("Q_s_imag", Q_si.get()); } else { - paraview_bdr.RegisterCoeffField("Qs", Qsr.get()); + paraview_bdr->RegisterCoeffField("Q_s", Q_sr.get()); } } - if (Jsr) + if (J_sr) { - if (has_imaginary) + if (HasComplexGridFunction()) { - paraview_bdr.RegisterVCoeffField("Js_real", Jsr.get()); - paraview_bdr.RegisterVCoeffField("Js_imag", Jsi.get()); + paraview_bdr->RegisterVCoeffField("J_s_real", J_sr.get()); + paraview_bdr->RegisterVCoeffField("J_s_imag", J_si.get()); } else { - paraview_bdr.RegisterVCoeffField("Js", Jsr.get()); + paraview_bdr->RegisterVCoeffField("J_s", J_sr.get()); } } - // Extract energy density field for electric field energy 1/2 Dᴴ E or magnetic field - // energy 1/2 Bᴴ H. - if (Ue) - { - paraview.RegisterCoeffField("Ue", Ue.get()); - paraview_bdr.RegisterCoeffField("Ue", Ue.get()); - } - if (Um) + // Add wave port boundary mode postprocessing when available. + for (const auto &[idx, data] : port_E0) { - paraview.RegisterCoeffField("Um", Um.get()); - paraview_bdr.RegisterCoeffField("Um", Um.get()); + paraview_bdr->RegisterVCoeffField(fmt::format("E0_{}_real", idx), data.E0r.get()); + paraview_bdr->RegisterVCoeffField(fmt::format("E0_{}_imag", idx), data.E0i.get()); } } -void PostOperator::SetEGridFunction(const ComplexVector &e) +void ScaleGridFunctions(double L, int dim, std::unique_ptr &E, + std::unique_ptr &B, std::unique_ptr &V, + std::unique_ptr &A) { - MFEM_VERIFY( - has_imaginary, - "SetEGridFunction for complex-valued output called when has_imaginary == false!"); - MFEM_VERIFY(E, "Incorrect usage of PostOperator::SetEGridFunction!"); - E->real().SetFromTrueDofs(e.Real()); // Parallel distribute - E->imag().SetFromTrueDofs(e.Imag()); - E->real().ExchangeFaceNbrData(); // Ready for parallel comm on shared faces - E->imag().ExchangeFaceNbrData(); - lumped_port_init = wave_port_init = false; + // For fields on H(curl) and H(div) spaces, we "undo" the effect of redimensionalizing + // the mesh which would carry into the fields during the mapping from reference to + // physical space through the element Jacobians. No transformation for V is needed (H1 + // interpolation). Because the coefficients are always evaluating E, B in neighboring + // elements, the Jacobian scaling is the same for the domain and boundary data + // collections (instead of being different for B due to the dim - 1 evaluation). Wave + // port fields also do not require rescaling since their submesh object where they are + // evaluated remains nondimensionalized. + if (E) + { + // Piola transform: J^-T + E->Real() *= L; + E->Real().FaceNbrData() *= L; + if (E->HasImag()) + { + E->Imag() *= L; + E->Imag().FaceNbrData() *= L; + } + } + if (B) + { + // Piola transform: J / |J| + const auto Ld = std::pow(L, dim - 1); + B->Real() *= Ld; + B->Real().FaceNbrData() *= Ld; + if (B->HasImag()) + { + B->Imag() *= Ld; + B->Imag().FaceNbrData() *= Ld; + } + } + if (A) + { + // Piola transform: J^-T + A->Real() *= L; + A->Real().FaceNbrData() *= L; + } } -void PostOperator::SetBGridFunction(const ComplexVector &b) +void DimensionalizeGridFunctions(Units &units, std::unique_ptr &E, + std::unique_ptr &B, + std::unique_ptr &V, + std::unique_ptr &A) { - MFEM_VERIFY( - has_imaginary, - "SetBGridFunction for complex-valued output called when has_imaginary == false!"); - MFEM_VERIFY(B, "Incorrect usage of PostOperator::SetBGridFunction!"); - B->real().SetFromTrueDofs(b.Real()); // Parallel distribute - B->imag().SetFromTrueDofs(b.Imag()); - B->real().ExchangeFaceNbrData(); // Ready for parallel comm on shared faces - B->imag().ExchangeFaceNbrData(); - lumped_port_init = wave_port_init = false; + if (E) + { + units.DimensionalizeInPlace(*E); + } + if (B) + { + units.DimensionalizeInPlace(*B); + } + if (A) + { + units.DimensionalizeInPlace(A->Real()); + } + if (V) + { + units.DimensionalizeInPlace(V->Real()); + } } -void PostOperator::SetEGridFunction(const Vector &e) +void NondimensionalizeGridFunctions(Units &units, std::unique_ptr &E, + std::unique_ptr &B, + std::unique_ptr &V, + std::unique_ptr &A) { - MFEM_VERIFY(!has_imaginary, - "SetEGridFunction for real-valued output called when has_imaginary == true!"); - MFEM_VERIFY(E, "Incorrect usage of PostOperator::SetEGridFunction!"); - E->real().SetFromTrueDofs(e); - E->real().ExchangeFaceNbrData(); - lumped_port_init = wave_port_init = false; + if (E) + { + units.NondimensionalizeInPlace(*E); + } + if (B) + { + units.NondimensionalizeInPlace(*B); + } + if (A) + { + units.NondimensionalizeInPlace(A->Real()); + } + if (V) + { + units.NondimensionalizeInPlace(V->Real()); + } } -void PostOperator::SetBGridFunction(const Vector &b) +template +void PostOperator::WriteParaviewFields(double time, int step) { - MFEM_VERIFY(!has_imaginary, - "SetBGridFunction for real-valued output called when has_imaginary == true!"); - MFEM_VERIFY(B, "Incorrect usage of PostOperator::SetBGridFunction!"); - B->real().SetFromTrueDofs(b); - B->real().ExchangeFaceNbrData(); - lumped_port_init = wave_port_init = false; -} + BlockTimer bt(Timer::POSTPRO_PARAVIEW); -void PostOperator::SetVGridFunction(const Vector &v) -{ - MFEM_VERIFY(!has_imaginary, - "SetVGridFunction for real-valued output called when has_imaginary == true!"); - MFEM_VERIFY(V, "Incorrect usage of PostOperator::SetVGridFunction!"); - V->SetFromTrueDofs(v); - V->ExchangeFaceNbrData(); + auto mesh_Lc0 = units.GetMeshLengthRelativeScale(); + + // Given the electric field and magnetic flux density, write the fields to disk for + // visualization. Write the mesh coordinates in the same units as originally input. + mfem::ParMesh &mesh = E ? *E->ParFESpace()->GetParMesh() : *B->ParFESpace()->GetParMesh(); + mesh::DimensionalizeMesh(mesh, mesh_Lc0); + ScaleGridFunctions(mesh_Lc0, mesh.Dimension(), E, B, V, A); + DimensionalizeGridFunctions(units, E, B, V, A); + paraview->SetCycle(step); + paraview->SetTime(time); + paraview_bdr->SetCycle(step); + paraview_bdr->SetTime(time); + paraview->Save(); + paraview_bdr->Save(); + mesh::NondimensionalizeMesh(mesh, mesh_Lc0); + ScaleGridFunctions(1.0 / mesh_Lc0, mesh.Dimension(), E, B, V, A); + NondimensionalizeGridFunctions(units, E, B, V, A); + Mpi::Barrier(fem_op->GetComm()); } -void PostOperator::SetAGridFunction(const Vector &a) +template +void PostOperator::WriteParaviewFieldsFinal(const ErrorIndicator *indicator) { - MFEM_VERIFY(!has_imaginary, - "SetAGridFunction for real-valued output called when has_imaginary == true!"); - MFEM_VERIFY(A, "Incorrect usage of PostOperator::SetAGridFunction!"); - A->SetFromTrueDofs(a); - A->ExchangeFaceNbrData(); + BlockTimer bt(Timer::POSTPRO_PARAVIEW); + + auto mesh_Lc0 = units.GetMeshLengthRelativeScale(); + + // Write the mesh partitioning and (optionally) error indicators at the final step. No + // need for these to be parallel objects, since the data is local to each process and + // there isn't a need to ever access the element neighbors. We set the time to some + // non-used value to make the step identifiable within the data collection. + mfem::ParMesh &mesh = E ? *E->ParFESpace()->GetParMesh() : *B->ParFESpace()->GetParMesh(); + mesh::DimensionalizeMesh(mesh, mesh_Lc0); + paraview->SetCycle(paraview->GetCycle() + 1); + if (paraview->GetTime() < 1.0) + { + paraview->SetTime(99.0); + } + else + { + // 1 -> 99, 10 -> 999, etc. + paraview->SetTime( + std::pow(10.0, 2.0 + static_cast(std::log10(paraview->GetTime()))) - 1.0); + } + auto field_map = paraview->GetFieldMap(); // Copy, so can reregister later + for (const auto &[name, gf] : field_map) + { + paraview->DeregisterField(name); + } + auto coeff_field_map = paraview->GetCoeffFieldMap(); + for (const auto &[name, gf] : coeff_field_map) + { + paraview->DeregisterCoeffField(name); + } + auto vcoeff_field_map = paraview->GetVCoeffFieldMap(); + for (const auto &[name, gf] : vcoeff_field_map) + { + paraview->DeregisterVCoeffField(name); + } + mfem::L2_FECollection pwconst_fec(0, mesh.Dimension()); + mfem::FiniteElementSpace pwconst_fespace(&mesh, &pwconst_fec); + std::unique_ptr rank, eta; + { + rank = std::make_unique(&pwconst_fespace); + *rank = mesh.GetMyRank() + 1; + paraview->RegisterField("Rank", rank.get()); + } + if (indicator) + { + eta = std::make_unique(&pwconst_fespace); + MFEM_VERIFY(eta->Size() == indicator->Local().Size(), + "Size mismatch for provided ErrorIndicator for postprocessing!"); + *eta = indicator->Local(); + paraview->RegisterField("Indicator", eta.get()); + } + paraview->Save(); + if (rank) + { + paraview->DeregisterField("Rank"); + } + if (eta) + { + paraview->DeregisterField("Indicator"); + } + for (const auto &[name, gf] : field_map) + { + paraview->RegisterField(name, gf); + } + for (const auto &[name, gf] : coeff_field_map) + { + paraview->RegisterCoeffField(name, gf); + } + for (const auto &[name, gf] : vcoeff_field_map) + { + paraview->RegisterVCoeffField(name, gf); + } + mesh::NondimensionalizeMesh(mesh, mesh_Lc0); + Mpi::Barrier(fem_op->GetComm()); } -void PostOperator::UpdatePorts(const LumpedPortOperator &lumped_port_op, double omega) +template +void PostOperator::WriteMFEMGridFunctions(double time, int step) { - MFEM_VERIFY(E && B, "Incorrect usage of PostOperator::UpdatePorts!"); - if (lumped_port_init) + BlockTimer bt(Timer::POSTPRO_GRIDFUNCTION); + + // Create output directory if it doesn't exist. + if (Mpi::Root(fem_op->GetComm())) { - return; + fs::create_directories(gridfunction_output_dir); + } + + auto mesh_Lc0 = units.GetMeshLengthRelativeScale(); + + // Given the electric field and magnetic flux density, write the fields to disk for + // visualization. Write the mesh coordinates in the same units as originally input. + mfem::ParMesh &mesh = E ? *E->ParFESpace()->GetParMesh() : *B->ParFESpace()->GetParMesh(); + mesh::DimensionalizeMesh(mesh, mesh_Lc0); + ScaleGridFunctions(mesh_Lc0, mesh.Dimension(), E, B, V, A); + DimensionalizeGridFunctions(units, E, B, V, A); + // Create grid function for vector coefficients. + mfem::ParFiniteElementSpace &fespace = E ? *E->ParFESpace() : *B->ParFESpace(); + mfem::ParGridFunction gridfunc_vector(&fespace); + + // Create grid function for scalar coefficients. + mfem::H1_FECollection h1_fec(fespace.GetMaxElementOrder(), mesh.Dimension()); + mfem::ParFiniteElementSpace h1_fespace(&mesh, &h1_fec); + mfem::ParGridFunction gridfunc_scalar(&h1_fespace); + + const int local_rank = mesh.GetMyRank(); + + auto write_grid_function = [&](const auto &gridfunc, const std::string &name) + { + auto path = fs::path(gridfunction_output_dir) / + fmt::format("{}_{:0{}d}.gf.{:0{}d}", name, step, pad_digits_default, + local_rank, pad_digits_default); + std::ofstream file(path); + gridfunc.Save(file); + }; + + // Write grid functions using MFEM's built-in Save method. + // Use 6-digit padding to match MFEM's pad_digits_default. + if constexpr (HasEGridFunction()) + { + if (E) + { + if constexpr (HasComplexGridFunction()) + { + // Write real and imaginary parts separately. + write_grid_function(E->Real(), "E_real"); + write_grid_function(E->Imag(), "E_imag"); + } + else + { + // Write real part only. + write_grid_function(E->Real(), "E"); + } + } } - for (const auto &[idx, data] : lumped_port_op) + + if constexpr (HasBGridFunction()) { - auto &vi = lumped_port_vi[idx]; - if (has_imaginary) + if (B) { - MFEM_VERIFY( - omega > 0.0, - "Frequency domain lumped port postprocessing requires nonzero frequency!"); - vi.S = data.GetSParameter(*E); - vi.P = data.GetPower(*E, *B, mat_op); - vi.V = data.GetVoltage(*E); - vi.Z = data.GetCharacteristicImpedance(omega); + if constexpr (HasComplexGridFunction()) + { + // Write real and imaginary parts separately. + write_grid_function(B->Real(), "B_real"); + write_grid_function(B->Imag(), "B_imag"); + } + else + { + // Write real part only. + write_grid_function(B->Real(), "B"); + } } - else + } + + if constexpr (HasVGridFunction()) + { + if (V) { - vi.P = data.GetPower(E->real(), B->real(), mat_op); - vi.V = data.GetVoltage(E->real()); - vi.S = vi.Z = 0.0; + write_grid_function(V->Real(), "V"); } } - lumped_port_init = true; -} -void PostOperator::UpdatePorts(const WavePortOperator &wave_port_op, double omega) -{ - MFEM_VERIFY(has_imaginary && E && B, "Incorrect usage of PostOperator::UpdatePorts!"); - if (wave_port_init) + if constexpr (HasAGridFunction()) { - return; + if (A) + { + write_grid_function(A->Real(), "A"); + } + } + + if (U_e) + { + gridfunc_scalar = 0.0; + gridfunc_scalar.ProjectCoefficient(*U_e.get()); + write_grid_function(gridfunc_scalar, "U_e"); + } + + if (U_m) + { + gridfunc_scalar = 0.0; + gridfunc_scalar.ProjectCoefficient(*U_m.get()); + write_grid_function(gridfunc_scalar, "U_m"); } - for (const auto &[idx, data] : wave_port_op) + + if (S) { - MFEM_VERIFY(omega > 0.0, - "Frequency domain wave port postprocessing requires nonzero frequency!"); - auto &vi = wave_port_vi[idx]; - vi.S = data.GetSParameter(*E); - vi.P = data.GetPower(*E, *B, mat_op); - vi.V = vi.Z = 0.0; // Not yet implemented (Z = V² / P, I = V / Z) + gridfunc_vector = 0.0; + gridfunc_vector.ProjectCoefficient(*S.get()); + write_grid_function(gridfunc_vector, "S"); } - wave_port_init = true; + + mesh::NondimensionalizeMesh(mesh, mesh_Lc0); + ScaleGridFunctions(1.0 / mesh_Lc0, mesh.Dimension(), E, B, V, A); + NondimensionalizeGridFunctions(units, E, B, V, A); + Mpi::Barrier(fem_op->GetComm()); } -double PostOperator::GetEFieldEnergy() const +template +void PostOperator::WriteMFEMGridFunctionsFinal(const ErrorIndicator *indicator) { - // We use a leading factor of 1/2 instead of 1/4 even though the eigenmodes are peak - // phasors and not RMS normalized because the same peak phasors are used to compute the - // voltages/currents which are 2x the time-averaged values. This correctly yields an EPR - // of 1 in cases where expected. - MFEM_VERIFY(E, "PostOperator is not configured for electric field energy calculation!"); - return has_imaginary ? dom_post_op.GetElectricFieldEnergy(*E) - : dom_post_op.GetElectricFieldEnergy(E->real()); + BlockTimer bt(Timer::POSTPRO_GRIDFUNCTION); + + auto mesh_Lc0 = units.GetMeshLengthRelativeScale(); + + // Write the mesh partitioning and (optionally) error indicators at the final step. + // Write the mesh coordinates in the same units as originally input. + mfem::ParMesh &mesh = E ? *E->ParFESpace()->GetParMesh() : *B->ParFESpace()->GetParMesh(); + mesh::DimensionalizeMesh(mesh, mesh_Lc0); + + // Create output directory if it doesn't exist. + if (Mpi::Root(fem_op->GetComm())) + { + fs::create_directories(gridfunction_output_dir); + } + + // Create piecewise constant finite element space for rank and error indicator. + mfem::L2_FECollection pwconst_fec(0, mesh.Dimension()); + mfem::FiniteElementSpace pwconst_fespace(&mesh, &pwconst_fec); + + const int local_rank = mesh.GetMyRank(); + + auto write_grid_function = [&](const auto &gridfunc, const std::string &name) + { + auto path = fs::path(gridfunction_output_dir) / + fmt::format("{}.gf.{:0{}d}", name, local_rank, pad_digits_default); + std::ofstream file(path); + gridfunc.Save(file); + }; + + // Write mesh partitioning (rank information). + { + mfem::GridFunction rank(&pwconst_fespace); + rank = local_rank + 1; + write_grid_function(rank, "rank"); + } + + // Write error indicator if provided. + if (indicator) + { + mfem::GridFunction eta(&pwconst_fespace); + MFEM_VERIFY(eta.Size() == indicator->Local().Size(), + "Size mismatch for provided ErrorIndicator for postprocessing!"); + eta = indicator->Local(); + write_grid_function(eta, "indicator"); + } + + // Save ParMesh files; necessary to visualize grid functions. + fs::path mesh_filename = fs::path(gridfunction_output_dir) / "mesh"; + mesh.Save(mesh_filename.string()); + + mesh::NondimensionalizeMesh(mesh, mesh_Lc0); + Mpi::Barrier(fem_op->GetComm()); } -double PostOperator::GetHFieldEnergy() const +// Measurements. + +template +void PostOperator::MeasureDomainFieldEnergy() const { - // We use a leading factor of 1/2 instead of 1/4 even though the eigenmodes are peak - // phasors and not RMS normalized because the same peak phasors are used to compute the - // voltages/currents which are 2x the time-averaged values. This correctly yields an EPR - // of 1 in cases where expected. - MFEM_VERIFY(B, "PostOperator is not configured for magnetic field energy calculation!"); - return has_imaginary ? dom_post_op.GetMagneticFieldEnergy(*B) - : dom_post_op.GetMagneticFieldEnergy(B->real()); + measurement_cache.domain_E_field_energy_i.clear(); + measurement_cache.domain_H_field_energy_i.clear(); + + measurement_cache.domain_E_field_energy_i.reserve(dom_post_op.M_i.size()); + measurement_cache.domain_H_field_energy_i.reserve(dom_post_op.M_i.size()); + + if constexpr (HasEGridFunction()) + { + // Use V if it has it rather than E. + auto &field = V ? *V : *E; + auto energy = dom_post_op.GetElectricFieldEnergy(field); + measurement_cache.domain_E_field_energy_all = energy; + + for (const auto &[idx, data] : dom_post_op.M_i) + { + auto energy_i = dom_post_op.GetDomainElectricFieldEnergy(idx, field); + auto participation_ratio = std::abs(energy_i) > 0.0 ? energy_i / energy : 0.0; + measurement_cache.domain_E_field_energy_i.emplace_back( + Measurement::DomainData{idx, energy_i, participation_ratio}); + } + } + else + { + // Magnetic field only. + measurement_cache.domain_E_field_energy_all = 0.0; + for (const auto &[idx, data] : dom_post_op.M_i) + { + measurement_cache.domain_E_field_energy_i.emplace_back( + Measurement::DomainData{idx, 0.0, 0.0}); + } + } + + if (HasBGridFunction()) + { + auto &field = A ? *A : *B; + auto energy = dom_post_op.GetMagneticFieldEnergy(field); + measurement_cache.domain_H_field_energy_all = energy; + + for (const auto &[idx, data] : dom_post_op.M_i) + { + auto energy_i = dom_post_op.GetDomainMagneticFieldEnergy(idx, field); + auto participation_ratio = std::abs(energy) > 0.0 ? energy_i / energy : 0.0; + measurement_cache.domain_H_field_energy_i.emplace_back( + Measurement::DomainData{idx, energy_i, participation_ratio}); + } + } + else + { + // Electric field only. + measurement_cache.domain_H_field_energy_all = 0.0; + for (const auto &[idx, data] : dom_post_op.M_i) + { + measurement_cache.domain_H_field_energy_i.emplace_back( + Measurement::DomainData{idx, 0.0, 0.0}); + } + } + + // Log Domain Energy. + const auto domain_E = units.Dimensionalize( + measurement_cache.domain_E_field_energy_all); + const auto domain_H = units.Dimensionalize( + measurement_cache.domain_H_field_energy_all); + if constexpr (HasEGridFunction() && !HasBGridFunction()) + { + Mpi::Print(" Field energy E = {:.3e} J\n", domain_E); + } + else if constexpr (!HasEGridFunction() && HasBGridFunction()) + { + Mpi::Print(" Field energy H = {:.3e} J\n", domain_H); + } + else if constexpr (solver_t != ProblemType::EIGENMODE) + { + Mpi::Print(" Field energy E ({:.3e} J) + H ({:.3e} J) = {:.3e} J\n", domain_E, domain_H, + domain_E + domain_H); + } } -double PostOperator::GetLumpedInductorEnergy(const LumpedPortOperator &lumped_port_op) const +template +void PostOperator::MeasureLumpedPorts() const { - // Add contribution due to all capacitive lumped boundaries in the model: - // E_ind = ∑_j 1/2 L_j I_mj². - double U = 0.0; - for (const auto &[idx, data] : lumped_port_op) + measurement_cache.lumped_port_vi.clear(); + measurement_cache.lumped_port_inductor_energy = 0.0; + measurement_cache.lumped_port_capacitor_energy = 0.0; + + if constexpr (solver_t == ProblemType::EIGENMODE || solver_t == ProblemType::DRIVEN || + solver_t == ProblemType::TRANSIENT) { - if (std::abs(data.GetL()) > 0.0) + for (const auto &[idx, data] : fem_op->GetLumpedPortOp()) { - std::complex Ij = GetPortCurrent(lumped_port_op, idx); - U += 0.5 * std::abs(data.GetL()) * std::real(Ij * std::conj(Ij)); + auto &vi = measurement_cache.lumped_port_vi[idx]; + vi.P = data.GetPower(*E, *B); + vi.V = data.GetVoltage(*E); + if constexpr (solver_t == ProblemType::EIGENMODE || solver_t == ProblemType::DRIVEN) + { + // Compute current from the port impedance, separate contributions for R, L, C + // branches. + // Get value and make real: Matches current behaviour (even for eigensolver!). + MFEM_VERIFY( + measurement_cache.freq.real() > 0.0, + "Frequency domain lumped port postprocessing requires nonzero frequency!"); + vi.I_RLC[0] = + (std::abs(data.R) > 0.0) + ? vi.V / data.GetCharacteristicImpedance(measurement_cache.freq.real(), + LumpedPortData::Branch::R) + : 0.0; + vi.I_RLC[1] = + (std::abs(data.L) > 0.0) + ? vi.V / data.GetCharacteristicImpedance(measurement_cache.freq.real(), + LumpedPortData::Branch::L) + : 0.0; + vi.I_RLC[2] = + (std::abs(data.C) > 0.0) + ? vi.V / data.GetCharacteristicImpedance(measurement_cache.freq.real(), + LumpedPortData::Branch::C) + : 0.0; + vi.I = std::accumulate(vi.I_RLC.begin(), vi.I_RLC.end(), + std::complex{0.0, 0.0}); + vi.S = data.GetSParameter(*E); + + // Add contribution due to all inductive lumped boundaries in the model: + // E_ind = ∑_j 1/2 L_j I_mj². + if (std::abs(data.L) > 0.0) + { + std::complex I_mj = vi.I_RLC[1]; + vi.inductor_energy = 0.5 * std::abs(data.L) * std::real(I_mj * std::conj(I_mj)); + measurement_cache.lumped_port_inductor_energy += vi.inductor_energy; + } + + // Add contribution due to all capacitive lumped boundaries in the model: + // E_cap = ∑_j 1/2 C_j V_mj². + if (std::abs(data.C) > 0.0) + { + std::complex V_mj = vi.V; + vi.capacitor_energy = 0.5 * std::abs(data.C) * std::real(V_mj * std::conj(V_mj)); + measurement_cache.lumped_port_capacitor_energy += vi.capacitor_energy; + } + } + else + { + // Compute current from P = V I^* since there is no frequency & characteristic + // impedance of the lumped element. + vi.I = (std::abs(vi.V) > 0.0) ? std::conj(vi.P / vi.V) : 0.0; + } } } - return U; } -double -PostOperator::GetLumpedCapacitorEnergy(const LumpedPortOperator &lumped_port_op) const +template +void PostOperator::MeasureLumpedPortsEig() const { - // Add contribution due to all capacitive lumped boundaries in the model: - // E_cap = ∑_j 1/2 C_j V_mj². - double U = 0.0; - for (const auto &[idx, data] : lumped_port_op) + // Depends on MeasureLumpedPorts. + if constexpr (solver_t == ProblemType::EIGENMODE) { - if (std::abs(data.GetC()) > 0.0) + auto freq_re = measurement_cache.freq.real(); + auto energy_electric_all = measurement_cache.domain_E_field_energy_all + + measurement_cache.lumped_port_capacitor_energy; + for (const auto &[idx, data] : fem_op->GetLumpedPortOp()) { - std::complex Vj = GetPortVoltage(lumped_port_op, idx); - U += 0.5 * std::abs(data.GetC()) * std::real(Vj * std::conj(Vj)); + // Get previously computed data: should never fail as defined by MeasureLumpedPorts. + auto &vi = measurement_cache.lumped_port_vi.at(idx); + + // Resistive Lumped Ports: + // Compute participation ratio of external ports (given as any port boundary with + // nonzero resistance). Currently no reactance of the ports is supported. The κ of + // the port follows from: + // κ_mj = 1/2 R_j I_mj² / E_m + // from which the mode coupling quality factor is computed as: + // Q_mj = ω_m / κ_mj. + if (std::abs(data.R) > 0.0) + { + std::complex I_mj = vi.I_RLC[0]; + // Power = 1/2 R_j I_mj². + // Note conventions: mean(I²) = (I_r² + I_i²) / 2; + auto resistor_power = 0.5 * std::abs(data.R) * std::real(I_mj * std::conj(I_mj)); + vi.mode_port_kappa = + std::copysign(resistor_power / energy_electric_all, I_mj.real()); + vi.quality_factor = (vi.mode_port_kappa == 0.0) + ? mfem::infinity() + : freq_re / std::abs(vi.mode_port_kappa); + } + + // Inductive Lumped Ports: + // Compute energy-participation ratio of junction given by index idx for the field + // mode. We first get the port line voltage, and use lumped port circuit impedance to + // get peak current through the inductor: I_mj = V_mj / Z_mj, Z_mj = i ω_m L_j. E_m + // is the total energy in mode m: E_m = E_elec + E_cap = E_mag + E_ind. The signed EPR + // for a lumped inductive element is computed as: + // p_mj = 1/2 L_j I_mj² / E_m. + // An element with no assigned inductance will be treated as having zero admittance + // and thus zero current. + if (std::abs(data.L) > 0.0) + { + std::complex I_mj = vi.I_RLC[1]; + vi.inductive_energy_participation = + std::copysign(vi.inductor_energy / energy_electric_all, I_mj.real()); + } } } - return U; } -std::complex PostOperator::GetSParameter(const LumpedPortOperator &lumped_port_op, - int idx, int source_idx) const +template +void PostOperator::MeasureWavePorts() const { - MFEM_VERIFY(lumped_port_init, - "Port S-parameters not defined until ports are initialized!"); - const LumpedPortData &data = lumped_port_op.GetPort(idx); - const LumpedPortData &src_data = lumped_port_op.GetPort(source_idx); - const auto it = lumped_port_vi.find(idx); - MFEM_VERIFY(src_data.IsExcited(), - "Lumped port index " << source_idx << " is not marked for excitation!"); - MFEM_VERIFY(it != lumped_port_vi.end(), - "Could not find lumped port when calculating port S-parameters!"); - std::complex Sij = it->second.S; - if (idx == source_idx) - { - Sij.real(Sij.real() - 1.0); - } - // Generalized S-parameters if the ports are resistive (avoids divide-by-zero). - if (std::abs(data.GetR()) > 0.0) - { - Sij *= std::sqrt(src_data.GetR() / data.GetR()); - } - return Sij; + measurement_cache.wave_port_vi.clear(); + + if constexpr (solver_t == ProblemType::DRIVEN) + { + for (const auto &[idx, data] : fem_op->GetWavePortOp()) + { + // Get value and make real: Matches current behaviour. + auto freq_re = measurement_cache.freq.real(); // TODO: Fix + MFEM_VERIFY(freq_re > 0.0, + "Frequency domain wave port postprocessing requires nonzero frequency!"); + auto &vi = measurement_cache.wave_port_vi[idx]; + vi.P = data.GetPower(*E, *B); + vi.S = data.GetSParameter(*E); + // vi.V = vi.I[0] = vi.I[1] = vi.I[2] = 0.0; // Not yet implemented + // // (Z = V² / P, I = V / Z) + } + } } -std::complex PostOperator::GetSParameter(const WavePortOperator &wave_port_op, - int idx, int source_idx) const +template +void PostOperator::MeasureSParameter() const { - // Wave port modes are not normalized to a characteristic impedance so no generalized - // S-parameters are available. - MFEM_VERIFY(wave_port_init, "Port S-parameters not defined until ports are initialized!"); - const WavePortData &data = wave_port_op.GetPort(idx); - const WavePortData &src_data = wave_port_op.GetPort(source_idx); - const auto it = wave_port_vi.find(idx); - MFEM_VERIFY(src_data.IsExcited(), - "Wave port index " << source_idx << " is not marked for excitation!"); - MFEM_VERIFY(it != wave_port_vi.end(), - "Could not find wave port when calculating port S-parameters!"); - std::complex Sij = it->second.S; - if (idx == source_idx) - { - Sij.real(Sij.real() - 1.0); - } - // Port de-embedding: S_demb = S exp(-ikₙᵢ dᵢ) exp(-ikₙⱼ dⱼ) (distance offset is default - // 0 unless specified). - Sij *= std::exp(1i * src_data.GetPropagationConstant() * src_data.GetOffsetDistance()); - Sij *= std::exp(1i * data.GetPropagationConstant() * data.GetOffsetDistance()); - return Sij; + // Depends on LumpedPorts, WavePorts. + if constexpr (solver_t == ProblemType::DRIVEN) + { + using fmt::format; + using std::complex_literals::operator""i; + + // Don't measure S-Matrix unless there is only one excitation per port. Also, we current + // don't support mixing wave and lumped ports, because we need to fix consistent + // conventions / de-embedding. + if (!fem_op->GetPortExcitations().IsMultipleSimple() || + !((fem_op->GetLumpedPortOp().Size() > 0) xor (fem_op->GetWavePortOp().Size() > 0))) + { + return; + } + + // Assumes that for single driving port the excitation index is equal to the port index. + auto drive_port_idx = measurement_cache.ex_idx; + + // Currently S-Parameters are not calculated for mixed lumped & wave ports, so don't + // combine output iterators. + for (const auto &[idx, data] : fem_op->GetLumpedPortOp()) + { + // Get previously computed data: should never fail as defined by MeasureLumpedPorts. + auto &vi = measurement_cache.lumped_port_vi.at(idx); + + const LumpedPortData &src_data = fem_op->GetLumpedPortOp().GetPort(drive_port_idx); + if (idx == drive_port_idx) + { + vi.S.real(vi.S.real() - 1.0); + } + // Generalized S-parameters if the ports are resistive (avoids divide-by-zero). + if (std::abs(data.R) > 0.0) + { + vi.S *= std::sqrt(src_data.R / data.R); + } + + Mpi::Print(" {0} = {1:+.3e}{2:+.3e}i, |{0}| = {3:+.3e}, arg({0}) = {4:+.3e}\n", + format("S[{}][{}]", idx, drive_port_idx), vi.S.real(), vi.S.imag(), + Measurement::Magnitude(vi.S), Measurement::Phase(vi.S)); + } + for (const auto &[idx, data] : fem_op->GetWavePortOp()) + { + // Get previously computed data: should never fail as defined by MeasureWavePorts. + auto &vi = measurement_cache.wave_port_vi.at(idx); + + // Wave port modes are not normalized to a characteristic impedance so no generalized + // S-parameters are available. + const WavePortData &src_data = fem_op->GetWavePortOp().GetPort(drive_port_idx); + if (idx == drive_port_idx) + { + vi.S.real(vi.S.real() - 1.0); + } + // Port de-embedding: S_demb = S exp(ikₙᵢ dᵢ) exp(ikₙⱼ dⱼ) (distance offset is default + // 0 unless specified). + vi.S *= std::exp(1i * src_data.kn0 * src_data.d_offset); + vi.S *= std::exp(1i * data.kn0 * data.d_offset); + + Mpi::Print(" {0} = {1:+.3e}{2:+.3e}i, |{0}| = {3:+.3e}, arg({0}) = {4:+.3e}\n", + format("S[{}][{}]", idx, drive_port_idx), vi.S.real(), vi.S.imag(), + Measurement::Magnitude(vi.S), Measurement::Phase(vi.S)); + } + } } -std::complex PostOperator::GetPortPower(const LumpedPortOperator &lumped_port_op, - int idx) const +template +void PostOperator::MeasureSurfaceFlux() const { - MFEM_VERIFY(lumped_port_init, - "Lumped port quantities not defined until ports are initialized!"); - const auto it = lumped_port_vi.find(idx); - MFEM_VERIFY(it != lumped_port_vi.end(), - "Could not find lumped port when calculating lumped port power!"); - return it->second.P; + // Compute the flux through a surface as Φ_j = ∫ F ⋅ n_j dS, with F = B, F = ε D, or F = + // E x H. The special coefficient is used to avoid issues evaluating MFEM GridFunctions + // which are discontinuous at interior boundary elements. + measurement_cache.surface_flux_i.clear(); + measurement_cache.surface_flux_i.reserve(surf_post_op.flux_surfs.size()); + for (const auto &[idx, data] : surf_post_op.flux_surfs) + { + measurement_cache.surface_flux_i.emplace_back(Measurement::FluxData{ + idx, surf_post_op.GetSurfaceFlux(idx, E.get(), B.get()), data.type}); + } } -std::complex PostOperator::GetPortPower(const WavePortOperator &wave_port_op, - int idx) const +template +void PostOperator::MeasureFarField() const { - MFEM_VERIFY(wave_port_init, - "Wave port quantities not defined until ports are initialized!"); - const auto it = wave_port_vi.find(idx); - MFEM_VERIFY(it != wave_port_vi.end(), - "Could not find wave port when calculating wave port power!"); - return it->second.P; + if constexpr (solver_t == ProblemType::DRIVEN || solver_t == ProblemType::EIGENMODE) + { + measurement_cache.farfield.thetaphis = surf_post_op.farfield.thetaphis; + + // NOTE: measurement_cache.freq is omega (it has a factor of 2pi). + measurement_cache.farfield.E_field = surf_post_op.GetFarFieldrE( + measurement_cache.farfield.thetaphis, *E, *B, measurement_cache.freq.real(), + measurement_cache.freq.imag()); + } } -std::complex PostOperator::GetPortVoltage(const LumpedPortOperator &lumped_port_op, - int idx) const +template +void PostOperator::MeasureInterfaceEFieldEnergy() const { - MFEM_VERIFY(lumped_port_init, - "Lumped port quantities not defined until ports are initialized!"); - const auto it = lumped_port_vi.find(idx); - MFEM_VERIFY(it != lumped_port_vi.end(), - "Could not find lumped port when calculating lumped port voltage!"); - return it->second.V; + // Depends on Lumped Port Energy since this is used in normalization of participation + // ratio. + + // Compute the surface dielectric participation ratio and associated quality factor for + // the material interface given by index idx. We have: + // 1/Q_mj = p_mj tan(δ)_j + // with: + // p_mj = 1/2 t_j Re{∫_{Γ_j} (ε_j E_m)ᴴ E_m dS} / (E_elec + E_cap). + measurement_cache.interface_eps_i.clear(); + if constexpr (HasEGridFunction()) + { + // Domain and port energies must have been measured first. E_cap returns zero if the + // solver does not support lumped ports. + // + // TODO: Should this not include other types of energy too (surface impedance case)? + auto energy_electric_all = measurement_cache.domain_E_field_energy_all + + measurement_cache.lumped_port_capacitor_energy; + + measurement_cache.interface_eps_i.reserve(surf_post_op.eps_surfs.size()); + for (const auto &[idx, data] : surf_post_op.eps_surfs) + { + auto energy = surf_post_op.GetInterfaceElectricFieldEnergy(idx, *E); + + auto energy_participation_p = energy / energy_electric_all; + auto loss_tangent_delta = surf_post_op.GetInterfaceLossTangent(idx); + auto quality_factor_Q = (energy_participation_p == 0.0 || loss_tangent_delta == 0.0) + ? mfem::infinity() + : 1.0 / (loss_tangent_delta * energy_participation_p); + + measurement_cache.interface_eps_i.emplace_back(Measurement::InterfaceData{ + idx, energy, loss_tangent_delta, energy_participation_p, quality_factor_Q}); + } + } } -std::complex PostOperator::GetPortCurrent(const LumpedPortOperator &lumped_port_op, - int idx) const +template +void PostOperator::MeasureProbes() const { - MFEM_VERIFY(lumped_port_init, - "Lumped port quantities not defined until ports are initialized!"); - const auto it = lumped_port_vi.find(idx); - MFEM_VERIFY(it != lumped_port_vi.end(), - "Could not find lumped port when calculating lumped port current!"); - if (std::abs(it->second.Z) > 0.0) + measurement_cache.probe_E_field.clear(); + measurement_cache.probe_B_field.clear(); + +#if defined(MFEM_USE_GSLIB) + if constexpr (HasEGridFunction()) { - // Compute from V = I Z when impedance is available. - return it->second.V / it->second.Z; + if (interp_op.GetProbes().size() > 0) + { + measurement_cache.probe_E_field = interp_op.ProbeField(*E); + } } - else if (std::abs(it->second.V) > 0.0) + if constexpr (HasBGridFunction()) { - // Compute from P = V I⋆. - return std::conj(it->second.P / it->second.V); + if (interp_op.GetProbes().size() > 0) + { + measurement_cache.probe_B_field = interp_op.ProbeField(*B); + } } - return 0.0; +#endif } -double PostOperator::GetInductorParticipation(const LumpedPortOperator &lumped_port_op, - int idx, double Em) const -{ - // Compute energy-participation ratio of junction given by index idx for the field mode. - // We first get the port line voltage, and use lumped port circuit impedance to get peak - // current through the inductor: I_mj = V_mj / Z_mj, Z_mj = i ω_m L_j. Em is the total - // energy in mode m: E_m = E_elec + E_cap = E_mag + E_ind. The signed EPR for a lumped - // inductive element is computed as: - // p_mj = 1/2 L_j I_mj² / E_m. - // An element with no assigned inductance will be treated as having zero admittance and - // thus zero current. - const LumpedPortData &data = lumped_port_op.GetPort(idx); - std::complex Imj = GetPortCurrent(lumped_port_op, idx); - return std::copysign(0.5 * std::abs(data.GetL()) * std::real(Imj * std::conj(Imj)) / Em, - Imj.real()); // mean(I²) = (I_r² + I_i²) / 2 -} +using fmt::format; -double PostOperator::GetExternalKappa(const LumpedPortOperator &lumped_port_op, int idx, - double Em) const +template +template +auto PostOperator::MeasureAndPrintAll(int ex_idx, int step, + const ComplexVector &e, + const ComplexVector &b, + std::complex omega) + -> std::enable_if_t { - // Compute participation ratio of external ports (given as any port boundary with nonzero - // resistance). Currently no reactance of the ports is supported. The κ of the port - // follows from: - // κ_mj = 1/2 R_j I_mj² / E_m - // from which the mode coupling quality factor is computed as: - // Q_mj = ω_m / κ_mj. - const LumpedPortData &data = lumped_port_op.GetPort(idx); - std::complex Imj = GetPortCurrent(lumped_port_op, idx); - return std::copysign(0.5 * std::abs(data.GetR()) * std::real(Imj * std::conj(Imj)) / Em, - Imj.real()); // mean(I²) = (I_r² + I_i²) / 2 -} + BlockTimer bt0(Timer::POSTPRO); + SetEGridFunction(e); + SetBGridFunction(b); -double PostOperator::GetBulkParticipation(int idx, double Em) const -{ - // Compute the bulk dielectric participation ratio material given by index idx. Here, we - // have: - // p_mj = E_elec,j / (E_elec + E_cap). - MFEM_VERIFY(E, "Bulk Q not defined, no electric field solution found!"); - double Ebulk = has_imaginary ? dom_post_op.GetDomainElectricFieldEnergy(idx, *E) - : dom_post_op.GetDomainElectricFieldEnergy(idx, E->real()); - return Ebulk / Em; -} + measurement_cache = {}; + measurement_cache.freq = omega; + measurement_cache.ex_idx = ex_idx; + MeasureAllImpl(); -double PostOperator::GetBulkQualityFactor(int idx, double Em) const -{ - // Compute the associated quality factor for the material given by index idx. Here, we - // have: - // 1/Q_mj = p_mj tan(δ)_j = tan(δ)_j E_elec,j / (E_elec + E_cap). - MFEM_VERIFY(E, "Bulk Q not defined, no electric field solution found!"); - double Ebulki = has_imaginary - ? dom_post_op.GetDomainElectricFieldEnergyLoss(idx, *E) - : dom_post_op.GetDomainElectricFieldEnergyLoss(idx, E->real()); - return (Ebulki == 0.0) ? mfem::infinity() : Em / Ebulki; + std::complex freq = + units.Dimensionalize(omega) / (2 * M_PI); + post_op_csv.PrintAllCSVData(*this, measurement_cache, freq.real(), step, ex_idx); + if (ShouldWriteParaviewFields(step)) + { + Mpi::Print("\n"); + auto ind = 1 + std::distance(output_save_indices.begin(), + std::lower_bound(output_save_indices.begin(), + output_save_indices.end(), step)); + WriteParaviewFields(omega.real(), ind); + Mpi::Print(" Wrote fields to disk (Paraview) at step {:d}\n", step + 1); + } + if (ShouldWriteGridFunctionFields(step)) + { + Mpi::Print("\n"); + auto ind = 1 + std::distance(output_save_indices.begin(), + std::lower_bound(output_save_indices.begin(), + output_save_indices.end(), step)); + WriteMFEMGridFunctions(freq.real(), ind); + Mpi::Print(" Wrote fields to disk (grid function) at step {:d}\n", step + 1); + } + return measurement_cache.domain_E_field_energy_all + + measurement_cache.domain_H_field_energy_all; } -double PostOperator::GetInterfaceParticipation(int idx, double Em) const +template +template +auto PostOperator::MeasureAndPrintAll(int step, const ComplexVector &e, + const ComplexVector &b, + std::complex omega, + double error_abs, double error_bkwd, + int num_conv) + -> std::enable_if_t { - // Compute the surface dielectric participation ratio and associated quality factor for - // the material interface given by index idx. We have: - // 1/Q_mj = p_mj tan(δ)_j - // with: - // p_mj = 1/2 t_j Re{∫_{Γ_j} (ε_j E_m)ᴴ E_m dS} /(E_elec + E_cap). - MFEM_VERIFY(E, "Surface Q not defined, no electric field solution found!"); - double Esurf = has_imaginary - ? surf_post_op.GetInterfaceElectricFieldEnergy(idx, *E) - : surf_post_op.GetInterfaceElectricFieldEnergy(idx, E->real()); - return Esurf / Em; + BlockTimer bt0(Timer::POSTPRO); + SetEGridFunction(e); + SetBGridFunction(b); + + measurement_cache = {}; + measurement_cache.freq = omega; + measurement_cache.eigenmode_Q = + (omega == 0.0) ? mfem::infinity() : 0.5 * std::abs(omega) / std::abs(omega.imag()); + measurement_cache.error_abs = error_abs; + measurement_cache.error_bkwd = error_bkwd; + + // Mini pretty-print table of eig summaries: always print with header since other + // measurements may log their results. + if (Mpi::Root(fem_op->GetComm())) + { + Table table; + int idx_pad = 1 + static_cast(std::log10(num_conv)); + table.col_options = {6, 6}; + table.insert(Column("idx", "m", idx_pad, {}, {}, "") << step + 1); + table.insert(Column("f_re", "Re{f} (GHz)") + << (units.Dimensionalize(omega.real())) / + (2 * M_PI)); + table.insert(Column("f_im", "Im{f} (GHz)") + << (units.Dimensionalize(omega.imag())) / + (2 * M_PI)); + table.insert(Column("q", "Q") << measurement_cache.eigenmode_Q); + table.insert(Column("err_back", "Error (Bkwd.)") << error_bkwd); + table.insert(Column("err_abs", "Error (Abs.)") << error_abs); + table[0].print_as_int = true; + Mpi::Print("{}", (step == 0) ? table.format_table() : table.format_row(0)); + } + MeasureAllImpl(); + + int print_idx = step + 1; + post_op_csv.PrintAllCSVData(*this, measurement_cache, print_idx, step); + if (ShouldWriteParaviewFields(step)) + { + WriteParaviewFields(step, print_idx); + Mpi::Print(" Wrote mode {:d} to disk (Paraview)\n", print_idx); + } + if (ShouldWriteGridFunctionFields(step)) + { + WriteMFEMGridFunctions(step, print_idx); + Mpi::Print(" Wrote mode {:d} to disk (grid function)\n", print_idx); + } + return measurement_cache.domain_E_field_energy_all + + measurement_cache.domain_H_field_energy_all; } -double PostOperator::GetSurfaceCharge(int idx) const +template +template +auto PostOperator::MeasureAndPrintAll(int step, const Vector &v, const Vector &e, + int idx) + -> std::enable_if_t { - // Compute the induced charge on a surface as Q_j = ∫ D ⋅ n_j dS, which correctly handles - // two-sided internal surfaces using a special GridFunction coefficient which accounts - // for both sides of the surface. This then yields the capacitive coupling to the - // excitation as C_jk = Q_j / V_k where V_k is the excitation voltage. - MFEM_VERIFY(E, "Surface capacitance not defined, no electric field solution found!"); - double Q = has_imaginary ? surf_post_op.GetSurfaceElectricCharge(idx, *E) - : surf_post_op.GetSurfaceElectricCharge(idx, E->real()); - return Q; -} + BlockTimer bt0(Timer::POSTPRO); + SetVGridFunction(v); + SetEGridFunction(e); + + measurement_cache = {}; + MeasureAllImpl(); -double PostOperator::GetSurfaceFlux(int idx) const + int print_idx = step + 1; + post_op_csv.PrintAllCSVData(*this, measurement_cache, print_idx, step); + if (ShouldWriteParaviewFields(step)) + { + Mpi::Print("\n"); + WriteParaviewFields(step, idx); + Mpi::Print(" Wrote fields to disk (Paraview) for source {:d}\n", idx); + } + if (ShouldWriteGridFunctionFields(step)) + { + Mpi::Print("\n"); + WriteMFEMGridFunctions(step, idx); + Mpi::Print(" Wrote fields to disk (grid function) for source {:d}\n", idx); + } + return measurement_cache.domain_E_field_energy_all + + measurement_cache.domain_H_field_energy_all; +} +template +template +auto PostOperator::MeasureAndPrintAll(int step, const Vector &a, const Vector &b, + int idx) + -> std::enable_if_t { - // Compute the magnetic flux through a surface as Φ_j = ∫ B ⋅ n_j dS. This then yields the - // inductive coupling to the excitation as M_jk = Φ_j / I_k where I_k is the excitation - // current. The special coefficient is used to avoid issues evaluating MFEM GridFunctions - // which are discontinuous at interior boundary elements. - MFEM_VERIFY(B, - "Surface inductance not defined, no magnetic flux density solution found!"); - double Phi = has_imaginary ? surf_post_op.GetSurfaceMagneticFlux(idx, *B) - : surf_post_op.GetSurfaceMagneticFlux(idx, B->real()); - return Phi; + BlockTimer bt0(Timer::POSTPRO); + SetAGridFunction(a); + SetBGridFunction(b); + + measurement_cache = {}; + MeasureAllImpl(); + + int print_idx = step + 1; + post_op_csv.PrintAllCSVData(*this, measurement_cache, print_idx, step); + if (ShouldWriteParaviewFields(step)) + { + Mpi::Print("\n"); + WriteParaviewFields(step, idx); + Mpi::Print(" Wrote fields to disk (Paraview) for source {:d}\n", idx); + } + if (ShouldWriteGridFunctionFields(step)) + { + Mpi::Print("\n"); + WriteMFEMGridFunctions(step, idx); + Mpi::Print(" Wrote fields to disk (grid function) for source {:d}\n", idx); + } + return measurement_cache.domain_E_field_energy_all + + measurement_cache.domain_H_field_energy_all; } -void PostOperator::WriteFields(int step, double time, const ErrorIndicator *indicator) const +template +template +auto PostOperator::MeasureAndPrintAll(int step, const Vector &e, const Vector &b, + double time, double J_coef) + -> std::enable_if_t { - // Given the electric field and magnetic flux density, write the fields to disk for - // visualization. Write the mesh coordinates in the same units as originally input. - bool first_save = (paraview.GetCycle() < 0); - mfem::ParMesh &mesh = - (E) ? *E->ParFESpace()->GetParMesh() : *B->ParFESpace()->GetParMesh(); - mesh::DimensionalizeMesh(mesh, mesh_Lc0); + BlockTimer bt0(Timer::POSTPRO); + SetEGridFunction(e); + SetBGridFunction(b); - paraview.SetCycle(step); - paraview.SetTime(time); - paraview_bdr.SetCycle(step); - paraview_bdr.SetTime(time); - if (first_save || indicator) + measurement_cache = {}; + measurement_cache.Jcoeff_excitation = J_coef; + MeasureAllImpl(); + + // Time must be converted before passing into csv due to the shared PrintAllCSVData + // method. + time = units.Dimensionalize(time); + post_op_csv.PrintAllCSVData(*this, measurement_cache, time, step); + if (ShouldWriteParaviewFields(step)) { - mfem::L2_FECollection pwconst_fec(0, mesh.Dimension()); - mfem::ParFiniteElementSpace pwconst_fespace(&mesh, &pwconst_fec); - std::unique_ptr rank, eta; - if (first_save) - { - rank = std::make_unique(&pwconst_fespace); - *rank = mesh.GetMyRank() + 1; - paraview.RegisterField("Rank", rank.get()); - } - if (indicator) - { - eta = std::make_unique(&pwconst_fespace); - MFEM_VERIFY(eta->Size() == indicator->Local().Size(), - "Size mismatch for provided ErrorIndicator for postprocessing!"); - *eta = indicator->Local(); - paraview.RegisterField("Indicator", eta.get()); - } - paraview.Save(); - if (rank) - { - paraview.DeregisterField("Rank"); - } - if (eta) - { - paraview.DeregisterField("Indicator"); - } + Mpi::Print("\n"); + WriteParaviewFields(time, double(step) / output_delta_post); + Mpi::Print(" Wrote fields to disk (Paraview) at step {:d}\n", step + 1); } - else + if (ShouldWriteGridFunctionFields(step)) { - paraview.Save(); + Mpi::Print("\n"); + WriteMFEMGridFunctions(time, double(step) / output_delta_post); + Mpi::Print(" Wrote fields to disk (grid function) at step {:d}\n", step + 1); } - paraview_bdr.Save(); - - // Restore the mesh nondimensionalization. - mesh::NondimensionalizeMesh(mesh, mesh_Lc0); + return measurement_cache.domain_E_field_energy_all + + measurement_cache.domain_H_field_energy_all; } -std::vector> PostOperator::ProbeEField() const +template +void PostOperator::MeasureFinalize(const ErrorIndicator &indicator) { - MFEM_VERIFY(E, "PostOperator is not configured for electric field probes!"); - return interp_op.ProbeField(*E, has_imaginary); + BlockTimer bt0(Timer::POSTPRO); + auto indicator_stats = indicator.GetSummaryStatistics(fem_op->GetComm()); + post_op_csv.PrintErrorIndicator(Mpi::Root(fem_op->GetComm()), indicator_stats); + if (ShouldWriteParaviewFields()) + { + WriteParaviewFieldsFinal(&indicator); + } + if (ShouldWriteGridFunctionFields()) + { + WriteMFEMGridFunctionsFinal(&indicator); + } } -std::vector> PostOperator::ProbeBField() const +template +template +auto PostOperator::MeasureDomainFieldEnergyOnly(const ComplexVector &e, + const ComplexVector &b) + -> std::enable_if_t { - MFEM_VERIFY(B, "PostOperator is not configured for magnetic flux density probes!"); - return interp_op.ProbeField(*B, has_imaginary); + SetEGridFunction(e); + SetBGridFunction(b); + MeasureDomainFieldEnergy(); + Mpi::Barrier(fem_op->GetComm()); + + // Return total domain energy for normalizing error indicator. + return measurement_cache.domain_E_field_energy_all + + measurement_cache.domain_H_field_energy_all; } +// Explicit template instantiation. + +template class PostOperator; +template class PostOperator; +template class PostOperator; +template class PostOperator; +template class PostOperator; + +// Function explicit instantiation. +// TODO(C++20): with requires, we won't need a second template. + +template auto PostOperator::MeasureAndPrintAll( + int ex_idx, int step, const ComplexVector &e, const ComplexVector &b, + std::complex omega) -> double; + +template auto +PostOperator::MeasureAndPrintAll( + int step, const ComplexVector &e, const ComplexVector &b, std::complex omega, + double error_abs, double error_bkwd, int num_conv) -> double; + +template auto +PostOperator::MeasureAndPrintAll( + int step, const Vector &v, const Vector &e, int idx) -> double; + +template auto +PostOperator::MeasureAndPrintAll( + int step, const Vector &a, const Vector &b, int idx) -> double; + +template auto +PostOperator::MeasureAndPrintAll( + int step, const Vector &e, const Vector &b, double t, double J_coef) -> double; + +template auto +PostOperator::MeasureDomainFieldEnergyOnly( + const ComplexVector &e, const ComplexVector &b) -> double; + +template auto +PostOperator::InitializeParaviewDataCollection( + int ex_idx) -> void; + } // namespace palace diff --git a/palace/models/postoperator.hpp b/palace/models/postoperator.hpp index 8eae756f34..33a6220902 100644 --- a/palace/models/postoperator.hpp +++ b/palace/models/postoperator.hpp @@ -8,14 +8,20 @@ #include #include #include -#include +#include #include #include +#include "fem/gridfunction.hpp" #include "fem/interpolator.hpp" #include "linalg/operator.hpp" #include "linalg/vector.hpp" #include "models/domainpostoperator.hpp" +#include "models/lumpedportoperator.hpp" +#include "models/postoperatorcsv.hpp" #include "models/surfacepostoperator.hpp" +#include "utils/configfile.hpp" +#include "utils/filesystem.hpp" +#include "utils/units.hpp" namespace palace { @@ -24,162 +30,425 @@ class CurlCurlOperator; class ErrorIndicator; class IoData; class LaplaceOperator; -class LumpedPortOperator; class MaterialOperator; class SpaceOperator; class SurfaceCurrentOperator; class WavePortOperator; +// Statically specify if solver uses real or complex fields. + +template +constexpr bool HasComplexGridFunction() +{ + return solver_t == ProblemType::DRIVEN || solver_t == ProblemType::EIGENMODE; +} + +// Statically specify what fields a solver uses +// TODO(C++20): Change these to inline consteval and use with requires. + +template +constexpr bool HasVGridFunction() +{ + return solver_t == ProblemType::ELECTROSTATIC; +} + +template +constexpr bool HasAGridFunction() +{ + return solver_t == ProblemType::MAGNETOSTATIC; +} + +template +constexpr bool HasEGridFunction() +{ + return solver_t != ProblemType::MAGNETOSTATIC; +} + +template +constexpr bool HasBGridFunction() +{ + return solver_t != ProblemType::ELECTROSTATIC; +} + +// Scale gridfunctions after redimensionalizing the mesh. +void ScaleGridFunctions(double L, int dim, std::unique_ptr &E, + std::unique_ptr &B, std::unique_ptr &V, + std::unique_ptr &A); + +// Scale gridfunctions from non-dimensional to SI units. +void DimensionalizeGridFunctions(Units &units, std::unique_ptr &E, + std::unique_ptr &B, + std::unique_ptr &V, + std::unique_ptr &A); + +// Scale gridfunctions from SI units to non-dimensional. +void NondimensionalizeGridFunctions(Units &units, std::unique_ptr &E, + std::unique_ptr &B, + std::unique_ptr &V, + std::unique_ptr &A); + // -// A class to handle solution postprocessing. +// A class to handle solution postprocessing for all solvers. // +template class PostOperator { -private: - // Reference to material property operator (not owned). - const MaterialOperator &mat_op; - - // Surface boundary and domain postprocessors. - const SurfacePostOperator surf_post_op; - const DomainPostOperator dom_post_op; - - // Objects for grid function postprocessing from the FE solution. - const bool has_imaginary; - std::optional E, B; - std::optional V, A; - std::unique_ptr Esr, Esi, Bsr, Bsi, As, Jsr, Jsi; - std::unique_ptr Vs, Ue, Um, Qsr, Qsi; - - // Lumped and wave port voltage and current (R, L, and C branches) caches updated when - // the grid functions are set. - struct PortPostData - { - std::complex S, P, V, Z; +protected: + // Pointer to operator handling discretization and FEM space appropriate to solver. It + // also contains the reference to all domains, boundary conditions, etc. needed for + // measurement and printing. + // TODO(C++20): Use std::reference_wrapper with incomplete types. + fem_op_t *fem_op; + + // Unit converter from IOData to scale mesh and measurements. Lightweight class so it is + // cheap to copy, rather than keep another reference to IOData. + Units units; + + // Base post-op output directory. + fs::path post_dir; + + // Fields: Electric, Magnetic, Scalar Potential, Vector Potential. + std::unique_ptr E, B, V, A; + + // Field output format control flags. + bool enable_paraview_output = false; + bool enable_gridfunction_output = false; + + // How many / which fields to output. + int output_delta_post = 0; // printing rate (TRANSIENT) + int output_n_post = 0; // max printing (OTHER SOLVERS) + std::vector output_save_indices = {}; // explicit saves + + // Whether any output formats were specified. + bool AnyOutputFormats() const + { + return enable_paraview_output || enable_gridfunction_output; + } + bool AnythingToSave() const + { + return (output_delta_post > 0) || (output_n_post > 0) || !output_save_indices.empty(); + } + + // Whether any fields should be written at all. + bool ShouldWriteFields() const { return AnyOutputFormats() && AnythingToSave(); } + + // Whether any fields should be written for this step. + bool ShouldWriteFields(std::size_t step) const + { + return AnyOutputFormats() && + ((output_delta_post > 0 && step % output_delta_post == 0) || + (output_n_post > 0 && step < output_n_post) || + std::binary_search(output_save_indices.cbegin(), output_save_indices.cend(), + step)); + } + + // Whether fields should be written for a particular output format (at a given step). + bool ShouldWriteParaviewFields() const + { + return enable_paraview_output && AnythingToSave(); + } + bool ShouldWriteParaviewFields(std::size_t step) const + { + return enable_paraview_output && ShouldWriteFields(step); + } + bool ShouldWriteGridFunctionFields() const + { + return enable_gridfunction_output && AnythingToSave(); + } + bool ShouldWriteGridFunctionFields(std::size_t step) const + { + return enable_gridfunction_output && ShouldWriteFields(step); + } + + // ParaView data collection: writing fields to disk for visualization. + // This is an optional, since ParaViewDataCollection has no default (empty) ctor, + // and we only want initialize it if ShouldWriteParaviewFields() returns true. + std::optional paraview, paraview_bdr; + + // MFEM grid function output details. + std::string gridfunction_output_dir; + const std::size_t pad_digits_default = 6; + + // Measurements of field solution for ParaView files (full domain or surfaces). + + // Poynting Coefficient, Electric Boundary Field (re+im), Magnetic Boundary Field (re+im), + // Vector Potential Boundary Field, Surface Current (re+im). + std::unique_ptr S, E_sr, E_si, B_sr, B_si, A_s, J_sr, J_si; + + // Electric Energy Density, Magnetic Energy Density, Scalar Potential Boundary Field, + // Surface Charge (re+im). + std::unique_ptr U_e, U_m, V_s, Q_sr, Q_si; + + // Wave port boundary mode field postprocessing. + struct WavePortFieldData + { + std::unique_ptr E0r, E0i; }; - std::map lumped_port_vi, wave_port_vi; - bool lumped_port_init, wave_port_init; + std::map port_E0; + + // Setup coefficients for field postprocessing. + void SetupFieldCoefficients(); - // Data collection for writing fields to disk for visualization and sampling points. - mutable mfem::ParaViewDataCollection paraview, paraview_bdr; - mutable InterpolationOperator interp_op; - double mesh_Lc0; - void InitializeDataCollection(const IoData &iodata); + // Initialize Paraview, register all fields to write. + void InitializeParaviewDataCollection(const fs::path &sub_folder_name = ""); public: - PostOperator(const IoData &iodata, SpaceOperator &spaceop, const std::string &name); - PostOperator(const IoData &iodata, LaplaceOperator &laplaceop, const std::string &name); - PostOperator(const IoData &iodata, CurlCurlOperator &curlcurlop, const std::string &name); + // Public overload for the driven solver only, that takes in an excitation index and + // sets the correct sub_folder_name path for the primary function above. + template + auto InitializeParaviewDataCollection(int ex_idx) + -> std::enable_if_t; - // Access to surface and domain postprocessing objects. - const auto &GetSurfacePostOp() const { return surf_post_op; } - const auto &GetDomainPostOp() const { return dom_post_op; } +protected: + // Write to disk the E- and B-fields extracted from the solution vectors. Note that + // fields are not redimensionalized, to do so one needs to compute: B <= B * (μ₀ H₀), E + // <= E * (Z₀ H₀), V <= V * (Z₀ H₀ L₀), etc. + void WriteParaviewFields(double time, int step); + void WriteParaviewFieldsFinal(const ErrorIndicator *indicator = nullptr); + void WriteMFEMGridFunctions(double time, int step); + void WriteMFEMGridFunctionsFinal(const ErrorIndicator *indicator = nullptr); + + // CSV Measure & Print. + + // PostOperatorCSV is a class that contains csv tables and printers of + // measurements. Conceptually, its members could be a part of this class, like the + // ParaView fields and functions above. It has been separated out for code readability. To + // achieve this, it is has a pointer back to its "parent" PostOperator class and is a + // friend class so it can access the private measurement_cache and references of the + // system from fem_op. + friend PostOperatorCSV; + + PostOperatorCSV post_op_csv; - // Return options for postprocessing configuration. - bool HasImaginary() const { return has_imaginary; } - bool HasE() const { return E.has_value(); } - bool HasB() const { return B.has_value(); } + // Helper classes that actually do some measurements that will be saved to csv files. + DomainPostOperator dom_post_op; // Energy in bulk + SurfacePostOperator surf_post_op; // Dielectric Interface Energy, Flux, and FarField + mutable InterpolationOperator interp_op; // E & B fields: mutates during measure + + mutable Measurement measurement_cache; + + // Individual measurements to fill the cache/workspace. Measurements functions are not + // constrained by solver type in the signature since they are private member functions. + // They dispatch on solver type within the function itself using `if constexpr`, and do + // nothing if the measurement is not solver appropriate. + void MeasureDomainFieldEnergy() const; + void MeasureLumpedPorts() const; + void MeasureWavePorts() const; + void MeasureLumpedPortsEig() const; // Depends: DomainFieldEnergy, LumpedPorts + void MeasureSParameter() const; // Depends: LumpedPorts, WavePorts + void MeasureSurfaceFlux() const; + void MeasureFarField() const; + void MeasureInterfaceEFieldEnergy() const; // Depends: LumpedPorts + void MeasureProbes() const; + + // Helper function called by all solvers. Has to ensure correct call order to deal with + // dependent measurements. + void MeasureAllImpl() const + { + MeasureDomainFieldEnergy(); + MeasureLumpedPorts(); + MeasureWavePorts(); + MeasureLumpedPortsEig(); + MeasureSParameter(); + MeasureSurfaceFlux(); + MeasureInterfaceEFieldEnergy(); + MeasureProbes(); + MeasureFarField(); + } + + // Setting grid functions. + // // Populate the grid function solutions for the E- and B-field using the solution vectors // on the true dofs. For the real-valued overload, the electric scalar potential can be // specified too for electrostatic simulations. The output mesh and fields are - // nondimensionalized consistently (B ~ E (L₀ ω₀ E₀⁻¹)). - void SetEGridFunction(const ComplexVector &e); - void SetBGridFunction(const ComplexVector &b); - void SetEGridFunction(const Vector &e); - void SetBGridFunction(const Vector &b); - void SetVGridFunction(const Vector &v); - void SetAGridFunction(const Vector &a); - - // Update cached port voltages and currents for lumped and wave port operators. - void UpdatePorts(const LumpedPortOperator &lumped_port_op, - const WavePortOperator &wave_port_op, double omega = 0.0) - { - UpdatePorts(lumped_port_op, omega); - UpdatePorts(wave_port_op, omega); - } - void UpdatePorts(const LumpedPortOperator &lumped_port_op, double omega = 0.0); - void UpdatePorts(const WavePortOperator &wave_port_op, double omega = 0.0); - - // Postprocess the total electric and magnetic field energies in the electric and magnetic - // fields. - double GetEFieldEnergy() const; - double GetHFieldEnergy() const; - - // Postprocess the energy in lumped capacitor or inductor port boundaries with index in - // the provided set. - double GetLumpedInductorEnergy(const LumpedPortOperator &lumped_port_op) const; - double GetLumpedCapacitorEnergy(const LumpedPortOperator &lumped_port_op) const; - - // Postprocess the S-parameter for recieving lumped or wave port index using the electric - // field solution. - std::complex GetSParameter(const LumpedPortOperator &lumped_port_op, int idx, - int source_idx) const; - std::complex GetSParameter(const WavePortOperator &wave_port_op, int idx, - int source_idx) const; - - // Postprocess the circuit voltage and current across lumped port index using the electric - // field solution. When has_imaginary is false, the returned voltage has only a nonzero - // real part. - std::complex GetPortPower(const LumpedPortOperator &lumped_port_op, - int idx) const; - std::complex GetPortPower(const WavePortOperator &wave_port_op, int idx) const; - std::complex GetPortVoltage(const LumpedPortOperator &lumped_port_op, - int idx) const; - std::complex GetPortVoltage(const WavePortOperator &wave_port_op, int idx) const - { - MFEM_ABORT("GetPortVoltage is not yet implemented for wave port boundaries!"); - return 0.0; - } - std::complex GetPortCurrent(const LumpedPortOperator &lumped_port_op, - int idx) const; - std::complex GetPortCurrent(const WavePortOperator &wave_port_op, int idx) const - { - MFEM_ABORT("GetPortCurrent is not yet implemented for wave port boundaries!"); - return 0.0; - } - - // Postprocess the EPR for the electric field solution and lumped port index. - double GetInductorParticipation(const LumpedPortOperator &lumped_port_op, int idx, - double Em) const; - - // Postprocess the coupling rate for radiative loss to the given I-O port index. - double GetExternalKappa(const LumpedPortOperator &lumped_port_op, int idx, - double Em) const; - - // Postprocess the participation ratio or quality factor for bulk lossy dielectric losses - // in the electric field mode. - double GetBulkParticipation(int idx, double Em) const; - double GetBulkQualityFactor(int idx, double Em) const; - - // Postprocess the partitipation ratio for interface lossy dielectric losses in the - // electric field mode. - double GetInterfaceParticipation(int idx, double Em) const; - - // Postprocess the charge or flux for a surface index using the electric field solution - // or the magnetic flux density field solution. - double GetSurfaceCharge(int idx) const; - double GetSurfaceFlux(int idx) const; - - // Write to disk the E- and B-fields extracted from the solution vectors. Note that fields - // are not redimensionalized, to do so one needs to compute: B <= B * (μ₀ H₀), E <= E * - // (Z₀ H₀), V <= V * (Z₀ H₀ L₀), etc. - void WriteFields(int step, double time, const ErrorIndicator *indicator = nullptr) const; - - // Probe the E- and B-fields for their vector-values at speceified locations in space. - // Locations of probes are set up in constructor from configuration file data. If - // has_imaginary is false, the returned fields have only nonzero real parts. Output - // vectors are ordered by vector dimension, that is [v1x, v1y, v1z, v2x, v2y, v2z, ...]. - const auto &GetProbes() const { return interp_op.GetProbes(); } - std::vector> ProbeEField() const; - std::vector> ProbeBField() const; - - // Get the associated MPI communicator. - MPI_Comm GetComm() const - { - return (E) ? E->ParFESpace()->GetComm() : B->ParFESpace()->GetComm(); + // non-dimensionalized consistently (B ~ E (L₀ ω₀ E₀⁻¹)). + // + // These functions are private helper functions. We want to enforce that a caller passes + // the appropriate ones as part of the MeasureAndPrintAll interface, rather than do a + // runtime check to see that they have been set. + // + // TODO(C++20): Switch SFINAE to requires. + + template + auto SetEGridFunction(const ComplexVector &e, bool exchange_face_nbr_data = true) + -> std::enable_if_t() && HasComplexGridFunction(), void> + { + E->Real().SetFromTrueDofs(e.Real()); // Parallel distribute + E->Imag().SetFromTrueDofs(e.Imag()); + if (exchange_face_nbr_data) + { + E->Real().ExchangeFaceNbrData(); // Ready for parallel comm on shared faces + E->Imag().ExchangeFaceNbrData(); + } + } + + template + auto SetEGridFunction(const Vector &e, bool exchange_face_nbr_data = true) + -> std::enable_if_t() && !HasComplexGridFunction(), void> + { + E->Real().SetFromTrueDofs(e); + if (exchange_face_nbr_data) + { + E->Real().ExchangeFaceNbrData(); + } + } + + template + auto SetBGridFunction(const ComplexVector &b, bool exchange_face_nbr_data = true) + -> std::enable_if_t() && HasComplexGridFunction(), void> + { + B->Real().SetFromTrueDofs(b.Real()); // Parallel distribute + B->Imag().SetFromTrueDofs(b.Imag()); + if (exchange_face_nbr_data) + { + B->Real().ExchangeFaceNbrData(); // Ready for parallel comm on shared faces + B->Imag().ExchangeFaceNbrData(); + } + } + + template + auto SetBGridFunction(const Vector &b, bool exchange_face_nbr_data = true) + -> std::enable_if_t() && !HasComplexGridFunction(), void> + { + B->Real().SetFromTrueDofs(b); + if (exchange_face_nbr_data) + { + B->Real().ExchangeFaceNbrData(); + } + } + + template + auto SetVGridFunction(const Vector &v, bool exchange_face_nbr_data = true) + -> std::enable_if_t() && !HasComplexGridFunction(), void> + { + V->Real().SetFromTrueDofs(v); + if (exchange_face_nbr_data) + { + V->Real().ExchangeFaceNbrData(); + } + } + + template + auto SetAGridFunction(const Vector &a, bool exchange_face_nbr_data = true) + -> std::enable_if_t() && !HasComplexGridFunction(), void> + { + A->Real().SetFromTrueDofs(a); + if (exchange_face_nbr_data) + { + A->Real().ExchangeFaceNbrData(); + } + } + +public: + explicit PostOperator(const IoData &iodata, fem_op_t &fem_op); + + // MeasureAndPrintAll is the primary public interface of this class. It is specialized by + // solver type, since each solver has different fields and extra data required. These + // functions all: + // 1) Set the GridFunctions which have to be passed as part of the call. + // 2) Perform all measurements and populate measurement_cache with temporary results. This + // cache structure exists since measurements have dependencies; we may use some + // measurement results in later measurements. + // 3) Pass the measurement cache to the csv printer which will add the appropriate + // rows/cols to the csv tables and print to file. + // 4) Trigger ParaView field computation and save. + // + // The functions return the total domain energy which is the only thing needed in the + // solver to normalize the error indicator. If more measurements were needed by the solver + // loop, we could imagine passing a small struct (like Measurement above or some sub-set + // therefore). + // + // The measure functions will also do logging of (some) measurements to stdout. + // + // TODO(C++20): Upgrade SFINAE to C++20 concepts to simplify static selection since we can + // just write `MeasureAndPrintAll(...) requires (solver_t == Type::A)` without extra + // template. + + template + auto MeasureAndPrintAll(int ex_idx, int step, const ComplexVector &e, + const ComplexVector &b, std::complex omega) + -> std::enable_if_t; + + template + auto MeasureAndPrintAll(int step, const ComplexVector &e, const ComplexVector &b, + std::complex omega, double error_abs, double error_bkwd, + int num_conv) + -> std::enable_if_t; + + template + auto MeasureAndPrintAll(int step, const Vector &v, const Vector &e, int idx) + -> std::enable_if_t; + + template + auto MeasureAndPrintAll(int step, const Vector &a, const Vector &b, int idx) + -> std::enable_if_t; + + template + auto MeasureAndPrintAll(int step, const Vector &e, const Vector &b, double t, + double J_coef) + -> std::enable_if_t; + + // Write error indicator into ParaView file and print summary statistics to csv. Should be + // called once at the end of the solver loop. + void MeasureFinalize(const ErrorIndicator &indicator); + + // Measurement of the domain energy without printing. This is needed during the driven + // simulation with PROM. There samples are taken and we need the total domain energy for + // the error indicator, but no other measurement / printing should be done. + // + // TODO(C++20): SFINAE to requires. + template + auto MeasureDomainFieldEnergyOnly(const ComplexVector &e, const ComplexVector &b) + -> std::enable_if_t; + + // Access grid functions for field solutions. Note that these are NOT const functions. The + // electrostatics / magnetostatics solver do measurements of the capacitance/ inductance + // matrix globally at the end of all solves. This is done in the solver class, but uses + // the GridFunctions in this (PostOp) class as already allocated scratch workspace. + // + // Future: Consider moving those cap/ind measurements into this class and MeasureFinalize? + // Would need to store vector of V,A. + // + // TODO(C++20): Switch SFINAE to requires. + template + auto GetEGridFunction() -> std::enable_if_t(), decltype(*E) &> + { + return *E; } + + template + auto GetBGridFunction() -> std::enable_if_t(), decltype(*B) &> + { + return *B; + } + + template + auto GetVGridFunction() -> std::enable_if_t(), decltype(*V) &> + { + return *V; + } + + template + auto GetAGridFunction() -> std::enable_if_t(), decltype(*A) &> + { + return *A; + } + + // Access to number of padding digits. + constexpr auto GetPadDigitsDefault() const { return pad_digits_default; } + + // Access to domain postprocessing objects. Use in electrostatic & magnetostatic matrix + // measurement (see above). + const auto &GetDomainPostOp() const { return dom_post_op; } + + // Expose MPI communicator from fem_op for electrostatic & magnetostatic matrix processing + // (see above). + auto GetComm() const { return fem_op->GetComm(); } }; + + bool createDirectory(const std::string &path); } // namespace palace #endif // PALACE_MODELS_POST_OPERATOR_HPP diff --git a/palace/models/postoperatorcsv.cpp b/palace/models/postoperatorcsv.cpp new file mode 100644 index 0000000000..06ec3a5980 --- /dev/null +++ b/palace/models/postoperatorcsv.cpp @@ -0,0 +1,1389 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "postoperatorcsv.hpp" + +#include + +#include "models/curlcurloperator.hpp" +#include "models/laplaceoperator.hpp" +#include "models/materialoperator.hpp" +#include "models/postoperator.hpp" +#include "models/spaceoperator.hpp" +#include "utils/iodata.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +// static +Measurement Measurement::Dimensionalize(const Units &units, + const Measurement &nondim_measurement_cache) +{ + Measurement measurement_cache; + measurement_cache.freq = + units.Dimensionalize(nondim_measurement_cache.freq) / + (2 * M_PI); + measurement_cache.ex_idx = nondim_measurement_cache.ex_idx; // NONE + measurement_cache.Jcoeff_excitation = nondim_measurement_cache.Jcoeff_excitation; // NONE + measurement_cache.eigenmode_Q = nondim_measurement_cache.eigenmode_Q; // NONE + measurement_cache.error_abs = nondim_measurement_cache.error_abs; // NONE + measurement_cache.error_bkwd = nondim_measurement_cache.error_bkwd; // NONE + + measurement_cache.domain_E_field_energy_all = + units.Dimensionalize( + nondim_measurement_cache.domain_E_field_energy_all); + measurement_cache.domain_H_field_energy_all = + units.Dimensionalize( + nondim_measurement_cache.domain_H_field_energy_all); + for (const auto &e : nondim_measurement_cache.domain_E_field_energy_i) + { + measurement_cache.domain_E_field_energy_i.emplace_back(Measurement::DomainData{ + e.idx, units.Dimensionalize(e.energy), + e.participation_ratio}); + } + for (const auto &e : nondim_measurement_cache.domain_H_field_energy_i) + { + measurement_cache.domain_H_field_energy_i.emplace_back(Measurement::DomainData{ + e.idx, units.Dimensionalize(e.energy), + e.participation_ratio}); + } + measurement_cache.lumped_port_capacitor_energy = + units.Dimensionalize( + nondim_measurement_cache.lumped_port_capacitor_energy); + measurement_cache.lumped_port_inductor_energy = + units.Dimensionalize( + nondim_measurement_cache.lumped_port_inductor_energy); + + auto dimensionalize_port_post_data = + [&units](const std::map &nondim) + { + std::map dim; + for (const auto &[k, data] : nondim) + { + dim[k] = Measurement::PortPostData(); + dim[k].P = units.Dimensionalize(data.P); + dim[k].V = units.Dimensionalize(data.V), + dim[k].I = units.Dimensionalize(data.I), + dim[k].I_RLC = {units.Dimensionalize(data.I_RLC[0]), + units.Dimensionalize(data.I_RLC[1]), + units.Dimensionalize(data.I_RLC[2])}; + dim[k].S = data.S; // NONE + + dim[k].inductor_energy = + units.Dimensionalize(data.inductor_energy); + dim[k].capacitor_energy = + units.Dimensionalize(data.capacitor_energy); + + dim[k].mode_port_kappa = + units.Dimensionalize(data.mode_port_kappa) / + (2 * M_PI); + dim[k].quality_factor = data.quality_factor; // NONE + dim[k].inductive_energy_participation = data.inductive_energy_participation; // NONE + } + return dim; + }; + measurement_cache.lumped_port_vi = + dimensionalize_port_post_data(nondim_measurement_cache.lumped_port_vi); + measurement_cache.wave_port_vi = + dimensionalize_port_post_data(nondim_measurement_cache.wave_port_vi); + + measurement_cache.probe_E_field = units.Dimensionalize( + nondim_measurement_cache.probe_E_field); + measurement_cache.probe_B_field = units.Dimensionalize( + nondim_measurement_cache.probe_B_field); + + for (const auto &data : nondim_measurement_cache.surface_flux_i) + { + auto &flux = measurement_cache.surface_flux_i.emplace_back(data); + if (data.type == SurfaceFlux::ELECTRIC) + { + flux.Phi *= units.GetScaleFactor(); + flux.Phi *= units.GetScaleFactor(); + } + else if (data.type == SurfaceFlux::MAGNETIC) + { + flux.Phi *= units.GetScaleFactor(); + flux.Phi *= units.GetScaleFactor(); + } + else if (data.type == SurfaceFlux::POWER) + { + flux.Phi *= units.GetScaleFactor(); + } + } + + for (const auto &data : nondim_measurement_cache.interface_eps_i) + { + auto &eps = measurement_cache.interface_eps_i.emplace_back(data); + eps.energy = units.Dimensionalize(data.energy); + } + + measurement_cache.farfield.thetaphis = + nondim_measurement_cache.farfield.thetaphis; // NONE + measurement_cache.farfield.E_field = units.Nondimensionalize( + nondim_measurement_cache.farfield.E_field); + + return measurement_cache; +} + +// static. +Measurement Measurement::Nondimensionalize(const Units &units, + const Measurement &dim_measurement_cache) +{ + Measurement measurement_cache; + measurement_cache.freq = + units.Nondimensionalize(dim_measurement_cache.freq) * + (2 * M_PI); + measurement_cache.ex_idx = dim_measurement_cache.ex_idx; // NONE + measurement_cache.Jcoeff_excitation = dim_measurement_cache.Jcoeff_excitation; // NONE + measurement_cache.eigenmode_Q = dim_measurement_cache.eigenmode_Q; // NONE + measurement_cache.error_abs = dim_measurement_cache.error_abs; // NONE + measurement_cache.error_bkwd = dim_measurement_cache.error_bkwd; // NONE + + measurement_cache.domain_E_field_energy_all = + units.Nondimensionalize( + dim_measurement_cache.domain_E_field_energy_all); + measurement_cache.domain_H_field_energy_all = + units.Nondimensionalize( + dim_measurement_cache.domain_H_field_energy_all); + for (const auto &e : dim_measurement_cache.domain_E_field_energy_i) + { + measurement_cache.domain_E_field_energy_i.emplace_back(Measurement::DomainData{ + e.idx, units.Nondimensionalize(e.energy), + e.participation_ratio}); + } + for (const auto &e : dim_measurement_cache.domain_H_field_energy_i) + { + measurement_cache.domain_H_field_energy_i.emplace_back(Measurement::DomainData{ + e.idx, units.Nondimensionalize(e.energy), + e.participation_ratio}); + } + measurement_cache.lumped_port_capacitor_energy = + units.Nondimensionalize( + dim_measurement_cache.lumped_port_capacitor_energy); + measurement_cache.lumped_port_inductor_energy = + units.Nondimensionalize( + dim_measurement_cache.lumped_port_inductor_energy); + + auto dimensionalize_port_post_data = + [&units](const std::map &nondim) + { + std::map dim; + for (const auto &[k, data] : nondim) + { + dim[k] = Measurement::PortPostData(); + dim[k].P = units.Nondimensionalize(data.P); + dim[k].V = units.Nondimensionalize(data.V), + dim[k].I = units.Nondimensionalize(data.I), + dim[k].I_RLC = {units.Nondimensionalize(data.I_RLC[0]), + units.Nondimensionalize(data.I_RLC[1]), + units.Nondimensionalize(data.I_RLC[2])}; + dim[k].S = data.S; // NONE + + dim[k].inductor_energy = + units.Nondimensionalize(data.inductor_energy); + dim[k].capacitor_energy = + units.Nondimensionalize(data.capacitor_energy); + + dim[k].mode_port_kappa = + units.Nondimensionalize(data.mode_port_kappa) * + (2 * M_PI); + dim[k].quality_factor = data.quality_factor; // NONE + dim[k].inductive_energy_participation = data.inductive_energy_participation; // NONE + } + return dim; + }; + measurement_cache.lumped_port_vi = + dimensionalize_port_post_data(dim_measurement_cache.lumped_port_vi); + measurement_cache.wave_port_vi = + dimensionalize_port_post_data(dim_measurement_cache.wave_port_vi); + + measurement_cache.probe_E_field = units.Nondimensionalize( + dim_measurement_cache.probe_E_field); + measurement_cache.probe_B_field = units.Nondimensionalize( + dim_measurement_cache.probe_B_field); + + for (const auto &data : dim_measurement_cache.surface_flux_i) + { + auto &flux = measurement_cache.surface_flux_i.emplace_back(data); + if (data.type == SurfaceFlux::ELECTRIC) + { + flux.Phi /= units.GetScaleFactor(); + flux.Phi /= units.GetScaleFactor(); + } + else if (data.type == SurfaceFlux::MAGNETIC) + { + flux.Phi /= units.GetScaleFactor(); + flux.Phi /= units.GetScaleFactor(); + } + else if (data.type == SurfaceFlux::POWER) + { + flux.Phi /= units.GetScaleFactor(); + } + } + + for (const auto &data : dim_measurement_cache.interface_eps_i) + { + auto &eps = measurement_cache.interface_eps_i.emplace_back(data); + eps.energy = units.Nondimensionalize(data.energy); + } + + measurement_cache.farfield.thetaphis = dim_measurement_cache.farfield.thetaphis; // NONE + measurement_cache.farfield.E_field = units.Nondimensionalize( + dim_measurement_cache.farfield.E_field); + + return measurement_cache; +} + +namespace +{ + +// TODO(C++20): Do constexpr with string. +std::string DimLabel(int i) +{ + switch (i) + { + // Note: Zero-based indexing here. + case 0: + return "x"; + case 1: + return "y"; + case 2: + return "z"; + default: + return fmt::format("d{}", i); + } +} + +// TODO(C++20): Do constexpr with string. +std::string LabelIndexCol(const ProblemType solver_t) +{ + switch (solver_t) + { + case ProblemType::DRIVEN: + return "f (GHz)"; + case ProblemType::EIGENMODE: + return "m"; + case ProblemType::ELECTROSTATIC: + case ProblemType::MAGNETOSTATIC: + return "i"; + case ProblemType::TRANSIENT: + return "t (ns)"; + default: + return "unknown"; + } +} +int PrecIndexCol(const ProblemType solver_t) +{ + switch (solver_t) + { + case ProblemType::DRIVEN: + case ProblemType::TRANSIENT: + return 8; + case ProblemType::EIGENMODE: + case ProblemType::ELECTROSTATIC: + case ProblemType::MAGNETOSTATIC: + return 2; + default: + return 8; + } +} + +// Index checking when adding to a new excitation block: When adding data to data_col with +// index idx, checks that idx matches what is already written in the corresponding row of +// idx_col. Adds a new idx row to idx_col if needed. +void CheckAppendIndex(Column &idx_col, double idx_value, size_t m_idx_row) +{ + if (m_idx_row == idx_col.n_rows()) + { + idx_col << idx_value; + } + else + { + auto current_idx = idx_col.data.at(m_idx_row); + MFEM_VERIFY(idx_value == current_idx, + fmt::format("Writing data table at incorrect index. Data has index {} " + "while table is at {}", + idx_value, current_idx)); + } +} + +} // namespace + +std::vector _impl::table_expected_filling(std::size_t m_idx_row, + std::size_t ex_idx_i, + std::size_t nr_rows, + std::size_t nr_col_blocks) +{ + // Expected column group filling pattern. Include leading index (freq, ...) + std::vector filling_pattern(nr_col_blocks + 1, 0); + filling_pattern.at(0) = (ex_idx_i == 0) ? m_idx_row : nr_rows; // index column + for (std::size_t i = 1; i < ex_idx_i + 1; i++) + { + filling_pattern.at(i) = nr_rows; + } + filling_pattern.at(ex_idx_i + 1) = m_idx_row; + return filling_pattern; +} + +template +void PostOperatorCSV::MoveTableValidateReload(TableWithCSVFile &t_csv_base, + Table &&t_ref) +{ + // For non-driven solvers or driven with default restart, no table was loaded. + if (!reload_table) + { + t_csv_base.table = std::move(t_ref); + return; + } + + // At this point we have a non-default restart. We need to verify that (a) the structure + // of the table is valid, (b) the cursor location matches the expected restart location. + + auto file = t_csv_base.get_csv_filepath(); + Table &t_base = t_csv_base.table; + MFEM_VERIFY(!t_base.empty(), + fmt::format("The data table loaded from path {} was empty, but the " + "simulation expected a restart with existing data!", + file)); + + auto err_msg = fmt::format("The results table loaded from path {} contains pre-existing " + "data, but it does not match the " + "expected table structure.", + file); + if (t_base.n_cols() != t_ref.n_cols()) + { + MFEM_ABORT(fmt::format("{} [Mismatched number of columns: expected {}, got {}.]", + err_msg, t_base.n_cols(), t_ref.n_cols())) + } + std::vector base_ex_idx_nrows; + long current_ex_idx_v = std::numeric_limits::max(); + for (std::size_t i = 0; i < t_base.n_cols(); i++) + { + auto &t_base_i = t_base[i]; + auto &t_ref_i = t_ref[i]; + + if (t_base_i.header_text != t_ref_i.header_text) + { + MFEM_ABORT(fmt::format("{} [Mismatched column header: expected {}, got {}.]", err_msg, + t_base_i.header_text, t_ref_i.header_text)) + } + // Since we cannot parse the column name (internal label) from the printed csv file or + // other options from csv file, we will over-write them from t_ref. + t_base_i.name = t_ref_i.name; + t_base_i.column_group_idx = t_ref_i.column_group_idx; + t_base_i.min_left_padding = t_ref_i.min_left_padding; + t_base_i.float_precision = t_ref_i.float_precision; + t_base_i.fmt_sign = t_ref_i.fmt_sign; + t_base_i.print_as_int = t_ref_i.print_as_int; + + // Check that columns in same group have the same row number. Assumes that column groups + // are contiguous. If no error, save row number to compare to expected pattern. + if (t_base_i.column_group_idx != current_ex_idx_v) + { + current_ex_idx_v = t_base_i.column_group_idx; + base_ex_idx_nrows.push_back(t_base_i.n_rows()); + } + else + { + if (t_base_i.n_rows() != base_ex_idx_nrows.back()) + { + MFEM_ABORT(fmt::format("{} [Mismatched rows in excitation {}.]", err_msg, + current_ex_idx_v)) + } + } + } + // Match expected column group pattern. + auto expected_ex_idx_nrows = _impl::table_expected_filling( + row_i, ex_idx_i, nr_expected_measurement_rows, ex_idx_v_all.size()); + + // Copy over other options from reference table, since we don't recover them on load. + t_csv_base.table.col_options = t_ref.col_options; + + MFEM_VERIFY(base_ex_idx_nrows == expected_ex_idx_nrows, + fmt::format("{} [Specified restart position is incompatible with reloaded " + "file. Row filling by excitation expected {}, got {}]", + err_msg, expected_ex_idx_nrows, base_ex_idx_nrows)) + + // Don't check index column (frequency) values or size. Size should match with sizing from + // cursor from printer below. Values will be checked as new frequencies are written. +} + +template +void PostOperatorCSV::InitializeDomainE(const DomainPostOperator &dom_post_op) +{ + using fmt::format; + domain_E = TableWithCSVFile(fs::path(post_dir/"domain-E.csv").string(), reload_table); + + Table t; // Define table locally first due to potential reload. + auto nr_expected_measurement_cols = + 1 + ex_idx_v_all.size() * 4 * (1 + dom_post_op.M_i.size()); + t.reserve(nr_expected_measurement_rows, nr_expected_measurement_cols); + t.insert("idx", LabelIndexCol(solver_t), -1, 0, PrecIndexCol(solver_t), ""); + for (const auto ex_idx : ex_idx_v_all) + { + std::string ex_label = HasSingleExIdx() ? "" : format("[{}]", ex_idx); + + t.insert(format("Ee_{}", ex_idx), format("E_elec{} (J)", ex_label), ex_idx); + t.insert(format("Em_{}", ex_idx), format("E_mag{} (J)", ex_label), ex_idx); + t.insert(format("Ec_{}", ex_idx), format("E_cap{} (J)", ex_label), ex_idx); + t.insert(format("Ei_{}", ex_idx), format("E_ind{} (J)", ex_label), ex_idx); + + for (const auto &[idx, data] : dom_post_op.M_i) + { + t.insert(format("Ee_{}_{}", idx, ex_idx), format("E_elec[{}]{} (J)", idx, ex_label), + ex_idx); + t.insert(format("pe_{}_{}", idx, ex_idx), format("p_elec[{}]{}", idx, ex_label), + ex_idx); + t.insert(format("Em_{}_{}", idx, ex_idx), format("E_mag[{}]{} (J)", idx, ex_label), + ex_idx); + t.insert(format("pm_{}_{}", idx, ex_idx), format("p_mag[{}]{}", idx, ex_label), + ex_idx); + } + } + MoveTableValidateReload(*domain_E, std::move(t)); + // No longer WriteFullTableTrunc here. We want to potentially reload and check all + // measurement tables, which may have existing values, before overwriting anything. Just + // write on first measurement. +} + +template +void PostOperatorCSV::PrintDomainE() +{ + if (!domain_E) // trivial check: always written and we are always on root + { + return; + } + using fmt::format; + CheckAppendIndex(domain_E->table["idx"], row_idx_v, row_i); + domain_E->table[format("Ee_{}", m_ex_idx)] << measurement_cache.domain_E_field_energy_all; + domain_E->table[format("Em_{}", m_ex_idx)] << measurement_cache.domain_H_field_energy_all; + domain_E->table[format("Ec_{}", m_ex_idx)] + << measurement_cache.lumped_port_capacitor_energy; + domain_E->table[format("Ei_{}", m_ex_idx)] + << measurement_cache.lumped_port_inductor_energy; + for (const auto &data : measurement_cache.domain_E_field_energy_i) + { + domain_E->table[format("Ee_{}_{}", data.idx, m_ex_idx)] << data.energy; + domain_E->table[format("pe_{}_{}", data.idx, m_ex_idx)] << data.participation_ratio; + } + for (const auto &data : measurement_cache.domain_H_field_energy_i) + { + domain_E->table[format("Em_{}_{}", data.idx, m_ex_idx)] << data.energy; + domain_E->table[format("pm_{}_{}", data.idx, m_ex_idx)] << data.participation_ratio; + } + domain_E->WriteFullTableTrunc(); +} + +template +void PostOperatorCSV::InitializeSurfaceF(const SurfacePostOperator &surf_post_op) +{ + if (!(surf_post_op.flux_surfs.size() > 0)) + { + return; + } + using fmt::format; + surface_F = TableWithCSVFile(fs::path(post_dir / "surface-F.csv").string(), reload_table); + + Table t; // Define table locally first due to potential reload. + auto nr_expected_measurement_cols = 1 + ex_idx_v_all.size() * + (HasComplexGridFunction() ? 2 : 1) * + surf_post_op.flux_surfs.size(); + t.reserve(nr_expected_measurement_rows, nr_expected_measurement_cols); + t.insert("idx", LabelIndexCol(solver_t), -1, 0, PrecIndexCol(solver_t), ""); + for (const auto ex_idx : ex_idx_v_all) + { + std::string ex_label = HasSingleExIdx() ? "" : format("[{}]", ex_idx); + for (const auto &[idx, data] : surf_post_op.flux_surfs) + { + switch (data.type) + { + case SurfaceFlux::ELECTRIC: + if (HasComplexGridFunction()) + { + t.insert(format("F_{}_{}_re", idx, ex_idx), + format("Re{{Φ_elec[{}]{}}} (C)", idx, ex_label), ex_idx); + t.insert(format("F_{}_{}_im", idx, ex_idx), + format("Im{{Φ_elec[{}]{}}} (C)", idx, ex_label), ex_idx); + } + else + { + t.insert(format("F_{}_{}_re", idx, ex_idx), + format("Φ_elec[{}]{} (C)", idx, ex_label), ex_idx); + } + break; + case SurfaceFlux::MAGNETIC: + if (HasComplexGridFunction()) + { + t.insert(format("F_{}_{}_re", idx, ex_idx), + format("Re{{Φ_mag[{}]{}}} (Wb)", idx, ex_label), ex_idx); + t.insert(format("F_{}_{}_im", idx, ex_idx), + format("Im{{Φ_mag[{}]{}}} (Wb)", idx, ex_label), ex_idx); + } + else + { + t.insert(format("F_{}_{}_re", idx, ex_idx), + format("Φ_mag[{}]{} (Wb)", idx, ex_label), ex_idx); + } + break; + case SurfaceFlux::POWER: + t.insert(format("F_{}_{}_re", idx, ex_idx), + format("Φ_pow[{}]{} (W)", idx, ex_label), ex_idx); + break; + } + } + } + MoveTableValidateReload(*surface_F, std::move(t)); +} + +template +void PostOperatorCSV::PrintSurfaceF() +{ + if (!surface_F) + { + return; + } + using fmt::format; + CheckAppendIndex(surface_F->table["idx"], row_idx_v, row_i); + for (const auto &data : measurement_cache.surface_flux_i) + { + surface_F->table[format("F_{}_{}_re", data.idx, m_ex_idx)] << data.Phi.real(); + if (HasComplexGridFunction() && + (data.type == SurfaceFlux::ELECTRIC || data.type == SurfaceFlux::MAGNETIC)) + { + surface_F->table[format("F_{}_{}_im", data.idx, m_ex_idx)] << data.Phi.imag(); + } + } + surface_F->WriteFullTableTrunc(); +} + +template +void PostOperatorCSV::InitializeSurfaceQ(const SurfacePostOperator &surf_post_op) +{ + if (!(surf_post_op.eps_surfs.size() > 0)) + { + return; + } + using fmt::format; + surface_Q = TableWithCSVFile(fs::path(post_dir / "surface-Q.csv").string(), reload_table); + + Table t; // Define table locally first due to potential reload. + auto nr_expected_measurement_cols = + 1 + ex_idx_v_all.size() * (2 * surf_post_op.eps_surfs.size()); + t.reserve(nr_expected_measurement_rows, nr_expected_measurement_cols); + t.insert("idx", LabelIndexCol(solver_t), -1, 0, PrecIndexCol(solver_t), ""); + for (const auto ex_idx : ex_idx_v_all) + { + std::string ex_label = HasSingleExIdx() ? "" : format("[{}]", ex_idx); + for (const auto &[idx, data] : surf_post_op.eps_surfs) + { + t.insert(format("p_{}_{}", idx, ex_idx), format("p_surf[{}]{}", idx, ex_label), + ex_idx); + t.insert(format("Q_{}_{}", idx, ex_idx), format("Q_surf[{}]{}", idx, ex_label), + ex_idx); + } + } + MoveTableValidateReload(*surface_Q, std::move(t)); +} + +template +void PostOperatorCSV::PrintSurfaceQ() +{ + if (!surface_Q) + { + return; + } + using fmt::format; + CheckAppendIndex(surface_Q->table["idx"], row_idx_v, row_i); + + for (const auto &data : measurement_cache.interface_eps_i) + { + surface_Q->table[format("p_{}_{}", data.idx, m_ex_idx)] << data.energy_participation; + surface_Q->table[format("Q_{}_{}", data.idx, m_ex_idx)] << data.quality_factor; + } + surface_Q->WriteFullTableTrunc(); +} + +template +template +auto PostOperatorCSV::InitializeFarFieldE(const SurfacePostOperator &surf_post_op) + -> std::enable_if_t +{ + if (!(surf_post_op.farfield.size() > 0)) + { + return; + } + using fmt::format; + + farfield_E = TableWithCSVFile(fs::path(post_dir / "farfield-rE.csv").string()); + + Table t; // Define table locally first due to potential reload. + + int v_dim = surf_post_op.GetVDim(); + int scale_col = 2 * v_dim; // Real + Imag components + int nr_expected_measurement_cols = 3 + scale_col; // freq, theta, phi + int nr_expected_measurement_rows = surf_post_op.farfield.size(); + t.reserve(nr_expected_measurement_rows, nr_expected_measurement_cols); + if constexpr (U == ProblemType::EIGENMODE) + { + t.insert("idx", "m", -1, 0, PrecIndexCol(solver_t), ""); + t.insert("f_re", "f_re (GHz)"); + t.insert("f_im", "f_im (GHz)"); + } + else + { + t.insert("idx", "f (GHz)", -1, 0, PrecIndexCol(solver_t), ""); + } + t.insert(Column("theta", "theta (deg.)", 0, PrecIndexCol(solver_t), {}, "")); + t.insert(Column("phi", "phi (deg.)", 0, PrecIndexCol(solver_t), {}, "")); + for (int i_dim = 0; i_dim < v_dim; i_dim++) + { + t.insert(format("rE{}_re", i_dim), format("r*Re{{E_{}}} (V)", DimLabel(i_dim))); + t.insert(format("rE{}_im", i_dim), format("r*Im{{E_{}}} (V)", DimLabel(i_dim))); + } + + MoveTableValidateReload(*farfield_E, std::move(t)); +} + +template +template +auto PostOperatorCSV::PrintFarFieldE(const SurfacePostOperator &surf_post_op) + -> std::enable_if_t +{ + if (!farfield_E) + { + return; + } + using fmt::format; + int v_dim = surf_post_op.GetVDim(); + for (size_t i = 0; i < measurement_cache.farfield.thetaphis.size(); i++) + { + farfield_E->table["idx"] << row_idx_v; + if constexpr (U == ProblemType::EIGENMODE) + { + farfield_E->table["f_re"] << measurement_cache.freq.real(); + farfield_E->table["f_im"] << measurement_cache.freq.imag(); + } + const auto &[theta, phi] = measurement_cache.farfield.thetaphis[i]; + const auto &E_field = measurement_cache.farfield.E_field[i]; + + // Print as degrees instead of radians. + farfield_E->table["theta"] << 180 / M_PI * theta; + farfield_E->table["phi"] << 180 / M_PI * phi; + for (int i_dim = 0; i_dim < v_dim; i_dim++) + { + farfield_E->table[format("rE{}_re", i_dim)] << E_field[i_dim].real(); + farfield_E->table[format("rE{}_im", i_dim)] << E_field[i_dim].imag(); + } + } + farfield_E->WriteFullTableTrunc(); +} + +template +void PostOperatorCSV::InitializeProbeE(const InterpolationOperator &interp_op) +{ + if (!(interp_op.GetProbes().size() > 0) || !HasEGridFunction()) + { + return; + } + using fmt::format; + probe_E = TableWithCSVFile(fs::path(post_dir / "probe-E.csv").string(), reload_table); + + Table t; // Define table locally first due to potential reload. + auto v_dim = interp_op.GetVDim(); + int scale_col = (HasComplexGridFunction() ? 2 : 1) * v_dim; + auto nr_expected_measurement_cols = + 1 + ex_idx_v_all.size() * scale_col * interp_op.GetProbes().size(); + t.reserve(nr_expected_measurement_rows, nr_expected_measurement_cols); + t.insert("idx", LabelIndexCol(solver_t), -1, 0, PrecIndexCol(solver_t), ""); + for (const auto ex_idx : ex_idx_v_all) + { + std::string ex_label = HasSingleExIdx() ? "" : format("[{}]", ex_idx); + for (const auto &idx : interp_op.GetProbes()) + { + for (int i_dim = 0; i_dim < v_dim; i_dim++) + { + if constexpr (HasComplexGridFunction()) + { + t.insert(format("E{}_{}_{}_re", i_dim, idx, ex_idx), + format("Re{{E_{}[{}]{}}} (V/m)", DimLabel(i_dim), idx, ex_label), + ex_idx); + t.insert(format("E{}_{}_{}_im", i_dim, idx, ex_idx), + format("Im{{E_{}[{}]{}}} (V/m)", DimLabel(i_dim), idx, ex_label), + ex_idx); + } + else + { + t.insert(format("E{}_{}_{}_re", i_dim, idx, ex_idx), + format("E_{}[{}]{} (V/m)", DimLabel(i_dim), idx, ex_label), ex_idx); + } + } + } + } + MoveTableValidateReload(*probe_E, std::move(t)); +} + +template +void PostOperatorCSV::PrintProbeE(const InterpolationOperator &interp_op) +{ + if (!probe_E) + { + return; + } + using fmt::format; + auto v_dim = interp_op.GetVDim(); + auto probe_field = measurement_cache.probe_E_field; + MFEM_VERIFY(probe_field.size() == v_dim * interp_op.GetProbes().size(), + format("Size mismatch: expect vector field to have size {} * {} = {}; got {}", + v_dim, interp_op.GetProbes().size(), + v_dim * interp_op.GetProbes().size(), probe_field.size())) + + CheckAppendIndex(probe_E->table["idx"], row_idx_v, row_i); + + size_t i = 0; + for (const auto &idx : interp_op.GetProbes()) + { + for (int i_dim = 0; i_dim < v_dim; i_dim++) + { + auto val = probe_field[i * v_dim + i_dim]; + probe_E->table[format("E{}_{}_{}_re", i_dim, idx, m_ex_idx)] << val.real(); + if (HasComplexGridFunction()) + { + probe_E->table[format("E{}_{}_{}_im", i_dim, idx, m_ex_idx)] << val.imag(); + } + } + i++; + } + probe_E->WriteFullTableTrunc(); +} + +template +void PostOperatorCSV::InitializeProbeB(const InterpolationOperator &interp_op) +{ + if (!(interp_op.GetProbes().size() > 0) || !HasBGridFunction()) + { + return; + } + using fmt::format; + probe_B = TableWithCSVFile(fs::path(post_dir / "probe-B.csv").string(), reload_table); + Table t; // Define table locally first due to potential reload. + auto v_dim = interp_op.GetVDim(); + int scale_col = (HasComplexGridFunction() ? 2 : 1) * v_dim; + auto nr_expected_measurement_cols = + 1 + ex_idx_v_all.size() * scale_col * interp_op.GetProbes().size(); + t.reserve(nr_expected_measurement_rows, nr_expected_measurement_cols); + t.insert("idx", LabelIndexCol(solver_t), -1, 0, PrecIndexCol(solver_t), ""); + for (const auto ex_idx : ex_idx_v_all) + { + std::string ex_label = HasSingleExIdx() ? "" : format("[{}]", ex_idx); + for (const auto &idx : interp_op.GetProbes()) + { + for (int i_dim = 0; i_dim < v_dim; i_dim++) + { + if (HasComplexGridFunction()) + { + t.insert(format("B{}_{}_{}_re", i_dim, idx, ex_idx), + format("Re{{B_{}[{}]{}}} (Wb/m²)", DimLabel(i_dim), idx, ex_label), + ex_idx); + t.insert(format("B{}_{}_{}_im", i_dim, idx, ex_idx), + format("Im{{B_{}[{}]{}}} (Wb/m²)", DimLabel(i_dim), idx, ex_label), + ex_idx); + } + else + { + t.insert(format("B{}_{}_{}_re", i_dim, idx, ex_idx), + format("B_{}[{}]{} (Wb/m²)", DimLabel(i_dim), idx, ex_label), ex_idx); + } + } + } + } + MoveTableValidateReload(*probe_B, std::move(t)); +} + +template +void PostOperatorCSV::PrintProbeB(const InterpolationOperator &interp_op) +{ + if (!probe_B) + { + return; + } + using fmt::format; + + auto v_dim = interp_op.GetVDim(); + auto probe_field = measurement_cache.probe_B_field; + MFEM_VERIFY(probe_field.size() == v_dim * interp_op.GetProbes().size(), + format("Size mismatch: expect vector field to have size {} * {} = {}; got {}", + v_dim, interp_op.GetProbes().size(), + v_dim * interp_op.GetProbes().size(), probe_field.size())) + + CheckAppendIndex(probe_B->table["idx"], row_idx_v, row_i); + + size_t i = 0; + for (const auto &idx : interp_op.GetProbes()) + { + for (int i_dim = 0; i_dim < v_dim; i_dim++) + { + auto val = probe_field[i * v_dim + i_dim]; + probe_B->table[format("B{}_{}_{}_re", i_dim, idx, m_ex_idx)] << val.real(); + if (HasComplexGridFunction()) + { + probe_B->table[format("B{}_{}_{}_im", i_dim, idx, m_ex_idx)] << val.imag(); + } + } + i++; + } + probe_B->WriteFullTableTrunc(); +} + +template +template +auto PostOperatorCSV::InitializeSurfaceI(const SurfaceCurrentOperator &surf_j_op) + -> std::enable_if_t +{ + if (!(surf_j_op.Size() > 0)) + { + return; + } + using fmt::format; + surface_I = TableWithCSVFile(fs::path(post_dir / "surface-I.csv").string(), reload_table); + + Table t; // Define table locally first due to potential reload. + auto nr_expected_measurement_cols = 1 + ex_idx_v_all.size() * surf_j_op.Size(); + t.reserve(nr_expected_measurement_rows, nr_expected_measurement_cols); + t.insert("idx", LabelIndexCol(solver_t), -1, 0, PrecIndexCol(solver_t), ""); + for (const auto ex_idx : ex_idx_v_all) + { + std::string ex_label = HasSingleExIdx() ? "" : format("[{}]", ex_idx); + for (const auto &[idx, data] : surf_j_op) + { + t.insert(format("I_{}_{}", idx, ex_idx), format("I_inc[{}]{} (A)", idx, ex_label), + ex_idx); + } + } + MoveTableValidateReload(*surface_I, std::move(t)); +} + +template +template +auto PostOperatorCSV::PrintSurfaceI(const SurfaceCurrentOperator &surf_j_op, + const Units &units) + -> std::enable_if_t +{ + if (!surface_I) + { + return; + } + using fmt::format; + CheckAppendIndex(surface_I->table["idx"], row_idx_v, row_i); + for (const auto &[idx, data] : surf_j_op) + { + auto I_inc_raw = data.GetExcitationCurrent() * measurement_cache.Jcoeff_excitation; + auto I_inc = units.Dimensionalize(I_inc_raw); + surface_I->table[format("I_{}_{}", idx, m_ex_idx)] << I_inc; + } + surface_I->WriteFullTableTrunc(); +} + +template +template +auto PostOperatorCSV::InitializePortVI(const SpaceOperator &fem_op) + -> std::enable_if_t +{ + if (!(fem_op.GetLumpedPortOp().Size() > 0)) + { + return; + } + using fmt::format; + // Currently only works for lumped ports. + const auto &lumped_port_op = fem_op.GetLumpedPortOp(); + port_V = TableWithCSVFile(fs::path(post_dir / "port-V.csv").string(), reload_table); + port_I = TableWithCSVFile(fs::path(post_dir / "port-I.csv").string(), reload_table); + + Table tV; // Define table locally first due to potential reload. + Table tI; + + auto nr_expected_measurement_cols = 1 + ex_idx_v_all.size() * lumped_port_op.Size(); + tV.reserve(nr_expected_measurement_rows, nr_expected_measurement_cols); + tI.reserve(nr_expected_measurement_rows, nr_expected_measurement_cols); + + tV.insert("idx", LabelIndexCol(solver_t), -1, 0, PrecIndexCol(solver_t), ""); + tI.insert("idx", LabelIndexCol(solver_t), -1, 0, PrecIndexCol(solver_t), ""); + for (const auto ex_idx : ex_idx_v_all) + { + std::string ex_label = HasSingleExIdx() ? "" : format("[{}]", ex_idx); + + // Print incident signal, if solver supports excitation on ports. + if constexpr (solver_t == ProblemType::DRIVEN || solver_t == ProblemType::TRANSIENT) + { + auto ex_spec = fem_op.GetPortExcitations().excitations.at(ex_idx); + for (const auto &idx : ex_spec.lumped_port) + { + tV.insert(format("inc{}_{}", idx, ex_idx), format("V_inc[{}]{} (V)", idx, ex_label), + ex_idx); + tI.insert(format("inc{}_{}", idx, ex_idx), format("I_inc[{}]{} (A)", idx, ex_label), + ex_idx); + } + } + for (const auto &[idx, data] : lumped_port_op) + { + if constexpr (HasComplexGridFunction()) + { + tV.insert(format("re{}_{}", idx, ex_idx), + format("Re{{V[{}]{}}} (V)", idx, ex_label), ex_idx); + tV.insert(format("im{}_{}", idx, ex_idx), + format("Im{{V[{}]{}}} (V)", idx, ex_label), ex_idx); + tI.insert(format("re{}_{}", idx, ex_idx), + format("Re{{I[{}]{}}} (A)", idx, ex_label), ex_idx); + tI.insert(format("im{}_{}", idx, ex_idx), + format("Im{{I[{}]{}}} (A)", idx, ex_label), ex_idx); + } + else + { + tV.insert(format("re{}_{}", idx, ex_idx), format("V[{}]{} (V)", idx, ex_label), + ex_idx); + tI.insert(format("re{}_{}", idx, ex_idx), format("I[{}]{} (A)", idx, ex_label), + ex_idx); + } + } + } + MoveTableValidateReload(*port_V, std::move(tV)); + MoveTableValidateReload(*port_I, std::move(tI)); +} + +template +template +auto PostOperatorCSV::PrintPortVI(const LumpedPortOperator &lumped_port_op, + const Units &units) + -> std::enable_if_t +{ + if (!port_V) // no need to recheck port_I + { + return; + } + using fmt::format; + // Currently only works for lumped ports. + // Postprocess the frequency domain lumped port voltages and currents (complex magnitude + // = sqrt(2) * RMS). + + CheckAppendIndex(port_V->table["idx"], row_idx_v, row_i); + CheckAppendIndex(port_I->table["idx"], row_idx_v, row_i); + + if constexpr (solver_t == ProblemType::DRIVEN || solver_t == ProblemType::TRANSIENT) + { + for (const auto &[idx, data] : lumped_port_op) + { + if (data.excitation == m_ex_idx) + { + auto Jcoeff = measurement_cache.Jcoeff_excitation; + double V_inc = data.GetExcitationVoltage() * Jcoeff; + double I_inc = (std::abs(V_inc) > 0.0) + ? data.GetExcitationPower() * Jcoeff * Jcoeff / V_inc + : 0.0; + + port_V->table[format("inc{}_{}", idx, m_ex_idx)] + << units.Dimensionalize(V_inc); + port_I->table[format("inc{}_{}", idx, m_ex_idx)] + << units.Dimensionalize(I_inc); + } + } + } + + for (const auto &[idx, data] : measurement_cache.lumped_port_vi) + { + port_V->table[fmt::format("re{}_{}", idx, m_ex_idx)] << data.V.real(); + port_I->table[fmt::format("re{}_{}", idx, m_ex_idx)] << data.I.real(); + + if constexpr (HasComplexGridFunction()) + { + port_V->table[fmt::format("im{}_{}", idx, m_ex_idx)] << data.V.imag(); + port_I->table[fmt::format("im{}_{}", idx, m_ex_idx)] << data.I.imag(); + } + } + port_V->WriteFullTableTrunc(); + port_I->WriteFullTableTrunc(); +} + +template +template +auto PostOperatorCSV::InitializePortS(const SpaceOperator &fem_op) + -> std::enable_if_t +{ + if (!fem_op.GetPortExcitations().IsMultipleSimple() || + !((fem_op.GetLumpedPortOp().Size() > 0) xor (fem_op.GetWavePortOp().Size() > 0))) + { + return; + } + using fmt::format; + port_S = TableWithCSVFile(fs::path(post_dir / "port-S.csv").string(), reload_table); + + Table t; // Define table locally first due to potential reload. + + auto nr_ports = fem_op.GetLumpedPortOp().Size() + fem_op.GetWavePortOp().Size(); + + auto nr_expected_measurement_cols = 1 + ex_idx_v_all.size() * nr_ports; + t.reserve(nr_expected_measurement_rows, nr_expected_measurement_cols); + t.insert("idx", "f (GHz)", -1, 0, PrecIndexCol(solver_t), ""); + + for (const auto ex_idx : ex_idx_v_all) + { + // TODO(C++20): Combine identical loops with ranges + projection. + for (const auto &[o_idx, data] : fem_op.GetLumpedPortOp()) + { + t.insert(format("abs_{}_{}", o_idx, ex_idx), + format("|S[{}][{}]| (dB)", o_idx, ex_idx), ex_idx); + t.insert(format("arg_{}_{}", o_idx, ex_idx), + format("arg(S[{}][{}]) (deg.)", o_idx, ex_idx), ex_idx); + } + for (const auto &[o_idx, data] : fem_op.GetWavePortOp()) + { + t.insert(format("abs_{}_{}", o_idx, ex_idx), + format("|S[{}][{}]| (dB)", o_idx, ex_idx), ex_idx); + t.insert(format("arg_{}_{}", o_idx, ex_idx), + format("arg(S[{}][{}]) (deg.)", o_idx, ex_idx), ex_idx); + } + } + MoveTableValidateReload(*port_S, std::move(t)); +} + +template +template +auto PostOperatorCSV::PrintPortS() + -> std::enable_if_t +{ + if (!port_S) + { + return; + } + using fmt::format; + CheckAppendIndex(port_S->table["idx"], row_idx_v, row_i); + for (const auto &[idx, data] : measurement_cache.lumped_port_vi) + { + port_S->table[format("abs_{}_{}", idx, m_ex_idx)] << Measurement::Magnitude(data.S); + port_S->table[format("arg_{}_{}", idx, m_ex_idx)] << Measurement::Phase(data.S); + } + for (const auto &[idx, data] : measurement_cache.wave_port_vi) + { + port_S->table[format("abs_{}_{}", idx, m_ex_idx)] << Measurement::Magnitude(data.S); + port_S->table[format("arg_{}_{}", idx, m_ex_idx)] << Measurement::Phase(data.S); + } + port_S->WriteFullTableTrunc(); +} + +template +template +auto PostOperatorCSV::InitializeEig() + -> std::enable_if_t +{ + using fmt::format; + eig = TableWithCSVFile(fs::path(post_dir / "eig.csv").string()); + eig->table.reserve(nr_expected_measurement_rows, 6); + eig->table.insert("idx", "m", -1, 0, PrecIndexCol(solver_t), ""); + eig->table.insert("f_re", "Re{f} (GHz)"); + eig->table.insert("f_im", "Im{f} (GHz)"); + eig->table.insert("q", "Q"); + eig->table.insert("err_back", "Error (Bkwd.)"); + eig->table.insert("err_abs", "Error (Abs.)"); + eig->WriteFullTableTrunc(); +} + +template +template +auto PostOperatorCSV::PrintEig() + -> std::enable_if_t +{ + if (!eig) // trivial check + { + return; + } + eig->table["idx"] << row_idx_v; + eig->table["f_re"] << measurement_cache.freq.real(); + eig->table["f_im"] << measurement_cache.freq.imag(); + eig->table["q"] << measurement_cache.eigenmode_Q; + eig->table["err_back"] << measurement_cache.error_bkwd; + eig->table["err_abs"] << measurement_cache.error_abs; + eig->WriteFullTableTrunc(); +} + +template +template +auto PostOperatorCSV::InitializeEigPortEPR( + const LumpedPortOperator &lumped_port_op) + -> std::enable_if_t +{ + // TODO(C++20): Make this a filtered iterator in LumpedPortOp. + for (const auto &[idx, data] : lumped_port_op) + { + if (std::abs(data.L) > 0.0) + { + ports_with_L.push_back(idx); + } + } + if (ports_with_L.empty()) + { + return; + } + using fmt::format; + port_EPR = TableWithCSVFile(fs::path(post_dir / "port-EPR.csv").string()); + port_EPR->table.reserve(nr_expected_measurement_rows, 1 + ports_with_L.size()); + port_EPR->table.insert("idx", "m", -1, 0, PrecIndexCol(solver_t), ""); + for (const auto idx : ports_with_L) + { + port_EPR->table.insert(format("p_{}", idx), format("p[{}]", idx)); + } + port_EPR->WriteFullTableTrunc(); +} + +template +template +auto PostOperatorCSV::PrintEigPortEPR() + -> std::enable_if_t +{ + if (!port_EPR) + { + return; + } + using fmt::format; + port_EPR->table["idx"] << row_idx_v; + for (const auto idx : ports_with_L) + { + auto vi = measurement_cache.lumped_port_vi.at(idx); + port_EPR->table[format("p_{}", idx)] << vi.inductive_energy_participation; + } + port_EPR->WriteFullTableTrunc(); +} + +template +template +auto PostOperatorCSV::InitializeEigPortQ(const LumpedPortOperator &lumped_port_op) + -> std::enable_if_t +{ + // TODO(C++20): Make this a filtered iterator in LumpedPortOp. + for (const auto &[idx, data] : lumped_port_op) + { + if (std::abs(data.R) > 0.0) + { + ports_with_R.push_back(idx); + } + } + if (ports_with_R.empty()) + { + return; + } + using fmt::format; + port_Q = TableWithCSVFile(fs::path(post_dir / "port-Q.csv").string()); + port_Q->table.reserve(nr_expected_measurement_rows, 1 + ports_with_R.size()); + port_Q->table.insert("idx", "m", -1, 0, PrecIndexCol(solver_t), ""); + for (const auto idx : ports_with_R) + { + port_Q->table.insert(format("Ql_{}", idx), format("Q_ext[{}]", idx)); + port_Q->table.insert(format("Kl_{}", idx), format("κ_ext[{}] (GHz)", idx)); + } + port_Q->WriteFullTableTrunc(); +} + +template +template +auto PostOperatorCSV::PrintEigPortQ() + -> std::enable_if_t +{ + if (!port_Q) + { + return; + } + using fmt::format; + port_Q->table["idx"] << row_idx_v; + for (const auto idx : ports_with_R) + { + auto vi = measurement_cache.lumped_port_vi.at(idx); + port_Q->table[format("Ql_{}", idx)] << vi.quality_factor; + port_Q->table[format("Kl_{}", idx)] << vi.mode_port_kappa; + } + port_Q->WriteFullTableTrunc(); +} + +template +void PostOperatorCSV::PrintErrorIndicator( + bool is_root, const ErrorIndicator::SummaryStatistics &indicator_stats) +{ + if (!is_root) + { + return; + } + + TableWithCSVFile error_indicator(fs::path(post_dir / "error-indicators.csv").string()); + error_indicator.table.reserve(1, 4); + + error_indicator.table.insert(Column("norm", "Norm") << indicator_stats.norm); + error_indicator.table.insert(Column("min", "Minimum") << indicator_stats.min); + error_indicator.table.insert(Column("max", "Maximum") << indicator_stats.max); + error_indicator.table.insert(Column("mean", "Mean") << indicator_stats.mean); + + error_indicator.WriteFullTableTrunc(); +} + +template +void PostOperatorCSV::InitializeCSVDataCollection( + const PostOperator &post_op) +{ + if (!Mpi::Root(post_op.fem_op->GetComm())) + { + return; + } + InitializeDomainE(post_op.dom_post_op); + InitializeSurfaceF(post_op.surf_post_op); + InitializeSurfaceQ(post_op.surf_post_op); + +#if defined(MFEM_USE_GSLIB) + InitializeProbeE(post_op.interp_op); + InitializeProbeB(post_op.interp_op); +#endif + if constexpr (solver_t == ProblemType::DRIVEN || solver_t == ProblemType::TRANSIENT) + { + InitializeSurfaceI(post_op.fem_op->GetSurfaceCurrentOp()); + } + if constexpr (solver_t == ProblemType::DRIVEN || solver_t == ProblemType::EIGENMODE || + solver_t == ProblemType::TRANSIENT) + { + InitializePortVI(*post_op.fem_op); + } + if constexpr (solver_t == ProblemType::DRIVEN) + { + InitializePortS(*post_op.fem_op); + } + if constexpr (solver_t == ProblemType::DRIVEN || solver_t == ProblemType::EIGENMODE) + { + InitializeFarFieldE(post_op.surf_post_op); + } + if constexpr (solver_t == ProblemType::EIGENMODE) + { + InitializeEig(); + InitializeEigPortEPR(post_op.fem_op->GetLumpedPortOp()); + InitializeEigPortQ(post_op.fem_op->GetLumpedPortOp()); + } +} + +template +void PostOperatorCSV::PrintAllCSVData( + const PostOperator &post_op, const Measurement &non_dim_measurement_cache, + double idx_value_dimensionful, int step) +{ + if (!Mpi::Root(post_op.fem_op->GetComm())) + { + return; + } + row_idx_v = idx_value_dimensionful; + row_i = step; + + // PostOperator acts on a nondimensional measurement cache, we write a dimensional cache. + measurement_cache = Measurement::Dimensionalize(post_op.units, non_dim_measurement_cache); + + PrintDomainE(); + PrintSurfaceF(); + PrintSurfaceQ(); + +#if defined(MFEM_USE_GSLIB) + PrintProbeE(post_op.interp_op); + PrintProbeB(post_op.interp_op); +#endif + if constexpr (solver_t == ProblemType::DRIVEN || solver_t == ProblemType::TRANSIENT) + { + PrintSurfaceI(post_op.fem_op->GetSurfaceCurrentOp(), post_op.units); + } + + if constexpr (solver_t == ProblemType::DRIVEN || solver_t == ProblemType::EIGENMODE || + solver_t == ProblemType::TRANSIENT) + { + PrintPortVI(post_op.fem_op->GetLumpedPortOp(), post_op.units); + } + if constexpr (solver_t == ProblemType::DRIVEN) + { + PrintPortS(); + } + if constexpr (solver_t == ProblemType::EIGENMODE || solver_t == ProblemType::DRIVEN) + { + PrintFarFieldE(post_op.surf_post_op); + } + if constexpr (solver_t == ProblemType::EIGENMODE) + { + PrintEig(); + PrintEigPortEPR(); + PrintEigPortQ(); + } +} + +template +PostOperatorCSV::PostOperatorCSV(const IoData &iodata, + const fem_op_t &fem_op) +{ + if (!Mpi::Root(fem_op.GetComm())) + { + return; + } + + post_dir = iodata.problem.output; + + // Initialize multi-excitation column group index. Only driven or transient support + // excitations; for other solvers this is default to a single idx=0. + if constexpr (solver_t == ProblemType::DRIVEN || solver_t == ProblemType::TRANSIENT) + { + auto excitation_helper = fem_op.GetPortExcitations(); + ex_idx_v_all.clear(); + ex_idx_v_all.reserve(excitation_helper.Size()); + std::transform(excitation_helper.begin(), excitation_helper.end(), + std::back_inserter(ex_idx_v_all), + [](const auto &pair) { return pair.first; }); + // Default to the first excitation. + ex_idx_i = 0; + m_ex_idx = ex_idx_v_all.front(); + } + + // Driven solver: can have non-trivial restart. + if constexpr (solver_t == ProblemType::DRIVEN) + { + nr_expected_measurement_rows = iodata.solver.driven.sample_f.size(); + reload_table = (iodata.solver.driven.restart != 1); + + row_i = std::size_t(iodata.solver.driven.restart - 1) % nr_expected_measurement_rows; + ex_idx_i = std::size_t(iodata.solver.driven.restart - 1) / nr_expected_measurement_rows; + m_ex_idx = ex_idx_v_all.at(ex_idx_i); + } + + // Non-driven solver: get nr_expected_measurement_rows to reserve table space. + if (solver_t == ProblemType::EIGENMODE) + { + nr_expected_measurement_rows = iodata.solver.eigenmode.n; + } + else if (solver_t == ProblemType::ELECTROSTATIC) + { + nr_expected_measurement_rows = iodata.solver.electrostatic.n_post; + } + else if (solver_t == ProblemType::MAGNETOSTATIC) + { + nr_expected_measurement_rows = iodata.solver.magnetostatic.n_post; + } + else if (solver_t == ProblemType::TRANSIENT) + { + // Estimate number for fixed (linear) stepping. + nr_expected_measurement_rows = + std::size_t(iodata.solver.transient.max_t / iodata.solver.transient.delta_t) + 1; + } +} + +// Explicit template instantiation. +template class PostOperatorCSV; +template class PostOperatorCSV; +template class PostOperatorCSV; +template class PostOperatorCSV; +template class PostOperatorCSV; + +// Function explicit needed testing since everywhere it's through PostOperator. +// TODO(C++20): with requires, we won't need a second template. + +template auto PostOperatorCSV::InitializePortVI( + const SpaceOperator &fem_op) -> void; + +} // namespace palace diff --git a/palace/models/postoperatorcsv.hpp b/palace/models/postoperatorcsv.hpp new file mode 100644 index 0000000000..135ea8e938 --- /dev/null +++ b/palace/models/postoperatorcsv.hpp @@ -0,0 +1,334 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_POST_OPERATOR_CSV_HPP +#define PALACE_MODELS_POST_OPERATOR_CSV_HPP + +#include +#include +#include "fem/errorindicator.hpp" +#include "models/curlcurloperator.hpp" +#include "models/laplaceoperator.hpp" +#include "models/spaceoperator.hpp" +#include "utils/configfile.hpp" +#include "utils/filesystem.hpp" +#include "utils/tablecsv.hpp" +#include "utils/units.hpp" + +namespace palace +{ +class IoData; +class DomainPostOperator; +class SurfacePostOperator; +class InterpolationOperator; +class SurfaceCurrentOperator; +class LumpedPortOperator; + +// Advance declaration. +template +class PostOperator; + +// Statically map solver (ProblemType) to finite element operator. + +template +struct fem_op_map_type +{ + using type = SpaceOperator; +}; +template <> +struct fem_op_map_type +{ + using type = LaplaceOperator; +}; +template <> +struct fem_op_map_type +{ + using type = CurlCurlOperator; +}; + +template +using fem_op_t = typename fem_op_map_type::type; + +// Results of measurements on fields. Not all measurements are sensible to define for all +// solvers. +struct Measurement +{ + // Mini storage structs for data measurements. + struct DomainData + { + int idx; + double energy; + double participation_ratio; + }; + + struct FluxData + { + int idx; // Surface index + std::complex Phi; // Integrated flux + SurfaceFlux type; + }; + + struct InterfaceData + { + int idx; // Interface index + double energy; // Surface Electric Field Energy + double tandelta; // Dissipation tangent tan(δ) + double energy_participation; // ratio of interface energy / total_energy + double quality_factor; // 1 / (energy_participation * tan δ) + }; + + struct FarFieldData + { + // Theta: polar angle (0 to pi radians). + // Phi: azimuthal angle (0 to 2pi radians). + std::vector> thetaphis; + + // Components of the electric field. + std::vector, 3>> E_field; + }; + + // Data for both lumped and wave port. + struct PortPostData + { + std::complex P = 0.0; + std::complex V = 0.0; + std::complex I = 0.0; + // Separate R, L, and C branches for current via Z. + std::array, 3> I_RLC = {0.0, 0.0, 0.0}; + + // S-Parameter. + std::complex S = 0.0; + + // Energies (currently only for lumped port). + double inductor_energy = 0.0; // E_ind = ∑_j 1/2 L_j I_mj². + double capacitor_energy = 0.0; // E_cap = ∑_j 1/2 C_j V_mj². + + // Resistive lumped port (only eigenmode). + double mode_port_kappa = 0.0; + double quality_factor = mfem::infinity(); + + // Inductive lumped port (only eigenmode). + double inductive_energy_participation = 0.0; + }; + + // "Pseudo-measurements": input required during measurement or data which is stored here + // in order to pass it along to the printers. + + int ex_idx = 0; // driven + + std::complex freq = {0.0, 0.0}; // driven || eigenvalue. + + // Modulation factor for input excitation: + // - I_inc(t) = J(t) I_in, for transient + // - I_inc(omega) = I_in, for driven so that Jcoeff_excitation = 1.0 + double Jcoeff_excitation = 1.0; // transient || driven + + // Eigenmode data including error from solver. + double eigenmode_Q = 0.0; + double error_bkwd = 0.0; + double error_abs = 0.0; + + // "Actual measurements". + + double domain_E_field_energy_all = 0.0; + double domain_H_field_energy_all = 0.0; + + std::vector domain_E_field_energy_i; + std::vector domain_H_field_energy_i; + + double lumped_port_capacitor_energy = 0.0; + double lumped_port_inductor_energy = 0.0; + + std::map lumped_port_vi; + std::map wave_port_vi; + + // Probe data is ordered as [Fx1, Fy1, Fz1, Fx2, Fy2, Fz2, ...]. + // TODO: Replace with proper matrix: mdspan (C++23) / Eigen. + std::vector> probe_E_field; + std::vector> probe_B_field; + + std::vector surface_flux_i; + std::vector interface_eps_i; + FarFieldData farfield; + + // Dimensionalize and nondimensionalize a set of measurements + static Measurement Dimensionalize(const Units &units, + const Measurement &nondim_measurement_cache); + static Measurement Nondimensionalize(const Units &units, + const Measurement &dim_measurement_cache); + // Helpers for converting complex variable to magnitude in dB and phase. + static double Magnitude(std::complex x) { return 20.0 * std::log10(std::abs(x)); } + static double Phase(std::complex x) { return std::arg(x) * 180.0 / M_PI; } +}; + +namespace _impl +{ +// Filling pattern of rows of column groups — needed to validate reload position of +// previous data. Make it public in an _impl namespace for testing. +std::vector table_expected_filling(std::size_t m_idx_row, std::size_t ex_idx_i, + std::size_t nr_rows, + std::size_t nr_col_blocks); + +} // namespace _impl + +// Helper class to PostOperator to collect csv tables and printers for measurement that will +// be saved to file. This class contains a pointer to the corresponding PostOperator class +// and is a friend to a PostOperator class; this is equivalent to having these members +// and methods in PostOperator. It exists for code clarity. +template +class PostOperatorCSV +{ +protected: + // Copy savepath from PostOperator for simpler dependencies. + fs::path post_dir; + bool reload_table = false; // True only for driven simulation with non-default restart + + // Dimensionalized measurement cache. Converted from the PostOperator member variable. + Measurement measurement_cache; + + // Cursor location & cursor value. + + std::size_t row_i = 0; // Plain count of current row (measurement index) + std::size_t ex_idx_i = 0; // Plain count of current column group (excitation) + + double row_idx_v; // Value of row index (time, freq..); must be dimensionful + std::size_t m_ex_idx = 0; // ex_idx_v: Excitation index value (= ex_idx_v_all[ex_idx_i]) + + // Required in validation of re-loaded table (driven), otherwise just to reserve space. + // Transient (adaptive time-stepping) or eigenvalue (converged eigenvalues) solver output + // may differ from expectation. + std::size_t nr_expected_measurement_rows = 1; + + // Stored column groups (excitations). Default single "0" for solvers without excitations. + std::vector ex_idx_v_all = {std::size_t(0)}; + bool HasSingleExIdx() const { return ex_idx_v_all.size() == 1; } + + void MoveTableValidateReload(TableWithCSVFile &t_csv_base, Table &&t_ref); + + // Data tables. + // + // These are all std::optional since: (a) should only be instantiated on the root mpi + // process, (b) they should only be written if the data is non-empty. + + // Initialize and print methods for various output quantities: The initialize methods + // prepare the tables for data insertion, whilst the print methods insert data + // appropriately. Methods are only enabled when valid given the problem type. + + // Base (all solvers). + std::optional domain_E; + void InitializeDomainE(const DomainPostOperator &dom_post_op); + void PrintDomainE(); + + std::optional surface_F; + void InitializeSurfaceF(const SurfacePostOperator &surf_post_op); + void PrintSurfaceF(); + + std::optional surface_Q; + void InitializeSurfaceQ(const SurfacePostOperator &surf_post_op); + void PrintSurfaceQ(); + + std::optional probe_E; + void InitializeProbeE(const InterpolationOperator &interp_op); + void PrintProbeE(const InterpolationOperator &interp_op); + + std::optional probe_B; + void InitializeProbeB(const InterpolationOperator &interp_op); + void PrintProbeB(const InterpolationOperator &interp_op); + + // TODO(C++20): Upgrade SFINAE to C++20 concepts to simplify static selection since we can + // just use `void Function(...) requires (solver_t == Type::A);`. + + // Driven + Transient. + std::optional surface_I; + template + auto InitializeSurfaceI(const SurfaceCurrentOperator &surf_j_op) + -> std::enable_if_t; + template + auto PrintSurfaceI(const SurfaceCurrentOperator &surf_j_op, const Units &units) + -> std::enable_if_t; + + // Eigenmode + Driven + Transient. + std::optional port_V; + std::optional port_I; + template + auto InitializePortVI(const SpaceOperator &fem_op) + -> std::enable_if_t; + template + auto PrintPortVI(const LumpedPortOperator &lumped_port_op, const Units &units) + -> std::enable_if_t; + + // Driven. + std::optional port_S; + template + auto InitializePortS(const SpaceOperator &fem_op) + -> std::enable_if_t; + template + auto PrintPortS() -> std::enable_if_t; + + // Driven + Eigenmode. + std::optional farfield_E; + template + auto InitializeFarFieldE(const SurfacePostOperator &surf_post_op) + -> std::enable_if_t; + template + auto PrintFarFieldE(const SurfacePostOperator &surf_post_op) + -> std::enable_if_t; + + // Eigenmode. + std::optional eig; + template + auto InitializeEig() -> std::enable_if_t; + template + auto PrintEig() -> std::enable_if_t; + + std::vector ports_with_L; + std::vector ports_with_R; + std::optional port_EPR; + template + auto InitializeEigPortEPR(const LumpedPortOperator &lumped_port_op) + -> std::enable_if_t; + template + auto PrintEigPortEPR() -> std::enable_if_t; + + std::optional port_Q; + template + auto InitializeEigPortQ(const LumpedPortOperator &lumped_port_op) + -> std::enable_if_t; + template + auto PrintEigPortQ() -> std::enable_if_t; + +public: + // Print all data from nondim_measurement_cache. + void PrintAllCSVData(const PostOperator &post_op, + const Measurement &nondim_measurement_cache, + double idx_value_dimensionful, int step); + + // Driven specific overload for specifying excitation index. + template + auto PrintAllCSVData(const PostOperator &post_op, + const Measurement &nondim_measurement_cache, + double idx_value_dimensionful, int step, int ex_idx) + -> std::enable_if_t + { + m_ex_idx = ex_idx; + PrintAllCSVData(post_op, nondim_measurement_cache, idx_value_dimensionful, step); + } + + // Special case of global indicator — init and print all at once. + void PrintErrorIndicator(bool is_root, + const ErrorIndicator::SummaryStatistics &indicator_stats); + + // "Delayed ctor" so that PostOperator can call it once it is fully constructed. + // Set-up all files to be called from post_op. + void InitializeCSVDataCollection(const PostOperator &post_op); + + explicit PostOperatorCSV(const IoData &iodata, const fem_op_t &fem_op); +}; + +} // namespace palace + +#endif // PALACE_MODELS_POST_OPERATOR_CSV_HPP diff --git a/palace/models/romoperator.cpp b/palace/models/romoperator.cpp index 78c9f5ea39..9e87dfb389 100644 --- a/palace/models/romoperator.cpp +++ b/palace/models/romoperator.cpp @@ -1,446 +1,538 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "romoperator.hpp" - -#include -#include -#include -#include "linalg/orthog.hpp" -#include "models/spaceoperator.hpp" -#include "utils/communication.hpp" -#include "utils/iodata.hpp" -#include "utils/timer.hpp" - -namespace palace -{ - -using namespace std::complex_literals; - -namespace -{ - -inline void ProjectMatInternal(MPI_Comm comm, const std::vector &V, - const ComplexOperator &A, Eigen::MatrixXcd &Ar, - ComplexVector &r, int n0) -{ - // Update Ar = Vᴴ A V for the new basis dimension n0 -> n. V is real and thus the result - // is complex symmetric if A is symmetric (which we assume is the case). Ar is replicated - // across all processes as a sequential n x n matrix. - const auto n = Ar.rows(); - MFEM_VERIFY(n0 < n, "Unexpected dimensions in PROM matrix projection!"); - for (int j = n0; j < n; j++) - { - // Fill block of Vᴴ A V = [ | Vᴴ A vj ] . We can optimize the matrix-vector product - // since the columns of V are real. - MFEM_VERIFY(A.HasReal() || A.HasImag(), - "Invalid zero ComplexOperator for PROM matrix projection!"); - if (A.HasReal()) - { - A.Real()->Mult(V[j], r.Real()); - } - if (A.HasImag()) - { - A.Imag()->Mult(V[j], r.Imag()); - } - for (int i = 0; i < n; i++) - { - Ar(i, j).real(A.HasReal() ? V[i] * r.Real() : 0.0); // Local inner product - Ar(i, j).imag(A.HasImag() ? V[i] * r.Imag() : 0.0); - } - } - Mpi::GlobalSum((n - n0) * n, Ar.data() + n0 * n, comm); - - // Fill lower block of Vᴴ A V = [ ____________ | ] - // [ vjᴴ A V[1:n0] | ] . - for (int j = 0; j < n0; j++) - { - for (int i = n0; i < n; i++) - { - Ar(i, j) = Ar(j, i); - } - } -} - -inline void ProjectVecInternal(MPI_Comm comm, const std::vector &V, - const ComplexVector &b, Eigen::VectorXcd &br, int n0) -{ - // Update br = Vᴴ b for the new basis dimension n0 -> n. br is replicated across all - // processes as a sequential n-dimensional vector. - const auto n = br.size(); - MFEM_VERIFY(n0 < n, "Unexpected dimensions in PROM vector projection!"); - for (int i = n0; i < n; i++) - { - br(i).real(V[i] * b.Real()); // Local inner product - br(i).imag(V[i] * b.Imag()); - } - Mpi::GlobalSum(n - n0, br.data() + n0, comm); -} - -} // namespace - -RomOperator::RomOperator(const IoData &iodata, SpaceOperator &spaceop) : spaceop(spaceop) -{ - // Construct the system matrices defining the linear operator. PEC boundaries are handled - // simply by setting diagonal entries of the system matrix for the corresponding dofs. - // Because the Dirichlet BC is always homogenous, no special elimination is required on - // the RHS. The damping matrix may be nullptr. - K = spaceop.GetStiffnessMatrix(Operator::DIAG_ONE); - C = spaceop.GetDampingMatrix(Operator::DIAG_ZERO); - M = spaceop.GetMassMatrix(Operator::DIAG_ZERO); - MFEM_VERIFY(K && M, "Invalid empty HDM matrices when constructing PROM!"); - - // Set up RHS vector (linear in frequency part) for the incident field at port boundaries, - // and the vector for the solution, which satisfies the Dirichlet (PEC) BC. - if (!spaceop.GetExcitationVector1(RHS1)) - { - RHS1.SetSize(0); - } - has_A2 = has_RHS2 = true; - - // Initialize temporary vector storage. - r.SetSize(K->Height()); - w.SetSize(K->Height()); - - // Set up the linear solver and set operators but don't set the operators yet (this will - // be done during an HDM solve at a given parameter point). The preconditioner for the - // complex linear system is constructed from a real approximation to the complex system - // matrix. - ksp = std::make_unique(iodata, spaceop.GetNDSpaces(), - &spaceop.GetH1Spaces()); - - // Initialize solver for inner product solves. The system matrix for the inner product is - // real and SPD. This uses the dual norm from https://ieeexplore.ieee.org/document/5313818 - // in the error estimate. - if (iodata.solver.driven.adaptive_metric_aposteriori) - { - constexpr int curlcurl_verbose = 0; - kspKM = std::make_unique( - spaceop.GetMaterialOp(), spaceop.GetNDSpaces(), spaceop.GetH1Spaces(), - spaceop.GetNDDbcTDofLists(), spaceop.GetH1DbcTDofLists(), iodata.solver.linear.tol, - iodata.solver.linear.max_it, curlcurl_verbose, iodata.solver.pa_order_threshold); - } - - // The initial PROM basis is empty. Orthogonalization uses MGS by default, else CGS2. - dim_V = 0; - orthog_mgs = - (iodata.solver.linear.gs_orthog_type == config::LinearSolverData::OrthogType::MGS); - - // Seed the random number generator for parameter space sampling. - engine.seed(std::chrono::system_clock::now().time_since_epoch().count()); -} - -void RomOperator::Initialize(double start, double delta, int num_steps, int max_dim) -{ - // Initialize P = {ω_L, ω_L+δ, ..., ω_R}. Always insert in ascending order. - MFEM_VERIFY(PS.empty() && P_m_PS.empty(), - "RomOperator::Initialize should only be called once!"); - MFEM_VERIFY( - num_steps > 2, - "RomOperator adaptive frequency sweep should have more than two frequency steps!"); - if (delta < 0.0) - { - start = start + (num_steps - 1) * delta; - delta = -delta; - } - auto it = P_m_PS.begin(); - for (int step = 0; step < num_steps; step++) - { - it = P_m_PS.emplace_hint(it, start + step * delta); - } - - // PROM operators Ar = Vᴴ A V when assembled is complex symmetric for real V. The provided - // max_dim is the number of sample points (2 basis vectors per point). - MFEM_VERIFY(max_dim > 0, "Reduced order basis storage must have > 0 columns!"); - V.resize(2 * max_dim, Vector()); -} - -void RomOperator::SolveHDM(double omega, ComplexVector &e) -{ - // Compute HDM solution at the given frequency. The system matrix, A = K + iω C - ω² M + - // A2(ω) is built by summing the underlying operator contributions. - BlockTimer bt0(Timer::CONSTRUCT); - A2 = spaceop.GetExtraSystemMatrix(omega, Operator::DIAG_ZERO); - has_A2 = (A2 != nullptr); - auto A = spaceop.GetSystemMatrix(std::complex(1.0, 0.0), 1i * omega, - std::complex(-omega * omega, 0.0), K.get(), - C.get(), M.get(), A2.get()); - auto P = - spaceop.GetPreconditionerMatrix(1.0, omega, -omega * omega, omega); - ksp->SetOperators(*A, *P); - - // The HDM excitation vector is computed as RHS = iω RHS1 + RHS2(ω). - Mpi::Print("\n"); - if (has_RHS2) - { - has_RHS2 = spaceop.GetExcitationVector2(omega, r); - } - else - { - r = 0.0; - } - if (RHS1.Size()) - { - r.Add(1i * omega, RHS1); - } - - // Solve the linear system. - BlockTimer bt1(Timer::SOLVE); - ksp->Mult(r, e); -} - -void RomOperator::AddHDMSample(double omega, ComplexVector &e) -{ - // Use the given HDM solution at the given frequency to update the reduced-order basis - // updating the PROM operators. - auto it = P_m_PS.lower_bound(omega); - MFEM_VERIFY(it != P_m_PS.end(), - "Sample frequency " << omega << " not found in parameter set!"); - P_m_PS.erase(it); - auto ret = PS.insert(omega); - MFEM_VERIFY(ret.second, "Sample frequency " - << omega << " already exists in the sampled parameter set!"); - - // Update V. The basis is always real (each complex solution adds two basis vectors if it - // has a nonzero real and imaginary parts). - const double normr = linalg::Norml2(spaceop.GetComm(), e.Real()); - const double normi = linalg::Norml2(spaceop.GetComm(), e.Imag()); - const bool has_real = (normr > 1.0e-12 * std::sqrt(normr * normr + normi * normi)); - const bool has_imag = (normi > 1.0e-12 * std::sqrt(normr * normr + normi * normi)); - MFEM_VERIFY(dim_V + has_real + has_imag <= static_cast(V.size()), - "Unable to increase basis storage size, increase maximum number of vectors!"); - const int dim_V0 = dim_V; - std::vector H(dim_V + 1); - if (has_real) - { - V[dim_V] = e.Real(); - if (orthog_mgs) - { - linalg::OrthogonalizeColumnMGS(spaceop.GetComm(), V, V[dim_V], H.data(), dim_V); - } - else - { - linalg::OrthogonalizeColumnCGS(spaceop.GetComm(), V, V[dim_V], H.data(), dim_V, true); - } - V[dim_V] *= 1.0 / linalg::Norml2(spaceop.GetComm(), V[dim_V]); - dim_V++; - } - if (has_imag) - { - V[dim_V] = e.Imag(); - if (orthog_mgs) - { - linalg::OrthogonalizeColumnMGS(spaceop.GetComm(), V, V[dim_V], H.data(), dim_V); - } - else - { - linalg::OrthogonalizeColumnCGS(spaceop.GetComm(), V, V[dim_V], H.data(), dim_V, true); - } - V[dim_V] *= 1.0 / linalg::Norml2(spaceop.GetComm(), V[dim_V]); - dim_V++; - } - - // Update reduced-order operators. Resize preserves the upper dim0 x dim0 block of each - // matrix and first dim0 entries of each vector and the projection uses the values - // computed for the unchanged basis vectors. - Kr.conservativeResize(dim_V, dim_V); - ProjectMatInternal(spaceop.GetComm(), V, *K, Kr, r, dim_V0); - if (C) - { - Cr.conservativeResize(dim_V, dim_V); - ProjectMatInternal(spaceop.GetComm(), V, *C, Cr, r, dim_V0); - } - Mr.conservativeResize(dim_V, dim_V); - ProjectMatInternal(spaceop.GetComm(), V, *M, Mr, r, dim_V0); - Ar.resize(dim_V, dim_V); - if (RHS1.Size()) - { - RHS1r.conservativeResize(dim_V); - ProjectVecInternal(spaceop.GetComm(), V, RHS1, RHS1r, dim_V0); - } - RHSr.resize(dim_V); -} - -void RomOperator::AssemblePROM(double omega) -{ - // Assemble the PROM linear system at the given frequency. The PROM system is defined by - // the matrix Aᵣ(ω) = Kᵣ + iω Cᵣ - ω² Mᵣ + Vᴴ A2 V(ω) and source vector RHSᵣ(ω) = - // iω RHS1ᵣ + Vᴴ RHS2(ω). A2(ω) and RHS2(ω) are constructed only if required and are - // only nonzero on boundaries, will be empty if not needed. - if (has_A2) - { - A2 = spaceop.GetExtraSystemMatrix(omega, Operator::DIAG_ZERO); - ProjectMatInternal(spaceop.GetComm(), V, *A2, Ar, r, 0); - } - else - { - Ar.setZero(); - } - Ar += Kr; - if (C) - { - Ar += (1i * omega) * Cr; - } - Ar += (-omega * omega) * Mr; - - if (has_RHS2) - { - spaceop.GetExcitationVector2(omega, RHS2); - ProjectVecInternal(spaceop.GetComm(), V, RHS2, RHSr, 0); - } - else - { - RHSr.setZero(); - } - if (RHS1.Size()) - { - RHSr += (1i * omega) * RHS1r; - } -} - -void RomOperator::SolvePROM(ComplexVector &e) -{ - // Compute PROM solution at the given frequency and expand into high-dimensional space. - // The PROM is solved on every process so the matrix-vector product for vector expansion - // does not require communication. - RHSr = Ar.partialPivLu().solve(RHSr); - // RHSr = Ar.ldlt().solve(RHSr); - // RHSr = Ar.selfadjointView().ldlt().solve(RHSr); - - e = 0.0; - for (int j = 0; j < dim_V; j++) - { - e.Real().Add(RHSr(j).real(), V[j]); - e.Imag().Add(RHSr(j).imag(), V[j]); - } -} - -double RomOperator::ComputeError(double omega) -{ - // Compute the error metric associated with the approximate PROM solution at the given - // frequency. The HDM residual -r = [K + iω C - ω² M + A2(ω)] x - [iω RHS1 + RHS2(ω)] is - // computed using the most recently computed A2(ω) and RHS2(ω). - AssemblePROM(omega); - SolvePROM(w); - - // Residual error. - r = 0.0; - if (RHS1.Size()) - { - r.Add(-1i * omega, RHS1); - } - if (has_RHS2) - { - r.Add(-1.0, RHS2); - } - double den = !kspKM ? linalg::Norml2(spaceop.GetComm(), r) : 0.0; - - K->AddMult(w, r, 1.0); - if (C) - { - C->AddMult(w, r, 1i * omega); - } - M->AddMult(w, r, -omega * omega); - if (has_A2) - { - A2->AddMult(w, r, 1.0); - } - - double num; - if (!kspKM) - { - num = linalg::Norml2(spaceop.GetComm(), r); - } - else - { - z.SetSize(r.Size()); - kspKM->Mult(r, z); - auto dot = linalg::Dot(spaceop.GetComm(), z, r); - MFEM_ASSERT(dot.real() > 0.0 && std::abs(dot.imag()) < 1.0e-9 * dot.real(), - "Non-positive vector norm in normalization (dot = " << dot << ")!"); - num = std::sqrt(dot.real()); - den = linalg::Norml2(spaceop.GetComm(), w, kspKM->GetOperator(), z); - } - MFEM_VERIFY(den > 0.0, "Unexpected zero denominator in HDM residual!"); - return num / den; -} - -double RomOperator::ComputeMaxError(int num_cand, double &omega_star) -{ - // Greedy iteration: Find argmax_{ω ∈ P_C} η(e; ω). We sample num_cand candidates from - // P \ P_S. - num_cand = std::min(num_cand, static_cast(P_m_PS.size())); - std::vector PC; - if (Mpi::Root(spaceop.GetComm())) - { - if constexpr (false) - { - // Sample with weighted probability by distance from the set of already sampled - // points. - std::vector weights(P_m_PS.size()); - PC.reserve(num_cand); - for (auto sample : PS) - { - int i = std::distance(P_m_PS.begin(), P_m_PS.lower_bound(sample)); - int il = i - 1; - while (il >= 0) - { - weights[il] = std::min(weights[il], static_cast(i - il)); - il--; - } - int iu = i; - while (iu < weights.size()) - { - weights[iu] = std::min(weights[iu], static_cast(1 + iu - i)); - iu++; - } - } - for (int i = 0; i < num_cand; i++) - { - std::discrete_distribution dist(weights.begin(), weights.end()); - auto res = dist(engine); - auto it = P_m_PS.begin(); - std::advance(it, res); - PC.push_back(*it); - weights[res] = 0.0; // No replacement - } - } - else - { - // Sample with uniform probability. - PC.reserve(num_cand); - std::sample(P_m_PS.begin(), P_m_PS.end(), std::back_inserter(PC), num_cand, engine); - } - } - else - { - PC.resize(num_cand); - } - Mpi::Broadcast(num_cand, PC.data(), 0, spaceop.GetComm()); - - // Debug - // Mpi::Print("Candidate sampling:\n"); - // Mpi::Print(" P_S: {}", PS); - // Mpi::Print(" P\\P_S: {}\n", P_m_PS); - // Mpi::Print(" P_C: {}\n", PC); - // Mpi::Print("\n"); - - // For each candidate, compute the PROM solution and associated error metric. - double err_max = 0.0; - for (auto omega : PC) - { - double err = ComputeError(omega); - - // Debug - // Mpi::Print("ω = {:.3e}, error = {:.3e}\n", omega, err); - - if (err > err_max) - { - err_max = err; - omega_star = omega; - } - } - return err_max; -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "romoperator.hpp" + +#include +#include +#include "linalg/orthog.hpp" +#include "models/spaceoperator.hpp" +#include "utils/communication.hpp" +#include "utils/iodata.hpp" +#include "utils/timer.hpp" + +// Eigen does not provide a complex-valued generalized eigenvalue solver, so we use LAPACK +// for this. +extern "C" +{ + void zggev_(char *, char *, int *, std::complex *, int *, std::complex *, + int *, std::complex *, std::complex *, std::complex *, + int *, std::complex *, int *, std::complex *, int *, double *, + int *); +} + +namespace palace +{ + +using namespace std::complex_literals; + +namespace +{ + +constexpr auto ORTHOG_TOL = 1.0e-12; + +template +inline void OrthogonalizeColumn(Orthogonalization type, MPI_Comm comm, + const std::vector &V, VecType &w, ScalarType *Rj, + int j) +{ + // Orthogonalize w against the leading j columns of V. + switch (type) + { + case Orthogonalization::MGS: + linalg::OrthogonalizeColumnMGS(comm, V, w, Rj, j); + break; + case Orthogonalization::CGS: + linalg::OrthogonalizeColumnCGS(comm, V, w, Rj, j); + break; + case Orthogonalization::CGS2: + linalg::OrthogonalizeColumnCGS(comm, V, w, Rj, j, true); + break; + } +} + +inline void ProjectMatInternal(MPI_Comm comm, const std::vector &V, + const ComplexOperator &A, Eigen::MatrixXcd &Ar, + ComplexVector &r, int n0) +{ + // Update Ar = Vᴴ A V for the new basis dimension n0 -> n. V is real and thus the result + // is complex symmetric if A is symmetric (which we assume is the case). Ar is replicated + // across all processes as a sequential n x n matrix. + const auto n = Ar.rows(); + MFEM_VERIFY(n0 < n, "Invalid dimensions in PROM matrix projection!"); + for (int j = n0; j < n; j++) + { + // Fill block of Vᴴ A V = [ | Vᴴ A vj ] . We can optimize the matrix-vector product + // since the columns of V are real. + MFEM_VERIFY(A.Real() || A.Imag(), + "Invalid zero ComplexOperator for PROM matrix projection!"); + if (A.Real()) + { + A.Real()->Mult(V[j], r.Real()); + } + if (A.Imag()) + { + A.Imag()->Mult(V[j], r.Imag()); + } + for (int i = 0; i < n; i++) + { + Ar(i, j).real(A.Real() ? V[i] * r.Real() : 0.0); // Local inner product + Ar(i, j).imag(A.Imag() ? V[i] * r.Imag() : 0.0); + } + } + Mpi::GlobalSum((n - n0) * n, Ar.data() + n0 * n, comm); + + // Fill lower block of Vᴴ A V = [ ____________ | ] + // [ vjᴴ A V[1:n0] | ] . + for (int j = 0; j < n0; j++) + { + for (int i = n0; i < n; i++) + { + Ar(i, j) = Ar(j, i); + } + } +} + +inline void ProjectVecInternal(MPI_Comm comm, const std::vector &V, + const ComplexVector &b, Eigen::VectorXcd &br, int n0) +{ + // Update br = Vᴴ b for the new basis dimension n0 -> n. br is replicated across all + // processes as a sequential n-dimensional vector. + const auto n = br.size(); + MFEM_VERIFY(n0 < n, "Invalid dimensions in PROM vector projection!"); + for (int i = n0; i < n; i++) + { + br(i).real(V[i] * b.Real()); // Local inner product + br(i).imag(V[i] * b.Imag()); + } + Mpi::GlobalSum(n - n0, br.data() + n0, comm); +} + +inline void ComputeMRI(const Eigen::MatrixXcd &R, Eigen::VectorXcd &q) +{ + // Compute the coefficients of the minimal rational interpolation (MRI): + // u = [sum_i u_i q_i / (z - z_i)] / [sum_i q_i / (z - z_i)]. The coefficients are given + // by the right singular vector of R corresponding to the minimum singular value. + const auto S = R.rows(); + MFEM_ASSERT(S > 0 && R.cols() == S, "Invalid dimension mismatch when computing MRI!"); + // For Eigen = v3.4.0 (latest tagged release as of 10/2023) + Eigen::JacobiSVD svd; + svd.compute(R, Eigen::ComputeFullV); + // For Eigen > v3.4.0 (GitLab repo is at v3.4.90 as of 10/2023) + // Eigen::JacobiSVD svd; + // svd.compute(R); + const auto &sigma = svd.singularValues(); + auto m = S - 1; + while (m > 0 && sigma[m] < ORTHOG_TOL * sigma[0]) + { + Mpi::Warning("Minimal rational interpolation encountered rank-deficient matrix: " + "σ[{:d}] = {:.3e} (σ[0] = {:.3e})!\n", + m, sigma[m], sigma[0]); + m--; + } + q = svd.matrixV().col(m); +} + +template +inline void ZGGEV(MatType &A, MatType &B, VecType &D, MatType &VR) +{ + // Wrapper for LAPACK's (z)ggev. A and B are overwritten by their Schur decompositions. + MFEM_VERIFY(A.rows() == A.cols() && B.rows() == B.cols() && A.rows() == B.rows(), + "Generalized eigenvalue problem expects A, B matrices to be square and have " + "same dimensions!"); + char jobvl = 'N', jobvr = 'V'; + int n = static_cast(A.rows()), lwork = 2 * n; + std::vector> alpha(n), beta(n), work(lwork); + std::vector rwork(8 * n); + MatType VL(0, 0); + VR.resize(n, n); + int info = 0; + + zggev_(&jobvl, &jobvr, &n, A.data(), &n, B.data(), &n, alpha.data(), beta.data(), + VL.data(), &n, VR.data(), &n, work.data(), &lwork, rwork.data(), &info); + MFEM_VERIFY(info == 0, "ZGGEV failed with info = " << info << "!"); + + // Postprocess the eigenvalues and eigenvectors (return unit 2-norm eigenvectors). + D.resize(n); + for (int i = 0; i < n; i++) + { + D(i) = (beta[i] == 0.0) + ? ((alpha[i] == 0.0) ? std::numeric_limits>::quiet_NaN() + : mfem::infinity()) + : alpha[i] / beta[i]; + VR.col(i) /= VR.col(i).norm(); + } +} + +template +inline void ProlongatePROMSolution(std::size_t n, const std::vector &V, + const VecType &y, ComplexVector &u) +{ + u = 0.0; + for (std::size_t j = 0; j < n; j += 2) + { + if (j + 1 < n) + { + linalg::AXPBYPCZ(y(j).real(), V[j], y(j + 1).real(), V[j + 1], 1.0, u.Real()); + linalg::AXPBYPCZ(y(j).imag(), V[j], y(j + 1).imag(), V[j + 1], 1.0, u.Imag()); + } + else + { + linalg::AXPY(y(j).real(), V[j], u.Real()); + linalg::AXPY(y(j).imag(), V[j], u.Imag()); + } + } +} + +} // namespace + +MinimalRationalInterpolation::MinimalRationalInterpolation(int max_size) +{ + Q.resize(max_size, ComplexVector()); +} + +void MinimalRationalInterpolation::AddSolutionSample(double omega, const ComplexVector &u, + const SpaceOperator &space_op, + Orthogonalization orthog_type) +{ + MPI_Comm comm = space_op.GetComm(); + + // Compute the coefficients for the minimal rational interpolation of the state u used + // as an error indicator. The complex-valued snapshot matrix U = [{u_i, (iω) u_i}] is + // stored by its QR decomposition. + MFEM_VERIFY(dim_Q + 1 <= Q.size(), + "Unable to increase basis storage size, increase maximum number of vectors!"); + R.conservativeResizeLike(Eigen::MatrixXd::Zero(dim_Q + 1, dim_Q + 1)); + { + std::vector blocks = {&u, &u}; + std::vector> s = {1.0, 1i * omega}; + Q[dim_Q].SetSize(2 * u.Size()); + Q[dim_Q].UseDevice(true); + Q[dim_Q].SetBlocks(blocks, s); + } + OrthogonalizeColumn(orthog_type, comm, Q, Q[dim_Q], R.col(dim_Q).data(), dim_Q); + R(dim_Q, dim_Q) = linalg::Norml2(comm, Q[dim_Q]); + Q[dim_Q] *= 1.0 / R(dim_Q, dim_Q); + dim_Q++; + ComputeMRI(R, q); + if constexpr (false) + { + Mpi::Print("MRI (S = {}):\nR = {}\nq = {}", dim_Q, R, q); + } + z.push_back(omega); +} + +std::vector MinimalRationalInterpolation::FindMaxError(int N) const +{ + // Return an estimate for argmax_z ||u(z) - V y(z)|| as argmin_z |Q(z)| with Q(z) = + // sum_i q_z / (z - z_i) (denominator of the barycentric interpolation of u). The roots of + // Q are given analytically as the solution to an S + 1 dimensional eigenvalue problem. + BlockTimer bt(Timer::CONSTRUCT_PROM); + const auto S = dim_Q; + MFEM_VERIFY(S >= 2, "Maximum error can only be found once two sample points have been " + "added to the PROM to define the parameter domain!"); + double start = *std::min_element(z.begin(), z.end()); + double end = *std::max_element(z.begin(), z.end()); + Eigen::Map z_map(z.data(), S); + std::vector> z_star(N, 0.0); + + // XX TODO: For now, we explicitly minimize Q on the real line since we don't allow + // samples at complex-valued points (yet). + + // Eigen::MatrixXcd A = Eigen::MatrixXcd::Zero(S + 1, S + 1); + // A.diagonal().head(S) = z_map.array(); + // A.row(S).head(S) = q; + // A.col(S).head(S) = Eigen::VectorXcd::Ones(S); + + // Eigen::MatrixXcd B = Eigen::MatrixXcd::Identity(S + 1, S + 1); + // B(S, S) = 0.0; + + // Eigen::VectorXcd D; + // Eigen::MatrixXcd X; + // ZGGEV(A, B, D, X); + + // // If there are multiple roots in [start, end], pick the ones furthest from the + // // existing set of samples. + // { + // std::vector dist_star(N, 0.0); + // for (auto d : D) + // { + // if (std::real(d) < start || std::real(d) > end) + // { + // continue; + // } + // const double dist = (z_map.array() - std::real(d)).abs().maxCoeff(); + // for (int i = 0; i < N; i++) + // { + // if (dist > dist_star[i]) + // { + // for (int j = i + 1; j < N; j++) + // { + // z_star[j] = z_star[j - 1]; + // dist_star[j] = dist_star[j - 1]; + // } + // z_star[i] = start; + // dist_star[i] = dist; + // } + // } + // } + // } + + // Fall back to sampling Q on discrete points if no roots exist in [start, end]. + if (std::abs(z_star[0]) == 0.0) + { + const auto delta = (end - start) / 1.0e6; + std::vector Q_star(N, mfem::infinity()); + while (start <= end) + { + const double Q = std::abs((q.array() / (z_map.array() - start)).sum()); + for (int i = 0; i < N; i++) + { + if (Q < Q_star[i]) + { + for (int j = i + 1; j < N; j++) + { + z_star[j] = z_star[j - 1]; + Q_star[j] = Q_star[j - 1]; + } + z_star[i] = start; + Q_star[i] = Q; + } + } + start += delta; + } + MFEM_VERIFY( + N == 0 || std::abs(z_star[0]) > 0.0, + fmt::format("Could not locate a maximum error in the range [{}, {}]!", start, end)); + } + std::vector vals(z_star.size()); + std::transform(z_star.begin(), z_star.end(), vals.begin(), + [](std::complex z) { return std::real(z); }); + return vals; +} + +RomOperator::RomOperator(const IoData &iodata, SpaceOperator &space_op, + int max_size_per_excitation) + : space_op(space_op), orthog_type(iodata.solver.linear.gs_orthog) +{ + // Construct the system matrices defining the linear operator. PEC boundaries are handled + // simply by setting diagonal entries of the system matrix for the corresponding dofs. + // Because the Dirichlet BC is always homogeneous, no special elimination is required on + // the RHS. The damping matrix may be nullptr. + K = space_op.GetStiffnessMatrix(Operator::DIAG_ONE); + C = space_op.GetDampingMatrix(Operator::DIAG_ZERO); + M = space_op.GetMassMatrix(Operator::DIAG_ZERO); + MFEM_VERIFY(K && M, "Invalid empty HDM matrices when constructing PROM!"); + + // Initialize working vector storage. + r.SetSize(K->Height()); + r.UseDevice(true); + + // Set up the linear solver and set operators but don't set the operators yet (this will + // be done during an HDM solve at a given parameter point). The preconditioner for the + // complex linear system is constructed from a real approximation to the complex system + // matrix. + ksp = std::make_unique(iodata, space_op.GetNDSpaces(), + &space_op.GetH1Spaces()); + + auto excitation_helper = space_op.GetPortExcitations(); + + // The initial PROM basis is empty. The provided maximum dimension is the number of sample + // points (2 basis vectors per point). Basis orthogonalization method is configured using + // GMRES/FGMRES settings. + MFEM_VERIFY(max_size_per_excitation * excitation_helper.Size() > 0, + "Reduced order basis storage must have > 0 columns!"); + V.resize(2 * max_size_per_excitation * excitation_helper.Size(), Vector()); + + // Set up MRI. + for (const auto &[excitation_idx, data] : excitation_helper) + { + mri.emplace(excitation_idx, MinimalRationalInterpolation(max_size_per_excitation)); + } +} + +void RomOperator::SetExcitationIndex(int excitation_idx) +{ + // Set up RHS vector (linear in frequency part) for the incident field at port boundaries, + // and the vector for the solution, which satisfies the Dirichlet (PEC) BC. + excitation_idx_cache = excitation_idx; + has_RHS1 = space_op.GetExcitationVector1(excitation_idx_cache, RHS1); + if (!has_RHS1) + { + RHS1.SetSize(0); + } + else + { + // Project RHS1 to RHS1r with current PROM. + if (dim_V > 0) + { + auto comm = space_op.GetComm(); + RHS1r.conservativeResize(dim_V); + ProjectVecInternal(comm, V, RHS1, RHS1r, 0); + } + } +} + +void RomOperator::SolveHDM(int excitation_idx, double omega, ComplexVector &u) +{ + if (excitation_idx_cache != excitation_idx) + { + SetExcitationIndex(excitation_idx); + } + // Compute HDM solution at the given frequency. The system matrix, A = K + iω C - ω² M + + // A2(ω) is built by summing the underlying operator contributions. + A2 = space_op.GetExtraSystemMatrix(omega, Operator::DIAG_ZERO); + has_A2 = (A2 != nullptr); + auto A = space_op.GetSystemMatrix(std::complex(1.0, 0.0), 1i * omega, + std::complex(-omega * omega, 0.0), K.get(), + C.get(), M.get(), A2.get()); + auto P = space_op.GetPreconditionerMatrix(1.0 + 0.0i, 1i * omega, + -omega * omega + 0.0i, omega); + ksp->SetOperators(*A, *P); + + // The HDM excitation vector is computed as RHS = iω RHS1 + RHS2(ω). + Mpi::Print("\n"); + if (has_RHS2) + { + has_RHS2 = space_op.GetExcitationVector2(excitation_idx, omega, r); + } + else + { + r = 0.0; + } + if (has_RHS1) + { + r.Add(1i * omega, RHS1); + } + + // Solve the linear system. + ksp->Mult(r, u); +} + +void RomOperator::UpdatePROM(const ComplexVector &u) +{ + + // Update V. The basis is always real (each complex solution adds two basis vectors if it + // has a nonzero real and imaginary parts). + BlockTimer bt(Timer::CONSTRUCT_PROM); + MPI_Comm comm = space_op.GetComm(); + const double normr = linalg::Norml2(comm, u.Real()); + const double normi = linalg::Norml2(comm, u.Imag()); + const bool has_real = (normr > ORTHOG_TOL * std::sqrt(normr * normr + normi * normi)); + const bool has_imag = (normi > ORTHOG_TOL * std::sqrt(normr * normr + normi * normi)); + MFEM_VERIFY(dim_V + has_real + has_imag <= V.size(), + "Unable to increase basis storage size, increase maximum number of vectors!"); + const std::size_t dim_V0 = dim_V; + std::vector H(dim_V + static_cast(has_real) + + static_cast(has_imag)); + if (has_real) + { + V[dim_V] = u.Real(); + OrthogonalizeColumn(orthog_type, comm, V, V[dim_V], H.data(), dim_V); + H[dim_V] = linalg::Norml2(comm, V[dim_V]); + V[dim_V] *= 1.0 / H[dim_V]; + dim_V++; + } + if (has_imag) + { + V[dim_V] = u.Imag(); + OrthogonalizeColumn(orthog_type, comm, V, V[dim_V], H.data(), dim_V); + H[dim_V] = linalg::Norml2(comm, V[dim_V]); + V[dim_V] *= 1.0 / H[dim_V]; + dim_V++; + } + + // Update reduced-order operators. Resize preserves the upper dim0 x dim0 block of each + // matrix and first dim0 entries of each vector and the projection uses the values + // computed for the unchanged basis vectors. + Kr.conservativeResize(dim_V, dim_V); + ProjectMatInternal(comm, V, *K, Kr, r, dim_V0); + if (C) + { + Cr.conservativeResize(dim_V, dim_V); + ProjectMatInternal(comm, V, *C, Cr, r, dim_V0); + } + Mr.conservativeResize(dim_V, dim_V); + ProjectMatInternal(comm, V, *M, Mr, r, dim_V0); + Ar.resize(dim_V, dim_V); + if (RHS1.Size()) + { + RHS1r.conservativeResize(dim_V); + ProjectVecInternal(comm, V, RHS1, RHS1r, dim_V0); + } + RHSr.resize(dim_V); +} + +void RomOperator::UpdateMRI(int excitation_idx, double omega, const ComplexVector &u) +{ + BlockTimer bt(Timer::CONSTRUCT_PROM); + mri.at(excitation_idx).AddSolutionSample(omega, u, space_op, orthog_type); +} + +void RomOperator::SolvePROM(int excitation_idx, double omega, ComplexVector &u) +{ + if (excitation_idx_cache != excitation_idx) + { + SetExcitationIndex(excitation_idx); + } + + // Assemble the PROM linear system at the given frequency. The PROM system is defined by + // the matrix Aᵣ(ω) = Kᵣ + iω Cᵣ - ω² Mᵣ + Vᴴ A2 V(ω) and source vector RHSᵣ(ω) = + // iω RHS1ᵣ + Vᴴ RHS2(ω). A2(ω) and RHS2(ω) are constructed only if required and are + // only nonzero on boundaries, will be empty if not needed. + if (has_A2 && Ar.rows() > 0) + { + A2 = space_op.GetExtraSystemMatrix(omega, Operator::DIAG_ZERO); + ProjectMatInternal(space_op.GetComm(), V, *A2, Ar, r, 0); + } + else + { + Ar.setZero(); + } + Ar += Kr; + if (C) + { + Ar += (1i * omega) * Cr; + } + Ar += (-omega * omega) * Mr; + + if (has_RHS2 && RHSr.size() > 0) + { + space_op.GetExcitationVector2(excitation_idx, omega, RHS2); + ProjectVecInternal(space_op.GetComm(), V, RHS2, RHSr, 0); + } + else + { + RHSr.setZero(); + } + if (has_RHS1) + { + RHSr += (1i * omega) * RHS1r; + } + + // Compute PROM solution at the given frequency and expand into high-dimensional space. + // The PROM is solved on every process so the matrix-vector product for vector expansion + // does not require communication. + BlockTimer bt(Timer::SOLVE_PROM); + if constexpr (false) + { + // LDLT solve. + RHSr = Ar.ldlt().solve(RHSr); + RHSr = Ar.selfadjointView().ldlt().solve(RHSr); + } + else + { + // LU solve. + RHSr = Ar.partialPivLu().solve(RHSr); + } + ProlongatePROMSolution(dim_V, V, RHSr, u); +} + +std::vector> RomOperator::ComputeEigenvalueEstimates() const +{ + // XX TODO: Not yet implemented + MFEM_ABORT("Eigenvalue estimates for PROM operators are not yet implemented!"); + return {}; +} + +} // namespace palace diff --git a/palace/models/romoperator.hpp b/palace/models/romoperator.hpp index e10ca12aaf..c0ad8d2a30 100644 --- a/palace/models/romoperator.hpp +++ b/palace/models/romoperator.hpp @@ -1,98 +1,127 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_ROM_OPERATOR_HPP -#define PALACE_MODELS_ROM_OPERATOR_HPP - -#include -#include -#include -#include -#include -#include "linalg/hcurl.hpp" -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" - -namespace palace -{ - -class IoData; -class SpaceOperator; - -// -// A class handling projection-based reduced order model (PROM) construction and use for -// adaptive fast frequency sweeps. -// -class RomOperator -{ -private: - // Reference to HDM discretization (not owned). - SpaceOperator &spaceop; - - // HDM system matrices and excitation RHS. - std::unique_ptr K, M, C, A2; - ComplexVector RHS1, RHS2; - bool has_A2, has_RHS2; - - // Working storage for HDM vectors. - ComplexVector r, w, z; - - // HDM linear system solver and preconditioner. - std::unique_ptr ksp; - - // Linear solver for inner product solves for error metric. - std::unique_ptr kspKM; - - // PROM matrices and vectors. - Eigen::MatrixXcd Kr, Mr, Cr, Ar; - Eigen::VectorXcd RHS1r, RHSr; - - // PROM reduced-order basis (real-valued) and active dimension. - std::vector V; - int dim_V; - bool orthog_mgs; - - // Data structures for parameter domain sampling. - std::set PS, P_m_PS; - std::default_random_engine engine; - -public: - RomOperator(const IoData &iodata, SpaceOperator &sp); - - // Return the HDM linear solver. - const ComplexKspSolver &GetLinearSolver() const { return *ksp; } - - // Return PROM dimension. - int GetReducedDimension() const { return dim_V; } - - // Return set of sampled parameter points for basis construction. - const std::set &GetSampleFrequencies() const { return PS; } - - // Initialize the parameter domain P = {ω_L, ω_L + δ, ..., ω_R}. Also sets the maximum - // number of sample points for the PROM construction. - void Initialize(double start, double delta, int num_steps, int max_dim); - - // Assemble and solve the HDM at the specified frequency. - void SolveHDM(double omega, ComplexVector &e); - - // Add the solution vector to the reduced-order basis and update the PROM. - void AddHDMSample(double omega, ComplexVector &e); - - // Assemble and solve the PROM at the specified frequency, expanding the solution back - // into the high-dimensional solution space. - void AssemblePROM(double omega); - void SolvePROM(ComplexVector &e); - - // Compute the error metric for the PROM at the specified frequency. - double ComputeError(double omega); - - // Compute the maximum error over a randomly sampled set of candidate points. Returns the - // maximum error and its correcponding frequency, as well as the number of candidate - // points used (if fewer than those availble in the unsampled parameter domain). - double ComputeMaxError(int num_cand, double &omega_star); -}; - -} // namespace palace - -#endif // PALACE_MODELS_ROM_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_ROM_OPERATOR_HPP +#define PALACE_MODELS_ROM_OPERATOR_HPP + +#include +#include +#include +#include +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +class IoData; +class SpaceOperator; + +// Class for handling minimal-rational interpolation of solutions in frequency space. Used +// as an error indicator and for selecting the next frequency sample points in PROM +// construction. Each excitation gets a separate MRI, so sample frequencies are not shared. +class MinimalRationalInterpolation +{ +private: + // (Complex-valued) upper-trianglar matrix R from orthogonalization of the HDM samples. + // Minimal rational interpolant (MRI) defined by the vector q of interpolation weights and + // support points z is used as an error indicator. + std::vector Q; + std::size_t dim_Q = 0; + Eigen::MatrixXcd R; + Eigen::VectorXcd q; + std::vector z; + +public: + MinimalRationalInterpolation(int max_size); + void AddSolutionSample(double omega, const ComplexVector &u, + const SpaceOperator &space_op, Orthogonalization orthog_type); + std::vector FindMaxError(int N) const; + + const auto &GetSamplePoints() const { return z; } +}; + +// +// A class handling projection-based reduced order model (PROM) construction and use for +// adaptive fast frequency sweeps. +// +class RomOperator +{ +private: + // Reference to HDM discretization (not owned). + SpaceOperator &space_op; + + // Used for constructing & reuse of RHS1. + int excitation_idx_cache = 0; + + // HDM system matrices and excitation RHS. + std::unique_ptr K, M, C, A2; + ComplexVector RHS1, RHS2, r; + // Defaults: will be toggled by SetExcitationIndex & SolveHDM. + bool has_A2 = true; + bool has_RHS1 = true; + bool has_RHS2 = true; + + // HDM linear system solver and preconditioner. + std::unique_ptr ksp; + + // PROM matrices and vectors. + Eigen::MatrixXcd Kr, Mr, Cr, Ar; + Eigen::VectorXcd RHS1r; + Eigen::VectorXcd RHSr; + + // PROM reduced-order basis (real-valued) and active dimension. + std::vector V; + std::size_t dim_V = 0; + Orthogonalization orthog_type; + + // MRIs: one for each excitation index. + std::map mri; + +public: + RomOperator(const IoData &iodata, SpaceOperator &space_op, int max_size_per_excitation); + + // Return the HDM linear solver. + const ComplexKspSolver &GetLinearSolver() const { return *ksp; } + + // Return PROM dimension. + auto GetReducedDimension() const { return dim_V; } + + // Return set of sampled parameter points for basis construction. + const auto &GetSamplePoints(int excitation_idx) const + { + return mri.at(excitation_idx).GetSamplePoints(); + } + + // Set excitation index to build corresponding RHS vector (linear in frequency part). + void SetExcitationIndex(int excitation_idx); + + // Assemble and solve the HDM at the specified frequency. + void SolveHDM(int excitation_idx, double omega, ComplexVector &u); + + // Add field configuration to the reduced-order basis and update the PROM. + void UpdatePROM(const ComplexVector &u); + + // Add solution u to the minimal-rational interpolation for error estimation. MRI are + // separated by excitation index. + void UpdateMRI(int excitation_idx, double omega, const ComplexVector &u); + + // Assemble and solve the PROM at the specified frequency, expanding the solution back + // into the high-dimensional space. + void SolvePROM(int excitation_idx, double omega, ComplexVector &u); + + // Compute the location(s) of the maximum error in the range of the previously sampled + // parameter points. + std::vector FindMaxError(int excitation_idx, int N = 1) const + { + return mri.at(excitation_idx).FindMaxError(N); + } + + // Compute eigenvalue estimates for the current PROM system. + std::vector> ComputeEigenvalueEstimates() const; +}; + +} // namespace palace + +#endif // PALACE_MODELS_ROM_OPERATOR_HPP diff --git a/palace/models/spaceoperator.cpp b/palace/models/spaceoperator.cpp index cc2a174e14..7ea83f7b49 100644 --- a/palace/models/spaceoperator.cpp +++ b/palace/models/spaceoperator.cpp @@ -1,948 +1,980 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "spaceoperator.hpp" - -#include -#include "fem/bilinearform.hpp" -#include "fem/coefficient.hpp" -#include "fem/integrator.hpp" -#include "fem/multigrid.hpp" -#include "linalg/rap.hpp" -#include "utils/communication.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" -#include "utils/prettyprint.hpp" - -namespace palace -{ - -using namespace std::complex_literals; - -namespace -{ - -mfem::Array SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh) -{ - int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; - if (!iodata.boundaries.pec.empty()) - { - // Check that boundary attributes have been specified correctly. - mfem::Array bdr_attr_marker(bdr_attr_max); - bdr_attr_marker = 0; - for (auto attr : mesh.bdr_attributes) - { - bdr_attr_marker[attr - 1] = 1; - } - bool first = true; - for (auto attr : iodata.boundaries.pec.attributes) - { - // MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, - // "PEC boundary attribute tags must be non-negative and correspond to " - // "attributes in the mesh!"); - // MFEM_VERIFY(bdr_attr_marker[attr-1], - // "Unknown PEC boundary attribute " << attr << "!"); - if (attr <= 0 || attr > bdr_attr_marker.Size() || !bdr_attr_marker[attr - 1]) - { - if (first) - { - Mpi::Print("\n"); - first = false; - } - Mpi::Warning("Unknown PEC boundary attribute {:d}!\nSolver will just ignore it!\n", - attr); - } - } - } - - // Mark selected boundary attributes from the mesh as essential (Dirichlet). - mfem::Array dbc_bcs, dbc_marker; - dbc_bcs.Reserve(static_cast(iodata.boundaries.pec.attributes.size())); - for (auto attr : iodata.boundaries.pec.attributes) - { - if (attr <= 0 || attr > bdr_attr_max) - { - continue; // Can just ignore if wrong - } - dbc_bcs.Append(attr); - } - mesh::AttrToMarker(bdr_attr_max, dbc_bcs, dbc_marker); - return dbc_marker; -} - -} // namespace - -SpaceOperator::SpaceOperator(const IoData &iodata, - const std::vector> &mesh) - : pa_order_threshold(iodata.solver.pa_order_threshold), skip_zeros(false), - pc_mat_real(iodata.solver.linear.pc_mat_real), - pc_mat_shifted(iodata.solver.linear.pc_mat_shifted), print_hdr(true), - print_prec_hdr(true), dbc_marker(SetUpBoundaryProperties(iodata, *mesh.back())), - nd_fecs(fem::ConstructFECollections( - iodata.solver.order, mesh.back()->Dimension(), iodata.solver.linear.mg_max_levels, - iodata.solver.linear.mg_coarsen_type, false)), - h1_fecs(fem::ConstructFECollections( - iodata.solver.order, mesh.back()->Dimension(), iodata.solver.linear.mg_max_levels, - iodata.solver.linear.mg_coarsen_type, false)), - rt_fec(std::make_unique(iodata.solver.order - 1, - mesh.back()->Dimension())), - nd_fespaces(fem::ConstructFiniteElementSpaceHierarchy( - iodata.solver.linear.mg_max_levels, mesh, nd_fecs, &dbc_marker, - &nd_dbc_tdof_lists)), - h1_fespaces(fem::ConstructAuxiliaryFiniteElementSpaceHierarchy( - nd_fespaces, h1_fecs, &dbc_marker, &h1_dbc_tdof_lists)), - rt_fespace(nd_fespaces.GetFinestFESpace(), mesh.back().get(), rt_fec.get()), - mat_op(iodata, *mesh.back()), farfield_op(iodata, mat_op, *mesh.back()), - surf_sigma_op(iodata, *mesh.back()), surf_z_op(iodata, *mesh.back()), - lumped_port_op(iodata, GetH1Space()), - wave_port_op(iodata, mat_op, GetNDSpace(), GetH1Space()), - surf_j_op(iodata, GetH1Space()) -{ - // Finalize setup. - CheckBoundaryProperties(); - - // Print essential BC information. - if (dbc_marker.Size() && dbc_marker.Max() > 0) - { - Mpi::Print("\nConfiguring Dirichlet PEC BC at attributes:\n"); - utils::PrettyPrintMarker(dbc_marker); - } -} - -void SpaceOperator::CheckBoundaryProperties() -{ - // Mark selected boundary attributes from the mesh as having some Dirichlet, Neumann, or - // mixed BC applied. - const auto &farfield_marker = farfield_op.GetMarker(); - const auto &surf_sigma_marker = surf_sigma_op.GetMarker(); - const auto &surf_z_Rs_marker = surf_z_op.GetRsMarker(); - const auto &surf_z_Ls_marker = surf_z_op.GetLsMarker(); - const auto &lumped_port_Rs_marker = lumped_port_op.GetRsMarker(); - const auto &lumped_port_Ls_marker = lumped_port_op.GetLsMarker(); - const auto &wave_port_marker = wave_port_op.GetMarker(); - aux_bdr_marker.SetSize(dbc_marker.Size()); - for (int i = 0; i < dbc_marker.Size(); i++) - { - aux_bdr_marker[i] = - (dbc_marker[i] || farfield_marker[i] || surf_sigma_marker[i] || - surf_z_Rs_marker[i] || surf_z_Ls_marker[i] || lumped_port_Rs_marker[i] || - lumped_port_Ls_marker[i] || wave_port_marker[i]); - } - // aux_bdr_marker = 1; // Mark all boundaries (including material interfaces - // // added during mesh preprocessing) - // // As tested, this does not eliminate all DC modes! - for (std::size_t l = 0; l < GetH1Spaces().GetNumLevels(); l++) - { - GetH1Spaces().GetFESpaceAtLevel(l).GetEssentialTrueDofs( - aux_bdr_marker, aux_bdr_tdof_lists.emplace_back()); - } - - // A final check that no boundary attribute is assigned multiple boundary conditions. The - // one exception is that a lumped port boundary attribute can be also be assigned some - // other condition, in which case the fact that it is a port is just used for - // postprocessing. - const auto &surf_z_marker = surf_z_op.GetMarker(); - const auto &lumped_port_marker = lumped_port_op.GetMarker(); - const auto &surf_j_marker = surf_j_op.GetMarker(); - bool first = true; - for (int i = 0; i < dbc_marker.Size(); i++) - { - if (lumped_port_marker[i]) - { - if (dbc_marker[i]) - { - if (first) - { - Mpi::Print("\n"); - first = false; - } - Mpi::Warning("Lumped port boundary {:d} also marked as PEC!\nBoundary " - "condition/excitation will be ignored!\n", - i + 1); - } - } - else - { - MFEM_VERIFY(dbc_marker[i] + farfield_marker[i] + surf_sigma_marker[i] + - surf_z_marker[i] + wave_port_marker[i] + surf_j_marker[i] <= - 1, - "Boundary attributes should not be specified with multiple BC!"); - } - } -} - -namespace -{ - -void PrintHeader(const FiniteElementSpace &h1_fespace, const FiniteElementSpace &nd_fespace, - const FiniteElementSpace &rt_fespace, int pa_order_threshold, - bool &print_hdr) -{ - if (print_hdr) - { - Mpi::Print("\nAssembling system matrices, number of global unknowns:\n" - " H1: {:d}, ND: {:d}, RT: {:d}\n Operator assembly level: {}\n", - h1_fespace.GlobalTrueVSize(), nd_fespace.GlobalTrueVSize(), - rt_fespace.GlobalTrueVSize(), - nd_fespace.GetMaxElementOrder() > pa_order_threshold ? "Partial" : "Full"); - } - print_hdr = false; -} - -template -std::unique_ptr BuildOperator(const FiniteElementSpace &fespace, T1 *df, T2 *f, - T3 *dfb, T4 *fb, int pa_order_threshold, - bool skip_zeros) -{ - BilinearForm a(fespace); - if (df && !df->empty() && f && !f->empty()) - { - a.AddDomainIntegrator(*df, *f); - } - else - { - if (df && !df->empty()) - { - a.AddDomainIntegrator(*df); - } - if (f && !f->empty()) - { - a.AddDomainIntegrator(*f); - } - } - if (dfb && !dfb->empty() && fb && !fb->empty()) - { - a.AddBoundaryIntegrator(*dfb, *fb); - } - else - { - if (dfb && !dfb->empty()) - { - a.AddBoundaryIntegrator(*dfb); - } - if (fb && !fb->empty()) - { - a.AddBoundaryIntegrator(*fb); - } - } - return a.Assemble(pa_order_threshold, skip_zeros); -} - -template -std::unique_ptr BuildAuxOperator(const FiniteElementSpace &fespace, T1 *f, T2 *fb, - int pa_order_threshold, bool skip_zeros) -{ - BilinearForm a(fespace); - if (f && !f->empty()) - { - a.AddDomainIntegrator(*f); - } - if (fb && !fb->empty()) - { - a.AddBoundaryIntegrator(*fb); - } - return a.Assemble(pa_order_threshold, skip_zeros); -} - -} // namespace - -template -std::unique_ptr -SpaceOperator::GetStiffnessMatrix(Operator::DiagonalPolicy diag_policy) -{ - PrintHeader(GetH1Space(), GetNDSpace(), GetRTSpace(), pa_order_threshold, print_hdr); - const int sdim = GetNDSpace().GetParMesh()->SpaceDimension(); - SumMatrixCoefficient df(sdim), f(sdim), fb(sdim); - AddStiffnessCoefficients(1.0, df, f); - AddStiffnessBdrCoefficients(1.0, fb); - if (df.empty() && f.empty() && fb.empty()) - { - return {}; - } - - auto k = BuildOperator(GetNDSpace(), &df, &f, (SumCoefficient *)nullptr, &fb, - pa_order_threshold, skip_zeros); - if constexpr (std::is_same::value) - { - auto K = std::make_unique(std::move(k), nullptr, GetNDSpace()); - K->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); - return K; - } - else - { - auto K = std::make_unique(std::move(k), GetNDSpace()); - K->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); - return K; - } -} - -template -std::unique_ptr -SpaceOperator::GetDampingMatrix(Operator::DiagonalPolicy diag_policy) -{ - PrintHeader(GetH1Space(), GetNDSpace(), GetRTSpace(), pa_order_threshold, print_hdr); - const int sdim = GetNDSpace().GetParMesh()->SpaceDimension(); - SumMatrixCoefficient f(sdim), fb(sdim); - AddDampingCoefficients(1.0, f); - AddDampingBdrCoefficients(1.0, fb); - if (f.empty() && fb.empty()) - { - return {}; - } - - auto c = BuildOperator(GetNDSpace(), (SumCoefficient *)nullptr, &f, - (SumCoefficient *)nullptr, &fb, pa_order_threshold, skip_zeros); - if constexpr (std::is_same::value) - { - auto C = std::make_unique(std::move(c), nullptr, GetNDSpace()); - C->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); - return C; - } - else - { - auto C = std::make_unique(std::move(c), GetNDSpace()); - C->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); - return C; - } -} - -template -std::unique_ptr SpaceOperator::GetMassMatrix(Operator::DiagonalPolicy diag_policy) -{ - PrintHeader(GetH1Space(), GetNDSpace(), GetRTSpace(), pa_order_threshold, print_hdr); - const int sdim = GetNDSpace().GetParMesh()->SpaceDimension(); - SumMatrixCoefficient fr(sdim), fi(sdim), fbr(sdim); - AddRealMassCoefficients(1.0, fr); - AddRealMassBdrCoefficients(1.0, fbr); - if constexpr (std::is_same::value) - { - AddImagMassCoefficients(1.0, fi); - } - if (fr.empty() && fbr.empty() && fi.empty()) - { - return {}; - } - - std::unique_ptr mr, mi; - if (!fr.empty() || !fbr.empty()) - { - mr = BuildOperator(GetNDSpace(), (SumCoefficient *)nullptr, &fr, - (SumCoefficient *)nullptr, &fbr, pa_order_threshold, skip_zeros); - } - if (!fi.empty()) - { - mi = BuildOperator(GetNDSpace(), (SumCoefficient *)nullptr, &fi, - (SumCoefficient *)nullptr, (SumCoefficient *)nullptr, - pa_order_threshold, skip_zeros); - } - if constexpr (std::is_same::value) - { - auto M = - std::make_unique(std::move(mr), std::move(mi), GetNDSpace()); - M->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); - return M; - } - else - { - auto M = std::make_unique(std::move(mr), GetNDSpace()); - M->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); - return M; - } -} - -template -std::unique_ptr -SpaceOperator::GetExtraSystemMatrix(double omega, Operator::DiagonalPolicy diag_policy) -{ - PrintHeader(GetH1Space(), GetNDSpace(), GetRTSpace(), pa_order_threshold, print_hdr); - const int sdim = GetNDSpace().GetParMesh()->SpaceDimension(); - SumMatrixCoefficient fbr(sdim), fbi(sdim); - SumCoefficient dfbr, dfbi; - AddExtraSystemBdrCoefficients(omega, dfbr, dfbi, fbr, fbi); - if (dfbr.empty() && fbr.empty() && dfbi.empty() && fbi.empty()) - { - return {}; - } - - std::unique_ptr ar, ai; - if (!dfbr.empty() || !fbr.empty()) - { - ar = BuildOperator(GetNDSpace(), (SumCoefficient *)nullptr, (SumCoefficient *)nullptr, - &dfbr, &fbr, pa_order_threshold, skip_zeros); - } - if (!dfbi.empty() || !fbi.empty()) - { - ai = BuildOperator(GetNDSpace(), (SumCoefficient *)nullptr, (SumCoefficient *)nullptr, - &dfbi, &fbi, pa_order_threshold, skip_zeros); - } - if constexpr (std::is_same::value) - { - auto A = - std::make_unique(std::move(ar), std::move(ai), GetNDSpace()); - A->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); - return A; - } - else - { - MFEM_VERIFY(!ai, "Unexpected imaginary part in GetExtraSystemMatrix!"); - auto A = std::make_unique(std::move(ar), GetNDSpace()); - A->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); - return A; - } -} - -namespace -{ - -auto BuildParSumOperator(int h, int w, double a0, double a1, double a2, - const ParOperator *K, const ParOperator *C, const ParOperator *M, - const ParOperator *A2, const FiniteElementSpace &fespace) -{ - auto sum = std::make_unique(h, w); - if (K && a0 != 0.0) - { - sum->AddOperator(K->LocalOperator(), a0); - } - if (C && a1 != 0.0) - { - sum->AddOperator(C->LocalOperator(), a1); - } - if (M && a2 != 0.0) - { - sum->AddOperator(M->LocalOperator(), a2); - } - if (A2) - { - sum->AddOperator(A2->LocalOperator(), 1.0); - } - return std::make_unique(std::move(sum), fespace); -} - -auto BuildParSumOperator(int h, int w, std::complex a0, std::complex a1, - std::complex a2, const ComplexParOperator *K, - const ComplexParOperator *C, const ComplexParOperator *M, - const ComplexParOperator *A2, const FiniteElementSpace &fespace) -{ - // Block 2 x 2 equivalent-real formulation for each term in the sum: - // [ sumr ] += [ ar -ai ] [ Ar ] - // [ sumi ] [ ai ar ] [ Ai ] . - auto sumr = std::make_unique(h, w); - auto sumi = std::make_unique(h, w); - if (K) - { - if (a0.real() != 0.0) - { - if (K->LocalOperator().HasReal()) - { - sumr->AddOperator(*K->LocalOperator().Real(), a0.real()); - } - if (K->LocalOperator().HasImag()) - { - sumi->AddOperator(*K->LocalOperator().Imag(), a0.real()); - } - } - if (a0.imag() != 0.0) - { - if (K->LocalOperator().HasImag()) - { - sumr->AddOperator(*K->LocalOperator().Imag(), -a0.imag()); - } - if (K->LocalOperator().HasReal()) - { - sumi->AddOperator(*K->LocalOperator().Real(), a0.imag()); - } - } - } - if (C && a1 != 0.0) - { - if (a1.real() != 0.0) - { - if (C->LocalOperator().HasReal()) - { - sumr->AddOperator(*C->LocalOperator().Real(), a1.real()); - } - if (C->LocalOperator().HasImag()) - { - sumi->AddOperator(*C->LocalOperator().Imag(), a1.real()); - } - } - if (a1.imag() != 0.0) - { - if (C->LocalOperator().HasImag()) - { - sumr->AddOperator(*C->LocalOperator().Imag(), -a1.imag()); - } - if (C->LocalOperator().HasReal()) - { - sumi->AddOperator(*C->LocalOperator().Real(), a1.imag()); - } - } - } - if (M && a2 != 0.0) - { - if (a2.real() != 0.0) - { - if (M->LocalOperator().HasReal()) - { - sumr->AddOperator(*M->LocalOperator().Real(), a2.real()); - } - if (M->LocalOperator().HasImag()) - { - sumi->AddOperator(*M->LocalOperator().Imag(), a2.real()); - } - } - if (a2.imag() != 0.0) - { - if (M->LocalOperator().HasImag()) - { - sumr->AddOperator(*M->LocalOperator().Imag(), -a2.imag()); - } - if (M->LocalOperator().HasReal()) - { - sumi->AddOperator(*M->LocalOperator().Real(), a2.imag()); - } - } - } - if (A2) - { - if (A2->LocalOperator().HasReal()) - { - sumr->AddOperator(*A2->LocalOperator().Real(), 1.0); - } - if (A2->LocalOperator().HasImag()) - { - sumi->AddOperator(*A2->LocalOperator().Imag(), 1.0); - } - } - return std::make_unique(std::move(sumr), std::move(sumi), fespace); -} - -} // namespace - -template -std::unique_ptr -SpaceOperator::GetSystemMatrix(ScalarType a0, ScalarType a1, ScalarType a2, - const OperType *K, const OperType *C, const OperType *M, - const OperType *A2) -{ - using ParOperType = - typename std::conditional::value, - ComplexParOperator, ParOperator>::type; - - const auto *PtAP_K = (K) ? dynamic_cast(K) : nullptr; - const auto *PtAP_C = (C) ? dynamic_cast(C) : nullptr; - const auto *PtAP_M = (M) ? dynamic_cast(M) : nullptr; - const auto *PtAP_A2 = (A2) ? dynamic_cast(A2) : nullptr; - MFEM_VERIFY((!K || PtAP_K) && (!C || PtAP_C) && (!M || PtAP_M) && (!A2 || PtAP_A2), - "SpaceOperator requires ParOperator or ComplexParOperator for system matrix " - "construction!"); - - int height = -1, width = -1; - if (PtAP_K) - { - height = PtAP_K->LocalOperator().Height(); - width = PtAP_K->LocalOperator().Width(); - } - else if (PtAP_C) - { - height = PtAP_C->LocalOperator().Height(); - width = PtAP_C->LocalOperator().Width(); - } - else if (PtAP_M) - { - height = PtAP_M->LocalOperator().Height(); - width = PtAP_M->LocalOperator().Width(); - } - else if (PtAP_A2) - { - height = PtAP_A2->LocalOperator().Height(); - width = PtAP_A2->LocalOperator().Width(); - } - MFEM_VERIFY(height >= 0 && width >= 0, - "At least one argument to GetSystemMatrix must not be empty!"); - - auto A = BuildParSumOperator(height, width, a0, a1, a2, PtAP_K, PtAP_C, PtAP_M, PtAP_A2, - GetNDSpace()); - A->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), Operator::DiagonalPolicy::DIAG_ONE); - return A; -} - -std::unique_ptr SpaceOperator::GetInnerProductMatrix(double a0, double a2, - const ComplexOperator *K, - const ComplexOperator *M) -{ - const auto *PtAP_K = (K) ? dynamic_cast(K) : nullptr; - const auto *PtAP_M = (M) ? dynamic_cast(M) : nullptr; - MFEM_VERIFY( - (!K || PtAP_K) && (!M || PtAP_M), - "SpaceOperator requires ComplexParOperator for inner product matrix construction!"); - - int height = -1, width = -1; - if (PtAP_K) - { - height = PtAP_K->LocalOperator().Height(); - width = PtAP_K->LocalOperator().Width(); - } - else if (PtAP_M) - { - height = PtAP_M->LocalOperator().Height(); - width = PtAP_M->LocalOperator().Width(); - } - MFEM_VERIFY(height >= 0 && width >= 0, - "At least one argument to GetInnerProductMatrix must not be empty!"); - - auto sum = std::make_unique(height, width); - if (PtAP_K && a0 != 0.0) - { - MFEM_VERIFY( - PtAP_K->LocalOperator().HasReal(), - "Missing real part of stiffness matrix for inner product matrix construction!"); - sum->AddOperator(*PtAP_K->LocalOperator().Real(), a0); - } - if (PtAP_M && a2 != 0.0) - { - MFEM_VERIFY(PtAP_M->LocalOperator().HasReal(), - "Missing real part of mass matrix for inner product matrix construction!"); - sum->AddOperator(*PtAP_M->LocalOperator().Real(), a2); - } - return std::make_unique(std::move(sum), GetNDSpace()); -} - -namespace -{ - -auto BuildLevelOperator(const MultigridOperator &B, std::unique_ptr &&br, - std::unique_ptr &&bi, const FiniteElementSpace &fespace) -{ - return std::make_unique(std::move(br), fespace); -} - -auto BuildLevelOperator(const ComplexMultigridOperator &B, std::unique_ptr &&br, - std::unique_ptr &&bi, const FiniteElementSpace &fespace) -{ - return std::make_unique(std::move(br), std::move(bi), fespace); -} - -} // namespace - -template -std::unique_ptr SpaceOperator::GetPreconditionerMatrix(double a0, double a1, - double a2, double a3) -{ - // XX TODO: Test complex PC matrix assembly for l == 0 if coarse solve supports it - // XX TODO: Handle complex coeff a0/a1/a2/a3 (like GetSystemMatrix) - if (print_prec_hdr) - { - Mpi::Print("\nAssembling multigrid hierarchy:\n"); - } - MFEM_VERIFY(GetH1Spaces().GetNumLevels() == GetNDSpaces().GetNumLevels(), - "Multigrid hierarchy mismatch for auxiliary space preconditioning!"); - const auto n_levels = GetNDSpaces().GetNumLevels(); - auto B = std::make_unique>(n_levels); - for (bool aux : {false, true}) - { - for (std::size_t l = 0; l < n_levels; l++) - { - // Force coarse level operator to be fully assembled always. - const auto &fespace_l = - aux ? GetH1Spaces().GetFESpaceAtLevel(l) : GetNDSpaces().GetFESpaceAtLevel(l); - const auto &dbc_tdof_lists_l = aux ? h1_dbc_tdof_lists[l] : nd_dbc_tdof_lists[l]; - if (print_prec_hdr) - { - Mpi::Print(" Level {:d}{} (p = {:d}): {:d} unknowns", l, aux ? " (auxiliary)" : "", - fespace_l.GetMaxElementOrder(), fespace_l.GlobalTrueVSize()); - } - const int sdim = GetNDSpace().GetParMesh()->SpaceDimension(); - SumMatrixCoefficient dfr(sdim), fr(sdim), fi(sdim), fbr(sdim), fbi(sdim); - SumCoefficient dfbr, dfbi; - if (!std::is_same::value || pc_mat_real || l == 0) - { - // Real-valued system matrix (approximation) for preconditioning. - AddStiffnessCoefficients(a0, dfr, fr); - AddStiffnessBdrCoefficients(a0, fbr); - AddDampingCoefficients(a1, fr); - AddDampingBdrCoefficients(a1, fbr); - AddAbsMassCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fr); - AddRealMassBdrCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fbr); - AddExtraSystemBdrCoefficients(a3, dfbr, dfbr, fbr, fbr); - } - else - { - // Build preconditioner based on the actual complex-valued system matrix. - AddStiffnessCoefficients(a0, dfr, fr); - AddStiffnessBdrCoefficients(a0, fbr); - AddDampingCoefficients(a1, fi); - AddDampingBdrCoefficients(a1, fbi); - AddRealMassCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fr); - AddRealMassBdrCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fbr); - AddImagMassCoefficients(a2, fi); - AddExtraSystemBdrCoefficients(a3, dfbr, dfbi, fbr, fbi); - } - - std::unique_ptr br, bi; - if (!dfr.empty() || !fr.empty() || !dfbr.empty() || !fbr.empty()) - { - br = aux ? BuildAuxOperator(fespace_l, &fr, &fbr, (l > 0) ? pa_order_threshold : 99, - skip_zeros) - : BuildOperator(fespace_l, &dfr, &fr, &dfbr, &fbr, - (l > 0) ? pa_order_threshold : 99, skip_zeros); - } - if (!fi.empty() || !dfbi.empty() || !fbi.empty()) - { - bi = aux ? BuildAuxOperator(fespace_l, &fi, &fbi, (l > 0) ? pa_order_threshold : 99, - skip_zeros) - : BuildOperator(fespace_l, (SumCoefficient *)nullptr, &fi, &dfbi, &fbi, - (l > 0) ? pa_order_threshold : 99, skip_zeros); - } - if (print_prec_hdr) - { - if (const auto *br_spm = dynamic_cast(br.get())) - { - HYPRE_BigInt nnz = br_spm->NumNonZeroElems(); - Mpi::GlobalSum(1, &nnz, fespace_l.GetComm()); - Mpi::Print(", {:d} NNZ\n", nnz); - } - else - { - Mpi::Print("\n"); - } - } - auto B_l = BuildLevelOperator(*B, std::move(br), std::move(bi), fespace_l); - B_l->SetEssentialTrueDofs(dbc_tdof_lists_l, Operator::DiagonalPolicy::DIAG_ONE); - if (aux) - { - B->AddAuxiliaryOperator(std::move(B_l)); - } - else - { - B->AddOperator(std::move(B_l)); - } - } - } - print_prec_hdr = false; - return B; -} - -void SpaceOperator::AddStiffnessCoefficients(double coef, SumMatrixCoefficient &df, - SumMatrixCoefficient &f) -{ - constexpr auto MatType = MaterialPropertyType::INV_PERMEABILITY; - df.AddCoefficient(std::make_unique>(mat_op, coef)); - - // Contribution for London superconductors. - if (mat_op.HasLondonDepth()) - { - constexpr auto MatTypeL = MaterialPropertyType::INV_LONDON_DEPTH; - f.AddCoefficient(std::make_unique>(mat_op, coef), - mat_op.GetLondonDepthMarker()); - } -} - -void SpaceOperator::AddStiffnessBdrCoefficients(double coef, SumMatrixCoefficient &fb) -{ - // Robin BC contributions due to surface impedance and lumped ports (inductance). - surf_z_op.AddStiffnessBdrCoefficients(coef, fb); - lumped_port_op.AddStiffnessBdrCoefficients(coef, fb); -} - -void SpaceOperator::AddDampingCoefficients(double coef, SumMatrixCoefficient &f) -{ - // Contribution for domain conductivity. - if (mat_op.HasConductivity()) - { - constexpr auto MatType = MaterialPropertyType::CONDUCTIVITY; - f.AddCoefficient(std::make_unique>(mat_op, coef), - mat_op.GetConductivityMarker()); - } -} - -void SpaceOperator::AddDampingBdrCoefficients(double coef, SumMatrixCoefficient &fb) -{ - // Robin BC contributions due to surface impedance, lumped ports, and absorbing - // boundaries (resistance). - farfield_op.AddDampingBdrCoefficients(coef, fb); - surf_z_op.AddDampingBdrCoefficients(coef, fb); - lumped_port_op.AddDampingBdrCoefficients(coef, fb); -} - -void SpaceOperator::AddRealMassCoefficients(double coef, SumMatrixCoefficient &f) -{ - constexpr auto MatType = MaterialPropertyType::PERMITTIVITY_REAL; - f.AddCoefficient(std::make_unique>(mat_op, coef)); -} - -void SpaceOperator::AddRealMassBdrCoefficients(double coef, SumMatrixCoefficient &fb) -{ - // Robin BC contributions due to surface impedance and lumped ports (capacitance). - surf_z_op.AddMassBdrCoefficients(coef, fb); - lumped_port_op.AddMassBdrCoefficients(coef, fb); -} - -void SpaceOperator::AddImagMassCoefficients(double coef, SumMatrixCoefficient &f) -{ - // Contribution for loss tangent: ε -> ε * (1 - i tan(δ)). - if (mat_op.HasLossTangent()) - { - constexpr auto MatType = MaterialPropertyType::PERMITTIVITY_IMAG; - f.AddCoefficient(std::make_unique>(mat_op, coef), - mat_op.GetLossTangentMarker()); - } -} - -void SpaceOperator::AddAbsMassCoefficients(double coef, SumMatrixCoefficient &f) -{ - constexpr auto MatType = MaterialPropertyType::PERMITTIVITY_ABS; - f.AddCoefficient(std::make_unique>(mat_op, coef)); -} - -void SpaceOperator::AddExtraSystemBdrCoefficients(double omega, SumCoefficient &dfbr, - SumCoefficient &dfbi, - SumMatrixCoefficient &fbr, - SumMatrixCoefficient &fbi) -{ - // Contribution for second-order farfield boundaries and finite conductivity boundaries. - farfield_op.AddExtraSystemBdrCoefficients(omega, dfbr, dfbi); - surf_sigma_op.AddExtraSystemBdrCoefficients(omega, fbr, fbi); - - // Contribution for numeric wave ports. - wave_port_op.AddExtraSystemBdrCoefficients(omega, fbr, fbi); -} - -bool SpaceOperator::GetExcitationVector(Vector &RHS) -{ - // Time domain excitation vector. - RHS.SetSize(GetNDSpace().GetTrueVSize()); - RHS = 0.0; - bool nnz = AddExcitationVector1Internal(RHS); - linalg::SetSubVector(RHS, nd_dbc_tdof_lists.back(), 0.0); - return nnz; -} - -bool SpaceOperator::GetExcitationVector(double omega, ComplexVector &RHS) -{ - // Frequency domain excitation vector: RHS = iω RHS1 + RHS2(ω). - RHS.SetSize(GetNDSpace().GetTrueVSize()); - RHS = 0.0; - bool nnz1 = AddExcitationVector1Internal(RHS.Real()); - RHS *= 1i * omega; - bool nnz2 = AddExcitationVector2Internal(omega, RHS); - linalg::SetSubVector(RHS, nd_dbc_tdof_lists.back(), 0.0); - return nnz1 || nnz2; -} - -bool SpaceOperator::GetExcitationVector1(ComplexVector &RHS1) -{ - // Assemble the frequency domain excitation term with linear frequency dependence - // (coefficient iω, see GetExcitationVector above, is accounted for later). - RHS1.SetSize(GetNDSpace().GetTrueVSize()); - RHS1 = 0.0; - bool nnz1 = AddExcitationVector1Internal(RHS1.Real()); - linalg::SetSubVector(RHS1.Real(), nd_dbc_tdof_lists.back(), 0.0); - return nnz1; -} - -bool SpaceOperator::GetExcitationVector2(double omega, ComplexVector &RHS2) -{ - RHS2.SetSize(GetNDSpace().GetTrueVSize()); - RHS2 = 0.0; - bool nnz2 = AddExcitationVector2Internal(omega, RHS2); - linalg::SetSubVector(RHS2, nd_dbc_tdof_lists.back(), 0.0); - return nnz2; -} - -bool SpaceOperator::AddExcitationVector1Internal(Vector &RHS1) -{ - // Assemble the time domain excitation -g'(t) J or frequency domain excitation -iω J. - // The g'(t) or iω factors are not accounted for here, they is accounted for in the time - // integration or frequency sweep later. - MFEM_VERIFY(RHS1.Size() == GetNDSpace().GetTrueVSize(), - "Invalid T-vector size for AddExcitationVector1Internal!"); - SumVectorCoefficient fb(GetNDSpace().GetParMesh()->SpaceDimension()); - lumped_port_op.AddExcitationBdrCoefficients(fb); - surf_j_op.AddExcitationBdrCoefficients(fb); - if (fb.empty()) - { - return false; - } - mfem::LinearForm rhs1(&GetNDSpace()); - rhs1.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fb)); - rhs1.UseFastAssembly(false); - rhs1.Assemble(); - GetNDSpace().GetProlongationMatrix()->AddMultTranspose(rhs1, RHS1); - return true; -} - -bool SpaceOperator::AddExcitationVector2Internal(double omega, ComplexVector &RHS2) -{ - // Assemble the contribution of wave ports to the frequency domain excitation term at the - // specified frequency. - MFEM_VERIFY(RHS2.Size() == GetNDSpace().GetTrueVSize(), - "Invalid T-vector size for AddExcitationVector2Internal!"); - SumVectorCoefficient fbr(GetNDSpace().GetParMesh()->SpaceDimension()), - fbi(GetNDSpace().GetParMesh()->SpaceDimension()); - wave_port_op.AddExcitationBdrCoefficients(omega, fbr, fbi); - if (fbr.empty() && fbi.empty()) - { - return false; - } - mfem::LinearForm rhs2r(&GetNDSpace()), rhs2i(&GetNDSpace()); - rhs2r.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fbr)); - rhs2i.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fbi)); - rhs2r.UseFastAssembly(false); - rhs2i.UseFastAssembly(false); - rhs2r.Assemble(); - rhs2i.Assemble(); - GetNDSpace().GetProlongationMatrix()->AddMultTranspose(rhs2r, RHS2.Real()); - GetNDSpace().GetProlongationMatrix()->AddMultTranspose(rhs2i, RHS2.Imag()); - return true; -} - -void SpaceOperator::GetConstantInitialVector(ComplexVector &v) -{ - v.SetSize(GetNDSpace().GetTrueVSize()); - v = 1.0; - linalg::SetSubVector(v.Real(), nd_dbc_tdof_lists.back(), 0.0); -} - -void SpaceOperator::GetRandomInitialVector(ComplexVector &v) -{ - v.SetSize(GetNDSpace().GetTrueVSize()); - linalg::SetRandom(GetNDSpace().GetComm(), v); - linalg::SetSubVector(v, nd_dbc_tdof_lists.back(), 0.0); -} - -template std::unique_ptr - SpaceOperator::GetStiffnessMatrix(Operator::DiagonalPolicy); -template std::unique_ptr - SpaceOperator::GetStiffnessMatrix(Operator::DiagonalPolicy); - -template std::unique_ptr - SpaceOperator::GetDampingMatrix(Operator::DiagonalPolicy); -template std::unique_ptr - SpaceOperator::GetDampingMatrix(Operator::DiagonalPolicy); - -template std::unique_ptr SpaceOperator::GetMassMatrix(Operator::DiagonalPolicy); -template std::unique_ptr - SpaceOperator::GetMassMatrix(Operator::DiagonalPolicy); - -template std::unique_ptr -SpaceOperator::GetExtraSystemMatrix(double, Operator::DiagonalPolicy); -template std::unique_ptr -SpaceOperator::GetExtraSystemMatrix(double, Operator::DiagonalPolicy); - -template std::unique_ptr -SpaceOperator::GetSystemMatrix(double, double, double, const Operator *, - const Operator *, const Operator *, - const Operator *); -template std::unique_ptr -SpaceOperator::GetSystemMatrix>( - std::complex, std::complex, std::complex, - const ComplexOperator *, const ComplexOperator *, const ComplexOperator *, - const ComplexOperator *); - -template std::unique_ptr -SpaceOperator::GetPreconditionerMatrix(double, double, double, double); -template std::unique_ptr -SpaceOperator::GetPreconditionerMatrix(double, double, double, double); - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "spaceoperator.hpp" + +#include +#include +#include "fem/bilinearform.hpp" +#include "fem/coefficient.hpp" +#include "fem/integrator.hpp" +#include "fem/mesh.hpp" +#include "fem/multigrid.hpp" +#include "linalg/hypre.hpp" +#include "linalg/rap.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" +#include "utils/prettyprint.hpp" + +namespace palace +{ + +using namespace std::complex_literals; + +SpaceOperator::SpaceOperator(const IoData &iodata, + const std::vector> &mesh) + : pc_mat_real(iodata.solver.linear.pc_mat_real), + pc_mat_shifted(iodata.solver.linear.pc_mat_shifted), print_hdr(true), + print_prec_hdr(true), dbc_attr(SetUpBoundaryProperties(iodata, *mesh.back())), + nd_fecs(fem::ConstructFECollections( + iodata.solver.order, mesh.back()->Dimension(), iodata.solver.linear.mg_max_levels, + iodata.solver.linear.mg_coarsening, false)), + h1_fecs(fem::ConstructFECollections( + iodata.solver.order, mesh.back()->Dimension(), iodata.solver.linear.mg_max_levels, + iodata.solver.linear.mg_coarsening, false)), + rt_fecs(fem::ConstructFECollections( + iodata.solver.order - 1, mesh.back()->Dimension(), + iodata.solver.linear.estimator_mg ? iodata.solver.linear.mg_max_levels : 1, + iodata.solver.linear.mg_coarsening, false)), + nd_fespaces(fem::ConstructFiniteElementSpaceHierarchy( + iodata.solver.linear.mg_max_levels, mesh, nd_fecs, &dbc_attr, &nd_dbc_tdof_lists)), + h1_fespaces(fem::ConstructFiniteElementSpaceHierarchy( + iodata.solver.linear.mg_max_levels, mesh, h1_fecs, &dbc_attr, &h1_dbc_tdof_lists)), + rt_fespaces(fem::ConstructFiniteElementSpaceHierarchy( + iodata.solver.linear.estimator_mg ? iodata.solver.linear.mg_max_levels : 1, mesh, + rt_fecs)), + mat_op(iodata, *mesh.back()), farfield_op(iodata, mat_op, *mesh.back()), + surf_sigma_op(iodata, mat_op, *mesh.back()), surf_z_op(iodata, mat_op, *mesh.back()), + lumped_port_op(iodata, mat_op, *mesh.back()), + wave_port_op(iodata, mat_op, GetNDSpace(), GetH1Space()), + surf_j_op(iodata, *mesh.back()), + port_excitation_helper(lumped_port_op, wave_port_op, surf_j_op) +{ + // Check Excitations. + if (iodata.problem.type == ProblemType::DRIVEN) + { + MFEM_VERIFY(!port_excitation_helper.Empty(), + "Driven problems must specify at least one excitation!"); + } + else if (iodata.problem.type == ProblemType::EIGENMODE) + { + MFEM_VERIFY(port_excitation_helper.Empty(), + "Eigenmode problems must not specify any excitation!"); + } + else if (iodata.problem.type == ProblemType::TRANSIENT) + { + MFEM_VERIFY( + port_excitation_helper.Size() == 1, + "Transient problems currently only support a single excitation per simulation!"); + } + else + { + MFEM_ABORT("Internal Error: Solver type incompatible with SpaceOperator."); + } + + // Finalize setup. + CheckBoundaryProperties(); + + // Print essential BC information. + if (dbc_attr.Size()) + { + Mpi::Print("\nConfiguring Dirichlet PEC BC at attributes:\n"); + utils::PrettyPrint(dbc_attr); + } +} + +mfem::Array SpaceOperator::SetUpBoundaryProperties(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + // Check that boundary attributes have been specified correctly. + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array bdr_attr_marker; + if (!iodata.boundaries.pec.empty()) + { + bdr_attr_marker.SetSize(bdr_attr_max); + bdr_attr_marker = 0; + for (auto attr : mesh.bdr_attributes) + { + bdr_attr_marker[attr - 1] = 1; + } + std::set bdr_warn_list; + for (auto attr : iodata.boundaries.pec.attributes) + { + // MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, + // "PEC boundary attribute tags must be non-negative and correspond to " + // "attributes in the mesh!"); + // MFEM_VERIFY(bdr_attr_marker[attr - 1], + // "Unknown PEC boundary attribute " << attr << "!"); + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + bdr_warn_list.insert(attr); + } + } + if (!bdr_warn_list.empty()) + { + Mpi::Print("\n"); + Mpi::Warning("Unknown PEC boundary attributes!\nSolver will just ignore them!"); + utils::PrettyPrint(bdr_warn_list, "Boundary attribute list:"); + Mpi::Print("\n"); + } + } + + // Mark selected boundary attributes from the mesh as essential (Dirichlet). + mfem::Array dbc_bcs; + dbc_bcs.Reserve(static_cast(iodata.boundaries.pec.attributes.size())); + for (auto attr : iodata.boundaries.pec.attributes) + { + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + continue; // Can just ignore if wrong + } + dbc_bcs.Append(attr); + } + return dbc_bcs; +} + +void SpaceOperator::CheckBoundaryProperties() +{ + // Mark selected boundary attributes from the mesh as having some Dirichlet, Neumann, or + // mixed BC applied. + const mfem::ParMesh &mesh = GetMesh(); + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + const auto dbc_marker = mesh::AttrToMarker(bdr_attr_max, dbc_attr); + const auto farfield_marker = mesh::AttrToMarker(bdr_attr_max, farfield_op.GetAttrList()); + const auto surf_sigma_marker = + mesh::AttrToMarker(bdr_attr_max, surf_sigma_op.GetAttrList()); + const auto surf_z_Rs_marker = mesh::AttrToMarker(bdr_attr_max, surf_z_op.GetRsAttrList()); + const auto surf_z_Ls_marker = mesh::AttrToMarker(bdr_attr_max, surf_z_op.GetLsAttrList()); + const auto lumped_port_Rs_marker = + mesh::AttrToMarker(bdr_attr_max, lumped_port_op.GetRsAttrList()); + const auto lumped_port_Ls_marker = + mesh::AttrToMarker(bdr_attr_max, lumped_port_op.GetLsAttrList()); + const auto wave_port_marker = + mesh::AttrToMarker(bdr_attr_max, wave_port_op.GetAttrList()); + mfem::Array aux_bdr_marker(dbc_marker.Size()); + for (int i = 0; i < dbc_marker.Size(); i++) + { + aux_bdr_marker[i] = + (dbc_marker[i] || farfield_marker[i] || surf_sigma_marker[i] || + surf_z_Rs_marker[i] || surf_z_Ls_marker[i] || lumped_port_Rs_marker[i] || + lumped_port_Ls_marker[i] || wave_port_marker[i]); + if (aux_bdr_marker[i]) + { + aux_bdr_attr.Append(i + 1); + } + } + // aux_bdr_marker = 1; // Mark all boundaries (including material interfaces + // // added during mesh preprocessing) + // // As tested, this does not eliminate all DC modes! + for (std::size_t l = 0; l < GetH1Spaces().GetNumLevels(); l++) + { + GetH1Spaces().GetFESpaceAtLevel(l).Get().GetEssentialTrueDofs( + aux_bdr_marker, aux_bdr_tdof_lists.emplace_back()); + } + + // A final check that no boundary attribute is assigned multiple boundary conditions. + const auto surf_z_marker = mesh::AttrToMarker(bdr_attr_max, surf_z_op.GetAttrList()); + const auto lumped_port_marker = + mesh::AttrToMarker(bdr_attr_max, lumped_port_op.GetAttrList()); + const auto surf_j_marker = mesh::AttrToMarker(bdr_attr_max, surf_j_op.GetAttrList()); + for (int i = 0; i < dbc_marker.Size(); i++) + { + MFEM_VERIFY(dbc_marker[i] + farfield_marker[i] + surf_sigma_marker[i] + + surf_z_marker[i] + lumped_port_marker[i] + wave_port_marker[i] + + surf_j_marker[i] <= + 1, + "Boundary attributes should not be specified with multiple BC!"); + } +} + +namespace +{ + +void PrintHeader(const mfem::ParFiniteElementSpace &h1_fespace, + const mfem::ParFiniteElementSpace &nd_fespace, + const mfem::ParFiniteElementSpace &rt_fespace, bool &print_hdr) +{ + if (print_hdr) + { + Mpi::Print("\nAssembling system matrices, number of global unknowns:\n" + " H1 (p = {:d}): {:d}, ND (p = {:d}): {:d}, RT (p = {:d}): {:d}\n Operator " + "assembly level: {}\n", + h1_fespace.GetMaxElementOrder(), h1_fespace.GlobalTrueVSize(), + nd_fespace.GetMaxElementOrder(), nd_fespace.GlobalTrueVSize(), + rt_fespace.GetMaxElementOrder(), rt_fespace.GlobalTrueVSize(), + (nd_fespace.GetMaxElementOrder() >= BilinearForm::pa_order_threshold) + ? "Partial" + : "Full"); + + const auto &mesh = *nd_fespace.GetParMesh(); + const auto geom_types = mesh::CheckElements(mesh).GetGeomTypes(); + Mpi::Print(" Mesh geometries:\n"); + for (auto geom : geom_types) + { + const auto *fe = nd_fespace.FEColl()->FiniteElementForGeometry(geom); + MFEM_VERIFY(fe, "MFEM does not support ND spaces on geometry = " + << mfem::Geometry::Name[geom] << "!"); + const int q_order = fem::DefaultIntegrationOrder::Get(mesh, geom); + Mpi::Print(" {}: P = {:d}, Q = {:d} (quadrature order = {:d}){}\n", + mfem::Geometry::Name[geom], fe->GetDof(), + mfem::IntRules.Get(geom, q_order).GetNPoints(), q_order, + (geom == geom_types.back()) ? "" : ","); + } + } + print_hdr = false; +} + +void AddIntegrators(BilinearForm &a, const MaterialPropertyCoefficient *df, + const MaterialPropertyCoefficient *f, + const MaterialPropertyCoefficient *dfb, + const MaterialPropertyCoefficient *fb, + const MaterialPropertyCoefficient *fp, bool assemble_q_data = false) +{ + if (df && !df->empty() && f && !f->empty()) + { + a.AddDomainIntegrator(*df, *f); + } + else + { + if (df && !df->empty()) + { + a.AddDomainIntegrator(*df); + } + if (f && !f->empty()) + { + a.AddDomainIntegrator(*f); + } + } + if (dfb && !dfb->empty() && fb && !fb->empty()) + { + a.AddBoundaryIntegrator(*dfb, *fb); + } + else + { + if (dfb && !dfb->empty()) + { + a.AddBoundaryIntegrator(*dfb); + } + if (fb && !fb->empty()) + { + a.AddBoundaryIntegrator(*fb); + } + } + if (fp && !fp->empty()) + { + a.AddDomainIntegrator(*fp); + a.AddDomainIntegrator(*fp, true); + } + if (assemble_q_data) + { + a.AssembleQuadratureData(); + } +} + +void AddAuxIntegrators(BilinearForm &a, const MaterialPropertyCoefficient *f, + const MaterialPropertyCoefficient *fb, bool assemble_q_data = false) +{ + if (f && !f->empty()) + { + a.AddDomainIntegrator(*f); + } + if (fb && !fb->empty()) + { + a.AddBoundaryIntegrator(*fb); + } + if (assemble_q_data) + { + a.AssembleQuadratureData(); + } +} + +auto AssembleOperator(const FiniteElementSpace &fespace, + const MaterialPropertyCoefficient *df, + const MaterialPropertyCoefficient *f, + const MaterialPropertyCoefficient *dfb, + const MaterialPropertyCoefficient *fb, + const MaterialPropertyCoefficient *fp, bool skip_zeros = false, + bool assemble_q_data = false) +{ + BilinearForm a(fespace); + AddIntegrators(a, df, f, dfb, fb, fp, assemble_q_data); + return a.Assemble(skip_zeros); +} + +auto AssembleOperators(const FiniteElementSpaceHierarchy &fespaces, + const MaterialPropertyCoefficient *df, + const MaterialPropertyCoefficient *f, + const MaterialPropertyCoefficient *dfb, + const MaterialPropertyCoefficient *fb, + const MaterialPropertyCoefficient *fp, bool skip_zeros = false, + bool assemble_q_data = false, std::size_t l0 = 0) +{ + BilinearForm a(fespaces.GetFinestFESpace()); + AddIntegrators(a, df, f, dfb, fb, fp, assemble_q_data); + return a.Assemble(fespaces, skip_zeros, l0); +} + +auto AssembleAuxOperators(const FiniteElementSpaceHierarchy &fespaces, + const MaterialPropertyCoefficient *f, + const MaterialPropertyCoefficient *fb, bool skip_zeros = false, + bool assemble_q_data = false, std::size_t l0 = 0) +{ + BilinearForm a(fespaces.GetFinestFESpace()); + AddAuxIntegrators(a, f, fb, assemble_q_data); + return a.Assemble(fespaces, skip_zeros, l0); +} + +} // namespace + +template +std::unique_ptr +SpaceOperator::GetStiffnessMatrix(Operator::DiagonalPolicy diag_policy) +{ + PrintHeader(GetH1Space(), GetNDSpace(), GetRTSpace(), print_hdr); + MaterialPropertyCoefficient df(mat_op.MaxCeedAttribute()), f(mat_op.MaxCeedAttribute()), + fb(mat_op.MaxCeedBdrAttribute()), fc(mat_op.MaxCeedAttribute()); + AddStiffnessCoefficients(1.0, df, f); + AddStiffnessBdrCoefficients(1.0, fb); + AddRealPeriodicCoefficients(1.0, f); + AddImagPeriodicCoefficients(1.0, fc); + int empty[2] = {(df.empty() && f.empty() && fb.empty()), (fc.empty())}; + Mpi::GlobalMin(2, empty, GetComm()); + if (empty[0] && empty[1]) + { + return {}; + } + constexpr bool skip_zeros = false; + std::unique_ptr kr, ki; + if (!empty[0]) + { + kr = AssembleOperator(GetNDSpace(), &df, &f, nullptr, &fb, nullptr, skip_zeros); + } + if (!empty[1]) + { + ki = + AssembleOperator(GetNDSpace(), nullptr, nullptr, nullptr, nullptr, &fc, skip_zeros); + } + if constexpr (std::is_same::value) + { + auto K = + std::make_unique(std::move(kr), std::move(ki), GetNDSpace()); + K->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); + return K; + } + else + { + MFEM_VERIFY(!ki, "Unexpected imaginary part in GetStiffnessMatrix!"); + auto K = std::make_unique(std::move(kr), GetNDSpace()); + K->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); + return K; + } +} + +template +std::unique_ptr +SpaceOperator::GetDampingMatrix(Operator::DiagonalPolicy diag_policy) +{ + PrintHeader(GetH1Space(), GetNDSpace(), GetRTSpace(), print_hdr); + MaterialPropertyCoefficient f(mat_op.MaxCeedAttribute()), + fb(mat_op.MaxCeedBdrAttribute()); + AddDampingCoefficients(1.0, f); + AddDampingBdrCoefficients(1.0, fb); + int empty = (f.empty() && fb.empty()); + Mpi::GlobalMin(1, &empty, GetComm()); + if (empty) + { + return {}; + } + constexpr bool skip_zeros = false; + auto c = AssembleOperator(GetNDSpace(), nullptr, &f, nullptr, &fb, nullptr, skip_zeros); + if constexpr (std::is_same::value) + { + auto C = std::make_unique(std::move(c), nullptr, GetNDSpace()); + C->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); + return C; + } + else + { + auto C = std::make_unique(std::move(c), GetNDSpace()); + C->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); + return C; + } +} + +template +std::unique_ptr SpaceOperator::GetMassMatrix(Operator::DiagonalPolicy diag_policy) +{ + PrintHeader(GetH1Space(), GetNDSpace(), GetRTSpace(), print_hdr); + MaterialPropertyCoefficient fr(mat_op.MaxCeedAttribute()), fi(mat_op.MaxCeedAttribute()), + fbr(mat_op.MaxCeedBdrAttribute()), fbi(mat_op.MaxCeedBdrAttribute()); + AddRealMassCoefficients(1.0, fr); + AddRealMassBdrCoefficients(1.0, fbr); + if constexpr (std::is_same::value) + { + AddImagMassCoefficients(1.0, fi); + } + int empty[2] = {(fr.empty() && fbr.empty()), (fi.empty() && fbi.empty())}; + Mpi::GlobalMin(2, empty, GetComm()); + if (empty[0] && empty[1]) + { + return {}; + } + constexpr bool skip_zeros = false; + std::unique_ptr mr, mi; + if (!empty[0]) + { + mr = AssembleOperator(GetNDSpace(), nullptr, &fr, nullptr, &fbr, nullptr, skip_zeros); + } + if (!empty[1]) + { + mi = AssembleOperator(GetNDSpace(), nullptr, &fi, nullptr, &fbi, nullptr, skip_zeros); + } + if constexpr (std::is_same::value) + { + auto M = + std::make_unique(std::move(mr), std::move(mi), GetNDSpace()); + M->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); + return M; + } + else + { + auto M = std::make_unique(std::move(mr), GetNDSpace()); + M->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); + return M; + } +} + +template +std::unique_ptr +SpaceOperator::GetExtraSystemMatrix(double omega, Operator::DiagonalPolicy diag_policy) +{ + PrintHeader(GetH1Space(), GetNDSpace(), GetRTSpace(), print_hdr); + MaterialPropertyCoefficient dfbr(mat_op.MaxCeedBdrAttribute()), + dfbi(mat_op.MaxCeedBdrAttribute()), fbr(mat_op.MaxCeedBdrAttribute()), + fbi(mat_op.MaxCeedBdrAttribute()); + AddExtraSystemBdrCoefficients(omega, dfbr, dfbi, fbr, fbi); + int empty[2] = {(dfbr.empty() && fbr.empty()), (dfbi.empty() && fbi.empty())}; + Mpi::GlobalMin(2, empty, GetComm()); + if (empty[0] && empty[1]) + { + return {}; + } + constexpr bool skip_zeros = false; + std::unique_ptr ar, ai; + if (!empty[0]) + { + ar = AssembleOperator(GetNDSpace(), nullptr, nullptr, &dfbr, &fbr, nullptr, skip_zeros); + } + if (!empty[1]) + { + ai = AssembleOperator(GetNDSpace(), nullptr, nullptr, &dfbi, &fbi, nullptr, skip_zeros); + } + if constexpr (std::is_same::value) + { + auto A = + std::make_unique(std::move(ar), std::move(ai), GetNDSpace()); + A->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); + return A; + } + else + { + MFEM_VERIFY(!ai, "Unexpected imaginary part in GetExtraSystemMatrix!"); + auto A = std::make_unique(std::move(ar), GetNDSpace()); + A->SetEssentialTrueDofs(nd_dbc_tdof_lists.back(), diag_policy); + return A; + } +} + +template +std::unique_ptr +SpaceOperator::GetSystemMatrix(ScalarType a0, ScalarType a1, ScalarType a2, + const OperType *K, const OperType *C, const OperType *M, + const OperType *A2) +{ + return BuildParSumOperator({a0, a1, a2, ScalarType{1}}, {K, C, M, A2}); +} + +std::unique_ptr SpaceOperator::GetInnerProductMatrix(double a0, double a2, + const ComplexOperator *K, + const ComplexOperator *M) +{ + const auto *PtAP_K = (K) ? dynamic_cast(K) : nullptr; + const auto *PtAP_M = (M) ? dynamic_cast(M) : nullptr; + return BuildParSumOperator( + {a0, a2}, {PtAP_K ? PtAP_K->Real() : nullptr, PtAP_M ? PtAP_M->Real() : nullptr}); +} + +namespace +{ + +template +auto BuildLevelParOperator(std::unique_ptr &&br, std::unique_ptr &&bi, + const FiniteElementSpace &fespace); + +template <> +auto BuildLevelParOperator(std::unique_ptr &&br, + std::unique_ptr &&bi, + const FiniteElementSpace &fespace) +{ + MFEM_VERIFY( + !bi, + "Should not be constructing a real-valued ParOperator with non-zero imaginary part!"); + return std::make_unique(std::move(br), fespace); +} + +template <> +auto BuildLevelParOperator(std::unique_ptr &&br, + std::unique_ptr &&bi, + const FiniteElementSpace &fespace) +{ + return std::make_unique(std::move(br), std::move(bi), fespace); +} + +} // namespace + +void SpaceOperator::AssemblePreconditioner( + std::complex a0, std::complex a1, std::complex a2, double a3, + std::vector> &br_vec, + std::vector> &br_aux_vec, + std::vector> &bi_vec, + std::vector> &bi_aux_vec) +{ + constexpr bool skip_zeros = false, assemble_q_data = false; + MaterialPropertyCoefficient dfr(mat_op.MaxCeedAttribute()), + dfi(mat_op.MaxCeedAttribute()), fr(mat_op.MaxCeedAttribute()), + fi(mat_op.MaxCeedAttribute()), dfbr(mat_op.MaxCeedBdrAttribute()), + dfbi(mat_op.MaxCeedBdrAttribute()), fbr(mat_op.MaxCeedBdrAttribute()), + fbi(mat_op.MaxCeedBdrAttribute()), fpi(mat_op.MaxCeedAttribute()), + fpr(mat_op.MaxCeedAttribute()); + AddStiffnessCoefficients(a0.real(), dfr, fr); + AddStiffnessCoefficients(a0.imag(), dfi, fi); + AddStiffnessBdrCoefficients(a0.real(), fbr); + AddStiffnessBdrCoefficients(a0.imag(), fbi); + AddDampingCoefficients(a1.real(), fr); + AddDampingCoefficients(a1.imag(), fi); + AddDampingBdrCoefficients(a1.real(), fbr); + AddDampingBdrCoefficients(a1.imag(), fbi); + AddRealMassCoefficients(pc_mat_shifted ? std::abs(a2.real()) : a2.real(), fr); + AddRealMassCoefficients(a2.imag(), fi); + AddRealMassBdrCoefficients(pc_mat_shifted ? std::abs(a2.real()) : a2.real(), fbr); + AddRealMassBdrCoefficients(a2.imag(), fbi); + AddImagMassCoefficients(a2.real(), fi); + AddImagMassCoefficients(-a2.imag(), fr); + AddExtraSystemBdrCoefficients(a3, dfbr, dfbi, fbr, fbi); + AddRealPeriodicCoefficients(a0.real(), fr); + AddRealPeriodicCoefficients(a0.imag(), fi); + AddImagPeriodicCoefficients(a0.real(), fpi); + AddImagPeriodicCoefficients(-a0.imag(), fpr); + int empty[2] = { + (dfr.empty() && fr.empty() && dfbr.empty() && fbr.empty() && fpr.empty()), + (dfi.empty() && fi.empty() && dfbi.empty() && fbi.empty() && fpi.empty())}; + Mpi::GlobalMin(2, empty, GetComm()); + if (!empty[0]) + { + br_vec = AssembleOperators(GetNDSpaces(), &dfr, &fr, &dfbr, &fbr, &fpr, skip_zeros, + assemble_q_data); + br_aux_vec = + AssembleAuxOperators(GetH1Spaces(), &fr, &fbr, skip_zeros, assemble_q_data); + } + if (!empty[1]) + { + bi_vec = AssembleOperators(GetNDSpaces(), &dfi, &fi, &dfbi, &fbi, &fpi, skip_zeros, + assemble_q_data); + bi_aux_vec = + AssembleAuxOperators(GetH1Spaces(), &fi, &fbi, skip_zeros, assemble_q_data); + } +} + +void SpaceOperator::AssemblePreconditioner( + std::complex a0, std::complex a1, std::complex a2, double a3, + std::vector> &br_vec, + std::vector> &br_aux_vec) +{ + constexpr bool skip_zeros = false, assemble_q_data = false; + MaterialPropertyCoefficient dfr(mat_op.MaxCeedAttribute()), fr(mat_op.MaxCeedAttribute()), + dfbr(mat_op.MaxCeedBdrAttribute()), fbr(mat_op.MaxCeedBdrAttribute()); + AddStiffnessCoefficients(a0.real(), dfr, fr); + AddStiffnessBdrCoefficients(a0.real(), fbr); + AddDampingCoefficients(a1.imag(), fr); + AddDampingBdrCoefficients(a1.imag(), fbr); + AddAbsMassCoefficients(pc_mat_shifted ? std::abs(a2.real()) : a2.real(), fr); + AddRealMassBdrCoefficients(pc_mat_shifted ? std::abs(a2.real()) : a2.real(), fbr); + AddExtraSystemBdrCoefficients(a3, dfbr, dfbr, fbr, fbr); + AddRealPeriodicCoefficients(a0.real(), fr); + int empty = (dfr.empty() && fr.empty() && dfbr.empty() && fbr.empty()); + Mpi::GlobalMin(1, &empty, GetComm()); + if (!empty) + { + br_vec = AssembleOperators(GetNDSpaces(), &dfr, &fr, &dfbr, &fbr, nullptr, skip_zeros, + assemble_q_data); + br_aux_vec = + AssembleAuxOperators(GetH1Spaces(), &fr, &fbr, skip_zeros, assemble_q_data); + } +} + +void SpaceOperator::AssemblePreconditioner( + double a0, double a1, double a2, double a3, + std::vector> &br_vec, + std::vector> &br_aux_vec) +{ + constexpr bool skip_zeros = false, assemble_q_data = false; + MaterialPropertyCoefficient dfr(mat_op.MaxCeedAttribute()), fr(mat_op.MaxCeedAttribute()), + dfbr(mat_op.MaxCeedBdrAttribute()), fbr(mat_op.MaxCeedBdrAttribute()); + AddStiffnessCoefficients(a0, dfr, fr); + AddStiffnessBdrCoefficients(a0, fbr); + AddDampingCoefficients(a1, fr); + AddDampingBdrCoefficients(a1, fbr); + AddAbsMassCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fr); + AddRealMassBdrCoefficients(pc_mat_shifted ? std::abs(a2) : a2, fbr); + AddExtraSystemBdrCoefficients(a3, dfbr, dfbr, fbr, fbr); + AddRealPeriodicCoefficients(a0, fr); + int empty = (dfr.empty() && fr.empty() && dfbr.empty() && fbr.empty()); + Mpi::GlobalMin(1, &empty, GetComm()); + if (!empty) + { + br_vec = AssembleOperators(GetNDSpaces(), &dfr, &fr, &dfbr, &fbr, nullptr, skip_zeros, + assemble_q_data); + br_aux_vec = + AssembleAuxOperators(GetH1Spaces(), &fr, &fbr, skip_zeros, assemble_q_data); + } +} + +template +std::unique_ptr SpaceOperator::GetPreconditionerMatrix(ScalarType a0, + ScalarType a1, + ScalarType a2, double a3) +{ + // When partially assembled, the coarse operators can reuse the fine operator quadrature + // data if the spaces correspond to the same mesh. When appropriate, we build the + // preconditioner on all levels based on the actual complex-valued system matrix. The + // coarse operator is always fully assembled. + if (print_prec_hdr) + { + Mpi::Print("\nAssembling multigrid hierarchy:\n"); + } + MFEM_VERIFY(GetH1Spaces().GetNumLevels() == GetNDSpaces().GetNumLevels(), + "Multigrid hierarchy mismatch for auxiliary space preconditioning!"); + + const auto n_levels = GetNDSpaces().GetNumLevels(); + std::vector> br_vec(n_levels), bi_vec(n_levels), + br_aux_vec(n_levels), bi_aux_vec(n_levels); + if (std::is_same::value && !pc_mat_real) + { + AssemblePreconditioner(a0, a1, a2, a3, br_vec, br_aux_vec, bi_vec, bi_aux_vec); + } + else + { + AssemblePreconditioner(a0, a1, a2, a3, br_vec, br_aux_vec); + } + + auto B = std::make_unique>(n_levels); + for (bool aux : {false, true}) + { + for (std::size_t l = 0; l < n_levels; l++) + { + const auto &fespace_l = + aux ? GetH1Spaces().GetFESpaceAtLevel(l) : GetNDSpaces().GetFESpaceAtLevel(l); + const auto &dbc_tdof_lists_l = aux ? h1_dbc_tdof_lists[l] : nd_dbc_tdof_lists[l]; + auto &br_l = aux ? br_aux_vec[l] : br_vec[l]; + auto &bi_l = aux ? bi_aux_vec[l] : bi_vec[l]; + if (print_prec_hdr) + { + Mpi::Print(" Level {:d}{} (p = {:d}): {:d} unknowns", l, aux ? " (auxiliary)" : "", + fespace_l.GetMaxElementOrder(), fespace_l.GlobalTrueVSize()); + const auto *b_spm = dynamic_cast(br_l.get()); + if (!b_spm) + { + b_spm = dynamic_cast(bi_l.get()); + } + if (b_spm) + { + HYPRE_BigInt nnz = b_spm->NNZ(); + Mpi::GlobalSum(1, &nnz, fespace_l.GetComm()); + Mpi::Print(", {:d} NNZ\n", nnz); + } + else + { + Mpi::Print("\n"); + } + } + auto B_l = + BuildLevelParOperator(std::move(br_l), std::move(bi_l), fespace_l); + B_l->SetEssentialTrueDofs(dbc_tdof_lists_l, Operator::DiagonalPolicy::DIAG_ONE); + if (aux) + { + B->AddAuxiliaryOperator(std::move(B_l)); + } + else + { + B->AddOperator(std::move(B_l)); + } + } + } + + print_prec_hdr = false; + return B; +} + +void SpaceOperator::AddStiffnessCoefficients(double coeff, MaterialPropertyCoefficient &df, + MaterialPropertyCoefficient &f) +{ + // Contribution from material permeability. + df.AddCoefficient(mat_op.GetAttributeToMaterial(), mat_op.GetInvPermeability(), coeff); + + // Contribution for London superconductors. + if (mat_op.HasLondonDepth()) + { + f.AddCoefficient(mat_op.GetAttributeToMaterial(), mat_op.GetInvLondonDepth(), coeff); + } +} + +void SpaceOperator::AddStiffnessBdrCoefficients(double coeff, + MaterialPropertyCoefficient &fb) +{ + // Robin BC contributions due to surface impedance and lumped ports (inductance). + surf_z_op.AddStiffnessBdrCoefficients(coeff, fb); + lumped_port_op.AddStiffnessBdrCoefficients(coeff, fb); +} + +void SpaceOperator::AddDampingCoefficients(double coeff, MaterialPropertyCoefficient &f) +{ + // Contribution for domain conductivity. + if (mat_op.HasConductivity()) + { + f.AddCoefficient(mat_op.GetAttributeToMaterial(), mat_op.GetConductivity(), coeff); + } +} + +void SpaceOperator::AddDampingBdrCoefficients(double coeff, MaterialPropertyCoefficient &fb) +{ + // Robin BC contributions due to surface impedance, lumped ports, and absorbing + // boundaries (resistance). + farfield_op.AddDampingBdrCoefficients(coeff, fb); + surf_z_op.AddDampingBdrCoefficients(coeff, fb); + lumped_port_op.AddDampingBdrCoefficients(coeff, fb); +} + +void SpaceOperator::AddRealMassCoefficients(double coeff, MaterialPropertyCoefficient &f) +{ + f.AddCoefficient(mat_op.GetAttributeToMaterial(), mat_op.GetPermittivityReal(), coeff); +} + +void SpaceOperator::AddRealMassBdrCoefficients(double coeff, + MaterialPropertyCoefficient &fb) +{ + // Robin BC contributions due to surface impedance and lumped ports (capacitance). + surf_z_op.AddMassBdrCoefficients(coeff, fb); + lumped_port_op.AddMassBdrCoefficients(coeff, fb); +} + +void SpaceOperator::AddImagMassCoefficients(double coeff, MaterialPropertyCoefficient &f) +{ + // Contribution for loss tangent: ε -> ε * (1 - i tan(δ)). + if (mat_op.HasLossTangent()) + { + f.AddCoefficient(mat_op.GetAttributeToMaterial(), mat_op.GetPermittivityImag(), coeff); + } +} + +void SpaceOperator::AddAbsMassCoefficients(double coeff, MaterialPropertyCoefficient &f) +{ + f.AddCoefficient(mat_op.GetAttributeToMaterial(), mat_op.GetPermittivityAbs(), coeff); +} + +void SpaceOperator::AddExtraSystemBdrCoefficients(double omega, + MaterialPropertyCoefficient &dfbr, + MaterialPropertyCoefficient &dfbi, + MaterialPropertyCoefficient &fbr, + MaterialPropertyCoefficient &fbi) +{ + // Contribution for second-order farfield boundaries and finite conductivity boundaries. + farfield_op.AddExtraSystemBdrCoefficients(omega, dfbr, dfbi); + surf_sigma_op.AddExtraSystemBdrCoefficients(omega, fbr, fbi); + + // Contribution for numeric wave ports. + wave_port_op.AddExtraSystemBdrCoefficients(omega, fbr, fbi); +} + +void SpaceOperator::AddRealPeriodicCoefficients(double coeff, + MaterialPropertyCoefficient &f) +{ + // Floquet periodicity contributions. + if (mat_op.HasWaveVector()) + { + f.AddCoefficient(mat_op.GetAttributeToMaterial(), mat_op.GetFloquetMass(), coeff); + } +} + +void SpaceOperator::AddImagPeriodicCoefficients(double coeff, + MaterialPropertyCoefficient &f) +{ + // Floquet periodicity contributions. + if (mat_op.HasWaveVector()) + { + f.AddCoefficient(mat_op.GetAttributeToMaterial(), mat_op.GetFloquetCurl(), coeff); + } +} + +bool SpaceOperator::GetExcitationVector(int excitation_idx, Vector &RHS) +{ + // Time domain excitation vector. + RHS.SetSize(GetNDSpace().GetTrueVSize()); + RHS.UseDevice(true); + RHS = 0.0; + bool nnz = AddExcitationVector1Internal(excitation_idx, RHS); + linalg::SetSubVector(RHS, nd_dbc_tdof_lists.back(), 0.0); + return nnz; +} + +bool SpaceOperator::GetExcitationVector(int excitation_idx, double omega, + ComplexVector &RHS) +{ + // Frequency domain excitation vector: RHS = iω RHS1 + RHS2(ω). + RHS.SetSize(GetNDSpace().GetTrueVSize()); + RHS.UseDevice(true); + RHS = 0.0; + bool nnz1 = AddExcitationVector1Internal(excitation_idx, RHS.Real()); + RHS *= 1i * omega; + bool nnz2 = AddExcitationVector2Internal(excitation_idx, omega, RHS); + linalg::SetSubVector(RHS, nd_dbc_tdof_lists.back(), 0.0); + return nnz1 || nnz2; +} + +bool SpaceOperator::GetExcitationVector1(int excitation_idx, ComplexVector &RHS1) +{ + // Assemble the frequency domain excitation term with linear frequency dependence + // (coefficient iω, see GetExcitationVector above, is accounted for later). + RHS1.SetSize(GetNDSpace().GetTrueVSize()); + RHS1.UseDevice(true); + RHS1 = 0.0; + bool nnz1 = AddExcitationVector1Internal(excitation_idx, RHS1.Real()); + linalg::SetSubVector(RHS1.Real(), nd_dbc_tdof_lists.back(), 0.0); + return nnz1; +} + +bool SpaceOperator::GetExcitationVector2(int excitation_idx, double omega, + ComplexVector &RHS2) +{ + RHS2.SetSize(GetNDSpace().GetTrueVSize()); + RHS2.UseDevice(true); + RHS2 = 0.0; + bool nnz2 = AddExcitationVector2Internal(excitation_idx, omega, RHS2); + linalg::SetSubVector(RHS2, nd_dbc_tdof_lists.back(), 0.0); + return nnz2; +} + +bool SpaceOperator::AddExcitationVector1Internal(int excitation_idx, Vector &RHS1) +{ + // Assemble the time domain excitation -g'(t) J or frequency domain excitation -iω J. + // The g'(t) or iω factors are not accounted for here, they is accounted for in the time + // integration or frequency sweep later. + MFEM_VERIFY(RHS1.Size() == GetNDSpace().GetTrueVSize(), + "Invalid T-vector size for AddExcitationVector1Internal!"); + SumVectorCoefficient fb(GetMesh().SpaceDimension()); + lumped_port_op.AddExcitationBdrCoefficients(excitation_idx, fb); + surf_j_op.AddExcitationBdrCoefficients(fb); // No excitation_idx — currently in all + int empty = (fb.empty()); + Mpi::GlobalMin(1, &empty, GetComm()); + if (empty) + { + return false; + } + mfem::LinearForm rhs1(&GetNDSpace().Get()); + rhs1.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fb)); + rhs1.UseFastAssembly(false); + rhs1.UseDevice(false); + rhs1.Assemble(); + rhs1.UseDevice(true); + GetNDSpace().GetProlongationMatrix()->AddMultTranspose(rhs1, RHS1); + return true; +} + +bool SpaceOperator::AddExcitationVector2Internal(int excitation_idx, double omega, + ComplexVector &RHS2) +{ + // Assemble the contribution of wave ports to the frequency domain excitation term at the + // specified frequency. + MFEM_VERIFY(RHS2.Size() == GetNDSpace().GetTrueVSize(), + "Invalid T-vector size for AddExcitationVector2Internal!"); + SumVectorCoefficient fbr(GetMesh().SpaceDimension()), fbi(GetMesh().SpaceDimension()); + wave_port_op.AddExcitationBdrCoefficients(excitation_idx, omega, fbr, fbi); + int empty = (fbr.empty() && fbi.empty()); + Mpi::GlobalMin(1, &empty, GetComm()); + if (empty) + { + return false; + } + { + mfem::LinearForm rhs2(&GetNDSpace().Get()); + rhs2.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fbr)); + rhs2.UseFastAssembly(false); + rhs2.UseDevice(false); + rhs2.Assemble(); + rhs2.UseDevice(true); + GetNDSpace().GetProlongationMatrix()->AddMultTranspose(rhs2, RHS2.Real()); + } + { + mfem::LinearForm rhs2(&GetNDSpace().Get()); + rhs2.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(fbi)); + rhs2.UseFastAssembly(false); + rhs2.UseDevice(false); + rhs2.Assemble(); + rhs2.UseDevice(true); + GetNDSpace().GetProlongationMatrix()->AddMultTranspose(rhs2, RHS2.Imag()); + } + return true; +} + +void SpaceOperator::GetConstantInitialVector(ComplexVector &v) +{ + v.SetSize(GetNDSpace().GetTrueVSize()); + v.UseDevice(true); + v = 1.0; + linalg::SetSubVector(v.Real(), nd_dbc_tdof_lists.back(), 0.0); +} + +void SpaceOperator::GetRandomInitialVector(ComplexVector &v) +{ + v.SetSize(GetNDSpace().GetTrueVSize()); + v.UseDevice(true); + linalg::SetRandom(GetNDSpace().GetComm(), v); + linalg::SetSubVector(v, nd_dbc_tdof_lists.back(), 0.0); +} + +template std::unique_ptr + SpaceOperator::GetStiffnessMatrix(Operator::DiagonalPolicy); +template std::unique_ptr + SpaceOperator::GetStiffnessMatrix(Operator::DiagonalPolicy); + +template std::unique_ptr + SpaceOperator::GetDampingMatrix(Operator::DiagonalPolicy); +template std::unique_ptr + SpaceOperator::GetDampingMatrix(Operator::DiagonalPolicy); + +template std::unique_ptr SpaceOperator::GetMassMatrix(Operator::DiagonalPolicy); +template std::unique_ptr + SpaceOperator::GetMassMatrix(Operator::DiagonalPolicy); + +template std::unique_ptr +SpaceOperator::GetExtraSystemMatrix(double, Operator::DiagonalPolicy); +template std::unique_ptr +SpaceOperator::GetExtraSystemMatrix(double, Operator::DiagonalPolicy); + +template std::unique_ptr +SpaceOperator::GetSystemMatrix(double, double, double, const Operator *, + const Operator *, const Operator *, + const Operator *); +template std::unique_ptr +SpaceOperator::GetSystemMatrix>( + std::complex, std::complex, std::complex, + const ComplexOperator *, const ComplexOperator *, const ComplexOperator *, + const ComplexOperator *); + +template std::unique_ptr +SpaceOperator::GetPreconditionerMatrix(double, double, double, double); +template std::unique_ptr +SpaceOperator::GetPreconditionerMatrix>( + std::complex, std::complex, std::complex, double); + +} // namespace palace diff --git a/palace/models/spaceoperator.hpp b/palace/models/spaceoperator.hpp index 3cce968d58..ed83322046 100644 --- a/palace/models/spaceoperator.hpp +++ b/palace/models/spaceoperator.hpp @@ -1,207 +1,234 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_SPACE_OPERATOR_HPP -#define PALACE_MODELS_SPACE_OPERATOR_HPP - -#include -#include -#include -#include -#include "fem/fespace.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" -#include "models/farfieldboundaryoperator.hpp" -#include "models/lumpedportoperator.hpp" -#include "models/materialoperator.hpp" -#include "models/surfaceconductivityoperator.hpp" -#include "models/surfacecurrentoperator.hpp" -#include "models/surfaceimpedanceoperator.hpp" -#include "models/waveportoperator.hpp" - -namespace palace -{ - -class IoData; -class SumCoefficient; -class SumMatrixCoefficient; - -// -// A class handling spatial discretization of the governing equations. -// -class SpaceOperator -{ -private: - const int pa_order_threshold; // Order above which to use partial assembly vs. full - const bool skip_zeros; // Skip zeros during full assembly of matrices - const bool pc_mat_real; // Use real-valued matrix for preconditioner - const bool pc_mat_shifted; // Use shifted mass matrix for preconditioner - - // Helper variables for log file printing. - bool print_hdr, print_prec_hdr; - - // Perfect electrical conductor essential boundary condition markers. - mfem::Array dbc_marker, aux_bdr_marker; - std::vector> nd_dbc_tdof_lists, h1_dbc_tdof_lists, aux_bdr_tdof_lists; - void CheckBoundaryProperties(); - - // Objects defining the finite element spaces for the electric field (Nedelec) and - // magnetic flux density (Raviart-Thomas) on the given mesh. The H1 spaces are used for - // various purposes throughout the code including postprocessing. - std::vector> nd_fecs; - std::vector> h1_fecs; - std::unique_ptr rt_fec; - FiniteElementSpaceHierarchy nd_fespaces; - AuxiliaryFiniteElementSpaceHierarchy h1_fespaces; - AuxiliaryFiniteElementSpace rt_fespace; - - // Operator for domain material properties. - MaterialOperator mat_op; - - // Operators for boundary conditions and source excitations. - FarfieldBoundaryOperator farfield_op; - SurfaceConductivityOperator surf_sigma_op; - SurfaceImpedanceOperator surf_z_op; - LumpedPortOperator lumped_port_op; - WavePortOperator wave_port_op; - SurfaceCurrentOperator surf_j_op; - - // Helper functions for building the bilinear forms corresponding to the discretized - // operators in Maxwell's equations. - void AddStiffnessCoefficients(double coef, SumMatrixCoefficient &df, - SumMatrixCoefficient &f); - void AddStiffnessBdrCoefficients(double coef, SumMatrixCoefficient &fb); - void AddDampingCoefficients(double coef, SumMatrixCoefficient &f); - void AddDampingBdrCoefficients(double coef, SumMatrixCoefficient &fb); - void AddRealMassCoefficients(double coef, SumMatrixCoefficient &f); - void AddRealMassBdrCoefficients(double coef, SumMatrixCoefficient &fb); - void AddImagMassCoefficients(double coef, SumMatrixCoefficient &f); - void AddAbsMassCoefficients(double coef, SumMatrixCoefficient &f); - void AddExtraSystemBdrCoefficients(double omega, SumCoefficient &dfbr, - SumCoefficient &dfbi, SumMatrixCoefficient &fbr, - SumMatrixCoefficient &fbi); - - // Helper functions for excitation vector assembly. - bool AddExcitationVector1Internal(Vector &RHS); - bool AddExcitationVector2Internal(double omega, ComplexVector &RHS); - -public: - SpaceOperator(const IoData &iodata, - const std::vector> &mesh); - - // Return list of all PEC boundary true dofs for all finite element space levels. - const std::vector> &GetNDDbcTDofLists() const - { - return nd_dbc_tdof_lists; - } - const std::vector> &GetH1DbcTDofLists() const - { - return h1_dbc_tdof_lists; - } - - // Returns lists of all boundary condition true dofs, PEC included, for the auxiliary - // H1 space hierarchy. These are all boundaries which affect the stiffness and damping - // (K and C) matrices, used for nullspace corrections. - const std::vector> &GetAuxBdrTDofLists() const - { - return aux_bdr_tdof_lists; - } - - // Return material operator for postprocessing. - const MaterialOperator &GetMaterialOp() const { return mat_op; } - - // Access to underlying BC operator objects for postprocessing. - auto &GetLumpedPortOp() { return lumped_port_op; } - auto &GetWavePortOp() { return wave_port_op; } - auto &GetSurfaceCurrentOp() { return surf_j_op; } - const auto &GetLumpedPortOp() const { return lumped_port_op; } - const auto &GetWavePortOp() const { return wave_port_op; } - const auto &GetSurfaceCurrentOp() const { return surf_j_op; } - - // Return the parallel finite element space objects. - auto &GetNDSpaces() { return nd_fespaces; } - const auto &GetNDSpaces() const { return nd_fespaces; } - auto &GetNDSpace() { return nd_fespaces.GetFinestFESpace(); } - const auto &GetNDSpace() const { return nd_fespaces.GetFinestFESpace(); } - auto &GetH1Spaces() { return h1_fespaces; } - const auto &GetH1Spaces() const { return h1_fespaces; } - auto &GetH1Space() { return h1_fespaces.GetFinestFESpace(); } - const auto &GetH1Space() const { return h1_fespaces.GetFinestFESpace(); } - auto &GetRTSpace() { return rt_fespace; } - const auto &GetRTSpace() const { return rt_fespace; } - - // Return the number of true (conforming) dofs on the finest ND space. - auto GlobalTrueVSize() { return GetNDSpace().GlobalTrueVSize(); } - - // Construct any part of the frequency-dependent complex linear system matrix: - // A = K + iω C - ω² (Mr + i Mi) + A2(ω) . - // For time domain problems, any one of K, C, or M = Mr can be constructed. The argument - // ω is required only for the constructing the "extra" matrix A2(ω). - template - std::unique_ptr GetStiffnessMatrix(Operator::DiagonalPolicy diag_policy); - template - std::unique_ptr GetDampingMatrix(Operator::DiagonalPolicy diag_policy); - template - std::unique_ptr GetMassMatrix(Operator::DiagonalPolicy diag_policy); - template - std::unique_ptr GetExtraSystemMatrix(double omega, - Operator::DiagonalPolicy diag_policy); - - // Construct the complete frequency or time domain system matrix using the provided - // stiffness, damping, mass, and extra matrices: - // A = a0 K + a1 C + a2 (Mr + i Mi) + A2 . - // It is assumed that the inputs have been constructed using previous calls to - // GetSystemMatrix() and the returned operator does not inherit ownership of any of them. - template - std::unique_ptr - GetSystemMatrix(ScalarType a0, ScalarType a1, ScalarType a2, const OperType *K, - const OperType *C, const OperType *M, const OperType *A2 = nullptr); - - // Construct the real, SPD matrix for weighted L2 or H(curl) inner products: - // B = a0 Kr + a2 Mr . - // It is assumed that the inputs have been constructed using previous calls to - // GetSystemMatrix() and the returned operator does not inherit ownership of any of them. - // If K or M have eliminated boundary conditions, they are not eliminated from the - // returned operator. - std::unique_ptr GetInnerProductMatrix(double a0, double a2, - const ComplexOperator *K, - const ComplexOperator *M); - - // Construct the real, optionally SPD matrix for frequency or time domain linear system - // preconditioning (Mr > 0, Mi < 0, |Mr + i Mi| is done on the material property - // coefficient, not the matrix entries themselves): - // B = a0 K + a1 C -/+ a2 |Mr + i Mi| + A2r(a3) + A2i(a3) . - template - std::unique_ptr GetPreconditionerMatrix(double a0, double a1, double a2, - double a3); - - // Construct and return the discrete curl or gradient matrices. - const Operator &GetGradMatrix() const - { - return GetH1Spaces().GetFinestFESpace().GetDiscreteInterpolator(); - } - const Operator &GetCurlMatrix() const { return GetRTSpace().GetDiscreteInterpolator(); } - - // Assemble the right-hand side source term vector for an incident field or current source - // applied on specified excited boundaries. The return value indicates whether or not the - // excitation is nonzero (and thus is true most of the time). - bool GetExcitationVector(Vector &RHS); - bool GetExcitationVector(double omega, ComplexVector &RHS); - - // Separate out RHS vector as RHS = iω RHS1 + RHS2(ω). The return value indicates whether - // or not the excitation is nonzero (and thus is true most of the time). - bool GetExcitationVector1(ComplexVector &RHS1); - bool GetExcitationVector2(double omega, ComplexVector &RHS2); - - // Construct a constant or randomly initialized vector which satisfies the PEC essential - // boundary conditions. - void GetRandomInitialVector(ComplexVector &v); - void GetConstantInitialVector(ComplexVector &v); - - // Get the associated MPI communicator. - MPI_Comm GetComm() const { return GetNDSpace().GetComm(); } -}; - -} // namespace palace - -#endif // PALACE_MODELS_SPACE_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_SPACE_OPERATOR_HPP +#define PALACE_MODELS_SPACE_OPERATOR_HPP + +#include +#include +#include +#include +#include "fem/fespace.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" +#include "models/farfieldboundaryoperator.hpp" +#include "models/lumpedportoperator.hpp" +#include "models/materialoperator.hpp" +#include "models/portexcitations.hpp" +#include "models/surfaceconductivityoperator.hpp" +#include "models/surfacecurrentoperator.hpp" +#include "models/surfaceimpedanceoperator.hpp" +#include "models/waveportoperator.hpp" + +namespace palace +{ + +class IoData; +class Mesh; + +// +// A class handling spatial discretization of the governing equations. +// +class SpaceOperator +{ +private: + const bool pc_mat_real; // Use real-valued matrix for preconditioner + const bool pc_mat_shifted; // Use shifted mass matrix for preconditioner + + // Helper variables for log file printing. + bool print_hdr, print_prec_hdr; + + // Perfect electrical conductor essential boundary condition attributes. + mfem::Array dbc_attr, aux_bdr_attr; + std::vector> nd_dbc_tdof_lists, h1_dbc_tdof_lists, aux_bdr_tdof_lists; + + // Objects defining the finite element spaces for the electric field (Nedelec) and + // magnetic flux density (Raviart-Thomas) on the given mesh. The H1 spaces are used for + // various purposes throughout the code including postprocessing. + std::vector> nd_fecs; + std::vector> h1_fecs; + std::vector> rt_fecs; + FiniteElementSpaceHierarchy nd_fespaces, h1_fespaces, rt_fespaces; + + // Operator for domain material properties. + MaterialOperator mat_op; + + // Operators for boundary conditions and source excitations. + FarfieldBoundaryOperator farfield_op; + SurfaceConductivityOperator surf_sigma_op; + SurfaceImpedanceOperator surf_z_op; + LumpedPortOperator lumped_port_op; + WavePortOperator wave_port_op; + SurfaceCurrentOperator surf_j_op; + + PortExcitations port_excitation_helper; + + mfem::Array SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh); + void CheckBoundaryProperties(); + + // Helper functions for building the bilinear forms corresponding to the discretized + // operators in Maxwell's equations. + void AddStiffnessCoefficients(double coeff, MaterialPropertyCoefficient &df, + MaterialPropertyCoefficient &f); + void AddStiffnessBdrCoefficients(double coeff, MaterialPropertyCoefficient &fb); + void AddDampingCoefficients(double coeff, MaterialPropertyCoefficient &f); + void AddDampingBdrCoefficients(double coeff, MaterialPropertyCoefficient &fb); + void AddRealMassCoefficients(double coeff, MaterialPropertyCoefficient &f); + void AddRealMassBdrCoefficients(double coeff, MaterialPropertyCoefficient &fb); + void AddImagMassCoefficients(double coeff, MaterialPropertyCoefficient &f); + void AddAbsMassCoefficients(double coeff, MaterialPropertyCoefficient &f); + void AddExtraSystemBdrCoefficients(double omega, MaterialPropertyCoefficient &dfbr, + MaterialPropertyCoefficient &dfbi, + MaterialPropertyCoefficient &fbr, + MaterialPropertyCoefficient &fbi); + void AddRealPeriodicCoefficients(double coeff, MaterialPropertyCoefficient &f); + void AddImagPeriodicCoefficients(double coeff, MaterialPropertyCoefficient &f); + + // Helper functions for excitation vector assembly. + bool AddExcitationVector1Internal(int excitation_idx, Vector &RHS); + bool AddExcitationVector2Internal(int excitation_idx, double omega, ComplexVector &RHS); + + // Helper functions to build the preconditioner matrix. + void AssemblePreconditioner(std::complex a0, std::complex a1, + std::complex a2, double a3, + std::vector> &br_vec, + std::vector> &br_aux_vec, + std::vector> &bi_vec, + std::vector> &bi_aux_vec); + void AssemblePreconditioner(std::complex a0, std::complex a1, + std::complex a2, double a3, + std::vector> &br_vec, + std::vector> &br_aux_vec); + void AssemblePreconditioner(double a0, double a1, double a2, double a3, + std::vector> &br_vec, + std::vector> &br_aux_vec); + +public: + SpaceOperator(const IoData &iodata, const std::vector> &mesh); + + // Return list of all PEC boundary true dofs for all finite element space levels. + const std::vector> &GetNDDbcTDofLists() const + { + return nd_dbc_tdof_lists; + } + const std::vector> &GetH1DbcTDofLists() const + { + return h1_dbc_tdof_lists; + } + + // Returns lists of all boundary condition true dofs, PEC included, for the auxiliary + // H1 space hierarchy. These are all boundaries which affect the stiffness and damping + // (K and C) matrices, used for nullspace corrections. + const std::vector> &GetAuxBdrTDofLists() const + { + return aux_bdr_tdof_lists; + } + + // Return material operator for postprocessing. + const MaterialOperator &GetMaterialOp() const { return mat_op; } + + // Access to underlying BC operator objects for postprocessing. + auto &GetLumpedPortOp() { return lumped_port_op; } + auto &GetWavePortOp() { return wave_port_op; } + auto &GetSurfaceCurrentOp() { return surf_j_op; } + const auto &GetLumpedPortOp() const { return lumped_port_op; } + const auto &GetWavePortOp() const { return wave_port_op; } + const auto &GetSurfaceCurrentOp() const { return surf_j_op; } + + const auto &GetPortExcitations() const { return port_excitation_helper; } + + // Return the parallel finite element space objects. + auto &GetNDSpaces() { return nd_fespaces; } + const auto &GetNDSpaces() const { return nd_fespaces; } + auto &GetNDSpace() { return nd_fespaces.GetFinestFESpace(); } + const auto &GetNDSpace() const { return nd_fespaces.GetFinestFESpace(); } + auto &GetH1Spaces() { return h1_fespaces; } + const auto &GetH1Spaces() const { return h1_fespaces; } + auto &GetH1Space() { return h1_fespaces.GetFinestFESpace(); } + const auto &GetH1Space() const { return h1_fespaces.GetFinestFESpace(); } + auto &GetRTSpaces() { return rt_fespaces; } + const auto &GetRTSpaces() const { return rt_fespaces; } + auto &GetRTSpace() { return rt_fespaces.GetFinestFESpace(); } + const auto &GetRTSpace() const { return rt_fespaces.GetFinestFESpace(); } + + // Access the underlying mesh object. + const auto &GetMesh() const { return GetNDSpace().GetMesh(); } + + // Return the number of true (conforming) dofs on the finest ND space. + auto GlobalTrueVSize() const { return GetNDSpace().GlobalTrueVSize(); } + + // Construct any part of the frequency-dependent complex linear system matrix: + // A = K + iω C - ω² (Mr + i Mi) + A2(ω). + // For time domain problems, any one of K, C, or M = Mr can be constructed. The argument + // ω is required only for the constructing the "extra" matrix A2(ω). + template + std::unique_ptr GetStiffnessMatrix(Operator::DiagonalPolicy diag_policy); + template + std::unique_ptr GetDampingMatrix(Operator::DiagonalPolicy diag_policy); + template + std::unique_ptr GetMassMatrix(Operator::DiagonalPolicy diag_policy); + template + std::unique_ptr GetExtraSystemMatrix(double omega, + Operator::DiagonalPolicy diag_policy); + + // Construct the complete frequency or time domain system matrix using the provided + // stiffness, damping, mass, and extra matrices: + // A = a0 K + a1 C + a2 (Mr + i Mi) + A2. + // It is assumed that the inputs have been constructed using previous calls to + // GetSystemMatrix() and the returned operator does not inherit ownership of any of them. + template + std::unique_ptr + GetSystemMatrix(ScalarType a0, ScalarType a1, ScalarType a2, const OperType *K, + const OperType *C, const OperType *M, const OperType *A2 = nullptr); + + // Construct the real, SPD matrix for weighted L2 or H(curl) inner products: + // B = a0 Kr + a2 Mr . + // It is assumed that the inputs have been constructed using previous calls to + // GetSystemMatrix() and the returned operator does not inherit ownership of any of them. + // If K or M have eliminated boundary conditions, they are not eliminated from the + // returned operator. + std::unique_ptr GetInnerProductMatrix(double a0, double a2, + const ComplexOperator *K, + const ComplexOperator *M); + + // Construct the matrix for frequency or time domain linear system preconditioning. If it + // is real-valued (Mr > 0, Mi < 0, |Mr + Mi| is done on the material property coefficient, + // not the matrix entries themselves): + // B = a0 K + a1 C -/+ a2 |Mr + Mi| + A2r(a3) + A2i(a3). + template + std::unique_ptr GetPreconditionerMatrix(ScalarType a0, ScalarType a1, + ScalarType a2, double a3); + + // Construct and return the discrete curl or gradient matrices. + const Operator &GetGradMatrix() const + { + return GetNDSpace().GetDiscreteInterpolator(GetH1Space()); + } + const Operator &GetCurlMatrix() const + { + return GetRTSpace().GetDiscreteInterpolator(GetNDSpace()); + } + + // Assemble the right-hand side source term vector for an incident field or current source + // applied on specified excited boundaries. The return value indicates whether or not the + // excitation is nonzero (and thus is true most of the time). + bool GetExcitationVector(int excitation_idx, Vector &RHS); + bool GetExcitationVector(int excitation_idx, double omega, ComplexVector &RHS); + + // Separate out RHS vector as RHS = iω RHS1 + RHS2(ω). The return value indicates whether + // or not the excitation is nonzero (and thus is true most of the time). + bool GetExcitationVector1(int excitation_idx, ComplexVector &RHS1); + bool GetExcitationVector2(int excitation_idx, double omega, ComplexVector &RHS2); + + // Construct a constant or randomly initialized vector which satisfies the PEC essential + // boundary conditions. + void GetRandomInitialVector(ComplexVector &v); + void GetConstantInitialVector(ComplexVector &v); + + // Get the associated MPI communicator. + MPI_Comm GetComm() const { return GetNDSpace().GetComm(); } +}; + +} // namespace palace + +#endif // PALACE_MODELS_SPACE_OPERATOR_HPP diff --git a/palace/models/strattonchu.cpp b/palace/models/strattonchu.cpp new file mode 100644 index 0000000000..31996c4e23 --- /dev/null +++ b/palace/models/strattonchu.cpp @@ -0,0 +1,167 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "strattonchu.hpp" + +#include "fem/coefficient.hpp" +#include "utils/omp.hpp" + +namespace palace +{ + +// Computes contribution to Stratton-Chu far-field integrands for multiple observation +// directions for the given element T. +// +// The Stratton-Chu transformations to compute far-field electric field rE_∞(r̂) +// at multiple observation directions r₀ (specified by a vector of (θ, ϕ)s) are: +// +// r E_∞(r₀) = (ik/4π) r₀ × ∫_S [n̂ × E - Z r₀ × (n̂ × H)] e^{ikr₀·r'} dS ≈ r₀ × Σ (Iᵣ + +// iIᵢ). +// +// where: +// - S is the boundary surface with outward normal n̂. +// - E, H are the electric and magnetic fields on the surface. +// - k = ω/c is the wavenumber. +// - Z is the impedance. +// - r₀ = (sin θ cos φ, sin θ sin φ, cos θ) are observation directions. +// - r' are source points on the surface. +// +// This function computes Iᵣ and Iᵢ for all the input observation points. +// +// This equation was obtained starting from Stratton-Chu's transformations, assuming an +// analytic expression for Green's function (G(r, r0) = exp(-ik|r - r₀|) / (4π|r - r₀|)), +// and expanding for r that goes to infinity. +// +// Note: +// - This equation is only valid in three dimensional space. +// - This equation is only valid when all the materials have scalar μ and ε. +// - The implementation assumes S is an external boundary. +// +// Note also: +// +// This function uses std::vector> instead of Vectors. The +// reason for this is so that we can ensure that memory layout is simple (and +// contiguous), ensuring that we can perform one single MPI reduction for all +// the points in integrand_*. +void AddStrattonChuIntegrandAtElement(const GridFunction &E, const GridFunction &B, + const MaterialOperator &mat_op, double omega_re, + double omega_im, + std::vector> &r_naughts, + mfem::ElementTransformation &T, + const mfem::IntegrationRule &ir, + std::vector> &integrand_r, + std::vector> &integrand_i) +{ + MFEM_ASSERT(E.VectorDim() == 3, "Stratton-Chu requires 3D vector fields!"); + MFEM_ASSERT(B.VectorDim() == 3, "Stratton-Chu requires 3D vector fields!"); + + MFEM_VERIFY(integrand_r.size() == r_naughts.size() && + integrand_i.size() == r_naughts.size(), + "Mismatch between input points and result vector.") + + MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, + "Unexpected element type in BdrSurfaceFluxCoefficient!"); + + StaticVector<3> r_phys; + StaticVector<3> E_real, E_imag, B_real, B_imag; + StaticVector<3> ZH_real, ZH_imag; + StaticVector<3> normal; + StaticVector<3> n_cross_Er, n_cross_ZHr; + StaticVector<3> n_cross_Ei, n_cross_ZHi; + + const mfem::ParMesh *mesh = E.Real().ParFESpace()->GetParMesh(); + mfem::FaceElementTransformations FET; + mfem::IsoparametricTransformation T1, T2; + + for (int j = 0; j < ir.GetNPoints(); j++) + { + const mfem::IntegrationPoint &ip = ir.IntPoint(j); + T.SetIntPoint(&ip); + + MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, + "Unexpected element type in BdrSurfaceFluxCoefficient!"); + + bool invert = BdrGridFunctionCoefficient::GetBdrElementNeighborTransformations( + T.ElementNo, *mesh, FET, T1, T2, &ip); // NOTE: this updates FET. + MFEM_VERIFY(!FET.Elem2, + "FarField computations are only supported on external boundaries.") + + T.Transform(ip, r_phys); + + // Evaluate E and B fields on this element. + E.Real().GetVectorValue(*FET.Elem1, FET.Elem1->GetIntPoint(), E_real); + E.Imag().GetVectorValue(*FET.Elem1, FET.Elem1->GetIntPoint(), E_imag); + B.Real().GetVectorValue(*FET.Elem1, FET.Elem1->GetIntPoint(), B_real); + B.Imag().GetVectorValue(*FET.Elem1, FET.Elem1->GetIntPoint(), B_imag); + + // We assume that the material is isotropic, so the wave speed is a scalar. + double wave_speed = mat_op.GetLightSpeedMax(FET.Elem1->Attribute); + double k_re = omega_re / wave_speed; + double k_im = omega_im / wave_speed; + double quadrature_weight = ip.weight * T.Weight(); + + // Complex prefactor: (ik/4π) = (i(k_re + ik_im)/4π) = (ik_re - k_im)/4π. + double prefactor_re = -quadrature_weight * k_im / (4 * M_PI); + double prefactor_im = quadrature_weight * k_re / (4 * M_PI); + + // Z * H = c0 * B. + mat_op.GetLightSpeed(FET.Elem1->Attribute).Mult(B_real, ZH_real); + mat_op.GetLightSpeed(FET.Elem1->Attribute).Mult(B_imag, ZH_imag); + BdrGridFunctionCoefficient::GetNormal(T, normal, invert); + + // n̂ × E. + linalg::Cross3(normal, E_real, n_cross_Er); + linalg::Cross3(normal, E_imag, n_cross_Ei); + + // n̂ × ZH. + linalg::Cross3(normal, ZH_real, n_cross_ZHr); + linalg::Cross3(normal, ZH_imag, n_cross_ZHi); + + // This is a hot loop. Manually unrolling and avoiding Vectors significantly + // increases performance. + PalacePragmaOmp(parallel for schedule(static)) + for (size_t i = 0; i < r_naughts.size(); i++) + { + const auto &r = r_naughts[i]; + + double r0 = r[0], r1 = r[1], r2 = r[2]; + double dot_product = r0 * r_phys(0) + r1 * r_phys(1) + r2 * r_phys(2); + + // Complex phase: exp(i*k*r₀·r') = exp(i*(k_re + ik_im)*dot_product) + // = exp(i*k_re*dot_product - k_im*dot_product) + // = exp(-k_im*dot_product) * exp(i*k_re*dot_product). + double amplitude = std::exp(-k_im * dot_product); + double phase_re = k_re * dot_product; + double cos_phase = std::cos(phase_re); + double sin_phase = std::sin(phase_re); + + // Complex weight: prefactor * exp(i*k*r₀·r'). + double w_real = amplitude * (prefactor_re * cos_phase - prefactor_im * sin_phase); + double w_imag = amplitude * (prefactor_re * sin_phase + prefactor_im * cos_phase); + + // r₀ × (n̂ × ZH). + double cr0 = r1 * n_cross_ZHr(2) - r2 * n_cross_ZHr(1); + double cr1 = r2 * n_cross_ZHr(0) - r0 * n_cross_ZHr(2); + double cr2 = r0 * n_cross_ZHr(1) - r1 * n_cross_ZHr(0); + + double ci0 = r1 * n_cross_ZHi(2) - r2 * n_cross_ZHi(1); + double ci1 = r2 * n_cross_ZHi(0) - r0 * n_cross_ZHi(2); + double ci2 = r0 * n_cross_ZHi(1) - r1 * n_cross_ZHi(0); + + // Complex multiplication: (A + iB) * (w_real + i*w_imag). + double A0 = n_cross_Er(0) - cr0, B0 = n_cross_Ei(0) - ci0; + double A1 = n_cross_Er(1) - cr1, B1 = n_cross_Ei(1) - ci1; + double A2 = n_cross_Er(2) - cr2, B2 = n_cross_Ei(2) - ci2; + + integrand_r[i][0] += A0 * w_real - B0 * w_imag; + integrand_r[i][1] += A1 * w_real - B1 * w_imag; + integrand_r[i][2] += A2 * w_real - B2 * w_imag; + + integrand_i[i][0] += A0 * w_imag + B0 * w_real; + integrand_i[i][1] += A1 * w_imag + B1 * w_real; + integrand_i[i][2] += A2 * w_imag + B2 * w_real; + } + } +} + +}; // namespace palace diff --git a/palace/models/strattonchu.hpp b/palace/models/strattonchu.hpp new file mode 100644 index 0000000000..1e34651e46 --- /dev/null +++ b/palace/models/strattonchu.hpp @@ -0,0 +1,26 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_STRATTONCHU_HPP +#define PALACE_MODELS_STRATTONCHU_HPP + +#include +#include +#include +#include "fem/gridfunction.hpp" +#include "models/materialoperator.hpp" + +namespace palace +{ +void AddStrattonChuIntegrandAtElement(const GridFunction &E, const GridFunction &B, + const MaterialOperator &mat_op, double omega_re, + double omega_im, + std::vector> &r_naughts, + mfem::ElementTransformation &T, + const mfem::IntegrationRule &ir, + std::vector> &integrand_r, + std::vector> &integrand_i); + +} // namespace palace + +#endif // PALACE_MODELS_STRATTONCHU_HPP diff --git a/palace/models/surfaceconductivityoperator.cpp b/palace/models/surfaceconductivityoperator.cpp index 705315f677..9868cbe51a 100644 --- a/palace/models/surfaceconductivityoperator.cpp +++ b/palace/models/surfaceconductivityoperator.cpp @@ -1,174 +1,174 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "surfaceconductivityoperator.hpp" - -#include "fem/coefficient.hpp" -#include "utils/communication.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -using namespace std::complex_literals; - -SurfaceConductivityOperator::SurfaceConductivityOperator(const IoData &iodata, - mfem::ParMesh &mesh) -{ - // Set up finite conductivity boundary conditions. - SetUpBoundaryProperties(iodata, mesh); - PrintBoundaryInfo(iodata, mesh); -} - -void SurfaceConductivityOperator::SetUpBoundaryProperties(const IoData &iodata, - const mfem::ParMesh &mesh) -{ - // Check that conductivity boundary attributes have been specified correctly. - int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; - if (!iodata.boundaries.conductivity.empty()) - { - mfem::Array bdr_attr_marker(bdr_attr_max); - bdr_attr_marker = 0; - for (auto attr : mesh.bdr_attributes) - { - bdr_attr_marker[attr - 1] = 1; - } - for (const auto &data : iodata.boundaries.conductivity) - { - for (auto attr : data.attributes) - { - MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, - "Conductivity boundary attribute tags must be non-negative and " - "correspond to attributes in the mesh!"); - MFEM_VERIFY(bdr_attr_marker[attr - 1], - "Unknown conductivity boundary attribute " << attr << "!"); - } - } - } - - // Finite conductivity boundaries are defined using the user provided surface conductivity - // and optionally conductor thickness. - bdr_sigma.SetSize(bdr_attr_max); - bdr_mu.SetSize(bdr_attr_max); - bdr_h.SetSize(bdr_attr_max); - bdr_sigma = 0.0; - bdr_mu = 0.0; - bdr_h = 0.0; - for (const auto &data : iodata.boundaries.conductivity) - { - MFEM_VERIFY(data.sigma > 0.0 && data.mu_r > 0.0, - "Conductivity boundary has no conductivity or no " - "permeability defined!"); - MFEM_VERIFY(data.h >= 0.0, "Conductivity boundary should have non-negative thickness!"); - for (auto attr : data.attributes) - { - MFEM_VERIFY( - bdr_sigma(attr - 1) == 0.0 && bdr_mu(attr - 1) == 0.0 && bdr_h(attr - 1) == 0.0, - "Multiple definitions of conductivity boundary properties for boundary attribute " - << attr << "!"); - bdr_sigma(attr - 1) = data.sigma; - bdr_mu(attr - 1) = data.mu_r; - bdr_h(attr - 1) = data.h; - if (data.external) - { - // External surfaces have twice the effective thickness since the BC is applied at - // one side. - bdr_h(attr - 1) *= 2.0; - } - } - } - - // Mark selected boundary attributes from the mesh as finite conductivity. - mfem::Array conductivity_bcs; - for (const auto &data : iodata.boundaries.conductivity) - { - for (auto attr : data.attributes) - { - conductivity_bcs.Append(attr); - } - } - MFEM_VERIFY(conductivity_bcs.Size() == 0 || - iodata.problem.type == config::ProblemData::Type::DRIVEN, - "Finite conductivity boundaries are only available for frequency " - "domain driven simulations!"); - mesh::AttrToMarker(bdr_attr_max, conductivity_bcs, conductivity_marker); -} - -void SurfaceConductivityOperator::PrintBoundaryInfo(const IoData &iodata, - mfem::ParMesh &mesh) -{ - if (conductivity_marker.Size() && conductivity_marker.Max() == 0) - { - return; - } - Mpi::Print("\nConfiguring Robin finite conductivity BC at attributes:\n"); - for (int i = 0; i < conductivity_marker.Size(); i++) - { - if (conductivity_marker[i]) - { - const int attr = i + 1; - mfem::Vector nor; - mesh::GetSurfaceNormal(mesh, attr, nor); - Mpi::Print(" {:d}: σ = {:.3e} S/m", attr, - iodata.DimensionalizeValue(IoData::ValueType::CONDUCTIVITY, bdr_sigma(i))); - if (bdr_h(i) > 0.0) - { - Mpi::Print(", h = {:.3e} m", - iodata.DimensionalizeValue(IoData::ValueType::LENGTH, bdr_h(i))); - } - if (mesh.SpaceDimension() == 3) - { - Mpi::Print(", n = ({:+.1f}, {:+.1f}, {:+.1f})", nor(0), nor(1), nor(2)); - } - else - { - Mpi::Print(", n = ({:+.1f}, {:+.1f})", nor(0), nor(1)); - } - Mpi::Print("\n"); - } - } -} - -void SurfaceConductivityOperator::AddExtraSystemBdrCoefficients(double omega, - SumMatrixCoefficient &fbr, - SumMatrixCoefficient &fbi) -{ - if (conductivity_marker.Size() && conductivity_marker.Max() > 0) - { - // If the provided conductor thickness is empty (zero), prescribe a surface impedance - // (1+i)/σδ, where δ is the skin depth. If it is nonzero, use a finite thickness - // modification which correctly produces the DC limit when h << δ. See the Ansys HFSS - // user manual section titled "Surface Impedance Boundary Condition for Metal Traces of - // Finite Thickness." - mfem::Vector vr(bdr_sigma.Size()), vi(bdr_sigma.Size()); - for (int i = 0; i < bdr_sigma.Size(); i++) - { - if (bdr_sigma(i) > 0.0) - { - double delta = std::sqrt(2.0 / (bdr_mu(i) * bdr_sigma(i) * omega)); - std::complex Z = 1.0 / (bdr_sigma(i) * delta); - Z.imag(Z.real()); - if (bdr_h(i) > 0.0) - { - double nu = bdr_h(i) / delta; - double den = std::cosh(nu) - std::cos(nu); - Z.real(Z.real() * (std::sinh(nu) + std::sin(nu)) / den); - Z.imag(Z.imag() * (std::sinh(nu) - std::sin(nu)) / den); - } - // The BC term has coefficient iω/Z (like for standard lumped surface impedance). - std::complex s(1i * omega / Z); - vr(i) = s.real(); - vi(i) = s.imag(); - } - else - { - vr(i) = vi(i) = 0.0; // Not a conductivity boundary - } - } - fbr.AddCoefficient(std::make_unique(vr), conductivity_marker); - fbi.AddCoefficient(std::make_unique(vi), conductivity_marker); - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "surfaceconductivityoperator.hpp" + +#include +#include "models/materialoperator.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" +#include "utils/prettyprint.hpp" + +namespace palace +{ + +using namespace std::complex_literals; + +SurfaceConductivityOperator::SurfaceConductivityOperator(const IoData &iodata, + const MaterialOperator &mat_op, + const mfem::ParMesh &mesh) + : mat_op(mat_op) +{ + // Print out BC info for all finite conductivity boundary attributes. + SetUpBoundaryProperties(iodata, mesh); + PrintBoundaryInfo(iodata, mesh); +} + +void SurfaceConductivityOperator::SetUpBoundaryProperties(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + // Check that conductivity boundary attributes have been specified correctly. + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array bdr_attr_marker; + if (!iodata.boundaries.conductivity.empty()) + { + mfem::Array conductivity_marker(bdr_attr_max); + bdr_attr_marker.SetSize(bdr_attr_max); + bdr_attr_marker = 0; + conductivity_marker = 0; + for (auto attr : mesh.bdr_attributes) + { + bdr_attr_marker[attr - 1] = 1; + } + std::set bdr_warn_list; + for (const auto &data : iodata.boundaries.conductivity) + { + for (auto attr : data.attributes) + { + MFEM_VERIFY(!conductivity_marker[attr - 1], + "Multiple definitions of conductivity boundary properties for boundary " + "attribute " + << attr << "!"); + conductivity_marker[attr - 1] = 1; + // MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, + // "Conductivity boundary attribute tags must be non-negative and " + // "correspond to attributes in the mesh!"); + // MFEM_VERIFY(bdr_attr_marker[attr - 1], + // "Unknown conductivity boundary attribute " << attr << "!"); + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + bdr_warn_list.insert(attr); + } + } + } + if (!bdr_warn_list.empty()) + { + Mpi::Print("\n"); + Mpi::Warning( + "Unknown conductivity boundary attributes!\nSolver will just ignore them!"); + utils::PrettyPrint(bdr_warn_list, "Boundary attribute list:"); + Mpi::Print("\n"); + } + } + + // Finite conductivity boundaries are defined using the user provided surface conductivity + // and optionally conductor thickness. + boundaries.reserve(iodata.boundaries.conductivity.size()); + for (const auto &data : iodata.boundaries.conductivity) + { + MFEM_VERIFY(data.sigma > 0.0 && data.mu_r > 0.0, + "Conductivity boundary has no conductivity or no " + "permeability defined!"); + MFEM_VERIFY(data.h >= 0.0, "Conductivity boundary should have non-negative thickness!"); + auto &bdr = boundaries.emplace_back(); + bdr.sigma = data.sigma; + bdr.mu = data.mu_r; + bdr.h = data.h; + if (data.external) + { + // External surfaces have twice the effective thickness since the BC is applied at one + // side. + bdr.h *= 2.0; + } + bdr.attr_list.Reserve(static_cast(data.attributes.size())); + for (auto attr : data.attributes) + { + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + continue; // Can just ignore if wrong + } + bdr.attr_list.Append(attr); + } + } + MFEM_VERIFY(boundaries.empty() || iodata.problem.type == ProblemType::DRIVEN || + iodata.problem.type == ProblemType::EIGENMODE, + "Finite conductivity boundaries are only available for frequency " + "domain simulations!"); +} + +void SurfaceConductivityOperator::PrintBoundaryInfo(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + if (boundaries.empty()) + { + return; + } + Mpi::Print("\nConfiguring Robin finite conductivity BC at attributes:\n"); + for (const auto &bdr : boundaries) + { + for (auto attr : bdr.attr_list) + { + Mpi::Print(" {:d}: σ = {:.3e} S/m", attr, + iodata.units.Dimensionalize(bdr.sigma)); + if (bdr.h > 0.0) + { + Mpi::Print(", h = {:.3e} m", + iodata.units.Dimensionalize(bdr.h)); + } + Mpi::Print(", n = ({:+.1f})\n", fmt::join(mesh::GetSurfaceNormal(mesh, attr), ",")); + } + } +} + +mfem::Array SurfaceConductivityOperator::GetAttrList() const +{ + mfem::Array attr_list; + for (const auto &bdr : boundaries) + { + attr_list.Append(bdr.attr_list); + } + return attr_list; +} + +void SurfaceConductivityOperator::AddExtraSystemBdrCoefficients( + double omega, MaterialPropertyCoefficient &fbr, MaterialPropertyCoefficient &fbi) +{ + // If the provided conductor thickness is empty (zero), prescribe a surface impedance + // (1+i)/σδ, where δ is the skin depth. If it is nonzero, use a finite thickness + // modification which correctly produces the DC limit when h << δ. See the Ansys HFSS + // user manual section titled "Surface Impedance Boundary Condition for Metal Traces of + // Finite Thickness." + for (const auto &bdr : boundaries) + { + if (std::abs(bdr.sigma) > 0.0) + { + double delta = std::sqrt(2.0 / (bdr.mu * bdr.sigma * omega)); + std::complex Z = 1.0 / (bdr.sigma * delta); + Z.imag(Z.real()); + if (bdr.h > 0.0) + { + double nu = bdr.h / delta; + double den = std::cosh(nu) - std::cos(nu); + Z.real(Z.real() * (std::sinh(nu) + std::sin(nu)) / den); + Z.imag(Z.imag() * (std::sinh(nu) - std::sin(nu)) / den); + } + // The BC term has coefficient iω/Z (like for standard lumped surface impedance). + std::complex s(1i * omega / Z); + fbr.AddMaterialProperty(mat_op.GetCeedBdrAttributes(bdr.attr_list), s.real()); + fbi.AddMaterialProperty(mat_op.GetCeedBdrAttributes(bdr.attr_list), s.imag()); + } + } +} + +} // namespace palace diff --git a/palace/models/surfaceconductivityoperator.hpp b/palace/models/surfaceconductivityoperator.hpp index cfe40e717d..7e6d31a74b 100644 --- a/palace/models/surfaceconductivityoperator.hpp +++ b/palace/models/surfaceconductivityoperator.hpp @@ -1,41 +1,52 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_SURFACE_CONDUCTIVITY_OPERATOR_HPP -#define PALACE_MODELS_SURFACE_CONDUCTIVITY_OPERATOR_HPP - -#include - -namespace palace -{ - -class IoData; -class SumMatrixCoefficient; - -// -// A class handling finite conductivity boundaries. -// -class SurfaceConductivityOperator -{ -private: - // Surface properties for finite conductivity boundary attributes: conductor conductivity - // and permeability, and (optionally) thickness. - mfem::Vector bdr_sigma, bdr_mu, bdr_h; - mfem::Array conductivity_marker; - void SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh); - void PrintBoundaryInfo(const IoData &iodata, mfem::ParMesh &mesh); - -public: - SurfaceConductivityOperator(const IoData &iodata, mfem::ParMesh &mesh); - - // Returns array marking finite conductivity boundary attributes. - const mfem::Array &GetMarker() const { return conductivity_marker; } - - // Add contributions to system matrix for a finite conductivity boundary condition. - void AddExtraSystemBdrCoefficients(double omega, SumMatrixCoefficient &fbr, - SumMatrixCoefficient &fbi); -}; - -} // namespace palace - -#endif // PALACE_MODELS_SURFACE_CONDUCTIVITY_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_SURFACE_CONDUCTIVITY_OPERATOR_HPP +#define PALACE_MODELS_SURFACE_CONDUCTIVITY_OPERATOR_HPP + +#include +#include + +namespace palace +{ + +class IoData; +class MaterialOperator; +class MaterialPropertyCoefficient; + +// +// A class handling finite conductivity boundaries. +// +class SurfaceConductivityOperator +{ +private: + // Reference to material property data (not owned). + const MaterialOperator &mat_op; + + // Surface properties for finite conductivity boundary attributes: conductor conductivity + // and permeability, and (optionally) thickness. + struct ConductivityData + { + double sigma, mu, h; + mfem::Array attr_list; + }; + std::vector boundaries; + + void SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh); + void PrintBoundaryInfo(const IoData &iodata, const mfem::ParMesh &mesh); + +public: + SurfaceConductivityOperator(const IoData &iodata, const MaterialOperator &mat_op, + const mfem::ParMesh &mesh); + + // Returns array of finite conductivity boundary attributes. + mfem::Array GetAttrList() const; + + // Add contributions to system matrix for a finite conductivity boundary condition. + void AddExtraSystemBdrCoefficients(double omega, MaterialPropertyCoefficient &fbr, + MaterialPropertyCoefficient &fbi); +}; + +} // namespace palace + +#endif // PALACE_MODELS_SURFACE_CONDUCTIVITY_OPERATOR_HPP diff --git a/palace/models/surfacecurrentoperator.cpp b/palace/models/surfacecurrentoperator.cpp index bbcd1d17b7..a3a603e29c 100644 --- a/palace/models/surfacecurrentoperator.cpp +++ b/palace/models/surfacecurrentoperator.cpp @@ -1,183 +1,164 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "surfacecurrentoperator.hpp" - -#include -#include "fem/coefficient.hpp" -#include "utils/communication.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -SurfaceCurrentData::SurfaceCurrentData(const config::SurfaceCurrentData &data, - mfem::ParFiniteElementSpace &h1_fespace) -{ - // Construct the source elements allowing for a possible multielement surface current - // sources. - for (const auto &elem : data.elements) - { - mfem::Array attr_marker; - mesh::AttrToMarker(h1_fespace.GetParMesh()->bdr_attributes.Size() - ? h1_fespace.GetParMesh()->bdr_attributes.Max() - : 0, - elem.attributes, attr_marker); - switch (elem.coordinate_system) - { - case config::internal::ElementData::CoordinateSystem::CYLINDRICAL: - elems.push_back( - std::make_unique(elem.direction, attr_marker, h1_fespace)); - break; - case config::internal::ElementData::CoordinateSystem::CARTESIAN: - elems.push_back( - std::make_unique(elem.direction, attr_marker, h1_fespace)); - break; - } - } -} - -double SurfaceCurrentData::GetExcitationCurrent() const -{ - // Ideal unit current source for each index. - return 1.0; -} - -SurfaceCurrentOperator::SurfaceCurrentOperator(const IoData &iodata, - mfem::ParFiniteElementSpace &h1_fespace) -{ - // Set up surface current source boundaries. - SetUpBoundaryProperties(iodata, h1_fespace); - PrintBoundaryInfo(iodata, *h1_fespace.GetParMesh()); -} - -void SurfaceCurrentOperator::SetUpBoundaryProperties( - const IoData &iodata, mfem::ParFiniteElementSpace &h1_fespace) -{ - // Check that surface current boundary attributes have been specified correctly. - int bdr_attr_max = h1_fespace.GetParMesh()->bdr_attributes.Size() - ? h1_fespace.GetParMesh()->bdr_attributes.Max() - : 0; - if (!iodata.boundaries.current.empty()) - { - mfem::Array bdr_attr_marker(bdr_attr_max); - bdr_attr_marker = 0; - for (auto attr : h1_fespace.GetParMesh()->bdr_attributes) - { - bdr_attr_marker[attr - 1] = 1; - } - for (const auto &[idx, data] : iodata.boundaries.current) - { - for (const auto &elem : data.elements) - { - for (auto attr : elem.attributes) - { - MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, - "Surface current boundary attribute tags must be non-negative and " - "correspond to boundaries in the mesh!"); - MFEM_VERIFY(bdr_attr_marker[attr - 1], - "Unknown surface current boundary attribute " << attr << "!"); - } - } - } - } - - // Set up surface current data structures. - for (const auto &[idx, data] : iodata.boundaries.current) - { - sources.try_emplace(idx, data, h1_fespace); - } - - // Mark selected boundary attributes from the mesh for current sources. - source_marker.SetSize(bdr_attr_max); - source_marker = 0; - for (const auto &[idx, data] : sources) - { - for (const auto &elem : data.GetElements()) - { - for (int i = 0; i < elem->GetMarker().Size(); i++) - { - MFEM_VERIFY( - !(source_marker[i] && elem->GetMarker()[i]), - "Boundary attribute is assigned to more than one surface current source!"); - source_marker[i] = source_marker[i] || elem->GetMarker()[i]; - } - } - } -} - -void SurfaceCurrentOperator::PrintBoundaryInfo(const IoData &iodata, mfem::ParMesh &mesh) -{ - if (sources.empty()) - { - return; - } - Mpi::Print("\nConfiguring surface current excitation source term at attributes:\n"); - for (const auto &[idx, data] : sources) - { - for (const auto &elem : data.GetElements()) - { - for (int i = 0; i < elem->GetMarker().Size(); i++) - { - if (!elem->GetMarker()[i]) - { - continue; - } - const int attr = i + 1; - mfem::Vector nor; - mesh::GetSurfaceNormal(mesh, attr, nor); - Mpi::Print(" {:d}: Index = {:d}", attr, idx); - if (mesh.SpaceDimension() == 3) - { - Mpi::Print(", n = ({:+.1f}, {:+.1f}, {:+.1f})", nor(0), nor(1), nor(2)); - } - else - { - Mpi::Print(", n = ({:+.1f}, {:+.1f})", nor(0), nor(1)); - } - Mpi::Print("\n"); - } - } - } -} - -const SurfaceCurrentData &SurfaceCurrentOperator::GetSource(int idx) const -{ - auto it = sources.find(idx); - MFEM_VERIFY(it != sources.end(), "Unknown current source index requested!"); - return it->second; -} - -void SurfaceCurrentOperator::AddExcitationBdrCoefficients(SumVectorCoefficient &fb) -{ - // Construct the RHS source term for surface current boundaries, which looks like - // -iω J_inc for a surface current boundary. The chosen surface current J_inc corresponds - // to a unit current excitation. Note: The real RHS returned here does not yet have the - // factor of (iω) included, so works for time domain simulations requiring RHS -J_inc - // (t). - for (const auto &[idx, data] : sources) - { - AddExcitationBdrCoefficients(data, fb); - } -} - -void SurfaceCurrentOperator::AddExcitationBdrCoefficients(int idx, SumVectorCoefficient &fb) -{ - // Construct the RHS source term for a single surface current boundary index. - AddExcitationBdrCoefficients(GetSource(idx), fb); -} - -void SurfaceCurrentOperator::AddExcitationBdrCoefficients(const SurfaceCurrentData &data, - SumVectorCoefficient &fb) -{ - // Add excited boundaries to the linear form, with a unit current distributed across - // all elements of the current source in parallel. - for (const auto &elem : data.GetElements()) - { - const double Jinc = 1.0 / (elem->GetGeometryWidth() * data.GetElements().size()); - fb.AddCoefficient(elem->GetModeCoefficient(-Jinc), elem->GetMarker()); - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "surfacecurrentoperator.hpp" + +#include "fem/coefficient.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" + +namespace palace +{ + +SurfaceCurrentData::SurfaceCurrentData(const config::SurfaceCurrentData &data, + const mfem::ParMesh &mesh) +{ + // Construct the source elements allowing for a possible multielement surface current + // sources. + for (const auto &elem : data.elements) + { + mfem::Array attr_list; + attr_list.Append(elem.attributes.data(), elem.attributes.size()); + switch (elem.coordinate_system) + { + case CoordinateSystem::CYLINDRICAL: + elems.push_back( + std::make_unique(elem.direction, attr_list, mesh)); + break; + case CoordinateSystem::CARTESIAN: + elems.push_back( + std::make_unique(elem.direction, attr_list, mesh)); + break; + } + } +} + +double SurfaceCurrentData::GetExcitationCurrent() const +{ + // Ideal unit current source for each index. + return 1.0; +} + +SurfaceCurrentOperator::SurfaceCurrentOperator(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + // Set up surface current source boundaries. + SetUpBoundaryProperties(iodata, mesh); + PrintBoundaryInfo(iodata, mesh); +} + +void SurfaceCurrentOperator::SetUpBoundaryProperties(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + // Check that surface current boundary attributes have been specified correctly. + if (!iodata.boundaries.current.empty()) + { + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array bdr_attr_marker(bdr_attr_max), source_marker(bdr_attr_max); + bdr_attr_marker = 0; + source_marker = 0; + for (auto attr : mesh.bdr_attributes) + { + bdr_attr_marker[attr - 1] = 1; + } + for (const auto &[idx, data] : iodata.boundaries.current) + { + for (const auto &elem : data.elements) + { + for (auto attr : elem.attributes) + { + MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, + "Surface current boundary attribute tags must be non-negative and " + "correspond to boundaries in the mesh!"); + MFEM_VERIFY(bdr_attr_marker[attr - 1], + "Unknown surface current boundary attribute " << attr << "!"); + MFEM_VERIFY( + !source_marker[attr - 1], + "Boundary attribute is assigned to more than one surface current source!"); + source_marker[attr - 1] = 1; + } + } + } + } + + // Set up surface current data structures. + for (const auto &[idx, data] : iodata.boundaries.current) + { + sources.try_emplace(idx, data, mesh); + } +} + +void SurfaceCurrentOperator::PrintBoundaryInfo(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + if (sources.empty()) + { + return; + } + Mpi::Print("\nConfiguring surface current excitation source term at attributes:\n"); + for (const auto &[idx, data] : sources) + { + for (const auto &elem : data.elems) + { + for (auto attr : elem->GetAttrList()) + { + Mpi::Print(" {:d}: Index = {:d}, n = ({:+.1f})\n", attr, idx, + fmt::join(mesh::GetSurfaceNormal(mesh, attr), ",")); + } + } + } +} + +const SurfaceCurrentData &SurfaceCurrentOperator::GetSource(int idx) const +{ + auto it = sources.find(idx); + MFEM_VERIFY(it != sources.end(), "Unknown current source index requested!"); + return it->second; +} + +mfem::Array SurfaceCurrentOperator::GetAttrList() const +{ + mfem::Array attr_list; + for (const auto &[idx, data] : sources) + { + for (const auto &elem : data.elems) + { + attr_list.Append(elem->GetAttrList()); + } + } + return attr_list; +} + +void SurfaceCurrentOperator::AddExcitationBdrCoefficients(SumVectorCoefficient &fb) +{ + // Construct the RHS source term for surface current boundaries, which looks like + // -iω J_inc for a surface current boundary. The chosen surface current J_inc corresponds + // to a unit current excitation. Note: The real RHS returned here does not yet have the + // factor of (iω) included, so works for time domain simulations requiring RHS -J_inc + // (t). + for (const auto &[idx, data] : sources) + { + AddExcitationBdrCoefficients(data, fb); + } +} + +void SurfaceCurrentOperator::AddExcitationBdrCoefficients(int idx, SumVectorCoefficient &fb) +{ + // Construct the RHS source term for a single surface current boundary index. + AddExcitationBdrCoefficients(GetSource(idx), fb); +} + +void SurfaceCurrentOperator::AddExcitationBdrCoefficients(const SurfaceCurrentData &data, + SumVectorCoefficient &fb) +{ + // Add excited boundaries to the linear form, with a unit current distributed across + // all elements of the current source in parallel. + for (const auto &elem : data.elems) + { + const double Jinc = 1.0 / (elem->GetGeometryWidth() * data.elems.size()); + fb.AddCoefficient(elem->GetModeCoefficient(-Jinc)); + } +} + +} // namespace palace diff --git a/palace/models/surfacecurrentoperator.hpp b/palace/models/surfacecurrentoperator.hpp index 949cec765f..b47a953e83 100644 --- a/palace/models/surfacecurrentoperator.hpp +++ b/palace/models/surfacecurrentoperator.hpp @@ -1,88 +1,81 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_SURFACE_CURRENT_OPERATOR_HPP -#define PALACE_MODELS_SURFACE_CURRENT_OPERATOR_HPP - -#include -#include -#include -#include -#include "fem/lumpedelement.hpp" - -namespace palace -{ - -class IoData; -class SumVectorCoefficient; - -namespace config -{ - -struct SurfaceCurrentData; - -} // namespace config - -// -// Helper class for surface current boundaries in a model. -// -class SurfaceCurrentData -{ -private: - // To accomodate multielement surface current sources, a current source may be made up - // of elements with different attributes and directions which add to deliver the same - // total source current. - std::vector> elems; - -public: - SurfaceCurrentData(const config::SurfaceCurrentData &data, - mfem::ParFiniteElementSpace &h1_fespace); - - const std::vector> &GetElements() const - { - return elems; - } - - double GetExcitationCurrent() const; -}; - -// -// A class handling surface current boundaries. -// -class SurfaceCurrentOperator -{ -private: - // Mapping from source index to data structure containing source surface current - // information. - std::map sources; - mfem::Array source_marker; - void SetUpBoundaryProperties(const IoData &iodata, - mfem::ParFiniteElementSpace &h1_fespace); - void PrintBoundaryInfo(const IoData &iodata, mfem::ParMesh &mesh); - -public: - SurfaceCurrentOperator(const IoData &iodata, mfem::ParFiniteElementSpace &h1_fespace); - - // Access data structures for the surface current source with the given index. - const SurfaceCurrentData &GetSource(int idx) const; - auto begin() const { return sources.begin(); } - auto end() const { return sources.end(); } - auto rbegin() const { return sources.rbegin(); } - auto rend() const { return sources.rend(); } - auto Size() const { return sources.size(); } - - // Returns array marking surface current source attributes. - const mfem::Array &GetMarker() const { return source_marker; } - - // Add contributions to the right-hand side source term vector for a surface current - // excitation at the specified boundaries, -J_inc for the real version (versus the - // full -iω J_inc for the complex one). - void AddExcitationBdrCoefficients(SumVectorCoefficient &fb); - void AddExcitationBdrCoefficients(int idx, SumVectorCoefficient &fb); - void AddExcitationBdrCoefficients(const SurfaceCurrentData &data, - SumVectorCoefficient &fb); -}; - -} // namespace palace - -#endif // PALACE_MODELS_SURFACE_CURRENT_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_SURFACE_CURRENT_OPERATOR_HPP +#define PALACE_MODELS_SURFACE_CURRENT_OPERATOR_HPP + +#include +#include +#include +#include +#include "fem/lumpedelement.hpp" + +namespace palace +{ + +class IoData; +class SumVectorCoefficient; + +namespace config +{ + +struct SurfaceCurrentData; + +} // namespace config + +// +// Helper class for surface current boundaries in a model. +// +class SurfaceCurrentData +{ +public: + // To accommodate multielement surface current sources, a current source may be made up + // of elements with different attributes and directions which add to deliver the same + // total source current. + std::vector> elems; + +public: + SurfaceCurrentData(const config::SurfaceCurrentData &data, const mfem::ParMesh &mesh); + + double GetExcitationCurrent() const; +}; + +// +// A class handling surface current boundaries. +// +class SurfaceCurrentOperator +{ +private: + // Mapping from source index to data structure containing source surface current + // information. + std::map sources; + + void SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh); + void PrintBoundaryInfo(const IoData &iodata, const mfem::ParMesh &mesh); + +public: + SurfaceCurrentOperator(const IoData &iodata, const mfem::ParMesh &mesh); + + // Access data structures for the surface current source with the given index. + const SurfaceCurrentData &GetSource(int idx) const; + auto begin() const { return sources.begin(); } + auto end() const { return sources.end(); } + auto rbegin() const { return sources.rbegin(); } + auto rend() const { return sources.rend(); } + auto Size() const { return sources.size(); } + + // Returns array of surface current source attributes. + mfem::Array GetAttrList() const; + + // Add contributions to the right-hand side source term vector for a surface current + // excitation at the specified boundaries, -J_inc for the real version (versus the + // full -iω J_inc for the complex one). + void AddExcitationBdrCoefficients(SumVectorCoefficient &fb); + void AddExcitationBdrCoefficients(int idx, SumVectorCoefficient &fb); + void AddExcitationBdrCoefficients(const SurfaceCurrentData &data, + SumVectorCoefficient &fb); +}; + +} // namespace palace + +#endif // PALACE_MODELS_SURFACE_CURRENT_OPERATOR_HPP diff --git a/palace/models/surfaceimpedanceoperator.cpp b/palace/models/surfaceimpedanceoperator.cpp index 4671cc6d01..3811edb7f2 100644 --- a/palace/models/surfaceimpedanceoperator.cpp +++ b/palace/models/surfaceimpedanceoperator.cpp @@ -1,197 +1,236 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "surfaceimpedanceoperator.hpp" - -#include "fem/coefficient.hpp" -#include "utils/communication.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -SurfaceImpedanceOperator::SurfaceImpedanceOperator(const IoData &iodata, - mfem::ParMesh &mesh) -{ - // Set up impedance boundary conditions. - SetUpBoundaryProperties(iodata, mesh); - PrintBoundaryInfo(iodata, mesh); -} - -void SurfaceImpedanceOperator::SetUpBoundaryProperties(const IoData &iodata, - const mfem::ParMesh &mesh) -{ - // Check that impedance boundary attributes have been specified correctly. - int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; - if (!iodata.boundaries.impedance.empty()) - { - mfem::Array bdr_attr_marker(bdr_attr_max); - bdr_attr_marker = 0; - for (auto attr : mesh.bdr_attributes) - { - bdr_attr_marker[attr - 1] = 1; - } - for (const auto &data : iodata.boundaries.impedance) - { - for (auto attr : data.attributes) - { - MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, - "Impedance boundary attribute tags must be non-negative and correspond " - "to attributes in the mesh!"); - MFEM_VERIFY(bdr_attr_marker[attr - 1], - "Unknown impedance boundary attribute " << attr << "!"); - } - } - } - - // Impedance boundaries are defined using the user provided impedance per square. - Z_Rsinv.SetSize(bdr_attr_max); - Z_Lsinv.SetSize(bdr_attr_max); - Z_Cs.SetSize(bdr_attr_max); - Z_Rsinv = 0.0; - Z_Lsinv = 0.0; - Z_Cs = 0.0; - for (const auto &data : iodata.boundaries.impedance) - { - for (auto attr : data.attributes) - { - MFEM_VERIFY( - Z_Rsinv(attr - 1) == 0.0 && Z_Lsinv(attr - 1) == 0.0 && Z_Cs(attr - 1) == 0.0, - "Multiple definitions of impedance boundary properties for boundary attribute " - << attr << "!"); - Z_Rsinv(attr - 1) = (std::abs(data.Rs) > 0.0) ? 1.0 / data.Rs : 0.0; - Z_Lsinv(attr - 1) = (std::abs(data.Ls) > 0.0) ? 1.0 / data.Ls : 0.0; - Z_Cs(attr - 1) = (std::abs(data.Cs) > 0.0) ? data.Cs : 0.0; - MFEM_VERIFY(std::abs(Z_Rsinv(attr - 1)) + std::abs(Z_Lsinv(attr - 1)) + - std::abs(Z_Cs(attr - 1)) > - 0.0, - "Impedance boundary has no Rs, Ls, or Cs defined!"); - } - } - - // Mark selected boundary attributes from the mesh as impedance. - mfem::Array impedance_bcs, impedance_Rs_bcs, impedance_Ls_bcs, impedance_Cs_bcs; - for (const auto &data : iodata.boundaries.impedance) - { - for (auto attr : data.attributes) - { - impedance_bcs.Append(attr); - if (std::abs(Z_Rsinv(attr - 1)) > 0.0) - { - impedance_Rs_bcs.Append(attr); - } - if (std::abs(Z_Lsinv(attr - 1)) > 0.0) - { - impedance_Ls_bcs.Append(attr); - } - if (std::abs(Z_Cs(attr - 1)) > 0.0) - { - impedance_Cs_bcs.Append(attr); - } - } - } - mesh::AttrToMarker(bdr_attr_max, impedance_bcs, impedance_marker); - mesh::AttrToMarker(bdr_attr_max, impedance_Rs_bcs, impedance_Rs_marker); - mesh::AttrToMarker(bdr_attr_max, impedance_Ls_bcs, impedance_Ls_marker); - mesh::AttrToMarker(bdr_attr_max, impedance_Cs_bcs, impedance_Cs_marker); -} - -void SurfaceImpedanceOperator::PrintBoundaryInfo(const IoData &iodata, mfem::ParMesh &mesh) -{ - if (impedance_marker.Size() && impedance_marker.Max() == 0) - { - return; - } - Mpi::Print("\nConfiguring Robin impedance BC at attributes:\n"); - for (int i = 0; i < impedance_marker.Size(); i++) - { - if (impedance_marker[i]) - { - const int attr = i + 1; - mfem::Vector nor; - mesh::GetSurfaceNormal(mesh, attr, nor); - bool comma = false; - Mpi::Print(" {:d}:", attr); - if (std::abs(Z_Rsinv(i)) > 0.0) - { - Mpi::Print( - " Rs = {:.3e} Ω/sq", - iodata.DimensionalizeValue(IoData::ValueType::IMPEDANCE, 1.0 / Z_Rsinv(i))); - comma = true; - } - if (std::abs(Z_Lsinv(i)) > 0.0) - { - if (comma) - { - Mpi::Print(","); - } - Mpi::Print( - " Ls = {:.3e} H/sq", - iodata.DimensionalizeValue(IoData::ValueType::INDUCTANCE, 1.0 / Z_Lsinv(i))); - comma = true; - } - if (std::abs(Z_Cs(i)) > 0.0) - { - if (comma) - { - Mpi::Print(","); - } - Mpi::Print(" Cs = {:.3e} F/sq", - iodata.DimensionalizeValue(IoData::ValueType::CAPACITANCE, Z_Cs(i))); - comma = true; - } - if (comma) - { - Mpi::Print(","); - } - if (mesh.SpaceDimension() == 3) - { - Mpi::Print(" n = ({:+.1f}, {:+.1f}, {:+.1f})", nor(0), nor(1), nor(2)); - } - else - { - Mpi::Print(" n = ({:+.1f}, {:+.1f})", nor(0), nor(1)); - } - Mpi::Print("\n"); - } - } -} - -void SurfaceImpedanceOperator::AddStiffnessBdrCoefficients(double coef, - SumMatrixCoefficient &fb) -{ - // Lumped inductor boundaries. - if (impedance_Ls_marker.Size() && impedance_Ls_marker.Max() > 0) - { - mfem::Vector v(Z_Lsinv); - v *= coef; - auto f = std::make_unique(v); - fb.AddCoefficient(std::make_unique(v), impedance_Ls_marker); - } -} - -void SurfaceImpedanceOperator::AddMassBdrCoefficients(double coef, SumMatrixCoefficient &fb) -{ - // Lumped capacitor boundaries. - if (impedance_Cs_marker.Size() && impedance_Cs_marker.Max() > 0) - { - mfem::Vector v(Z_Cs); - v *= coef; - fb.AddCoefficient(std::make_unique(v), impedance_Cs_marker); - } -} - -void SurfaceImpedanceOperator::AddDampingBdrCoefficients(double coef, - SumMatrixCoefficient &fb) -{ - // Lumped resistor boundaries. - if (impedance_Rs_marker.Size() && impedance_Rs_marker.Max() > 0) - { - mfem::Vector v(Z_Rsinv); - v *= coef; - fb.AddCoefficient(std::make_unique(v), impedance_Rs_marker); - } -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "surfaceimpedanceoperator.hpp" + +#include +#include "models/materialoperator.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" +#include "utils/prettyprint.hpp" + +namespace palace +{ + +SurfaceImpedanceOperator::SurfaceImpedanceOperator(const IoData &iodata, + const MaterialOperator &mat_op, + const mfem::ParMesh &mesh) + : mat_op(mat_op) +{ + // Print out BC info for all impedance boundary attributes. + SetUpBoundaryProperties(iodata, mesh); + PrintBoundaryInfo(iodata, mesh); +} + +void SurfaceImpedanceOperator::SetUpBoundaryProperties(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + // Check that impedance boundary attributes have been specified correctly. + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array bdr_attr_marker; + if (!iodata.boundaries.impedance.empty()) + { + mfem::Array impedance_marker(bdr_attr_max); + bdr_attr_marker.SetSize(bdr_attr_max); + bdr_attr_marker = 0; + impedance_marker = 0; + for (auto attr : mesh.bdr_attributes) + { + bdr_attr_marker[attr - 1] = 1; + } + std::set bdr_warn_list; + for (const auto &data : iodata.boundaries.impedance) + { + for (auto attr : data.attributes) + { + MFEM_VERIFY( + !impedance_marker[attr - 1], + "Multiple definitions of impedance boundary properties for boundary attribute " + << attr << "!"); + impedance_marker[attr - 1] = 1; + // MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, + // "Impedance boundary attribute tags must be non-negative and + // correspond " "to attributes in the mesh!"); + // MFEM_VERIFY(bdr_attr_marker[attr - 1], + // "Unknown impedance boundary attribute " << attr << "!"); + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + bdr_warn_list.insert(attr); + } + } + } + if (!bdr_warn_list.empty()) + { + Mpi::Print("\n"); + Mpi::Warning("Unknown impedance boundary attributes!\nSolver will just ignore them!"); + utils::PrettyPrint(bdr_warn_list, "Boundary attribute list:"); + Mpi::Print("\n"); + } + } + + // Impedance boundaries are defined using the user provided impedance per square. + boundaries.reserve(iodata.boundaries.impedance.size()); + for (const auto &data : iodata.boundaries.impedance) + { + MFEM_VERIFY(std::abs(data.Rs) + std::abs(data.Ls) + std::abs(data.Cs) > 0.0, + "Impedance boundary has no Rs, Ls, or Cs defined!"); + auto &bdr = boundaries.emplace_back(); + bdr.Rs = data.Rs; + bdr.Ls = data.Ls; + bdr.Cs = data.Cs; + bdr.attr_list.Reserve(static_cast(data.attributes.size())); + for (auto attr : data.attributes) + { + if (attr <= 0 || attr > bdr_attr_max || !bdr_attr_marker[attr - 1]) + { + continue; // Can just ignore if wrong + } + bdr.attr_list.Append(attr); + // Compute a scaling factor to account for increased area when using mesh cracking. + if (iodata.boundaries.cracked_attributes.find(attr) != + iodata.boundaries.cracked_attributes.end()) + { + bdr.scaling = 2.0; + } + MFEM_VERIFY((iodata.boundaries.cracked_attributes.find(attr) != + iodata.boundaries.cracked_attributes.end()) || + (bdr.scaling == 1.0), + "Impedance boundary has both cracked and uncracked attributes!"); + } + bdr.Ls *= bdr.scaling; + bdr.Rs *= bdr.scaling; + bdr.Cs /= bdr.scaling; + } +} + +void SurfaceImpedanceOperator::PrintBoundaryInfo(const IoData &iodata, + const mfem::ParMesh &mesh) +{ + if (boundaries.empty()) + { + return; + } + + fmt::memory_buffer buf{}; // Output buffer & buffer append lambda for cleaner code + auto to = [&buf](auto fmt, auto &&...args) + { fmt::format_to(std::back_inserter(buf), fmt, std::forward(args)...); }; + + using VT = Units::ValueType; + + to("\nConfiguring Robin impedance BC at attributes:\n"); + for (const auto &bdr : boundaries) + { + for (auto attr : bdr.attr_list) + { + to(" {:d}:", attr); + if (std::abs(bdr.Rs) > 0.0) + { + to(" Rs = {:.3e} Ω/sq,", + iodata.units.Dimensionalize(bdr.Rs / bdr.scaling)); + } + if (std::abs(bdr.Ls) > 0.0) + { + to(" Ls = {:.3e} H/sq,", + iodata.units.Dimensionalize(bdr.Ls / bdr.scaling)); + } + if (std::abs(bdr.Cs) > 0.0) + { + to(" Cs = {:.3e} F/sq,", + iodata.units.Dimensionalize(bdr.Cs * bdr.scaling)); + } + to(" n = ({:+.1f})\n", fmt::join(mesh::GetSurfaceNormal(mesh, attr), ",")); + } + } + Mpi::Print("{}", fmt::to_string(buf)); +} + +mfem::Array SurfaceImpedanceOperator::GetAttrList() const +{ + mfem::Array attr_list; + for (const auto &bdr : boundaries) + { + attr_list.Append(bdr.attr_list); + } + return attr_list; +} + +mfem::Array SurfaceImpedanceOperator::GetRsAttrList() const +{ + mfem::Array attr_list; + for (const auto &bdr : boundaries) + { + if (std::abs(bdr.Rs) > 0.0) + { + attr_list.Append(bdr.attr_list); + } + } + return attr_list; +} + +mfem::Array SurfaceImpedanceOperator::GetLsAttrList() const +{ + mfem::Array attr_list; + for (const auto &bdr : boundaries) + { + if (std::abs(bdr.Ls) > 0.0) + { + attr_list.Append(bdr.attr_list); + } + } + return attr_list; +} + +mfem::Array SurfaceImpedanceOperator::GetCsAttrList() const +{ + mfem::Array attr_list; + for (const auto &bdr : boundaries) + { + if (std::abs(bdr.Cs) > 0.0) + { + attr_list.Append(bdr.attr_list); + } + } + return attr_list; +} + +void SurfaceImpedanceOperator::AddStiffnessBdrCoefficients(double coeff, + MaterialPropertyCoefficient &fb) +{ + // Lumped inductor boundaries. + for (const auto &bdr : boundaries) + { + if (std::abs(bdr.Ls) > 0.0) + { + fb.AddMaterialProperty(mat_op.GetCeedBdrAttributes(bdr.attr_list), coeff / bdr.Ls); + } + } +} + +void SurfaceImpedanceOperator::AddDampingBdrCoefficients(double coeff, + MaterialPropertyCoefficient &fb) +{ + // Lumped resistor boundaries. + for (const auto &bdr : boundaries) + { + if (std::abs(bdr.Rs) > 0.0) + { + fb.AddMaterialProperty(mat_op.GetCeedBdrAttributes(bdr.attr_list), coeff / bdr.Rs); + } + } +} + +void SurfaceImpedanceOperator::AddMassBdrCoefficients(double coeff, + MaterialPropertyCoefficient &fb) +{ + // Lumped capacitor boundaries. + for (const auto &bdr : boundaries) + { + if (std::abs(bdr.Cs) > 0.0) + { + fb.AddMaterialProperty(mat_op.GetCeedBdrAttributes(bdr.attr_list), coeff * bdr.Cs); + } + } +} + +} // namespace palace diff --git a/palace/models/surfaceimpedanceoperator.hpp b/palace/models/surfaceimpedanceoperator.hpp index 6ad90ab422..a7bde4f75a 100644 --- a/palace/models/surfaceimpedanceoperator.hpp +++ b/palace/models/surfaceimpedanceoperator.hpp @@ -1,48 +1,59 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_SURFACE_IMPEDANCE_OPERATOR_HPP -#define PALACE_MODELS_SURFACE_IMPEDANCE_OPERATOR_HPP - -#include - -namespace palace -{ - -class IoData; -class SumMatrixCoefficient; - -// -// A class handling impedance boundaries. -// -class SurfaceImpedanceOperator -{ -private: - // Surface properties for impedance boundary attributes: surface resistance, capacitance, - // and inductance. - mfem::Vector Z_Rsinv, Z_Lsinv, Z_Cs; - mfem::Array impedance_marker, impedance_Rs_marker, impedance_Ls_marker, - impedance_Cs_marker; - void SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh); - void PrintBoundaryInfo(const IoData &iodata, mfem::ParMesh &mesh); - -public: - SurfaceImpedanceOperator(const IoData &iodata, mfem::ParMesh &mesh); - - // Returns array marking surface impedance attributes. - const mfem::Array &GetMarker() const { return impedance_marker; } - const mfem::Array &GetRsMarker() const { return impedance_Rs_marker; } - const mfem::Array &GetLsMarker() const { return impedance_Ls_marker; } - const mfem::Array &GetCsMarker() const { return impedance_Cs_marker; } - - // Add contributions to system matrices from impedance boundaries with nonzero inductance, - // capacitance, and/or resistance. For boundaries with more than R/L/C, impedances add in - // parallel. - void AddStiffnessBdrCoefficients(double coef, SumMatrixCoefficient &fb); - void AddMassBdrCoefficients(double coef, SumMatrixCoefficient &fb); - void AddDampingBdrCoefficients(double coef, SumMatrixCoefficient &fb); -}; - -} // namespace palace - -#endif // PALACE_MODELS_SURFACE_IMPEDANCE_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_SURFACE_IMPEDANCE_OPERATOR_HPP +#define PALACE_MODELS_SURFACE_IMPEDANCE_OPERATOR_HPP + +#include +#include + +namespace palace +{ + +class IoData; +class MaterialOperator; +class MaterialPropertyCoefficient; + +// +// A class handling impedance boundaries. +// +class SurfaceImpedanceOperator +{ +private: + // Reference to material property data (not owned). + const MaterialOperator &mat_op; + + // Surface properties for impedance boundary attributes: surface resistance, capacitance, + // and inductance. + struct ImpedanceData + { + double Rs, Ls, Cs; + mfem::Array attr_list; + double scaling = 1.0; + }; + std::vector boundaries; + + void SetUpBoundaryProperties(const IoData &iodata, const mfem::ParMesh &mesh); + void PrintBoundaryInfo(const IoData &iodata, const mfem::ParMesh &mesh); + +public: + SurfaceImpedanceOperator(const IoData &iodata, const MaterialOperator &mat_op, + const mfem::ParMesh &mesh); + + // Returns array of surface impedance attributes. + mfem::Array GetAttrList() const; + mfem::Array GetRsAttrList() const; + mfem::Array GetLsAttrList() const; + mfem::Array GetCsAttrList() const; + + // Add contributions to system matrices from impedance boundaries with nonzero inductance, + // resistance, and/or capacitance. For boundaries with more than R/L/C, impedances add in + // parallel. + void AddStiffnessBdrCoefficients(double coeff, MaterialPropertyCoefficient &fb); + void AddDampingBdrCoefficients(double coeff, MaterialPropertyCoefficient &fb); + void AddMassBdrCoefficients(double coeff, MaterialPropertyCoefficient &fb); +}; + +} // namespace palace + +#endif // PALACE_MODELS_SURFACE_IMPEDANCE_OPERATOR_HPP diff --git a/palace/models/surfacepostoperator.cpp b/palace/models/surfacepostoperator.cpp index d1bd578965..9dd1a3c026 100644 --- a/palace/models/surfacepostoperator.cpp +++ b/palace/models/surfacepostoperator.cpp @@ -1,263 +1,423 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "surfacepostoperator.hpp" - -#include -#include -#include "fem/integrator.hpp" -#include "models/materialoperator.hpp" -#include "utils/communication.hpp" -#include "utils/geodata.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -SurfacePostOperator::InterfaceDielectricData::InterfaceDielectricData( - const config::InterfaceDielectricData &data, mfem::ParMesh &mesh) - : ts(data.ts), tandelta(data.tandelta) -{ - // Calculate surface dielectric loss according to the formulas from J. Wenner et al., - // Surface loss simulations of superconducting coplanar waveguide resonators, Appl. Phys. - // Lett. (2011). If only a general layer permittivity is specified and not any special - // metal-air (MA), metal-substrate (MS), or substrate-air (SA) permittivity, compute the - // numerator of the participation ratio according to the regular formula - // p * E_elec = 1/2 t Re{∫ (ε E)ᴴ E_m dS} . - bool has_eps = (std::abs(data.epsilon_r) > 0.0); - bool has_eps_ma = (std::abs(data.epsilon_r_ma) > 0.0); - bool has_eps_ms = (std::abs(data.epsilon_r_ms) > 0.0); - bool has_eps_sa = (std::abs(data.epsilon_r_sa) > 0.0); - MFEM_VERIFY(has_eps + has_eps_ma + has_eps_ms + has_eps_sa == 1, - "Surface dielectric loss postprocessing should only be specialized as one of " - "metal-air, metal-substrate, or substrate-air, or not specialized at all!"); - if (has_eps) - { - type = DielectricInterfaceType::DEFAULT; - epsilon = data.epsilon_r; - } - else if (has_eps_ma) - { - type = DielectricInterfaceType::MA; - epsilon = data.epsilon_r_ma; - } - else if (has_eps_ms) - { - type = DielectricInterfaceType::MS; - epsilon = data.epsilon_r_ms; - } - else if (has_eps_sa) - { - type = DielectricInterfaceType::SA; - epsilon = data.epsilon_r_sa; - } - MFEM_VERIFY(data.ts > 0.0, - "Surface dielectric loss postprocessing requires positive thickness!"); - - // Construct the postprocessing data allowing for multiple groups of attribute with - // different side values. - for (const auto &elem : data.elements) - { - // Store information about the surface side to consider. - mfem::Vector &side = sides.emplace_back(); - if (elem.direction[0] == 0 && elem.direction[1] == 0 && elem.direction[2] == 0) - { - // This is OK if surface is single sided, just push back an empty Vector. - } - else - { - side.SetSize(mesh.SpaceDimension()); - std::copy(elem.direction.begin(), elem.direction.end(), side.begin()); - side /= side.Norml2(); - } - - // Store markers for this element of the postprocessing boundary. - mesh::AttrToMarker(mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0, - elem.attributes, attr_markers.emplace_back()); - } -} - -std::unique_ptr -SurfacePostOperator::InterfaceDielectricData::GetCoefficient( - int i, const mfem::ParGridFunction &U, const MaterialOperator &mat_op) const -{ - switch (type) - { - case DielectricInterfaceType::MA: - return std::make_unique>( - U, mat_op, ts, epsilon, sides[i]); - case DielectricInterfaceType::MS: - return std::make_unique>( - U, mat_op, ts, epsilon, sides[i]); - case DielectricInterfaceType::SA: - return std::make_unique>( - U, mat_op, ts, epsilon, sides[i]); - case DielectricInterfaceType::DEFAULT: - return std::make_unique< - DielectricInterfaceCoefficient>( - U, mat_op, ts, epsilon, sides[i]); - } - return {}; // For compiler warning -} - -SurfacePostOperator::SurfaceChargeData::SurfaceChargeData( - const config::CapacitanceData &data, mfem::ParMesh &mesh) -{ - mesh::AttrToMarker(mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0, - data.attributes, attr_markers.emplace_back()); -} - -std::unique_ptr SurfacePostOperator::SurfaceChargeData::GetCoefficient( - int i, const mfem::ParGridFunction &U, const MaterialOperator &mat_op) const -{ - return std::make_unique(U, mat_op); -} - -SurfacePostOperator::SurfaceFluxData::SurfaceFluxData(const config::InductanceData &data, - mfem::ParMesh &mesh) -{ - // Store information about the global direction for orientation. Note the true boundary - // normal is used in calculating the flux, this is just used to determine the sign. - direction.SetSize(mesh.SpaceDimension()); - std::copy(data.direction.begin(), data.direction.end(), direction.begin()); - direction /= direction.Norml2(); - - // Construct the coefficient for this postprocessing boundary (copies the direction - // vector). - mesh::AttrToMarker(mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0, - data.attributes, attr_markers.emplace_back()); -} - -std::unique_ptr -SurfacePostOperator::SurfaceFluxData::GetCoefficient(int i, const mfem::ParGridFunction &U, - const MaterialOperator &mat_op) const -{ - return std::make_unique(U, direction, - mat_op.GetLocalToSharedFaceMap()); -} - -SurfacePostOperator::SurfacePostOperator(const IoData &iodata, - const MaterialOperator &mat_op, - mfem::ParFiniteElementSpace &h1_fespace) - : mat_op(mat_op), ones(&h1_fespace) -{ - // Define a constant 1 function on the scalar finite element space for computing surface - // integrals. - ones = 1.0; - - // Surface dielectric loss postprocessing. - for (const auto &[idx, data] : iodata.boundaries.postpro.dielectric) - { - eps_surfs.try_emplace(idx, data, *h1_fespace.GetParMesh()); - } - - // Surface capacitance postprocessing. - for (const auto &[idx, data] : iodata.boundaries.postpro.capacitance) - { - charge_surfs.try_emplace(idx, data, *h1_fespace.GetParMesh()); - } - - // Surface inductance postprocessing. - for (const auto &[idx, data] : iodata.boundaries.postpro.inductance) - { - flux_surfs.try_emplace(idx, data, *h1_fespace.GetParMesh()); - } -} - -double SurfacePostOperator::GetInterfaceLossTangent(int idx) const -{ - auto it = eps_surfs.find(idx); - MFEM_VERIFY(it != eps_surfs.end(), - "Unknown dielectric loss postprocessing surface index requested!"); - return it->second.tandelta; -} - -double SurfacePostOperator::GetInterfaceElectricFieldEnergy( - int idx, const mfem::ParComplexGridFunction &E) const -{ - auto it = eps_surfs.find(idx); - MFEM_VERIFY(it != eps_surfs.end(), - "Unknown dielectric loss postprocessing surface index requested!"); - double dot = GetLocalSurfaceIntegral(it->second, E.real()) + - GetLocalSurfaceIntegral(it->second, E.imag()); - Mpi::GlobalSum(1, &dot, E.ParFESpace()->GetComm()); - return dot; -} - -double -SurfacePostOperator::GetInterfaceElectricFieldEnergy(int idx, - const mfem::ParGridFunction &E) const -{ - auto it = eps_surfs.find(idx); - MFEM_VERIFY(it != eps_surfs.end(), - "Unknown dielectric loss postprocessing surface index requested!"); - double dot = GetLocalSurfaceIntegral(it->second, E); - Mpi::GlobalSum(1, &dot, E.ParFESpace()->GetComm()); - return dot; -} - -double -SurfacePostOperator::GetSurfaceElectricCharge(int idx, - const mfem::ParComplexGridFunction &E) const -{ - auto it = charge_surfs.find(idx); - MFEM_VERIFY(it != charge_surfs.end(), - "Unknown capacitance postprocessing surface index requested!"); - std::complex dot(GetLocalSurfaceIntegral(it->second, E.real()), - GetLocalSurfaceIntegral(it->second, E.imag())); - Mpi::GlobalSum(1, &dot, E.ParFESpace()->GetComm()); - return std::copysign(std::abs(dot), dot.real()); -} - -double SurfacePostOperator::GetSurfaceElectricCharge(int idx, - const mfem::ParGridFunction &E) const -{ - auto it = charge_surfs.find(idx); - MFEM_VERIFY(it != charge_surfs.end(), - "Unknown capacitance postprocessing surface index requested!"); - double dot = GetLocalSurfaceIntegral(it->second, E); - Mpi::GlobalSum(1, &dot, E.ParFESpace()->GetComm()); - return dot; -} - -double -SurfacePostOperator::GetSurfaceMagneticFlux(int idx, - const mfem::ParComplexGridFunction &B) const -{ - auto it = flux_surfs.find(idx); - MFEM_VERIFY(it != flux_surfs.end(), - "Unknown inductance postprocessing surface index requested!"); - std::complex dot(GetLocalSurfaceIntegral(it->second, B.real()), - GetLocalSurfaceIntegral(it->second, B.imag())); - Mpi::GlobalSum(1, &dot, B.ParFESpace()->GetComm()); - return std::copysign(std::abs(dot), dot.real()); -} - -double SurfacePostOperator::GetSurfaceMagneticFlux(int idx, - const mfem::ParGridFunction &B) const -{ - auto it = flux_surfs.find(idx); - MFEM_VERIFY(it != flux_surfs.end(), - "Unknown inductance postprocessing surface index requested!"); - double dot = GetLocalSurfaceIntegral(it->second, B); - Mpi::GlobalSum(1, &dot, B.ParFESpace()->GetComm()); - return dot; -} - -double SurfacePostOperator::GetLocalSurfaceIntegral(const SurfaceData &data, - const mfem::ParGridFunction &U) const -{ - // Integrate the coefficient over the boundary attributes making up this surface index. - std::vector> fb; - mfem::LinearForm s(const_cast(ones.FESpace())); - for (int i = 0; i < static_cast(data.attr_markers.size()); i++) - { - fb.emplace_back(data.GetCoefficient(i, U, mat_op)); - s.AddBoundaryIntegrator(new BoundaryLFIntegrator(*fb.back()), data.attr_markers[i]); - } - s.UseFastAssembly(false); - s.Assemble(); - return s * ones; -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "surfacepostoperator.hpp" + +#include +#include +#include "fem/gridfunction.hpp" +#include "fem/integrator.hpp" +#include "linalg/vector.hpp" +#include "models/materialoperator.hpp" +#include "models/strattonchu.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" +#include "utils/iodata.hpp" +#include "utils/prettyprint.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +namespace +{ + +template +mfem::Array SetUpBoundaryProperties(const T &data, + const mfem::Array &bdr_attr_marker) +{ + mfem::Array attr_list; + attr_list.Reserve(static_cast(data.attributes.size())); + std::set bdr_warn_list; + for (auto attr : data.attributes) + { + // MFEM_VERIFY(attr > 0 && attr <= bdr_attr_max, + // "Boundary postprocessing attribute tags must be non-negative and " + // "correspond to attributes in the mesh!"); + // MFEM_VERIFY(bdr_attr_marker[attr - 1], + // "Unknown boundary postprocessing attribute " << attr << "!"); + if (attr <= 0 || attr > bdr_attr_marker.Size() || !bdr_attr_marker[attr - 1]) + { + bdr_warn_list.insert(attr); + } + else + { + attr_list.Append(attr); + } + } + if (!bdr_warn_list.empty()) + { + Mpi::Print("\n"); + Mpi::Warning( + "Unknown boundary postprocessing attributes!\nSolver will just ignore them!"); + utils::PrettyPrint(bdr_warn_list, "Boundary attribute list:"); + Mpi::Print("\n"); + } + return attr_list; +} + +} // namespace + +SurfacePostOperator::SurfaceFluxData::SurfaceFluxData( + const config::SurfaceFluxData &data, const mfem::ParMesh &mesh, + const mfem::Array &bdr_attr_marker) +{ + // Store boundary attributes for this postprocessing boundary. + attr_list = SetUpBoundaryProperties(data, bdr_attr_marker); + + // Store the type of flux. + switch (data.type) + { + case SurfaceFlux::ELECTRIC: + type = SurfaceFlux::ELECTRIC; + break; + case SurfaceFlux::MAGNETIC: + type = SurfaceFlux::MAGNETIC; + break; + case SurfaceFlux::POWER: + type = SurfaceFlux::POWER; + break; + } + + // Store information about the global direction for orientation. Note the true boundary + // normal is used in calculating the flux, this is just used to determine the sign. + two_sided = data.two_sided; + if (!two_sided) + { + center.SetSize(mesh.SpaceDimension()); + if (data.no_center) + { + // Compute the center as the bounding box centroid for all boundary elements making up + // this postprocessing boundary. + mfem::Vector bbmin, bbmax; + mesh::GetAxisAlignedBoundingBox( + mesh, mesh::AttrToMarker(bdr_attr_marker.Size(), attr_list), true, bbmin, bbmax); + for (int d = 0; d < mesh.SpaceDimension(); d++) + { + center(d) = 0.5 * (bbmin(d) + bbmax(d)); + } + } + else + { + std::copy(data.center.begin(), data.center.end(), center.begin()); + } + } +} + +std::unique_ptr +SurfacePostOperator::SurfaceFluxData::GetCoefficient(const mfem::ParGridFunction *E, + const mfem::ParGridFunction *B, + const MaterialOperator &mat_op) const +{ + switch (type) + { + case SurfaceFlux::ELECTRIC: + return std::make_unique< + RestrictedCoefficient>>( + attr_list, E, nullptr, mat_op, two_sided, center); + case SurfaceFlux::MAGNETIC: + return std::make_unique< + RestrictedCoefficient>>( + attr_list, nullptr, B, mat_op, two_sided, center); + case SurfaceFlux::POWER: + return std::make_unique< + RestrictedCoefficient>>( + attr_list, E, B, mat_op, two_sided, center); + } + return {}; +} + +SurfacePostOperator::InterfaceDielectricData::InterfaceDielectricData( + const config::InterfaceDielectricData &data, const mfem::ParMesh &mesh, + const mfem::Array &bdr_attr_marker) +{ + // Store boundary attributes for this postprocessing boundary. + attr_list = SetUpBoundaryProperties(data, bdr_attr_marker); + + // Calculate surface dielectric loss according to the formulas from J. Wenner et al., + // Surface loss simulations of superconducting coplanar waveguide resonators, Appl. Phys. + // Lett. (2011). If only a general layer permittivity is specified and not any special + // metal-air (MA), metal-substrate (MS), or substrate-air (SA) permittivity, compute the + // numerator of the participation ratio according to the regular formula + // p * E_elec = 1/2 t Re{∫ (ε E)ᴴ E_m dS} . + switch (data.type) + { + case InterfaceDielectric::DEFAULT: + type = InterfaceDielectric::DEFAULT; + break; + case InterfaceDielectric::MA: + type = InterfaceDielectric::MA; + break; + case InterfaceDielectric::MS: + type = InterfaceDielectric::MS; + break; + case InterfaceDielectric::SA: + type = InterfaceDielectric::SA; + break; + } + t = data.t; + epsilon = data.epsilon_r; + tandelta = data.tandelta; +} + +std::unique_ptr +SurfacePostOperator::InterfaceDielectricData::GetCoefficient( + const GridFunction &E, const MaterialOperator &mat_op) const +{ + switch (type) + { + case InterfaceDielectric::DEFAULT: + return std::make_unique>>( + attr_list, E, mat_op, t, epsilon); + case InterfaceDielectric::MA: + return std::make_unique< + RestrictedCoefficient>>( + attr_list, E, mat_op, t, epsilon); + case InterfaceDielectric::MS: + return std::make_unique< + RestrictedCoefficient>>( + attr_list, E, mat_op, t, epsilon); + case InterfaceDielectric::SA: + return std::make_unique< + RestrictedCoefficient>>( + attr_list, E, mat_op, t, epsilon); + } + return {}; // For compiler warning +} + +SurfacePostOperator::FarFieldData::FarFieldData(const config::FarFieldPostData &data, + const mfem::ParMesh &mesh, + const mfem::Array &bdr_attr_marker) + : thetaphis(data.thetaphis) +{ + // Store boundary attributes for this postprocessing boundary. + attr_list = SetUpBoundaryProperties(data, bdr_attr_marker); +} + +SurfacePostOperator::SurfacePostOperator(const IoData &iodata, + const MaterialOperator &mat_op, + mfem::ParFiniteElementSpace &h1_fespace, + mfem::ParFiniteElementSpace &nd_fespace) + : mat_op(mat_op), h1_fespace(h1_fespace), nd_fespace(nd_fespace) +{ + // Check that boundary attributes have been specified correctly. + const auto &mesh = *h1_fespace.GetParMesh(); + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array bdr_attr_marker; + if (!iodata.boundaries.postpro.flux.empty() || + !iodata.boundaries.postpro.dielectric.empty() || + !iodata.boundaries.postpro.farfield.empty()) + { + bdr_attr_marker.SetSize(bdr_attr_max); + bdr_attr_marker = 0; + for (auto attr : mesh.bdr_attributes) + { + bdr_attr_marker[attr - 1] = 1; + } + } + + // Surface flux postprocessing. + for (const auto &[idx, data] : iodata.boundaries.postpro.flux) + { + MFEM_VERIFY(iodata.problem.type != ProblemType::ELECTROSTATIC || + data.type == SurfaceFlux::ELECTRIC, + "Magnetic field or power surface flux postprocessing are not available " + "for electrostatic problems!"); + MFEM_VERIFY(iodata.problem.type != ProblemType::MAGNETOSTATIC || + data.type == SurfaceFlux::MAGNETIC, + "Electric field or power surface flux postprocessing are not available " + "for magnetostatic problems!"); + flux_surfs.try_emplace(idx, data, *h1_fespace.GetParMesh(), bdr_attr_marker); + } + + // Interface dielectric postprocessing. + MFEM_VERIFY(iodata.boundaries.postpro.dielectric.empty() || + iodata.problem.type != ProblemType::MAGNETOSTATIC, + "Interface dielectric loss postprocessing is not available for " + "magnetostatic problems!"); + for (const auto &[idx, data] : iodata.boundaries.postpro.dielectric) + { + eps_surfs.try_emplace(idx, data, *h1_fespace.GetParMesh(), bdr_attr_marker); + } + + // FarField postprocessing. + MFEM_VERIFY(iodata.boundaries.postpro.farfield.empty() || + iodata.problem.type == ProblemType::DRIVEN || + iodata.problem.type == ProblemType::EIGENMODE, + "Far-field extraction is only available for driven and eigenmode problems!"); + + // Check that we don't have anisotropic materials. + if (!iodata.boundaries.postpro.farfield.empty()) + { + const auto &mesh = *nd_fespace.GetParMesh(); + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array bdr_attr_marker = + mesh::AttrToMarker(bdr_attr_max, iodata.boundaries.postpro.farfield.attributes); + + std::set domain_attrs; + + for (int i = 0; i < mesh.GetNBE(); i++) + { + if (bdr_attr_marker[mesh.GetBdrAttribute(i) - 1]) + { + int elem_id, _face_id; + mesh.GetBdrElementAdjacentElement(i, elem_id, _face_id); + if (elem_id >= 0) + { + domain_attrs.insert(mesh.GetAttribute(elem_id)); + } + } + } + + for (int attr : domain_attrs) + { + MFEM_VERIFY(mat_op.IsIsotropic(attr), + "FarField requires isotropic materials, but attribute " + + std::to_string(attr) + " is not."); + } + } + + farfield = FarFieldData(iodata.boundaries.postpro.farfield, *nd_fespace.GetParMesh(), + bdr_attr_marker); +} + +std::complex SurfacePostOperator::GetSurfaceFlux(int idx, const GridFunction *E, + const GridFunction *B) const +{ + // For complex-valued fields, output the separate real and imaginary parts for the time- + // harmonic quantity. For power flux (Poynting vector), output only the stationary real + // part and not the part which has double the frequency. + auto it = flux_surfs.find(idx); + MFEM_VERIFY(it != flux_surfs.end(), + "Unknown surface flux postprocessing index requested!"); + const bool has_imag = (E) ? E->HasImag() : B->HasImag(); + const auto &mesh = *h1_fespace.GetParMesh(); + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array attr_marker = mesh::AttrToMarker(bdr_attr_max, it->second.attr_list); + auto f = + it->second.GetCoefficient(E ? &E->Real() : nullptr, B ? &B->Real() : nullptr, mat_op); + std::complex dot(GetLocalSurfaceIntegral(*f, attr_marker), 0.0); + if (has_imag) + { + f = it->second.GetCoefficient(E ? &E->Imag() : nullptr, B ? &B->Imag() : nullptr, + mat_op); + double doti = GetLocalSurfaceIntegral(*f, attr_marker); + if (it->second.type == SurfaceFlux::POWER) + { + dot += doti; + } + else + { + dot.imag(doti); + } + } + Mpi::GlobalSum(1, &dot, (E) ? E->GetComm() : B->GetComm()); + return dot; +} + +double SurfacePostOperator::GetInterfaceLossTangent(int idx) const +{ + auto it = eps_surfs.find(idx); + MFEM_VERIFY(it != eps_surfs.end(), + "Unknown interface dielectric postprocessing index requested!"); + return it->second.tandelta; +} + +double SurfacePostOperator::GetInterfaceElectricFieldEnergy(int idx, + const GridFunction &E) const +{ + auto it = eps_surfs.find(idx); + MFEM_VERIFY(it != eps_surfs.end(), + "Unknown interface dielectric postprocessing index requested!"); + const auto &mesh = *h1_fespace.GetParMesh(); + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array attr_marker = mesh::AttrToMarker(bdr_attr_max, it->second.attr_list); + auto f = it->second.GetCoefficient(E, mat_op); + double dot = GetLocalSurfaceIntegral(*f, attr_marker); + Mpi::GlobalSum(1, &dot, E.GetComm()); + return dot; +} + +double +SurfacePostOperator::GetLocalSurfaceIntegral(mfem::Coefficient &f, + const mfem::Array &attr_marker) const +{ + // Integrate the coefficient over the boundary attributes making up this surface index. + mfem::LinearForm s(&h1_fespace); + s.AddBoundaryIntegrator(new BoundaryLFIntegrator(f), + const_cast &>(attr_marker)); + s.UseFastAssembly(false); + s.UseDevice(false); + s.Assemble(); + s.UseDevice(true); + return linalg::LocalSum(s); +} + +std::vector, 3>> SurfacePostOperator::GetFarFieldrE( + const std::vector> &theta_phi_pairs, const GridFunction &E, + const GridFunction &B, double omega_re, double omega_im) const +{ + if (theta_phi_pairs.empty()) + return {}; + BlockTimer bt0(Timer::POSTPRO_FARFIELD); + + // Compute target unit vectors from the given theta and phis. + std::vector> r_naughts; + r_naughts.reserve(theta_phi_pairs.size()); + + r_naughts.reserve(theta_phi_pairs.size()); + for (const auto &[theta, phi] : theta_phi_pairs) + { + r_naughts.emplace_back(std::array{ + std::sin(theta) * std::cos(phi), std::sin(theta) * std::sin(phi), std::cos(theta)}); + } + + const auto &mesh = *nd_fespace.GetParMesh(); + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array attr_marker = mesh::AttrToMarker(bdr_attr_max, farfield.attr_list); + + // Integrate. Each MPI process computes its contribution and we will reduce + // everything at the end. We make them std::vector> + // because we want a very simple memory layout so that we can reduce + // everything with two MPI calls. + std::vector> integrals_r(theta_phi_pairs.size()); + std::vector> integrals_i(theta_phi_pairs.size()); + + for (int i = 0; i < mesh.GetNBE(); i++) + { + if (!attr_marker[mesh.GetBdrAttribute(i) - 1]) + continue; + + auto *T = const_cast(mesh).GetBdrElementTransformation(i); + const auto *fe = nd_fespace.GetBE(i); + const auto *ir = + &mfem::IntRules.Get(fe->GetGeomType(), fem::DefaultIntegrationOrder::Get(*T)); + + AddStrattonChuIntegrandAtElement(E, B, mat_op, omega_re, omega_im, r_naughts, *T, *ir, + integrals_r, integrals_i); + } + + double *data_r_ptr = integrals_r.data()->data(); + double *data_i_ptr = integrals_i.data()->data(); + size_t total_elements = integrals_r.size() * 3; + Mpi::GlobalSum(total_elements, data_i_ptr, E.GetComm()); + Mpi::GlobalSum(total_elements, data_r_ptr, E.GetComm()); + + // Finally, we apply cross product to reduced integrals and package the result + // in a neatly accessible vector of arrays of complex numbers. + std::vector, 3>> result(theta_phi_pairs.size()); + StaticVector<3> tmp_r, tmp_i; + for (size_t k = 0; k < theta_phi_pairs.size(); k++) + { + linalg::Cross3(r_naughts[k], integrals_r[k], tmp_r); + linalg::Cross3(r_naughts[k], integrals_i[k], tmp_i); + for (size_t d = 0; d < 3; d++) + { + result[k][d] = std::complex{tmp_r[d], tmp_i[d]}; + } + } + return result; +} + +} // namespace palace diff --git a/palace/models/surfacepostoperator.hpp b/palace/models/surfacepostoperator.hpp index 0e0c8e8bf4..3f5bf3cf26 100644 --- a/palace/models/surfacepostoperator.hpp +++ b/palace/models/surfacepostoperator.hpp @@ -1,116 +1,122 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_SURFACE_POST_OPERATOR_HPP -#define PALACE_MODELS_SURFACE_POST_OPERATOR_HPP - -#include -#include -#include -#include -#include "fem/coefficient.hpp" - -namespace palace -{ - -class IoData; -class MaterialOperator; - -namespace config -{ - -struct InterfaceDielectricData; -struct CapacitanceData; -struct InductanceData; - -} // namespace config - -// -// A class handling boundary surface postprocessing. -// -class SurfacePostOperator -{ -private: - // Mapping from surface index to data structure containing surface postprocessing - // information for surface loss, charge, or magnetic flux. - struct SurfaceData - { - mutable std::vector> attr_markers; - - virtual ~SurfaceData() = default; - - virtual std::unique_ptr - GetCoefficient(int i, const mfem::ParGridFunction &U, - const MaterialOperator &mat_op) const = 0; - }; - struct InterfaceDielectricData : public SurfaceData - { - DielectricInterfaceType type; - double epsilon, ts, tandelta; - std::vector sides; - - InterfaceDielectricData(const config::InterfaceDielectricData &data, - mfem::ParMesh &mesh); - - std::unique_ptr - GetCoefficient(int i, const mfem::ParGridFunction &U, - const MaterialOperator &mat_op) const override; - }; - struct SurfaceChargeData : public SurfaceData - { - SurfaceChargeData(const config::CapacitanceData &data, mfem::ParMesh &mesh); - - std::unique_ptr - GetCoefficient(int i, const mfem::ParGridFunction &U, - const MaterialOperator &mat_op) const override; - }; - struct SurfaceFluxData : public SurfaceData - { - mfem::Vector direction; - - SurfaceFluxData(const config::InductanceData &data, mfem::ParMesh &mesh); - - std::unique_ptr - GetCoefficient(int i, const mfem::ParGridFunction &U, - const MaterialOperator &mat_op) const override; - }; - std::map eps_surfs; - std::map charge_surfs; - std::map flux_surfs; - - // Reference to material property operator (not owned). - const MaterialOperator &mat_op; - - // Unit function used for computing surface integrals. - mfem::GridFunction ones; - - double GetLocalSurfaceIntegral(const SurfaceData &data, - const mfem::ParGridFunction &U) const; - -public: - SurfacePostOperator(const IoData &iodata, const MaterialOperator &mat_op, - mfem::ParFiniteElementSpace &h1_fespace); - - // Access data structures for the postprocessing surface with the given type. - const auto &GetEps() const { return eps_surfs; } - const auto &GetCap() const { return charge_surfs; } - const auto &GetInd() const { return flux_surfs; } - auto SizeEps() const { return eps_surfs.size(); } - auto SizeCap() const { return charge_surfs.size(); } - auto SizeInd() const { return flux_surfs.size(); } - - // Get surface integrals computing dielectric interface energy, surface charge, or - // surface magnetic flux. - double GetInterfaceLossTangent(int idx) const; - double GetInterfaceElectricFieldEnergy(int idx, - const mfem::ParComplexGridFunction &E) const; - double GetInterfaceElectricFieldEnergy(int idx, const mfem::ParGridFunction &E) const; - double GetSurfaceElectricCharge(int idx, const mfem::ParComplexGridFunction &E) const; - double GetSurfaceElectricCharge(int idx, const mfem::ParGridFunction &E) const; - double GetSurfaceMagneticFlux(int idx, const mfem::ParComplexGridFunction &B) const; - double GetSurfaceMagneticFlux(int idx, const mfem::ParGridFunction &B) const; -}; - -} // namespace palace - -#endif // PALACE_MODELS_SURFACE_POST_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_SURFACE_POST_OPERATOR_HPP +#define PALACE_MODELS_SURFACE_POST_OPERATOR_HPP + +#include +#include +#include +#include +#include "fem/coefficient.hpp" + +namespace palace +{ + +class GridFunction; +class IoData; +class MaterialOperator; + +namespace config +{ + +struct SurfaceFluxData; +struct InterfaceDielectricData; +struct FarFieldPostData; + +} // namespace config + +// +// A class handling boundary surface postprocessing. +// +class SurfacePostOperator +{ +private: + // Mapping from surface index to data structure containing surface postprocessing + // information for surface flux or interface dielectric participation. + struct SurfaceData + { + mfem::Array attr_list; + + virtual ~SurfaceData() = default; + }; + struct SurfaceFluxData : public SurfaceData + { + SurfaceFlux type; + bool two_sided; + mfem::Vector center; + + SurfaceFluxData(const config::SurfaceFluxData &data, const mfem::ParMesh &mesh, + const mfem::Array &bdr_attr_marker); + + std::unique_ptr GetCoefficient(const mfem::ParGridFunction *E, + const mfem::ParGridFunction *B, + const MaterialOperator &mat_op) const; + }; + struct InterfaceDielectricData : public SurfaceData + { + InterfaceDielectric type; + double t, epsilon, tandelta; + + InterfaceDielectricData(const config::InterfaceDielectricData &data, + const mfem::ParMesh &mesh, + const mfem::Array &bdr_attr_marker); + + std::unique_ptr GetCoefficient(const GridFunction &E, + const MaterialOperator &mat_op) const; + }; + struct FarFieldData : public SurfaceData + { + std::vector> thetaphis; + + FarFieldData() = default; + FarFieldData(const config::FarFieldPostData &data, const mfem::ParMesh &mesh, + const mfem::Array &bdr_attr_marker); + + size_t size() const { return thetaphis.size(); } + }; + + // Reference to material property operator (not owned). + const MaterialOperator &mat_op; + + // Reference to scalar finite element space used for computing surface integrals (not + // owned). + mfem::ParFiniteElementSpace &h1_fespace; + + // Reference to vector finite element space used for computing far-field integrals (not + // owned). + mfem::ParFiniteElementSpace &nd_fespace; + + double GetLocalSurfaceIntegral(mfem::Coefficient &f, + const mfem::Array &attr_marker) const; + +public: + // Data structures for postprocessing the surface with the given type. + std::map flux_surfs; + std::map eps_surfs; + FarFieldData farfield; + + SurfacePostOperator(const IoData &iodata, const MaterialOperator &mat_op, + mfem::ParFiniteElementSpace &h1_fespace, + mfem::ParFiniteElementSpace &nd_fespace); + + // Get surface integrals computing electric or magnetic field flux through a boundary. + std::complex GetSurfaceFlux(int idx, const GridFunction *E, + const GridFunction *B) const; + + // Batch version for multiple theta/phi pairs + std::vector, 3>> + GetFarFieldrE(const std::vector> &theta_phi_pairs, + const GridFunction &E, const GridFunction &B, double omega_re, + double omega_im) const; + + // Get surface integrals computing interface dielectric energy. + double GetInterfaceLossTangent(int idx) const; + double GetInterfaceElectricFieldEnergy(int idx, const GridFunction &E) const; + + int GetVDim() const { return mat_op.SpaceDimension(); }; +}; + +} // namespace palace + +#endif // PALACE_MODELS_SURFACE_POST_OPERATOR_HPP diff --git a/palace/models/timeoperator.cpp b/palace/models/timeoperator.cpp index 741b4ef97e..2d92a71aa4 100644 --- a/palace/models/timeoperator.cpp +++ b/palace/models/timeoperator.cpp @@ -1,238 +1,448 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "timeoperator.hpp" - -#include -#include "linalg/iterative.hpp" -#include "linalg/jacobi.hpp" -#include "linalg/solver.hpp" -#include "models/spaceoperator.hpp" -#include "utils/communication.hpp" -#include "utils/iodata.hpp" - -namespace palace -{ - -namespace -{ - -class TimeDependentCurlCurlOperator : public mfem::SecondOrderTimeDependentOperator -{ -public: - // MPI communicator. - MPI_Comm comm; - - // System matrices and excitation RHS. - std::unique_ptr K, M, C; - Vector NegJ; - - // Time dependence of current pulse for excitation: -J'(t) = -g'(t) J. This function - // returns g'(t). - std::function &dJcoef; - - // Internal objects for solution of linear systems during time stepping. - double a0_, a1_; - std::unique_ptr kspM, kspA; - std::unique_ptr A, B; - mutable Vector RHS; - - // Bindings to SpaceOperator functions to get the system matrix and preconditioner, and - // construct the linear solver. - std::function ConfigureLinearSolver; - -public: - TimeDependentCurlCurlOperator(const IoData &iodata, SpaceOperator &spaceop, - std::function &djcoef, double t0, - mfem::TimeDependentOperator::Type type) - : mfem::SecondOrderTimeDependentOperator(spaceop.GetNDSpace().GetTrueVSize(), t0, type), - comm(spaceop.GetComm()), dJcoef(djcoef) - { - // Construct the system matrices defining the linear operator. PEC boundaries are - // handled simply by setting diagonal entries of the mass matrix for the corresponding - // dofs. Because the Dirichlet BC is always homogenous, no special elimination is - // required on the RHS. Diagonal entries are set in M (so M is non-singular). - K = spaceop.GetStiffnessMatrix(Operator::DIAG_ZERO); - C = spaceop.GetDampingMatrix(Operator::DIAG_ZERO); - M = spaceop.GetMassMatrix(Operator::DIAG_ONE); - - // Set up RHS vector for the current source term: -g'(t) J, where g(t) handles the time - // dependence. - spaceop.GetExcitationVector(NegJ); - RHS.SetSize(NegJ.Size()); - - // Set up linear solvers. - { - auto pcg = std::make_unique>(comm, 0); - pcg->SetInitialGuess(iodata.solver.linear.initial_guess); - pcg->SetRelTol(iodata.solver.linear.tol); - pcg->SetMaxIter(iodata.solver.linear.max_it); - auto jac = std::make_unique>(); - kspM = std::make_unique(std::move(pcg), std::move(jac)); - kspM->SetOperators(*M, *M); - } - { - // For explicit schemes, recommended to just use cheaper preconditioners. Otherwise, - // use AMS or a direct solver. The system matrix is formed as a sequence of matrix - // vector products, and is only assembled for preconditioning. - ConfigureLinearSolver = [this, &iodata, &spaceop](double a0, double a1) - { - // Configure the system matrix and also the matrix (matrices) from which the - // preconditioner will be constructed. - A = spaceop.GetSystemMatrix(a0, a1, 1.0, K.get(), C.get(), M.get()); - B = spaceop.GetPreconditionerMatrix(a0, a1, 1.0, 0.0); - - // Configure the solver. - if (!kspA) - { - kspA = std::make_unique(iodata, spaceop.GetNDSpaces(), - &spaceop.GetH1Spaces()); - } - kspA->SetOperators(*A, *B); - }; - } - } - - void FormRHS(const Vector &u, const Vector &du, Vector &rhs) const - { - // Multiply: rhs = -(K u + C du) - g'(t) J. - K->Mult(u, rhs); - if (C) - { - C->AddMult(du, rhs, 1.0); - } - linalg::AXPBYPCZ(-1.0, rhs, dJcoef(t), NegJ, 0.0, rhs); - } - - void Mult(const Vector &u, const Vector &du, Vector &ddu) const override - { - // Solve: M ddu = -(K u + C du) - g'(t) J. - if (kspM->NumTotalMult() == 0) - { - // Operators have already been set in constructor. - ddu = 0.0; - } - FormRHS(u, du, RHS); - kspM->Mult(RHS, ddu); - } - - void ImplicitSolve(const double a0, const double a1, const Vector &u, const Vector &du, - Vector &k) override - { - // Solve: (a0 K + a1 C + M) k = -(K u + C du) - g'(t) J, where a0 may be 0 in the - // explicit case. At first iteration, construct the solver. Also don't print a newline - // if already done by the mass matrix solve at the first iteration. - if (!kspA || a0 != a0_ || a1 != a1_) - { - // Configure the linear solver, including the system matrix and also the matrix - // (matrices) from which the preconditioner will be constructed. - ConfigureLinearSolver(a0, a1); - a0_ = a0; - a1_ = a1; - k = 0.0; - } - Mpi::Print("\n"); - FormRHS(u, du, RHS); - kspA->Mult(RHS, k); - } -}; - -} // namespace - -TimeOperator::TimeOperator(const IoData &iodata, SpaceOperator &spaceop, - std::function &djcoef) -{ - // Construct discrete curl matrix for B-field time integration. - Curl = &spaceop.GetCurlMatrix(); - - // Allocate space for solution vectors. - E.SetSize(Curl->Width()); - dE.SetSize(Curl->Width()); - En.SetSize(Curl->Width()); - B.SetSize(Curl->Height()); - - // Create ODE solver for 2nd-order IVP. - mfem::TimeDependentOperator::Type type = mfem::TimeDependentOperator::EXPLICIT; - switch (iodata.solver.transient.type) - { - case config::TransientSolverData::Type::GEN_ALPHA: - case config::TransientSolverData::Type::DEFAULT: - { - constexpr double rho_inf = 1.0; - ode = std::make_unique(rho_inf); - type = mfem::TimeDependentOperator::IMPLICIT; - } - break; - case config::TransientSolverData::Type::NEWMARK: - { - constexpr double beta = 0.25, gamma = 0.5; - ode = std::make_unique(beta, gamma); - type = mfem::TimeDependentOperator::IMPLICIT; - } - break; - case config::TransientSolverData::Type::CENTRAL_DIFF: - { - ode = std::make_unique(); - type = mfem::TimeDependentOperator::EXPLICIT; - } - break; - } - - // Set up time-dependent operator for 2nd-order curl-curl equation for E. - op = std::make_unique(iodata, spaceop, djcoef, 0.0, type); -} - -const KspSolver &TimeOperator::GetLinearSolver() const -{ - const auto &curlcurl = dynamic_cast(*op); - MFEM_VERIFY(curlcurl.kspA, - "No linear solver for time-depdendent operator has been constructed!\n"); - return *curlcurl.kspA; -} - -double TimeOperator::GetMaxTimeStep() const -{ - const auto &curlcurl = dynamic_cast(*op); - MPI_Comm comm = curlcurl.comm; - const Operator &M = *curlcurl.M; - const Operator &K = *curlcurl.K; - - // Solver for M⁻¹. - constexpr double lin_tol = 1.0e-9; - constexpr int max_lin_it = 500; - CgSolver pcg(comm, 0); - pcg.SetRelTol(lin_tol); - pcg.SetMaxIter(max_lin_it); - pcg.SetOperator(M); - JacobiSmoother jac; - jac.SetOperator(M); - pcg.SetPreconditioner(jac); - - // Power iteration to estimate largest eigenvalue of undamped system matrix M⁻¹ K. - ProductOperator op(pcg, K); - double lam = linalg::SpectralNorm(comm, op, false); - MFEM_VERIFY(lam > 0.0, "Error during power iteration, λ = " << lam << "!"); - return 2.0 / std::sqrt(lam); -} - -void TimeOperator::Init() -{ - // Always use zero initial conditions. - E = 0.0; - dE = 0.0; - B = 0.0; - ode->Init(*op); -} - -void TimeOperator::Step(double &t, double &dt) -{ - // Single time step for E-field. - En = E; - ode->Step(E, dE, t, dt); - - // Trapezoidal integration for B-field: dB/dt = -∇ x E. - En += E; - Curl->AddMult(En, B, -0.5 * dt); -} - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "timeoperator.hpp" + +#include +#include +#include "linalg/iterative.hpp" +#include "linalg/jacobi.hpp" +#include "linalg/solver.hpp" +#include "models/portexcitations.hpp" +#include "models/spaceoperator.hpp" +#include "utils/communication.hpp" +#include "utils/iodata.hpp" + +namespace palace +{ + +namespace +{ + +class TimeDependentFirstOrderOperator : public mfem::TimeDependentOperator +{ +public: + // MPI communicator. + MPI_Comm comm; + + // System matrices and excitation RHS. + std::unique_ptr K, M, C; + Vector NegJ; + + // Time dependence of current pulse for excitation: -J'(t) = -g'(t) J. This function + // returns g'(t). + std::function dJ_coef; + + // Internal objects for solution of linear systems during time stepping. + double dt_, saved_gamma; + std::unique_ptr kspM, kspA; + std::unique_ptr A, B; + mutable Vector RHS; + int size_E, size_B; + + const Operator &Curl; + + // Bindings to SpaceOperator functions to get the system matrix and preconditioner, and + // construct the linear solver. + std::function ConfigureLinearSolver; + +public: + TimeDependentFirstOrderOperator(const IoData &iodata, SpaceOperator &space_op, + std::function dJ_coef, double t0, + mfem::TimeDependentOperator::Type type) + : mfem::TimeDependentOperator(2 * space_op.GetNDSpace().GetTrueVSize() + + space_op.GetRTSpace().GetTrueVSize(), + t0, type), + comm(space_op.GetComm()), dJ_coef(dJ_coef), + size_E(space_op.GetNDSpace().GetTrueVSize()), + size_B(space_op.GetRTSpace().GetTrueVSize()), Curl(space_op.GetCurlMatrix()) + { + // Construct the system matrices defining the linear operator. PEC boundaries are + // handled simply by setting diagonal entries of the mass matrix for the corresponding + // dofs. Because the Dirichlet BC is always homogeneous, no special elimination is + // required on the RHS. Diagonal entries are set in M (so M is non-singular). + K = space_op.GetStiffnessMatrix(Operator::DIAG_ZERO); + C = space_op.GetDampingMatrix(Operator::DIAG_ZERO); + M = space_op.GetMassMatrix(Operator::DIAG_ONE); + + // Already asserted that only that time dependent solver only has a single excitation. + auto excitation_helper = space_op.GetPortExcitations(); + auto excitation_idx = excitation_helper.excitations.begin()->first; + // Set up RHS vector for the current source term: -g'(t) J, where g(t) handles the time + // dependence. + space_op.GetExcitationVector(excitation_idx, NegJ); + RHS.SetSize(2 * size_E + size_B); + RHS.UseDevice(true); + + // Set up linear solvers. + { + auto pcg = std::make_unique>(comm, 0); + pcg->SetInitialGuess(0); + pcg->SetRelTol(iodata.solver.linear.tol); + pcg->SetAbsTol(std::numeric_limits::epsilon()); + pcg->SetMaxIter(iodata.solver.linear.max_it); + auto jac = std::make_unique>(comm); + kspM = std::make_unique(std::move(pcg), std::move(jac)); + kspM->SetOperators(*M, *M); + } + { + // For explicit schemes, recommended to just use cheaper preconditioners. Otherwise, + // use AMS or a direct solver. The system matrix is formed as a sequence of matrix + // vector products, and is only assembled for preconditioning. + ConfigureLinearSolver = [this, &iodata, &space_op](double dt) + { + // Configure the system matrix and also the matrix (matrices) from which the + // preconditioner will be constructed. + A = space_op.GetSystemMatrix(dt * dt, dt, 1.0, K.get(), C.get(), M.get()); + B = space_op.GetPreconditionerMatrix(dt * dt, dt, 1.0, 0.0); + + // Configure the solver. + if (!kspA) + { + kspA = std::make_unique(iodata, space_op.GetNDSpaces(), + &space_op.GetH1Spaces()); + } + kspA->SetOperators(*A, *B); + }; + } + } + + // Form the RHS for the first-order ODE system. + void FormRHS(const Vector &u, Vector &rhs) const + { + Vector u1, u2, u3, rhs1, rhs2, rhs3; + u1.UseDevice(true); + u2.UseDevice(true); + u3.UseDevice(true); + rhs1.UseDevice(true); + rhs2.UseDevice(true); + rhs3.UseDevice(true); + u.Read(); + u1.MakeRef(const_cast(u), 0, size_E); + u2.MakeRef(const_cast(u), size_E, size_E); + u3.MakeRef(const_cast(u), 2 * size_E, size_B); + rhs.ReadWrite(); + rhs1.MakeRef(rhs, 0, size_E); + rhs2.MakeRef(rhs, size_E, size_E); + rhs3.MakeRef(rhs, 2 * size_E, size_B); + + // u1 = Edot, u2 = E, u3 = B + // rhs1 = -(K * u2 + C * u1) - J(t) + // rhs2 = u1 + // rhs3 = -curl u2 + K->Mult(u2, rhs1); + if (C) + { + C->AddMult(u1, rhs1, 1.0); + } + linalg::AXPBYPCZ(-1.0, rhs1, dJ_coef(t), NegJ, 0.0, rhs1); + + rhs2 = u1; + + Curl.Mult(u2, rhs3); + rhs3 *= -1; + } + + // Solve M du = rhs + // |M 0 0| |du1| = |-(K * u2 + C * u1) - J(t) | + // |0 I 0| |du2| | u1 | + // |0 0 I| |du3| = |-curl u2 | + void Mult(const Vector &u, Vector &du) const override + { + if (kspM->NumTotalMult() == 0) + { + // Operators have already been set in constructor. + du = 0.0; + } + FormRHS(u, RHS); + + Vector du1, du2, du3, RHS1, RHS2, RHS3; + du1.UseDevice(true); + du2.UseDevice(true); + du3.UseDevice(true); + RHS1.UseDevice(true); + RHS2.UseDevice(true); + RHS3.UseDevice(true); + du.ReadWrite(); + du1.MakeRef(du, 0, size_E); + du2.MakeRef(du, size_E, size_E); + du3.MakeRef(du, 2 * size_E, size_B); + RHS.ReadWrite(); + RHS1.MakeRef(RHS, 0, size_E); + RHS2.MakeRef(RHS, size_E, size_E); + RHS3.MakeRef(RHS, 2 * size_E, size_B); + + kspM->Mult(RHS1, du1); + du2 = RHS2; + du3 = RHS3; + } + + void ImplicitSolve(double dt, const Vector &u, Vector &k) override + { + // Solve: M k = f(u + dt k, t) + // Use block elimination to avoid solving a 3n x 3n linear system. + if (!kspA || dt != dt_) + { + // Configure the linear solver, including the system matrix and also the matrix + // (matrices) from which the preconditioner will be constructed. + ConfigureLinearSolver(dt); + dt_ = dt; + k = 0.0; + } + Mpi::Print("\n"); + FormRHS(u, RHS); + + Vector k1, k2, k3, RHS1, RHS2, RHS3; + k1.UseDevice(true); + k2.UseDevice(true); + k3.UseDevice(true); + RHS1.UseDevice(true); + RHS2.UseDevice(true); + RHS3.UseDevice(true); + k.ReadWrite(); + k1.MakeRef(k, 0, size_E); + k2.MakeRef(k, size_E, size_E); + k3.MakeRef(k, 2 * size_E, size_B); + RHS.ReadWrite(); + RHS1.MakeRef(RHS, 0, size_E); + RHS2.MakeRef(RHS, size_E, size_E); + RHS3.MakeRef(RHS, 2 * size_E, size_B); + + // A k1 = RHS1 - dt K RHS2 + K->AddMult(RHS2, RHS1, -dt); + kspA->Mult(RHS1, k1); + + // k2 = rhs2 + dt k1 + linalg::AXPBYPCZ(1.0, RHS2, dt, k1, 0.0, k2); + + // k3 = rhs3 - dt curl k2 + k3 = RHS3; + Curl.AddMult(k2, k3, -dt); + } + + void ExplicitMult(const Vector &u, Vector &v) const override { Mult(u, v); } + + // Setup A = M - gamma J = M + gamma C + gamma^2 K + int SUNImplicitSetup(const Vector &y, const Vector &fy, int jok, int *jcur, + double gamma) override + { + // Update Jacobian matrix. + if (!kspA || gamma != saved_gamma) + { + ConfigureLinearSolver(gamma); + } + + // Indicate Jacobian was updated. + *jcur = 1; + + // Save gamma for use in solve. + saved_gamma = gamma; + + return 0; + } + + // Solve (Mass - dt Jacobian) x = Mass b + int SUNImplicitSolve(const Vector &b, Vector &x, double tol) override + { + Vector b1, b2, b3, x1, x2, x3, RHS1; + b1.UseDevice(true); + b2.UseDevice(true); + b3.UseDevice(true); + x1.UseDevice(true); + x2.UseDevice(true); + x3.UseDevice(true); + RHS1.UseDevice(true); + b.Read(); + b1.MakeRef(const_cast(b), 0, size_E); + b2.MakeRef(const_cast(b), size_E, size_E); + b3.MakeRef(const_cast(b), 2 * size_E, size_B); + x.ReadWrite(); + x1.MakeRef(x, 0, size_E); + x2.MakeRef(x, size_E, size_E); + x3.MakeRef(x, 2 * size_E, size_B); + RHS.ReadWrite(); + RHS1.MakeRef(RHS, 0, size_E); + + // A x1 = M b1 - dt K b2 + M->Mult(b1, RHS1); + K->AddMult(b2, RHS1, -saved_gamma); + kspA->Mult(RHS1, x1); + + // x2 = b2 + dt x1 + linalg::AXPBYPCZ(1.0, b2, saved_gamma, x1, 0.0, x2); + + // x3 = b3 - dt curl x2 + x3 = b3; + Curl.AddMult(x2, x3, -saved_gamma); + + return 0; + } +}; + +} // namespace + +TimeOperator::TimeOperator(const IoData &iodata, SpaceOperator &space_op, + std::function dJ_coef) + : rel_tol(iodata.solver.transient.rel_tol), abs_tol(iodata.solver.transient.abs_tol), + order(iodata.solver.transient.order) +{ + auto excitation_helper = space_op.GetPortExcitations(); + // Should have already asserted that time dependant solver only has a single excitation. + MFEM_VERIFY(excitation_helper.Size() == 1, + fmt::format("Transient evolution currently only allows for a single " + "excitation, received {}", + excitation_helper.Size())); + + // Get sizes. + int size_E = space_op.GetNDSpace().GetTrueVSize(); + int size_B = space_op.GetRTSpace().GetTrueVSize(); + + // Allocate space for solution vectors. + sol.SetSize(2 * size_E + size_B); + sol.UseDevice(true); + E.UseDevice(true); + B.UseDevice(true); + sol.ReadWrite(); + E.MakeRef(sol, size_E, size_E); + B.MakeRef(sol, 2 * size_E, size_B); + + // Create ODE solver for 1st-order IVP. + mfem::TimeDependentOperator::Type type = mfem::TimeDependentOperator::IMPLICIT; + op = std::make_unique(iodata, space_op, dJ_coef, 0.0, + type); + switch (iodata.solver.transient.type) + { + case TimeSteppingScheme::GEN_ALPHA: + { + constexpr double rho_inf = 1.0; + use_mfem_integrator = true; + ode = std::make_unique(rho_inf); + } + break; + case TimeSteppingScheme::RUNGE_KUTTA: + { + constexpr int gamma_opt = 2; + use_mfem_integrator = true; + ode = std::make_unique(gamma_opt); + } + break; + case TimeSteppingScheme::ARKODE: + { +#if defined(MFEM_USE_SUNDIALS) + // SUNDIALS ARKODE solver. + std::unique_ptr arkode; + arkode = std::make_unique(space_op.GetComm(), + mfem::ARKStepSolver::IMPLICIT); + // Initialize ARKODE. + arkode->Init(*op); + // Use implicit setup/solve defined in SUNImplicit*. + arkode->UseMFEMLinearSolver(); + // Implicit solve is linear and J is not time-dependent. + ARKodeSetLinear(arkode->GetMem(), 0); + // Relative and absolute tolerances. + arkode->SetSStolerances(rel_tol, abs_tol); + // Set the order of the RK scheme. + ARKodeSetOrder(arkode->GetMem(), order); + // Set the ODE solver to ARKODE. + ode = std::move(arkode); +#else + MFEM_ABORT("Solver was not built with SUNDIALS support, please choose a " + "different transient solver type!"); +#endif + } + break; + case TimeSteppingScheme::CVODE: + { +#if defined(MFEM_USE_SUNDIALS) + // SUNDIALS CVODE solver. + std::unique_ptr cvode; + cvode = std::make_unique(space_op.GetComm(), CV_BDF); + // Initialize CVODE. + cvode->Init(*op); + // Relative and absolute tolerances for time step control. + cvode->SetSStolerances(rel_tol, abs_tol); + // Use implicit setup/solve defined in SUNImplicit*. + cvode->UseMFEMLinearSolver(); + // Set the max order of the multistep scheme. + // CV_BDF can go up to 5, but >= 3 is not unconditionally stable. + cvode->SetMaxOrder(order); + // Set the max number of steps allowed in one CVODE step() call. + cvode->SetMaxNSteps(10000); + // Set the ODE solver to CVODE. + ode = std::move(cvode); +#else + MFEM_ABORT("Solver was not built with SUNDIALS support, please choose a " + "different transient solver type!"); +#endif + } + break; + } +} + +const KspSolver &TimeOperator::GetLinearSolver() const +{ + const auto &first_order = dynamic_cast(*op); + MFEM_VERIFY(first_order.kspA, + "No linear solver for time-dependent operator has been constructed!\n"); + return *first_order.kspA; +} + +void TimeOperator::Init() +{ + // Always use zero initial conditions. + sol = 0.0; + if (use_mfem_integrator) + { + ode->Init(*op); + } +} + +void TimeOperator::Step(double &t, double &dt) +{ + double dt_input = dt; + ode->Step(sol, t, dt); + // Ensure user-specified dt does not change. + dt = dt_input; +} + +void TimeOperator::PrintStats() +{ +#if defined(MFEM_USE_SUNDIALS) + if (mfem::ARKStepSolver *arkode = dynamic_cast(ode.get())) + { + long int expsteps, accsteps, step_attempts, nfe_evals, nfi_evals, nlinsetups, netfails; + ARKStepGetTimestepperStats(arkode->GetMem(), &expsteps, &accsteps, &step_attempts, + &nfe_evals, &nfi_evals, &nlinsetups, &netfails); + + long int nniters; + ARKodeGetNumNonlinSolvIters(arkode->GetMem(), &nniters); + + Mpi::Print("\nARKODE time-stepper statistics\n"); + Mpi::Print(" Stability-limited steps: {:d}\n", expsteps); + Mpi::Print(" Accuracy-limited steps: {:d}\n", accsteps); + Mpi::Print(" Calls to explicit RHS function: {:d}\n", nfe_evals); + Mpi::Print(" Calls to implicit RHS function: {:d}\n", nfi_evals); + Mpi::Print(" Calls to linear solver setup function: {:d}\n", nlinsetups); + Mpi::Print(" Calls to linear solver solve function: {:d}\n", nniters); + Mpi::Print(" Number of error test failures: {:d}\n", netfails); + } + else if (mfem::CVODESolver *cvode = dynamic_cast(ode.get())) + { + long int nsteps, nfevals, nlinsetups, netfails; + int qlast, qcur; + double hinused, hlast, hcur, tcur; + + // Get integrator stats. + CVodeGetIntegratorStats(cvode->GetMem(), &nsteps, &nfevals, &nlinsetups, &netfails, + &qlast, &qcur, &hinused, &hlast, &hcur, &tcur); + Mpi::Print("\n CVODE time-stepper statistics\n"); + Mpi::Print(" Number of steps: {:d}\n", nsteps); + Mpi::Print(" Calls to RHS function: {:d}\n", nfevals); + Mpi::Print(" Calls to linear solver setup function: {:d}\n", nlinsetups); + Mpi::Print(" Number of error test failures: {:d}\n", netfails); + Mpi::Print("\n"); + } +#endif +} + +} // namespace palace diff --git a/palace/models/timeoperator.hpp b/palace/models/timeoperator.hpp index f43cb274df..806f522e44 100644 --- a/palace/models/timeoperator.hpp +++ b/palace/models/timeoperator.hpp @@ -1,66 +1,63 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_TIME_OPERATOR_HPP -#define PALACE_MODELS_TIME_OPERATOR_HPP - -#include -#include -#include -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" - -namespace palace -{ - -class IoData; -class SpaceOperator; - -// -// A class handling temporal discretization of the governing equations. -// -class TimeOperator -{ -private: - // Solution vector storage. - Vector E, dE, En, B; - - // Time integrator for the curl-curl E-field formulation. - std::unique_ptr ode; - - // Time-dependent operator for the E-field. - std::unique_ptr op; - - // Discrete curl for B-field time integration (not owned). - const Operator *Curl; - -public: - TimeOperator(const IoData &iodata, SpaceOperator &spaceop, - std::function &djcoef); - - // Access solution vectors for E- and B-fields. - const Vector &GetE() const { return E; } - const Vector &GetEdot() const { return dE; } - const Vector &GetB() const { return B; } - - // Return the linear solver associated with the implicit or explicit time integrator. - const KspSolver &GetLinearSolver() const; - - // Return if the time integration scheme explicit or implicit. - bool isExplicit() const { return op->isExplicit(); } - - // Estimate the maximum stable time step based on the maximum eigenvalue of the - // undamped system matrix M⁻¹ K. - double GetMaxTimeStep() const; - - // Initialize time integrators and set 0 initial conditions. - void Init(); - - // Perform time step from t -> t + dt. - void Step(double &t, double &dt); -}; - -} // namespace palace - -#endif // PALACE_MODELS_TIME_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_TIME_OPERATOR_HPP +#define PALACE_MODELS_TIME_OPERATOR_HPP + +#include +#include +#include +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +class IoData; +class SpaceOperator; + +// +// A class handling temporal discretization of the governing equations. +// +class TimeOperator +{ +private: + // Solution vector storage. + Vector E, B, sol; + + // Time integrator for the first order ODE system. + std::unique_ptr ode; + + // Time-dependent operator for the Edot-E ODE system. + std::unique_ptr op; + + // Adaptive time-stepping parameters. + double rel_tol, abs_tol; + int order; + bool use_mfem_integrator = false; + +public: + TimeOperator(const IoData &iodata, SpaceOperator &space_op, + std::function dJ_coef); + + // Access solution vectors for E- and B-fields. + const Vector &GetE() const { return E; } + const Vector &GetB() const { return B; } + + // Return the linear solver associated with the implicit or explicit time integrator. + const KspSolver &GetLinearSolver() const; + + // Initialize time integrators and set 0 initial conditions. + void Init(); + + // Perform time step from t -> t + dt. + void Step(double &t, double &dt); + + // Print ODE integrator statistics. + void PrintStats(); +}; + +} // namespace palace + +#endif // PALACE_MODELS_TIME_OPERATOR_HPP diff --git a/palace/models/waveportoperator.cpp b/palace/models/waveportoperator.cpp index 54fbd1c5cb..6640a69e2b 100644 --- a/palace/models/waveportoperator.cpp +++ b/palace/models/waveportoperator.cpp @@ -3,11 +3,10 @@ #include "waveportoperator.hpp" -#include -#include +#include +#include #include "fem/bilinearform.hpp" #include "fem/coefficient.hpp" -#include "fem/fespace.hpp" #include "fem/integrator.hpp" #include "linalg/arpack.hpp" #include "linalg/iterative.hpp" @@ -35,20 +34,25 @@ void GetEssentialTrueDofs(mfem::ParGridFunction &E0t, mfem::ParGridFunction &E0n mfem::ParGridFunction &port_E0t, mfem::ParGridFunction &port_E0n, mfem::ParTransferMap &port_nd_transfer, mfem::ParTransferMap &port_h1_transfer, - const mfem::Array &dbc_marker, + const mfem::Array &dbc_attr, mfem::Array &port_nd_dbc_tdof_list, mfem::Array &port_h1_dbc_tdof_list) { - mfem::ParFiniteElementSpace &nd_fespace = *E0t.ParFESpace(); - mfem::ParFiniteElementSpace &h1_fespace = *E0n.ParFESpace(); - mfem::ParFiniteElementSpace &port_nd_fespace = *port_E0t.ParFESpace(); - mfem::ParFiniteElementSpace &port_h1_fespace = *port_E0n.ParFESpace(); - - mfem::Array nd_dbc_tdof_list, h1_dbc_tdof_list; + auto &nd_fespace = *E0t.ParFESpace(); + auto &h1_fespace = *E0n.ParFESpace(); + auto &port_nd_fespace = *port_E0t.ParFESpace(); + auto &port_h1_fespace = *port_E0n.ParFESpace(); + const auto &mesh = *nd_fespace.GetParMesh(); + + mfem::Array dbc_marker, nd_dbc_tdof_list, h1_dbc_tdof_list; + mesh::AttrToMarker(mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0, dbc_attr, + dbc_marker); nd_fespace.GetEssentialTrueDofs(dbc_marker, nd_dbc_tdof_list); h1_fespace.GetEssentialTrueDofs(dbc_marker, h1_dbc_tdof_list); Vector tE0t(nd_fespace.GetTrueVSize()), tE0n(h1_fespace.GetTrueVSize()); + tE0t.UseDevice(true); + tE0n.UseDevice(true); tE0t = 0.0; tE0n = 0.0; linalg::SetSubVector(tE0t, nd_dbc_tdof_list, 1.0); @@ -60,386 +64,450 @@ void GetEssentialTrueDofs(mfem::ParGridFunction &E0t, mfem::ParGridFunction &E0n Vector port_tE0t(port_nd_fespace.GetTrueVSize()), port_tE0n(port_h1_fespace.GetTrueVSize()); + port_tE0t.UseDevice(true); + port_tE0n.UseDevice(true); port_E0t.ParallelProject(port_tE0t); port_E0n.ParallelProject(port_tE0n); - for (int i = 0; i < port_tE0t.Size(); i++) { - if (port_tE0t[i] != 0.0) + const auto *h_port_tE0t = port_tE0t.HostRead(); + const auto *h_port_tE0n = port_tE0n.HostRead(); + for (int i = 0; i < port_tE0t.Size(); i++) { - port_nd_dbc_tdof_list.Append(i); + if (h_port_tE0t[i] != 0.0) + { + port_nd_dbc_tdof_list.Append(i); + } } - } - for (int i = 0; i < port_tE0n.Size(); i++) - { - if (port_tE0n[i] != 0.0) + for (int i = 0; i < port_tE0n.Size(); i++) { - port_h1_dbc_tdof_list.Append(i); + if (h_port_tE0n[i] != 0.0) + { + port_h1_dbc_tdof_list.Append(i); + } } } } +void GetInitialSpace(const mfem::ParFiniteElementSpace &nd_fespace, + const mfem::ParFiniteElementSpace &h1_fespace, + const mfem::Array &dbc_tdof_list, ComplexVector &v) +{ + // Initial space which satisfies Dirichlet BCs. + const int nd_size = nd_fespace.GetTrueVSize(), h1_size = h1_fespace.GetTrueVSize(); + v.SetSize(nd_size + h1_size); + v.UseDevice(true); + v = std::complex(1.0, 0.0); + // linalg::SetRandomReal(nd_fespace.GetComm(), v); + linalg::SetSubVector(v, nd_size, nd_size + h1_size, 0.0); + linalg::SetSubVector(v, dbc_tdof_list, 0.0); +} + +using ComplexHypreParMatrix = std::tuple, + std::unique_ptr>; constexpr bool skip_zeros = false; -std::unique_ptr GetBtt(const MaterialOperator &mat_op, - const mfem::ParFiniteElementSpace &nd_fespace) +ComplexHypreParMatrix GetAtt(const MaterialOperator &mat_op, + const FiniteElementSpace &nd_fespace, + const mfem::Vector &normal, double omega, double sigma) { - // Mass matrix: Bₜₜ = (μ⁻¹ u, v). - constexpr auto MatType = MaterialPropertyType::INV_PERMEABILITY; - constexpr auto ElemType = MeshElementType::BDR_SUBMESH; - MaterialPropertyCoefficient muinv_func(mat_op); - BilinearForm btt(nd_fespace); - btt.AddDomainIntegrator(muinv_func); - return std::make_unique(btt.FullAssemble(skip_zeros), nd_fespace); + // Stiffness matrix (shifted): Aₜₜ = (μ⁻¹ ∇ₜ x u, ∇ₜ x v) - ω² (ε u, v) - σ (μ⁻¹ u, v). + MaterialPropertyCoefficient muinv_func(mat_op.GetBdrAttributeToMaterial(), + mat_op.GetInvPermeability()); + muinv_func.NormalProjectedCoefficient(normal); + MaterialPropertyCoefficient epsilon_func(mat_op.GetBdrAttributeToMaterial(), + mat_op.GetPermittivityReal(), -omega * omega); + epsilon_func.AddCoefficient(mat_op.GetBdrAttributeToMaterial(), + mat_op.GetInvPermeability(), -sigma); + BilinearForm attr(nd_fespace); + attr.AddDomainIntegrator(muinv_func, epsilon_func); + + // Contribution for loss tangent: ε -> ε * (1 - i tan(δ)). + if (!mat_op.HasLossTangent()) + { + return {ParOperator(attr.FullAssemble(skip_zeros), nd_fespace).StealParallelAssemble(), + nullptr}; + } + MaterialPropertyCoefficient negepstandelta_func( + mat_op.GetBdrAttributeToMaterial(), mat_op.GetPermittivityImag(), -omega * omega); + BilinearForm atti(nd_fespace); + atti.AddDomainIntegrator(negepstandelta_func); + return {ParOperator(attr.FullAssemble(skip_zeros), nd_fespace).StealParallelAssemble(), + ParOperator(atti.FullAssemble(skip_zeros), nd_fespace).StealParallelAssemble()}; } -std::unique_ptr GetBtn(const MaterialOperator &mat_op, - const mfem::ParFiniteElementSpace &nd_fespace, - const mfem::ParFiniteElementSpace &h1_fespace) +ComplexHypreParMatrix GetAtn(const MaterialOperator &mat_op, + const FiniteElementSpace &nd_fespace, + const FiniteElementSpace &h1_fespace) { - // Mass matrix: Bₜₙ = (μ⁻¹ ∇ₜ u, v). - constexpr auto MatType = MaterialPropertyType::INV_PERMEABILITY; - constexpr auto ElemType = MeshElementType::BDR_SUBMESH; - MaterialPropertyCoefficient muinv_func(mat_op); - BilinearForm btn(h1_fespace, nd_fespace); - btn.AddDomainIntegrator(muinv_func); - return std::make_unique(btn.FullAssemble(skip_zeros), h1_fespace, nd_fespace, - false); + // Coupling matrix: Aₜₙ = -(μ⁻¹ ∇ₜ u, v). + MaterialPropertyCoefficient muinv_func(mat_op.GetBdrAttributeToMaterial(), + mat_op.GetInvPermeability(), -1.0); + BilinearForm atn(h1_fespace, nd_fespace); + atn.AddDomainIntegrator(muinv_func); + return {ParOperator(atn.FullAssemble(skip_zeros), h1_fespace, nd_fespace, false) + .StealParallelAssemble(), + nullptr}; } -std::array, 3> -GetBnn(const MaterialOperator &mat_op, const mfem::ParFiniteElementSpace &h1_fespace) +ComplexHypreParMatrix GetAnt(const MaterialOperator &mat_op, + const FiniteElementSpace &h1_fespace, + const FiniteElementSpace &nd_fespace) { - // Mass matrix: Bₙₙ = (μ⁻¹ ∇ₜ u, ∇ₜ v) - ω² (ε u, v) = Bₙₙ₁ - ω² Bₙₙ₂. - constexpr auto MatTypeMuInv = MaterialPropertyType::INV_PERMEABILITY; - constexpr auto ElemType = MeshElementType::BDR_SUBMESH; - MaterialPropertyCoefficient muinv_func(mat_op); - BilinearForm bnn1(h1_fespace); - bnn1.AddDomainIntegrator(muinv_func); - - constexpr auto MatTypeEpsReal = MaterialPropertyType::PERMITTIVITY_REAL; - NormalProjectedCoefficient epsilon_func( - std::make_unique>(mat_op)); - BilinearForm bnn2r(h1_fespace); - bnn2r.AddDomainIntegrator(epsilon_func); + // Coupling matrix: Aₙₜ = -(ε u, ∇ₜ v). + MaterialPropertyCoefficient epsilon_func(mat_op.GetBdrAttributeToMaterial(), + mat_op.GetPermittivityReal(), 1.0); + + BilinearForm antr(nd_fespace, h1_fespace); + antr.AddDomainIntegrator(epsilon_func); // Contribution for loss tangent: ε -> ε * (1 - i tan(δ)). if (!mat_op.HasLossTangent()) { - return {std::make_unique(bnn1.FullAssemble(skip_zeros), h1_fespace), - std::make_unique(bnn2r.FullAssemble(skip_zeros), h1_fespace), + return {ParOperator(antr.FullAssemble(skip_zeros), nd_fespace, h1_fespace, false) + .StealParallelAssemble(), nullptr}; } - constexpr auto MatTypeEpsImag = MaterialPropertyType::PERMITTIVITY_IMAG; - NormalProjectedCoefficient negepstandelta_func( - std::make_unique>(mat_op)); - BilinearForm bnn2i(h1_fespace); - bnn2i.AddDomainIntegrator(negepstandelta_func); - return {std::make_unique(bnn1.FullAssemble(skip_zeros), h1_fespace), - std::make_unique(bnn2r.FullAssemble(skip_zeros), h1_fespace), - std::make_unique(bnn2i.FullAssemble(skip_zeros), h1_fespace)}; + MaterialPropertyCoefficient negepstandelta_func(mat_op.GetBdrAttributeToMaterial(), + mat_op.GetPermittivityImag(), 1.0); + BilinearForm anti(nd_fespace, h1_fespace); + anti.AddDomainIntegrator(negepstandelta_func); + return {ParOperator(antr.FullAssemble(skip_zeros), nd_fespace, h1_fespace, false) + .StealParallelAssemble(), + ParOperator(anti.FullAssemble(skip_zeros), nd_fespace, h1_fespace, false) + .StealParallelAssemble()}; } -std::array, 3> -GetAtt(const MaterialOperator &mat_op, const mfem::ParFiniteElementSpace &nd_fespace) +ComplexHypreParMatrix GetAnn(const MaterialOperator &mat_op, + const FiniteElementSpace &h1_fespace, + const mfem::Vector &normal) { - // Stiffness matrix: Aₜₜ = (μ⁻¹ ∇ₜ x u, ∇ₜ x v) - ω² (ε u, v) = Aₜₜ₁ - ω² Aₜₜ₂. - constexpr auto MatTypeMuInv = MaterialPropertyType::INV_PERMEABILITY; - constexpr auto ElemType = MeshElementType::BDR_SUBMESH; - NormalProjectedCoefficient muinv_func( - std::make_unique>(mat_op)); - BilinearForm att1(nd_fespace); - att1.AddDomainIntegrator(muinv_func); - - constexpr auto MatTypeEpsReal = MaterialPropertyType::PERMITTIVITY_REAL; - MaterialPropertyCoefficient epsilon_func(mat_op); - BilinearForm att2r(nd_fespace); - att2r.AddDomainIntegrator(epsilon_func); + // Mass matrix: Aₙₙ = -(ε u, v). + MaterialPropertyCoefficient epsilon_func(mat_op.GetBdrAttributeToMaterial(), + mat_op.GetPermittivityReal(), -1.0); + epsilon_func.NormalProjectedCoefficient(normal); + BilinearForm annr(h1_fespace); + annr.AddDomainIntegrator(epsilon_func); // Contribution for loss tangent: ε -> ε * (1 - i tan(δ)). if (!mat_op.HasLossTangent()) { - return {std::make_unique(att1.FullAssemble(skip_zeros), nd_fespace), - std::make_unique(att2r.FullAssemble(skip_zeros), nd_fespace), + return {ParOperator(annr.FullAssemble(skip_zeros), h1_fespace).StealParallelAssemble(), nullptr}; } - constexpr auto MatTypeEpsImag = MaterialPropertyType::PERMITTIVITY_IMAG; - MaterialPropertyCoefficient negepstandelta_func(mat_op); - BilinearForm att2i(nd_fespace); - att2i.AddDomainIntegrator(negepstandelta_func); - return {std::make_unique(att1.FullAssemble(skip_zeros), nd_fespace), - std::make_unique(att2r.FullAssemble(skip_zeros), nd_fespace), - std::make_unique(att2i.FullAssemble(skip_zeros), nd_fespace)}; + MaterialPropertyCoefficient negepstandelta_func(mat_op.GetBdrAttributeToMaterial(), + mat_op.GetPermittivityImag(), -1.0); + negepstandelta_func.NormalProjectedCoefficient(normal); + BilinearForm anni(h1_fespace); + anni.AddDomainIntegrator(negepstandelta_func); + return {ParOperator(annr.FullAssemble(skip_zeros), h1_fespace).StealParallelAssemble(), + ParOperator(anni.FullAssemble(skip_zeros), h1_fespace).StealParallelAssemble()}; } -std::array, 6> -GetSystemMatrices(std::unique_ptr Btt, std::unique_ptr Btn, - std::unique_ptr Bnn1, std::unique_ptr Bnn2r, - std::unique_ptr Bnn2i, std::unique_ptr Att1, - std::unique_ptr Att2r, std::unique_ptr Att2i, - const mfem::Array &nd_dbc_tdof_list, - const mfem::Array &h1_dbc_tdof_list) +ComplexHypreParMatrix GetBtt(const MaterialOperator &mat_op, + const FiniteElementSpace &nd_fespace) { - // Construct the 2x2 block matrices for the eigenvalue problem A e = λ B e. We pre-compute - // the matrices such that: - // A = A₁ - ω² A₂, B = A₁ - ω² A₂ + 1/Θ² B₃ - ω²/Θ² B₄. - std::unique_ptr BtnT(Btn->ParallelAssemble().Transpose()); - - mfem::Array2D blocks(2, 2); - blocks(0, 0) = &Btt->ParallelAssemble(); - blocks(0, 1) = &Btn->ParallelAssemble(); - blocks(1, 0) = BtnT.get(); - blocks(1, 1) = &Bnn1->ParallelAssemble(); - std::unique_ptr A1(mfem::HypreParMatrixFromBlocks(blocks)); - - auto &Ztt = Btt->ParallelAssemble(); - Ztt *= 0.0; - - blocks = nullptr; - blocks(0, 0) = &Ztt; - blocks(1, 1) = &Bnn2r->ParallelAssemble(); - std::unique_ptr A2r(mfem::HypreParMatrixFromBlocks(blocks)); - - std::unique_ptr A2i; - if (Bnn2i) - { - blocks(1, 1) = &Bnn2i->ParallelAssemble(); - A2i.reset(mfem::HypreParMatrixFromBlocks(blocks)); - } - - auto &Znn = Bnn1->ParallelAssemble(); - Znn *= 0.0; - - blocks = nullptr; - blocks(0, 0) = &Att1->ParallelAssemble(); - blocks(1, 1) = &Znn; - std::unique_ptr B3(mfem::HypreParMatrixFromBlocks(blocks)); - - blocks(0, 0) = &Att2r->ParallelAssemble(); - blocks(1, 1) = &Znn; - std::unique_ptr B4r(mfem::HypreParMatrixFromBlocks(blocks)); + // Mass matrix: Bₜₜ = (μ⁻¹ u, v). + MaterialPropertyCoefficient muinv_func(mat_op.GetBdrAttributeToMaterial(), + mat_op.GetInvPermeability()); + BilinearForm btt(nd_fespace); + btt.AddDomainIntegrator(muinv_func); + return {ParOperator(btt.FullAssemble(skip_zeros), nd_fespace).StealParallelAssemble(), + nullptr}; +} - std::unique_ptr B4i; - if (Att2i) +ComplexHypreParMatrix +GetSystemMatrixA(const mfem::HypreParMatrix *Attr, const mfem::HypreParMatrix *Atti, + const mfem::HypreParMatrix *Atnr, const mfem::HypreParMatrix *Atni, + const mfem::HypreParMatrix *Antr, const mfem::HypreParMatrix *Anti, + const mfem::HypreParMatrix *Annr, const mfem::HypreParMatrix *Anni, + const mfem::Array &dbc_tdof_list) +{ + // Construct the 2x2 block matrices for the eigenvalue problem A e = λ B e. + mfem::Array2D blocks(2, 2); + blocks(0, 0) = Attr; + blocks(0, 1) = Atnr; + blocks(1, 0) = Antr; + blocks(1, 1) = Annr; + std::unique_ptr Ar(mfem::HypreParMatrixFromBlocks(blocks)); + + std::unique_ptr Ai; + if (Atti) { - blocks(0, 0) = &Att2i->ParallelAssemble(); - B4i.reset(mfem::HypreParMatrixFromBlocks(blocks)); + blocks(0, 0) = Atti; + blocks(0, 1) = Atni; + blocks(1, 0) = Anti; + blocks(1, 1) = Anni; + Ai.reset(mfem::HypreParMatrixFromBlocks(blocks)); } // Eliminate boundary true dofs not associated with this wave port or constrained by - // Dirichlet BCs. It is not guaranteed that any HypreParMatrix has a full diagonal in its - // sparsity pattern, so we add a zero diagonal before elimination to guarantee this for A1 - // and B3. - mfem::Array dbc_tdof_list; - int nd_tdof_offset = Btt->Height(); - dbc_tdof_list.Reserve(nd_dbc_tdof_list.Size() + h1_dbc_tdof_list.Size()); - for (auto tdof : nd_dbc_tdof_list) - { - dbc_tdof_list.Append(tdof); - } - for (auto tdof : h1_dbc_tdof_list) - { - dbc_tdof_list.Append(tdof + nd_tdof_offset); - } - - mfem::Vector d(B3->Height()); - d = 0.0; - mfem::SparseMatrix diag(d); - mfem::HypreParMatrix Diag(B3->GetComm(), B3->GetGlobalNumRows(), B3->GetRowStarts(), - &diag); - A1.reset(mfem::Add(1.0, *A1, 1.0, Diag)); - B3.reset(mfem::Add(1.0, *B3, 1.0, Diag)); - - A1->EliminateBC(dbc_tdof_list, Operator::DIAG_ZERO); - A2r->EliminateBC(dbc_tdof_list, Operator::DIAG_ZERO); - if (A2i) - { - A2i->EliminateBC(dbc_tdof_list, Operator::DIAG_ZERO); - } - B3->EliminateBC(dbc_tdof_list, Operator::DIAG_ONE); - B4r->EliminateBC(dbc_tdof_list, Operator::DIAG_ZERO); - if (B4i) + // Dirichlet BCs. + Ar->EliminateBC(dbc_tdof_list, Operator::DIAG_ONE); + if (Ai) { - B4i->EliminateBC(dbc_tdof_list, Operator::DIAG_ZERO); + Ai->EliminateBC(dbc_tdof_list, Operator::DIAG_ZERO); } - return {std::move(A1), std::move(A2r), std::move(A2i), - std::move(B3), std::move(B4r), std::move(B4i)}; + return {std::move(Ar), std::move(Ai)}; } -void GetInitialSpace(const mfem::ParFiniteElementSpace &nd_fespace, - const mfem::ParFiniteElementSpace &h1_fespace, - const mfem::Array &nd_dbc_tdof_list, - const mfem::Array &h1_dbc_tdof_list, ComplexVector &v) +ComplexHypreParMatrix GetSystemMatrixB(const mfem::HypreParMatrix *Bttr, + const mfem::HypreParMatrix *Btti, + const mfem::HypreParMatrix *Dnn, + const mfem::Array &dbc_tdof_list) { - // Initial space chosen as such that B v₀ = y₀, with y₀ = [y₀ₜ, 0, ... 0]ᵀ ⟂ null(A) - // (with Aₜₜ nonsingular). See Lee, Sun, and Cendes, 1991 for reference. - // Note: When the eigenvalue solver uses a standard ℓ²-inner product instead of B-inner - // product (since we use a general non-Hermitian solver due to complex symmetric B), then - // we just use v0 = y0 directly. - v.SetSize(nd_fespace.GetTrueVSize() + h1_fespace.GetTrueVSize()); - // linalg::SetRandomReal(nd_fespace.GetComm(), v); - v = std::complex(1.0, 0.0); - linalg::SetSubVector(v, nd_dbc_tdof_list, 0.0); - for (int i = nd_fespace.GetTrueVSize(); - i < nd_fespace.GetTrueVSize() + h1_fespace.GetTrueVSize(); i++) + // Construct the 2x2 block matrices for the eigenvalue problem A e = λ B e. + mfem::Array2D blocks(2, 2); + blocks(0, 0) = Bttr; + blocks(0, 1) = nullptr; + blocks(1, 0) = nullptr; + blocks(1, 1) = Dnn; + std::unique_ptr Br(mfem::HypreParMatrixFromBlocks(blocks)); + + std::unique_ptr Bi; + if (Btti) + { + blocks(0, 0) = Btti; + Bi.reset(mfem::HypreParMatrixFromBlocks(blocks)); + } + + // Eliminate boundary true dofs not associated with this wave port or constrained by + // Dirichlet BCs. + Br->EliminateBC(dbc_tdof_list, Operator::DIAG_ZERO); + if (Bi) { - v.Real()[i] = v.Imag()[i] = 0.0; + Bi->EliminateBC(dbc_tdof_list, Operator::DIAG_ZERO); } + + return {std::move(Br), std::move(Bi)}; } -void NormalizeWithSign(const mfem::ParGridFunction &S0t, mfem::ParComplexGridFunction &E0t, - mfem::ParComplexGridFunction &E0n, mfem::LinearForm &sr, - mfem::LinearForm &si) +void Normalize(const GridFunction &S0t, GridFunction &E0t, GridFunction &E0n, + mfem::LinearForm &sr, mfem::LinearForm &si) { // Normalize grid functions to a chosen polarization direction and unit power, |E x H⋆| ⋅ // n, integrated over the port surface (+n is the direction of propagation). The n x H // coefficients are updated implicitly as the only store references to the Et, En grid - // functions as well as kₙ, ω. We choose a (rather arbitrary) sign constraint to at least - // make results for the same port consistent between frequencies/meshes. - sr = 0.0; - si = 0.0; - sr.Assemble(); - si.Assemble(); - - // |E x H⋆| ⋅ n = |E ⋅ (-n x H⋆)| - double sign = sr * S0t; - std::complex dot(-(sr * E0t.real()) - (si * E0t.imag()), - -(sr * E0t.imag()) + (si * E0t.real())); - std::array data = {sign, dot.real(), dot.imag()}; - Mpi::GlobalSum(3, data.data(), S0t.ParFESpace()->GetComm()); - sign = (data[0] < 0.0) ? -1.0 : 1.0; - dot = {data[1], data[2]}; - - double scale = sign / std::sqrt(std::abs(dot)); - E0t.real() *= scale; // Updates the n x H coefficients depending on Et, En too - E0t.imag() *= scale; - E0n.real() *= scale; - E0n.imag() *= scale; - sr *= scale; // Update linear forms for postprocessing - si *= scale; - - // This parallel communication is not required since wave port boundaries are true - // one-sided boundaries. - // port_E0t->real().ExchangeFaceNbrData(); // Ready for parallel comm on shared faces - // port_E0t->imag().ExchangeFaceNbrData(); // for n x H coefficients evaluation - // port_E0n->real().ExchangeFaceNbrData(); - // port_E0n->imag().ExchangeFaceNbrData(); + // functions. We choose a (rather arbitrary) phase constraint to at least make results for + // the same port consistent between frequencies/meshes. + + // |E x H⋆| ⋅ n = |E ⋅ (-n x H⋆)|. This also updates the n x H coefficients depending on + // Et, En. Update linear forms for postprocessing too. + std::complex dot[2] = { + {sr * S0t.Real(), si * S0t.Real()}, + {-(sr * E0t.Real()) - (si * E0t.Imag()), -(sr * E0t.Imag()) + (si * E0t.Real())}}; + Mpi::GlobalSum(2, dot, S0t.ParFESpace()->GetComm()); + auto scale = std::abs(dot[0]) / (dot[0] * std::sqrt(std::abs(dot[1]))); + ComplexVector::AXPBY(scale, E0t.Real(), E0t.Imag(), 0.0, E0t.Real(), E0t.Imag()); + ComplexVector::AXPBY(scale, E0n.Real(), E0n.Imag(), 0.0, E0n.Real(), E0n.Imag()); + ComplexVector::AXPBY(scale, sr, si, 0.0, sr, si); + + // This parallel communication is not required since wave port boundaries are true one- + // sided boundaries. + // E0t.Real().ExchangeFaceNbrData(); // Ready for parallel comm on shared faces for n x H + // E0t.Imag().ExchangeFaceNbrData(); // coefficients evaluation + // E0n.Real().ExchangeFaceNbrData(); + // E0n.Imag().ExchangeFaceNbrData(); } -// Computes boundary modal n x H, where +n is the direction of wave propagation: n x H = +// Helper for BdrSubmeshEVectorCoefficient and BdrSubmeshHVectorCoefficient. +enum class ValueType +{ + REAL, + IMAG +}; + +// Return as a vector coefficient the boundary mode electric field. +template +class BdrSubmeshEVectorCoefficient : public mfem::VectorCoefficient +{ +private: + const GridFunction &Et, &En; + const mfem::ParSubMesh &submesh; + const std::unordered_map &submesh_parent_elems; + mfem::IsoparametricTransformation T_loc; + const double scaling; + +public: + BdrSubmeshEVectorCoefficient(const GridFunction &Et, const GridFunction &En, + const mfem::ParSubMesh &submesh, + const std::unordered_map &submesh_parent_elems, + double scaling = 1.0) + : mfem::VectorCoefficient(Et.Real().VectorDim()), Et(Et), En(En), submesh(submesh), + submesh_parent_elems(submesh_parent_elems), scaling(scaling) + { + } + + void Eval(mfem::Vector &V, mfem::ElementTransformation &T, + const mfem::IntegrationPoint &ip) override + { + // Always do the GridFunction evaluation in the submesh. + mfem::ElementTransformation *T_submesh = nullptr; + if (T.mesh == submesh.GetParent()) + { + MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, + "BdrSubmeshEVectorCoefficient requires ElementType::BDR_ELEMENT when not " + "used on a SubMesh!"); + auto it = submesh_parent_elems.find(T.ElementNo); + if (it == submesh_parent_elems.end()) + { + // Just return zero for a parent boundary element not in the submesh. + V.SetSize(vdim); + V = 0.0; + return; + } + else + { + submesh.GetElementTransformation(it->second, &T_loc); + T_loc.SetIntPoint(&ip); + T_submesh = &T_loc; + } + } + else if (T.mesh == &submesh) + { + MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::ELEMENT, + "BdrSubmeshEVectorCoefficient requires ElementType::ELEMENT when used on " + "a SubMesh!"); + T_submesh = &T; + } + else + { + MFEM_ABORT("Invalid mesh for BdrSubmeshEVectorCoefficient!"); + } + + // Compute Eₜ + n ⋅ Eₙ . The normal returned by GetNormal points out of the + // computational domain, so we reverse it (direction of propagation is into the domain). + double normal_data[3]; + mfem::Vector normal(normal_data, vdim); + BdrGridFunctionCoefficient::GetNormal(*T_submesh, normal); + if constexpr (Type == ValueType::REAL) + { + Et.Real().GetVectorValue(*T_submesh, ip, V); + auto Vn = En.Real().GetValue(*T_submesh, ip); + V.Add(-Vn, normal); + } + else + { + Et.Imag().GetVectorValue(*T_submesh, ip, V); + auto Vn = En.Imag().GetValue(*T_submesh, ip); + V.Add(-Vn, normal); + } + V *= scaling; + } +}; + +// Computes boundary mode n x H, where +n is the direction of wave propagation: n x H = // -1/(iωμ) (ikₙ Eₜ + ∇ₜ Eₙ), using the tangential and normal electric field component grid -// functions evaluated on the (single-sided) boundary element. The intent of this vector -// grid function is to be dotted with a function E which is only in the tangential -// component, so the fact that we use the full ∇ Eₙ in the element is fine. We use only the -// real part of kn. -template +// functions evaluated on the (single-sided) boundary element. +template class BdrSubmeshHVectorCoefficient : public mfem::VectorCoefficient { private: - const mfem::ParComplexGridFunction &Et, &En; + const GridFunction &Et, &En; const MaterialOperator &mat_op; - - mfem::ParSubMesh &submesh; - const mfem::ParMesh &parent; - std::unordered_map submesh_elem_ids; - + const mfem::ParSubMesh &submesh; + const std::unordered_map &submesh_parent_elems; + mfem::IsoparametricTransformation T_loc; std::complex kn; double omega; - mfem::ParSubMesh &GetSubMesh(mfem::ParMesh &mesh) - { - MFEM_ASSERT( - mfem::ParSubMesh::IsParSubMesh(&mesh), - "BdrSubmeshHVectorCoefficient requires the input grid function coefficients " - "to be defined on a SubMesh!"); - mfem::ParSubMesh &submesh = *static_cast(&mesh); - MFEM_ASSERT(submesh.GetFrom() == mfem::SubMesh::From::Boundary, - "BdrSubmeshHVectorCoefficient requires a SubMesh created using " - "CreateFromBoundary!"); - return submesh; - } - public: - BdrSubmeshHVectorCoefficient(const mfem::ParComplexGridFunction &Et, - const mfem::ParComplexGridFunction &En, - const MaterialOperator &mat_op) - : mfem::VectorCoefficient(Et.ParFESpace()->GetParMesh()->SpaceDimension()), Et(Et), - En(En), mat_op(mat_op), submesh(GetSubMesh(*Et.ParFESpace()->GetParMesh())), - parent(*submesh.GetParent()), kn(0.0), omega(0.0) + BdrSubmeshHVectorCoefficient(const GridFunction &Et, const GridFunction &En, + const MaterialOperator &mat_op, + const mfem::ParSubMesh &submesh, + const std::unordered_map &submesh_parent_elems, + std::complex kn, double omega) + : mfem::VectorCoefficient(Et.Real().VectorDim()), Et(Et), En(En), mat_op(mat_op), + submesh(submesh), submesh_parent_elems(submesh_parent_elems), kn(kn), omega(omega) { - // Construct mapping from parent (boundary) element indices to submesh (domain) - // elements. - const mfem::Array &parent_element_ids = submesh.GetParentElementIDMap(); - for (int i = 0; i < parent_element_ids.Size(); i++) - { - submesh_elem_ids[parent_element_ids[i]] = i; - } } void Eval(mfem::Vector &V, mfem::ElementTransformation &T, const mfem::IntegrationPoint &ip) override { - mfem::ElementTransformation *submesh_T = nullptr; - int attr = 0; - if (T.mesh == &parent) + // Always do the GridFunction evaluation in the submesh. + mfem::ElementTransformation *T_submesh = nullptr; + if (T.mesh == submesh.GetParent()) { MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, "BdrSubmeshHVectorCoefficient requires ElementType::BDR_ELEMENT when not " "used on a SubMesh!"); - auto it = submesh_elem_ids.find(T.ElementNo); - if (it == submesh_elem_ids.end()) + auto it = submesh_parent_elems.find(T.ElementNo); + if (it == submesh_parent_elems.end()) { - // Just return zero for a boundary face not in the submesh. + // Just return zero for a parent boundary element not in the submesh. V.SetSize(vdim); V = 0.0; return; } else { - submesh_T = submesh.GetElementTransformation(it->second); - submesh_T->SetIntPoint(&ip); + submesh.GetElementTransformation(it->second, &T_loc); + T_loc.SetIntPoint(&ip); + T_submesh = &T_loc; } - - int i, o, iel1, iel2; - parent.GetBdrElementFace(T.ElementNo, &i, &o); - parent.GetFaceElements(i, &iel1, &iel2); - attr = parent.GetAttribute(iel1); } else if (T.mesh == &submesh) { MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::ELEMENT, "BdrSubmeshHVectorCoefficient requires ElementType::ELEMENT when used on " "a SubMesh!"); - submesh_T = &T; - - int i, o, iel1, iel2; - parent.GetBdrElementFace(submesh.GetParentElementIDMap()[T.ElementNo], &i, &o); - parent.GetFaceElements(i, &iel1, &iel2); - attr = parent.GetAttribute(iel1); + T_submesh = &T; } else { - MFEM_ABORT("Invalid use of BdrSubmeshHVectorCoefficient on an unrecognized mesh!"); + MFEM_ABORT("Invalid mesh for BdrSubmeshHVectorCoefficient!"); } - // Compute Re/Im{-1/i (ikₙ Eₜ + ∇ₜ Eₙ)}. - mfem::Vector U; - if constexpr (RealPart) + // Get the attribute in the neighboring domain element of the parent mesh. + int attr = [&T, this]() + { + int i = -1, o, iel1, iel2; + if (T.mesh == submesh.GetParent()) + { + MFEM_ASSERT( + T.ElementType == mfem::ElementTransformation::BDR_ELEMENT, + "BdrSubmeshHVectorCoefficient requires ElementType::BDR_ELEMENT when not " + "used on a SubMesh!"); + T.mesh->GetBdrElementFace(T.ElementNo, &i, &o); + } + else if (T.mesh == &submesh) + { + MFEM_ASSERT(T.ElementType == mfem::ElementTransformation::ELEMENT, + "BdrSubmeshHVectorCoefficient requires ElementType::ELEMENT when used " + "on a SubMesh!"); + submesh.GetParent()->GetBdrElementFace(submesh.GetParentElementIDMap()[T.ElementNo], + &i, &o); + } + else + { + MFEM_ABORT("Invalid mesh for BdrSubmeshHVectorCoefficient!"); + } + submesh.GetParent()->GetFaceElements(i, &iel1, &iel2); + return submesh.GetParent()->GetAttribute(iel1); + }(); + + // Compute Re/Im{-1/i (ikₙ Eₜ + ∇ₜ Eₙ)} (t-gradient evaluated in boundary element). + double U_data[3]; + mfem::Vector U(U_data, vdim); + if constexpr (Type == ValueType::REAL) { - Et.real().GetVectorValue(*submesh_T, ip, U); + Et.Real().GetVectorValue(*T_submesh, ip, U); U *= -kn.real(); - mfem::Vector dU; - En.imag().GetGradient(*submesh_T, dU); + double dU_data[3]; + mfem::Vector dU(dU_data, vdim); + En.Imag().GetGradient(*T_submesh, dU); U -= dU; } else { - Et.imag().GetVectorValue(*submesh_T, ip, U); + Et.Imag().GetVectorValue(*T_submesh, ip, U); U *= -kn.real(); - mfem::Vector dU; - En.real().GetGradient(*submesh_T, dU); + double dU_data[3]; + mfem::Vector dU(dU_data, vdim); + En.Real().GetGradient(*T_submesh, dU); U += dU; } @@ -448,154 +516,114 @@ class BdrSubmeshHVectorCoefficient : public mfem::VectorCoefficient mat_op.GetInvPermeability(attr).Mult(U, V); V *= (1.0 / omega); } - - void SetFrequency(double w, std::complex k) - { - omega = w; - kn = k; - } }; } // namespace -WavePortData::WavePortData(const config::WavePortData &data, const MaterialOperator &mat_op, - const mfem::ParFiniteElementSpace &nd_fespace, - const mfem::ParFiniteElementSpace &h1_fespace, - const mfem::Array &dbc_marker) +WavePortData::WavePortData(const config::WavePortData &data, + const config::SolverData &solver, const MaterialOperator &mat_op, + mfem::ParFiniteElementSpace &nd_fespace, + mfem::ParFiniteElementSpace &h1_fespace, + const mfem::Array &dbc_attr) + : mat_op(mat_op), excitation(data.excitation), active(data.active) { - excitation = data.excitation; mode_idx = data.mode_idx; d_offset = data.d_offset; + kn0 = 0.0; + omega0 = 0.0; // Construct the SubMesh. MFEM_VERIFY(!data.attributes.empty(), "Wave port boundary found with no attributes!"); - mfem::ParMesh &mesh = *nd_fespace.GetParMesh(); - attr_list.Reserve(data.attributes.size()); - for (auto attr : data.attributes) - { - attr_list.Append(attr); - } - mesh::AttrToMarker(nd_fespace.GetParMesh()->bdr_attributes.Size() - ? nd_fespace.GetParMesh()->bdr_attributes.Max() - : 0, - attr_list, attr_marker); - port_mesh = std::make_unique( - mfem::ParSubMesh::CreateFromBoundary(mesh, attr_list)); - - int p_nd = nd_fespace.GetMaxElementOrder(); - int p_h1 = h1_fespace.GetMaxElementOrder(); - port_nd_fec = std::make_unique(p_nd, mesh.Dimension() - 1); - port_h1_fec = std::make_unique(p_h1, mesh.Dimension() - 1); - port_nd_fespace = - std::make_unique(port_mesh.get(), port_nd_fec.get()); - port_h1_fespace = - std::make_unique(port_mesh.get(), port_h1_fec.get()); - - mfem::ParGridFunction E0t(const_cast(&nd_fespace)), - E0n(const_cast(&h1_fespace)); - port_E0t = std::make_unique(port_nd_fespace.get()); - port_E0n = std::make_unique(port_h1_fespace.get()); + const auto &mesh = *nd_fespace.GetParMesh(); + attr_list.Append(data.attributes.data(), data.attributes.size()); + port_mesh = std::make_unique(std::make_unique( + mfem::ParSubMesh::CreateFromBoundary(mesh, attr_list))); + port_normal = mesh::GetSurfaceNormal(*port_mesh); + + port_nd_fec = std::make_unique(nd_fespace.GetMaxElementOrder(), + port_mesh->Dimension()); + port_h1_fec = std::make_unique(h1_fespace.GetMaxElementOrder(), + port_mesh->Dimension()); + port_nd_fespace = std::make_unique(*port_mesh, port_nd_fec.get()); + port_h1_fespace = std::make_unique(*port_mesh, port_h1_fec.get()); + + GridFunction E0t(nd_fespace), E0n(h1_fespace); + port_E0t = std::make_unique(*port_nd_fespace, true); + port_E0n = std::make_unique(*port_h1_fespace, true); + port_E = std::make_unique(*port_nd_fespace, true); port_nd_transfer = std::make_unique( - mfem::ParSubMesh::CreateTransferMap(E0t, port_E0t->real())); + mfem::ParSubMesh::CreateTransferMap(E0t.Real(), port_E0t->Real())); port_h1_transfer = std::make_unique( - mfem::ParSubMesh::CreateTransferMap(E0n, port_E0n->real())); - - // Extract Dirichlet BC true dofs for the port FE spaces. - mfem::Array port_nd_dbc_tdof_list, port_h1_dbc_tdof_list; - GetEssentialTrueDofs(E0t, E0n, port_E0t->real(), port_E0n->real(), *port_nd_transfer, - *port_h1_transfer, dbc_marker, port_nd_dbc_tdof_list, - port_h1_dbc_tdof_list); - - // Construct operators for the generalized eigenvalue problem: - // [Aₜₜ 0] [eₜ] = -kₙ² [Bₜₜ Bₜₙ] [eₜ] - // [0 0] [eₙ] [Bₜₙᵀ Bₙₙ] [eₙ] - // for the wave port of the given index. The transformed variables are related to the true - // field by Eₜ = eₜ/kₙ and Eₙ = ieₙ. This is solved on the global mesh so the result is a - // grid function over the entire space, not just the port boundary (so that it can be - // queried from functions which use the global mesh). - // - // We will actually solve the shifted problem A e = λ B e, where: - // [Bₜₜ Bₜₙ] [eₜ] = λ [Bₜₜ + 1/Θ² Aₜₜ Bₜₙ] [eₜ] - // [Bₜₙᵀ Bₙₙ] [eₙ] [Bₜₙᵀ Bₙₙ] [eₙ] . - // Here we have λ = Θ²/(Θ²-kₙ²), where Θ² bounds the maximum kₙ² and is taken as Θ² = - // ω² μₘₐₓ εₘₐₓ over the entire simulation domain. - // Reference: Lee, Sun, and Cendes, Full-wave analysis of dielectric waveguides using - // tangential vector finite elements, IEEE Trans. Microwave Theory Tech. - // (1991). - double c_min = mfem::infinity(); - for (auto attr : mesh.attributes) - { - c_min = std::min(c_min, mat_op.GetLightSpeedMin(attr)); - } - MFEM_VERIFY(c_min > 0.0 && c_min < mfem::infinity(), - "Invalid material speed of light detected in WavePortOperator!"); - mu_eps_max = 1.0 / (c_min * c_min); + mfem::ParSubMesh::CreateTransferMap(E0n.Real(), port_E0n->Real())); - // Pre-compute problem matrices such that: - // A = A₁ - ω² A₂, B = A₁ - 1 / (μₘ εₘ) B₄ - ω² A₂ + 1/Θ² B₃ . + // Construct mapping from parent (boundary) element indices to submesh (domain) + // elements. { - std::unique_ptr A1, B4r, B4i; + const auto &port_submesh = static_cast(port_mesh->Get()); + const mfem::Array &parent_elems = port_submesh.GetParentElementIDMap(); + for (int i = 0; i < parent_elems.Size(); i++) { - auto Btt = GetBtt(mat_op, *port_nd_fespace); - auto Btn = GetBtn(mat_op, *port_nd_fespace, *port_h1_fespace); - auto [Bnn1, Bnn2r, Bnn2i] = GetBnn(mat_op, *port_h1_fespace); - auto [Att1, Att2r, Att2i] = GetAtt(mat_op, *port_nd_fespace); - - auto system_mats = GetSystemMatrices( - std::move(Btt), std::move(Btn), std::move(Bnn1), std::move(Bnn2r), - std::move(Bnn2i), std::move(Att1), std::move(Att2r), std::move(Att2i), - port_nd_dbc_tdof_list, port_h1_dbc_tdof_list); - A1 = std::move(system_mats[0]); - A2r = std::move(system_mats[1]); - A2i = std::move(system_mats[2]); - B3 = std::move(system_mats[3]); - B4r = std::move(system_mats[4]); - B4i = std::move(system_mats[5]); + submesh_parent_elems[parent_elems[i]] = i; } + } - // Allocate storage for the eigenvalue problem operators. We have sparsity(A2) = - // sparsity(B3) = sparsity(B4) ⊆ sparsity(A1). Precompute the frequency independent - // contributions to A and B. - P = std::make_unique( - std::make_unique(*A1), nullptr); - if (A2i) + // Extract Dirichlet BC true dofs for the port FE spaces. + { + mfem::Array port_nd_dbc_tdof_list, port_h1_dbc_tdof_list; + GetEssentialTrueDofs(E0t.Real(), E0n.Real(), port_E0t->Real(), port_E0n->Real(), + *port_nd_transfer, *port_h1_transfer, dbc_attr, + port_nd_dbc_tdof_list, port_h1_dbc_tdof_list); + int nd_tdof_offset = port_nd_fespace->GetTrueVSize(); + port_dbc_tdof_list.Reserve(port_nd_dbc_tdof_list.Size() + port_h1_dbc_tdof_list.Size()); + for (auto tdof : port_nd_dbc_tdof_list) { - A = std::make_unique( - std::make_unique(*A1), - std::make_unique(*A2i)); - B = std::make_unique( - std::make_unique(*A1), - std::make_unique(*A2i)); - - auto &Br = *static_cast(B->Real()); - Br.Add(-1.0 / mu_eps_max, *B4r); - - auto &Ai = *static_cast(A->Imag()); - auto &Bi = *static_cast(B->Imag()); - Ai *= 0.0; - Bi *= 0.0; - Bi.Add(-1.0 / mu_eps_max, *B4i); + port_dbc_tdof_list.Append(tdof); } - else + for (auto tdof : port_h1_dbc_tdof_list) { - A = std::make_unique( - std::make_unique(*A1), nullptr); - B = std::make_unique( - std::make_unique(*A1), nullptr); - - auto &Br = *static_cast(B->Real()); - Br.Add(-1.0 / mu_eps_max, *B4r); + port_dbc_tdof_list.Append(tdof + nd_tdof_offset); } } - // Create vector for initial space for eigenvalue solves (for nullspace of [Aₜₜ 0] - // [0 0] ). - GetInitialSpace(*port_nd_fespace, *port_h1_fespace, port_nd_dbc_tdof_list, - port_h1_dbc_tdof_list, v0); - e0.SetSize(v0.Size()); - e0t.SetSize(port_nd_fespace->GetTrueVSize()); - e0n.SetSize(port_h1_fespace->GetTrueVSize()); + // Create vector for initial space for eigenvalue solves and eigenmode solution. + GetInitialSpace(*port_nd_fespace, *port_h1_fespace, port_dbc_tdof_list, v0); + e0.SetSize(port_nd_fespace->GetTrueVSize() + port_h1_fespace->GetTrueVSize()); + e0.UseDevice(true); + + // The operators for the generalized eigenvalue problem are: + // [Aₜₜ Aₜₙ] [eₜ] = -kₙ² [Bₜₜ 0ₜₙ] [eₜ] + // [Aₙₜ Aₙₙ] [eₙ] [0ₙₜ 0ₙₙ] [eₙ] + // for the wave port of the given index. The transformed variables are related to the true + // field by Eₜ = eₜ and Eₙ = eₙ / ikₙ. We will actually solve the shift-and-inverse + // problem (A - σ B)⁻¹ B e = λ e, with λ = 1 / (-kₙ² - σ). + // Reference: Vardapetyan and Demkowicz, Full-wave analysis of dielectric waveguides at a + // given frequency, Math. Comput. (2003). + // See also: Halla and Monk, On the analysis of waveguide modes in an electromagnetic + // transmission line, arXiv:2302.11994 (2023). + const double c_min = mat_op.GetLightSpeedMax().Min(); + MFEM_VERIFY(c_min > 0.0 && c_min < mfem::infinity(), + "Invalid material speed of light detected in WavePortOperator!"); + mu_eps_max = 1.0 / (c_min * c_min) * 1.1; // Add a safety factor for maximum + // propagation constant possible + std::tie(Atnr, Atni) = GetAtn(mat_op, *port_nd_fespace, *port_h1_fespace); + std::tie(Antr, Anti) = GetAnt(mat_op, *port_h1_fespace, *port_nd_fespace); + std::tie(Annr, Anni) = GetAnn(mat_op, *port_h1_fespace, port_normal); + { + // The HypreParMatrix constructor from a SparseMatrix on each process does not copy + // the SparseMatrix data, but that's OK since this Dnn is copied in the block system + // matrix construction. + Vector d(port_h1_fespace->GetTrueVSize()); + d.UseDevice(false); // SparseMatrix constructor uses Vector on host + d = 0.0; + mfem::SparseMatrix diag(d); + auto Dnn = std::make_unique( + port_h1_fespace->GetComm(), port_h1_fespace->Get().GlobalTrueVSize(), + port_h1_fespace->Get().GetTrueDofOffsets(), &diag); + auto [Bttr, Btti] = GetBtt(mat_op, *port_nd_fespace); + auto [Br, Bi] = GetSystemMatrixB(Bttr.get(), Btti.get(), Dnn.get(), port_dbc_tdof_list); + opB = std::make_unique(std::move(Br), std::move(Bi)); + } // Configure a communicator for the processes which have elements for this port. MPI_Comm comm = nd_fespace.GetComm(); @@ -617,75 +645,119 @@ WavePortData::WavePortData(const config::WavePortData &data, const MaterialOpera { // Define the linear solver to be used for solving systems associated with the // generalized eigenvalue problem. - constexpr int ksp_print = 0; - constexpr double ksp_tol = 1.0e-8; - constexpr double ksp_max_it = 30; - auto gmres = std::make_unique>(port_comm, ksp_print); + auto gmres = std::make_unique>(port_comm, data.verbose); gmres->SetInitialGuess(false); - gmres->SetRelTol(ksp_tol); - gmres->SetMaxIter(ksp_max_it); - gmres->SetRestartDim(ksp_max_it); - // gmres->SetPrecSide(GmresSolver::PrecSide::RIGHT); + gmres->SetRelTol(data.ksp_tol); + gmres->SetMaxIter(data.ksp_max_its); + gmres->SetRestartDim(data.ksp_max_its); + // gmres->SetPrecSide(PreconditionerSide::RIGHT); - config::LinearSolverData::Type pc_type; + LinearSolver pc_type = solver.linear.type; + if (pc_type == LinearSolver::SUPERLU) + { +#if !defined(MFEM_USE_SUPERLU) + MFEM_ABORT("Solver was not built with SuperLU_DIST support, please choose a " + "different solver!"); +#endif + } + else if (pc_type == LinearSolver::STRUMPACK || pc_type == LinearSolver::STRUMPACK_MP) + { +#if !defined(MFEM_USE_STRUMPACK) + MFEM_ABORT("Solver was not built with STRUMPACK support, please choose a " + "different solver!"); +#endif + } + else if (pc_type == LinearSolver::MUMPS) + { +#if !defined(MFEM_USE_MUMPS) + MFEM_ABORT("Solver was not built with MUMPS support, please choose a " + "different solver!"); +#endif + } + else // Default choice + { #if defined(MFEM_USE_SUPERLU) - pc_type = config::LinearSolverData::Type::SUPERLU; + pc_type = LinearSolver::SUPERLU; #elif defined(MFEM_USE_STRUMPACK) - pc_type = config::LinearSolverData::Type::STRUMPACK; + pc_type = LinearSolver::STRUMPACK; #elif defined(MFEM_USE_MUMPS) - pc_type = config::LinearSolverData::Type::MUMPS; + pc_type = LinearSolver::MUMPS; #else #error "Wave port solver requires building with SuperLU_DIST, STRUMPACK, or MUMPS!" #endif - std::unique_ptr> pc; - if (pc_type == config::LinearSolverData::Type::SUPERLU) - { + } + auto pc = std::make_unique>( + [&]() -> std::unique_ptr + { + if (pc_type == LinearSolver::SUPERLU) + { #if defined(MFEM_USE_SUPERLU) - auto slu = std::make_unique( - port_comm, config::LinearSolverData::SymFactType::DEFAULT, false, ksp_print - 1); - // slu->GetSolver().SetColumnPermutation(mfem::superlu::NATURAL); - pc = std::make_unique>(std::move(slu)); + auto slu = std::make_unique( + port_comm, SymbolicFactorization::DEFAULT, false, true, data.verbose - 1); + // slu->GetSolver().SetColumnPermutation(mfem::superlu::MMD_AT_PLUS_A); + return slu; #endif - } - else if (pc_type == config::LinearSolverData::Type::STRUMPACK) - { + } + else if (pc_type == LinearSolver::STRUMPACK) + { #if defined(MFEM_USE_STRUMPACK) - auto strumpack = std::make_unique( - port_comm, config::LinearSolverData::SymFactType::DEFAULT, - config::LinearSolverData::CompressionType::NONE, 0.0, 0, 0, ksp_print - 1); - // strumpack->SetReorderingStrategy(strumpack::ReorderingStrategy::NATURAL); - pc = std::make_unique>(std::move(strumpack)); + auto strumpack = std::make_unique( + port_comm, SymbolicFactorization::DEFAULT, SparseCompression::NONE, 0.0, 0, + 0, true, data.verbose - 1); + // strumpack->SetReorderingStrategy(strumpack::ReorderingStrategy::AMD); + return strumpack; #endif - } - else // config::LinearSolverData::Type::MUMPS - { + } + else if (pc_type == LinearSolver::MUMPS) + { #if defined(MFEM_USE_MUMPS) - auto mumps = std::make_unique( - port_comm, mfem::MUMPSSolver::SYMMETRIC_INDEFINITE, - config::LinearSolverData::SymFactType::DEFAULT, 0.0, ksp_print - 1); - // mumps->SetReorderingStrategy(mfem::MUMPSSolver::AMD); - pc = std::make_unique>(std::move(mumps)); + auto mumps = std::make_unique( + port_comm, mfem::MUMPSSolver::UNSYMMETRIC, SymbolicFactorization::DEFAULT, + 0.0, true, data.verbose - 1); + // mumps->SetReorderingStrategy(mfem::MUMPSSolver::AMD); + return mumps; #endif - } + } + return {}; + }()); + pc->SetSaveAssembled(false); + pc->SetDropSmallEntries(false); ksp = std::make_unique(std::move(gmres), std::move(pc)); // Define the eigenvalue solver. constexpr int print = 0; - config::EigenSolverData::Type type; + EigenSolverBackend type = data.eigen_solver; + if (type == EigenSolverBackend::SLEPC) + { +#if !defined(PALACE_WITH_SLEPC) + MFEM_ABORT("Solver was not built with SLEPc support, please choose a " + "different solver!"); +#endif + } + else if (type == EigenSolverBackend::ARPACK) + { +#if !defined(PALACE_WITH_ARPACK) + MFEM_ABORT("Solver was not built with ARPACK support, please choose a " + "different solver!"); +#endif + } + else // Default choice + { #if defined(PALACE_WITH_SLEPC) - type = config::EigenSolverData::Type::SLEPC; + type = EigenSolverBackend::SLEPC; #elif defined(PALACE_WITH_ARPACK) - type = config::EigenSolverData::Type::ARPACK; + type = EigenSolverBackend::ARPACK; #else #error "Wave port solver requires building with ARPACK or SLEPc!" #endif - if (type == config::EigenSolverData::Type::ARPACK) + } + if (type == EigenSolverBackend::ARPACK) { #if defined(PALACE_WITH_ARPACK) eigen = std::make_unique(port_comm, print); #endif } - else // config::EigenSolverData::Type::SLEPC + else // EigenSolverBackend::SLEPC { #if defined(PALACE_WITH_SLEPC) auto slepc = std::make_unique(port_comm, print); @@ -694,33 +766,21 @@ WavePortData::WavePortData(const config::WavePortData &data, const MaterialOpera eigen = std::move(slepc); #endif } - constexpr double tol = 1.0e-6; eigen->SetNumModes(mode_idx, std::max(2 * mode_idx + 1, 5)); - eigen->SetTol(tol); - eigen->SetWhichEigenpairs(EigenvalueSolver::WhichType::LARGEST_MAGNITUDE); + eigen->SetTol(data.eig_tol); eigen->SetLinearSolver(*ksp); - } - // Coefficients store references to kₙ, ω so they are updated implicitly at each new - // solve. Also, μ⁻¹ is persistent, so no copy is OK. - kn0 = 0.0; - omega0 = 0.0; - port_nxH0r_func = - std::make_unique>(*port_E0t, *port_E0n, mat_op); - port_nxH0i_func = - std::make_unique>(*port_E0t, *port_E0n, mat_op); - port_sr = std::make_unique(port_nd_fespace.get()); - port_si = std::make_unique(port_nd_fespace.get()); - port_sr->AddDomainIntegrator(new VectorFEDomainLFIntegrator(*port_nxH0r_func)); - port_si->AddDomainIntegrator(new VectorFEDomainLFIntegrator(*port_nxH0i_func)); - port_sr->UseFastAssembly(false); - port_si->UseFastAssembly(false); + // We want to ignore evanescent modes (kₙ with large imaginary component). The + // eigenvalue 1 / (-kₙ² - σ) of the shifted problem will be a large-magnitude positive + // real number for an eigenvalue kₙ² with real part close to but not above the cutoff σ. + eigen->SetWhichEigenpairs(EigenvalueSolver::WhichType::LARGEST_REAL); + } // Configure port mode sign convention: 1ᵀ Re{-n x H} >= 0 on the "upper-right quadrant" // of the wave port boundary, in order to deal with symmetry effectively. { Vector bbmin, bbmax; - port_mesh->GetBoundingBox(bbmin, bbmax); + mesh::GetAxisAlignedBoundingBox(*port_mesh, bbmin, bbmax); const int dim = port_mesh->SpaceDimension(); double la = 0.0, lb = 0.0; @@ -760,13 +820,16 @@ WavePortData::WavePortData(const config::WavePortData &data, const MaterialOpera } }; mfem::VectorFunctionCoefficient tfunc(dim, TDirection); - port_S0t = std::make_unique(port_nd_fespace.get()); - port_S0t->ProjectCoefficient(tfunc); + port_S0t = std::make_unique(*port_nd_fespace); + port_S0t->Real().ProjectCoefficient(tfunc); } } WavePortData::~WavePortData() { + // Free the solvers before the communicator on which they are based. + ksp.reset(); + eigen.reset(); if (port_comm != MPI_COMM_NULL) { MPI_Comm_free(&port_comm); @@ -780,37 +843,28 @@ void WavePortData::Initialize(double omega) return; } - // Use pre-computed matrices to construct and solve the generalized eigenvalue problem for - // the desired wave port mode. - double theta2 = mu_eps_max * omega * omega; + // Construct matrices and solve the generalized eigenvalue problem for the desired wave + // port mode. The B matrix is operating frequency-independent and has already been + // constructed. + std::unique_ptr opA; + const double sigma = -omega * omega * mu_eps_max; { - auto &Pr = *static_cast(P->Real()); - Pr *= 0.0; - - auto &Ar = *static_cast(A->Real()); - auto &Br = *static_cast(B->Real()); - Ar.Add(-omega * omega + omega0 * omega0, *A2r); - Br.Add(-omega * omega + omega0 * omega0, *A2r); - Br.Add(1.0 / theta2 - ((omega0 == 0.0) ? 0.0 : 1.0 / (mu_eps_max * omega0 * omega0)), - *B3); - Pr.Add(1.0, Br); - - if (A2i) - { - auto &Ai = *static_cast(A->Imag()); - auto &Bi = *static_cast(B->Imag()); - Ai.Add(-omega * omega + omega0 * omega0, *A2i); - Bi.Add(-omega * omega + omega0 * omega0, *A2i); - Pr.Add(1.0, Bi); - } + auto [Attr, Atti] = GetAtt(mat_op, *port_nd_fespace, port_normal, omega, sigma); + auto [Ar, Ai] = + GetSystemMatrixA(Attr.get(), Atti.get(), Atnr.get(), Atni.get(), Antr.get(), + Anti.get(), Annr.get(), Anni.get(), port_dbc_tdof_list); + opA = std::make_unique(std::move(Ar), std::move(Ai)); } - // Configure and solve the eigenvalue problem for the desired boundary mode. + // Configure and solve the (inverse) eigenvalue problem for the desired boundary mode. + // Linear solves are preconditioned with the real part of the system matrix (ignore loss + // tangent). std::complex lambda; if (port_comm != MPI_COMM_NULL) { - ksp->SetOperators(*B, *P); - eigen->SetOperators(*A, *B, EigenvalueSolver::ScaleType::NONE); + ComplexWrapperOperator opP(opA->Real(), nullptr); // Non-owning constructor + ksp->SetOperators(*opA, opP); + eigen->SetOperators(*opB, *opA, EigenvalueSolver::ScaleType::NONE); eigen->SetInitialSpace(v0); int num_conv = eigen->Solve(); MFEM_VERIFY(num_conv >= mode_idx, "Wave port eigensolver did not converge!"); @@ -819,100 +873,179 @@ void WavePortData::Initialize(double omega) // eigen->GetError(mode_idx - 1, EigenvalueSolver::ErrorType::BACKWARD), // eigen->GetError(mode_idx - 1, EigenvalueSolver::ErrorType::ABSOLUTE)); } - Mpi::Broadcast(1, &lambda, port_root, B3->GetComm()); + Mpi::Broadcast(1, &lambda, port_root, port_mesh->GetComm()); // Extract the eigenmode solution and postprocess. The extracted eigenvalue is λ = - // Θ² / (Θ² - kₙ²). - MFEM_VERIFY(lambda.real() > 1.0 / (1.0 - 1.0e-2), - "Computed wave port mode is or is very close to being evanescent " - << "(λ = " << lambda << ")!"); - kn0 = std::sqrt(theta2 - theta2 / lambda); + // 1 / (-kₙ² - σ). + kn0 = std::sqrt(-sigma - 1.0 / lambda); omega0 = omega; - static_cast *>(port_nxH0r_func.get()) - ->SetFrequency(omega0, kn0); - static_cast *>(port_nxH0i_func.get()) - ->SetFrequency(omega0, kn0); // Separate the computed field out into eₜ and eₙ and and transform back to true - // electric field variables: Eₜ = eₜ/kₙ and Eₙ = ieₙ. - if (port_comm != MPI_COMM_NULL) + // electric field variables: Eₜ = eₜ and Eₙ = eₙ / ikₙ. { - Vector e0tr, e0ti, e0nr, e0ni; - eigen->GetEigenvector(mode_idx - 1, e0); - e0tr.MakeRef(e0.Real(), 0, e0t.Size()); - e0nr.MakeRef(e0.Real(), e0t.Size(), e0n.Size()); - e0ti.MakeRef(e0.Imag(), 0, e0t.Size()); - e0ni.MakeRef(e0.Imag(), e0t.Size(), e0n.Size()); - e0t.Real() = e0tr; - e0t.Imag() = e0ti; - e0n.Real() = e0nr; - e0n.Imag() = e0ni; - e0t *= 1.0 / kn0; - e0n *= 1i; + if (port_comm != MPI_COMM_NULL) + { + eigen->GetEigenvector(mode_idx - 1, e0); + linalg::NormalizePhase(port_comm, e0); + } + else + { + MFEM_ASSERT(e0.Size() == 0, + "Unexpected non-empty port FE space in wave port boundary mode solve!"); + } + e0.Real().Read(); // Ensure memory is allocated on device before aliasing + e0.Imag().Read(); + Vector e0tr(e0.Real(), 0, port_nd_fespace->GetTrueVSize()); + Vector e0nr(e0.Real(), port_nd_fespace->GetTrueVSize(), + port_h1_fespace->GetTrueVSize()); + Vector e0ti(e0.Imag(), 0, port_nd_fespace->GetTrueVSize()); + Vector e0ni(e0.Imag(), port_nd_fespace->GetTrueVSize(), + port_h1_fespace->GetTrueVSize()); + e0tr.UseDevice(true); + e0nr.UseDevice(true); + e0ti.UseDevice(true); + e0ni.UseDevice(true); + ComplexVector::AXPBY(1.0 / (1i * kn0), e0nr, e0ni, 0.0, e0nr, e0ni); + port_E0t->Real().SetFromTrueDofs(e0tr); // Parallel distribute + port_E0t->Imag().SetFromTrueDofs(e0ti); + port_E0n->Real().SetFromTrueDofs(e0nr); + port_E0n->Imag().SetFromTrueDofs(e0ni); } - else + + // Configure the linear forms for computing S-parameters (projection of the field onto the + // port mode). Normalize the mode for a chosen polarization direction and unit power, + // |E x H⋆| ⋅ n, integrated over the port surface (+n is the direction of propagation). { - MFEM_ASSERT(e0.Size() == 0 && e0t.Size() == 0 && e0n.Size() == 0, - "Unexpected non-empty port FE space in wave port boundary mode solve!"); + const auto &port_submesh = static_cast(port_mesh->Get()); + BdrSubmeshHVectorCoefficient port_nxH0r_func( + *port_E0t, *port_E0n, mat_op, port_submesh, submesh_parent_elems, kn0, omega0); + BdrSubmeshHVectorCoefficient port_nxH0i_func( + *port_E0t, *port_E0n, mat_op, port_submesh, submesh_parent_elems, kn0, omega0); + { + port_sr = std::make_unique(&port_nd_fespace->Get()); + port_sr->AddDomainIntegrator(new VectorFEDomainLFIntegrator(port_nxH0r_func)); + port_sr->UseFastAssembly(false); + port_sr->UseDevice(false); + port_sr->Assemble(); + port_sr->UseDevice(true); + } + { + port_si = std::make_unique(&port_nd_fespace->Get()); + port_si->AddDomainIntegrator(new VectorFEDomainLFIntegrator(port_nxH0i_func)); + port_si->UseFastAssembly(false); + port_si->UseDevice(false); + port_si->Assemble(); + port_si->UseDevice(true); + } + Normalize(*port_S0t, *port_E0t, *port_E0n, *port_sr, *port_si); } - port_E0t->real().SetFromTrueDofs(e0t.Real()); // Parallel distribute - port_E0t->imag().SetFromTrueDofs(e0t.Imag()); - port_E0n->real().SetFromTrueDofs(e0n.Real()); - port_E0n->imag().SetFromTrueDofs(e0n.Imag()); - - // Normalize the mode for a chosen polarization direction and unit power, |E x H⋆| ⋅ n, - // integrated over the port surface (+n is the direction of propagation). - NormalizeWithSign(*port_S0t, *port_E0t, *port_E0n, *port_sr, *port_si); +} + +std::unique_ptr +WavePortData::GetModeExcitationCoefficientReal() const +{ + const auto &port_submesh = static_cast(port_mesh->Get()); + return std::make_unique< + RestrictedVectorCoefficient>>( + attr_list, *port_E0t, *port_E0n, mat_op, port_submesh, submesh_parent_elems, kn0, + omega0); +} + +std::unique_ptr +WavePortData::GetModeExcitationCoefficientImag() const +{ + const auto &port_submesh = static_cast(port_mesh->Get()); + return std::make_unique< + RestrictedVectorCoefficient>>( + attr_list, *port_E0t, *port_E0n, mat_op, port_submesh, submesh_parent_elems, kn0, + omega0); +} + +std::unique_ptr +WavePortData::GetModeFieldCoefficientReal(double scaling) const +{ + const auto &port_submesh = static_cast(port_mesh->Get()); + return std::make_unique< + RestrictedVectorCoefficient>>( + attr_list, *port_E0t, *port_E0n, port_submesh, submesh_parent_elems, scaling); +} + +std::unique_ptr +WavePortData::GetModeFieldCoefficientImag(double scaling) const +{ + const auto &port_submesh = static_cast(port_mesh->Get()); + return std::make_unique< + RestrictedVectorCoefficient>>( + attr_list, *port_E0t, *port_E0n, port_submesh, submesh_parent_elems, scaling); } double WavePortData::GetExcitationPower() const { // The computed port modes are normalized such that the power integrated over the port is // 1: ∫ (E_inc x H_inc⋆) ⋅ n dS = 1. - return excitation ? 1.0 : 0.0; + return HasExcitation() ? 1.0 : 0.0; } -std::complex WavePortData::GetSParameter(mfem::ParComplexGridFunction &E) const +std::complex WavePortData::GetPower(GridFunction &E, GridFunction &B) const { - // Compute port S-parameter, or the projection of the field onto the port mode: - // (E x H_inc⋆) ⋅ n = E ⋅ (-n x H_inc⋆), integrated over the port surface. - mfem::ParComplexGridFunction port_E(port_nd_fespace.get()); - port_nd_transfer->Transfer(E.real(), port_E.real()); - port_nd_transfer->Transfer(E.imag(), port_E.imag()); - std::complex dot(-((*port_sr) * port_E.real()) - ((*port_si) * port_E.imag()), - -((*port_sr) * port_E.imag()) + ((*port_si) * port_E.real())); - Mpi::GlobalSum(1, &dot, port_nd_fespace->GetComm()); + // Compute port power, (E x H) ⋅ n = E ⋅ (-n x H), integrated over the port surface using + // the computed E and H = μ⁻¹ B fields, where +n is the direction of propagation (into the + // domain). The BdrSurfaceCurrentVectorCoefficient computes -n x H for an outward normal, + // so we multiply by -1. The linear form is reconstructed from scratch each time due to + // changing H. + MFEM_VERIFY(E.HasImag() && B.HasImag(), + "Wave ports expect complex-valued E and B fields in port power " + "calculation!"); + auto &nd_fespace = *E.ParFESpace(); + const auto &mesh = *nd_fespace.GetParMesh(); + BdrSurfaceCurrentVectorCoefficient nxHr_func(B.Real(), mat_op); + BdrSurfaceCurrentVectorCoefficient nxHi_func(B.Imag(), mat_op); + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; + mfem::Array attr_marker = mesh::AttrToMarker(bdr_attr_max, attr_list); + std::complex dot; + { + mfem::LinearForm pr(&nd_fespace); + pr.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(nxHr_func), attr_marker); + pr.UseFastAssembly(false); + pr.UseDevice(false); + pr.Assemble(); + pr.UseDevice(true); + dot = -(pr * E.Real()) - 1i * (pr * E.Imag()); + } + { + mfem::LinearForm pi(&nd_fespace); + pi.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(nxHi_func), attr_marker); + pi.UseFastAssembly(false); + pi.UseDevice(false); + pi.Assemble(); + pi.UseDevice(true); + dot += -(pi * E.Imag()) + 1i * (pi * E.Real()); + } + Mpi::GlobalSum(1, &dot, nd_fespace.GetComm()); return dot; } -std::complex WavePortData::GetPower(mfem::ParComplexGridFunction &E, - mfem::ParComplexGridFunction &B, - const MaterialOperator &mat_op) const +std::complex WavePortData::GetSParameter(GridFunction &E) const { - // Compute port power, (E x H) ⋅ n = E ⋅ (-n x H), integrated over the port surface - // using the computed E and H = μ⁻¹ B fields. The linear form is reconstructed from - // scratch each time due to changing H. The BdrCurrentVectorCoefficient computes -n x H, - // where n is an outward normal. - auto &nd_fespace = *E.ParFESpace(); - BdrCurrentVectorCoefficient nxHr_func(B.real(), mat_op); - BdrCurrentVectorCoefficient nxHi_func(B.imag(), mat_op); - mfem::LinearForm pr(&nd_fespace), pi(&nd_fespace); - pr.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(nxHr_func), attr_marker); - pi.AddBoundaryIntegrator(new VectorFEBoundaryLFIntegrator(nxHi_func), attr_marker); - pr.UseFastAssembly(false); - pi.UseFastAssembly(false); - pr.Assemble(); - pi.Assemble(); - std::complex dot(-(pr * E.real()) - (pi * E.imag()), - -(pr * E.imag()) + (pi * E.real())); - Mpi::GlobalSum(1, &dot, nd_fespace.GetComm()); + // Compute port S-parameter, or the projection of the field onto the port mode: + // (E x H_inc⋆) ⋅ n = E ⋅ (-n x H_inc⋆), integrated over the port surface. + MFEM_VERIFY(E.HasImag(), + "Wave ports expect complex-valued E and B fields in port S-parameter " + "calculation!"); + port_nd_transfer->Transfer(E.Real(), port_E->Real()); + port_nd_transfer->Transfer(E.Imag(), port_E->Imag()); + std::complex dot(-((*port_sr) * port_E->Real()) - ((*port_si) * port_E->Imag()), + -((*port_sr) * port_E->Imag()) + ((*port_si) * port_E->Real())); + Mpi::GlobalSum(1, &dot, port_nd_fespace->GetComm()); return dot; } -WavePortOperator::WavePortOperator(const IoData &iod, const MaterialOperator &mat, - const mfem::ParFiniteElementSpace &nd_fespace, - const mfem::ParFiniteElementSpace &h1_fespace) - : iodata(iod), mat_op(mat), suppress_output(false) +WavePortOperator::WavePortOperator(const IoData &iodata, const MaterialOperator &mat_op, + mfem::ParFiniteElementSpace &nd_fespace, + mfem::ParFiniteElementSpace &h1_fespace) + : suppress_output(false), + fc(iodata.units.Dimensionalize(1.0)), + kc(1.0 / iodata.units.Dimensionalize(1.0)) { // Set up wave port boundary conditions. MFEM_VERIFY(nd_fespace.GetParMesh() == h1_fespace.GetParMesh(), @@ -921,20 +1054,20 @@ WavePortOperator::WavePortOperator(const IoData &iod, const MaterialOperator &ma PrintBoundaryInfo(iodata, *nd_fespace.GetParMesh()); } -void WavePortOperator::SetUpBoundaryProperties( - const IoData &iodata, const MaterialOperator &mat_op, - const mfem::ParFiniteElementSpace &nd_fespace, - const mfem::ParFiniteElementSpace &h1_fespace) +void WavePortOperator::SetUpBoundaryProperties(const IoData &iodata, + const MaterialOperator &mat_op, + mfem::ParFiniteElementSpace &nd_fespace, + mfem::ParFiniteElementSpace &h1_fespace) { // Check that wave port boundary attributes have been specified correctly. - int bdr_attr_max = nd_fespace.GetParMesh()->bdr_attributes.Size() - ? nd_fespace.GetParMesh()->bdr_attributes.Max() - : 0; + const auto &mesh = *nd_fespace.GetParMesh(); + int bdr_attr_max = mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0; if (!iodata.boundaries.waveport.empty()) { - mfem::Array bdr_attr_marker(bdr_attr_max); + mfem::Array bdr_attr_marker(bdr_attr_max), port_marker(bdr_attr_max); bdr_attr_marker = 0; - for (auto attr : nd_fespace.GetParMesh()->bdr_attributes) + port_marker = 0; + for (auto attr : mesh.bdr_attributes) { bdr_attr_marker[attr - 1] = 1; } @@ -947,6 +1080,9 @@ void WavePortOperator::SetUpBoundaryProperties( "boundaries in the mesh!"); MFEM_VERIFY(bdr_attr_marker[attr - 1], "Unknown port boundary attribute " << attr << "!"); + MFEM_VERIFY(!data.active || !port_marker[attr - 1], + "Boundary attribute is assigned to more than one wave port!"); + port_marker[attr - 1] = 1; } } } @@ -954,10 +1090,13 @@ void WavePortOperator::SetUpBoundaryProperties( // List of all boundaries which will be marked as essential for the purposes of computing // wave port modes. This includes all PEC surfaces, but may also include others like when // a kinetic inductance or other BC is applied for the 3D simulation but should be - // considered as PEC for the 2D problem. - mfem::Array dbc_bcs, dbc_marker; + // considered as PEC for the 2D problem. In addition, we mark as Dirichlet boundaries all + // wave ports other than the wave port being currently considered, in case two wave ports + // are touching and share one or more edges. + mfem::Array dbc_bcs; dbc_bcs.Reserve(static_cast(iodata.boundaries.pec.attributes.size() + - iodata.boundaries.auxpec.attributes.size())); + iodata.boundaries.auxpec.attributes.size() + + iodata.boundaries.conductivity.size())); for (auto attr : iodata.boundaries.pec.attributes) { if (attr <= 0 || attr > bdr_attr_max) @@ -974,90 +1113,101 @@ void WavePortOperator::SetUpBoundaryProperties( } dbc_bcs.Append(attr); } + for (const auto &data : iodata.boundaries.conductivity) + { + for (auto attr : data.attributes) + { + if (attr <= 0 || attr > bdr_attr_max) + { + continue; // Can just ignore if wrong + } + dbc_bcs.Append(attr); + } + } // If user accidentally specifies a surface as both "PEC" and "WavePortPEC", this is fine // so allow for duplicates in the attribute list. dbc_bcs.Sort(); dbc_bcs.Unique(); - mesh::AttrToMarker(bdr_attr_max, dbc_bcs, dbc_marker); // Set up wave port data structures. for (const auto &[idx, data] : iodata.boundaries.waveport) { - ports.try_emplace(idx, data, mat_op, nd_fespace, h1_fespace, dbc_marker); - } - MFEM_VERIFY( - ports.empty() || iodata.problem.type == config::ProblemData::Type::DRIVEN, - "Wave port boundaries are only available for frequency domain driven simulations!"); - - // Mark selected boundary attributes from the mesh for wave ports. - port_marker.SetSize(bdr_attr_max); - port_marker = 0; - for (const auto &[idx, data] : ports) - { - for (int i = 0; i < data.GetMarker().Size(); i++) + mfem::Array port_dbc_bcs(dbc_bcs); + for (const auto &[other_idx, other_data] : iodata.boundaries.waveport) { - MFEM_VERIFY(!(port_marker[i] && data.GetMarker()[i]), - "Boundary attribute is assigned to more than one wave port!"); - port_marker[i] = port_marker[i] || data.GetMarker()[i]; + if (other_idx == idx || !other_data.active) + { + continue; + } + for (auto attr : other_data.attributes) + { + if (std::binary_search(data.attributes.begin(), data.attributes.end(), attr)) + { + continue; + } + port_dbc_bcs.Append(attr); + } } + port_dbc_bcs.Sort(); + port_dbc_bcs.Unique(); + ports.try_emplace(idx, data, iodata.solver, mat_op, nd_fespace, h1_fespace, + port_dbc_bcs); } + MFEM_VERIFY( + ports.empty() || iodata.problem.type == ProblemType::DRIVEN || + iodata.problem.type == ProblemType::EIGENMODE, + "Wave port boundaries are only available for frequency domain driven simulations!"); } -void WavePortOperator::PrintBoundaryInfo(const IoData &iodata, mfem::ParMesh &mesh) +void WavePortOperator::PrintBoundaryInfo(const IoData &iodata, const mfem::ParMesh &mesh) { - // Print out BC info for all port attributes. if (ports.empty()) { return; } - Mpi::Print("\nConfiguring Robin impedance BC for wave ports at attributes:\n"); + fmt::memory_buffer buf{}; // Output buffer & buffer append lambda for cleaner code + auto to = [&buf](auto fmt, auto &&...args) + { fmt::format_to(std::back_inserter(buf), fmt, std::forward(args)...); }; + + // Print out BC info for all active port attributes. for (const auto &[idx, data] : ports) { - for (int i = 0; i < data.GetMarker().Size(); i++) + if (!data.active) { - if (!data.GetMarker()[i]) - { - continue; - } - const int attr = i + 1; - mfem::Vector nor; - mesh::GetSurfaceNormal(mesh, attr, nor); - Mpi::Print( - " {:d}: Index = {:d}, mode = {:d}, d = {:.3e} m", attr, idx, data.GetModeIndex(), - iodata.DimensionalizeValue(IoData::ValueType::LENGTH, data.GetOffsetDistance())); - if (mesh.SpaceDimension() == 3) - { - Mpi::Print(", n = ({:+.1f}, {:+.1f}, {:+.1f})", nor(0), nor(1), nor(2)); - } - else - { - Mpi::Print(", n = ({:+.1f}, {:+.1f})", nor(0), nor(1)); - } - Mpi::Print("\n"); + continue; + } + for (auto attr : data.GetAttrList()) + { + to(" {:d}: Index = {:d}, mode = {:d}, d = {:.3e} m, n = ({:+.1f})\n", attr, idx, + data.mode_idx, + iodata.units.Dimensionalize(data.d_offset), + fmt::join(data.port_normal, ",")); } } + if (buf.size() > 0) + { + Mpi::Print("\nConfiguring Robin impedance BC for wave ports at attributes:\n"); + Mpi::Print("{}", fmt::to_string(buf)); + buf.clear(); + } // Print some information for excited wave ports. - bool first = true; for (const auto &[idx, data] : ports) { - if (!data.IsExcited()) + if (!data.HasExcitation()) { continue; } - if (first) - { - Mpi::Print("\nConfiguring wave port excitation source term at attributes:\n"); - first = false; - } - for (int i = 0; i < data.GetMarker().Size(); i++) + for (auto attr : data.GetAttrList()) { - if (data.GetMarker()[i]) - { - Mpi::Print(" {:d}: Index = {:d}\n", i + 1, idx); - } + to(" {:d}: Index = {:d}\n", attr, idx); } } + if (buf.size() > 0) + { + Mpi::Print("\nConfiguring wave port excitation source term at attributes:\n"); + Mpi::Print("{}", fmt::to_string(buf)); + } } const WavePortData &WavePortOperator::GetPort(int idx) const @@ -1067,25 +1217,38 @@ const WavePortData &WavePortOperator::GetPort(int idx) const return it->second; } +mfem::Array WavePortOperator::GetAttrList() const +{ + mfem::Array attr_list; + for (const auto &[idx, data] : ports) + { + if (!data.active) + { + continue; + } + attr_list.Append(data.GetAttrList()); + } + return attr_list; +} + void WavePortOperator::Initialize(double omega) { bool init = false, first = true; for (const auto &[idx, data] : ports) { - init = init || (data.GetOperatingFrequency() != omega); - first = first && (data.GetOperatingFrequency() == 0.0); + init = init || (data.omega0 != omega); + first = first && (data.omega0 == 0.0); } if (!init) { return; } - BlockTimer bt(Timer::WAVEPORT); + BlockTimer bt(Timer::WAVE_PORT); if (!suppress_output) { - const double freq = iodata.DimensionalizeValue(IoData::ValueType::FREQUENCY, omega); Mpi::Print( - "\nCalculating boundary modes at wave ports for ω/2π = {:.3e} GHz ({:.3e})\n", freq, - omega); + "\nCalculating boundary modes at wave ports for ω/2π = {:.3e} GHz ({:.3e})\n", + omega * fc, omega); } for (auto &[idx, data] : ports) { @@ -1098,52 +1261,54 @@ void WavePortOperator::Initialize(double omega) " H1: {:d}, ND: {:d}\n", idx, data.GlobalTrueH1Size(), data.GlobalTrueNDSize()); } - double k0 = 1.0 / iodata.DimensionalizeValue(IoData::ValueType::LENGTH, 1.0); - Mpi::Print(" Port {:d}, mode {:d}: kₙ = {:.3e}{:+.3e}i m⁻¹\n", idx, - data.GetModeIndex(), k0 * data.GetPropagationConstant().real(), - k0 * data.GetPropagationConstant().imag()); + Mpi::Print(" Port {:d}, mode {:d}: kₙ = {:.3e}{:+.3e}i m⁻¹\n", idx, data.mode_idx, + data.kn0.real() * kc, data.kn0.imag() * kc); } } } void WavePortOperator::AddExtraSystemBdrCoefficients(double omega, - SumMatrixCoefficient &fbr, - SumMatrixCoefficient &fbi) + MaterialPropertyCoefficient &fbr, + MaterialPropertyCoefficient &fbi) { // Add wave port boundaries to the bilinear form. This looks a lot like the lumped port // boundary, except the iω / Z_s coefficient goes to ikₙ / μ where kₙ is specific to the // port mode at the given operating frequency (note only the real part of the propagation // constant contributes). Initialize(omega); - for (auto &[idx, data] : ports) + for (const auto &[idx, data] : ports) { - constexpr auto MatType = MaterialPropertyType::INV_PERMEABILITY; - constexpr auto ElemType = MeshElementType::BDR_ELEMENT; - fbi.AddCoefficient(std::make_unique>( - mat_op, data.GetPropagationConstant().real()), - data.GetMarker()); + if (!data.active) + { + continue; + } + const MaterialOperator &mat_op = data.mat_op; + MaterialPropertyCoefficient muinv_func(mat_op.GetBdrAttributeToMaterial(), + mat_op.GetInvPermeability()); + muinv_func.RestrictCoefficient(mat_op.GetCeedBdrAttributes(data.GetAttrList())); + // fbr.AddCoefficient(muinv_func.GetAttributeToMaterial(), + // muinv_func.GetMaterialProperties(), + // -data.kn0.imag()); + fbi.AddCoefficient(muinv_func.GetAttributeToMaterial(), + muinv_func.GetMaterialProperties(), data.kn0.real()); } } -void WavePortOperator::AddExcitationBdrCoefficients(double omega, SumVectorCoefficient &fbr, +void WavePortOperator::AddExcitationBdrCoefficients(int excitation_idx, double omega, + SumVectorCoefficient &fbr, SumVectorCoefficient &fbi) { - // Re{-U_inc} = Re{+2 (-iω) n x H_inc}, which is a function of E_inc as computed by the - // modal solution (stored as a grid function and coefficient during initialization). - // Likewise for the imaginary part. + // Re/Im{-U_inc} = Re/Im{+2 (-iω) n x H_inc}, which is a function of E_inc as computed by + // the modal solution (stored as a grid function and coefficient during initialization). Initialize(omega); - for (auto &[idx, data] : ports) + for (const auto &[idx, data] : ports) { - if (!data.IsExcited()) + if (data.excitation != excitation_idx) { continue; } - fbr.AddCoefficient(std::make_unique( - 2.0 * omega, data.GetModeCoefficientImag()), - data.GetMarker()); - fbi.AddCoefficient(std::make_unique( - -2.0 * omega, data.GetModeCoefficientReal()), - data.GetMarker()); + fbr.AddCoefficient(data.GetModeExcitationCoefficientImag(), 2.0 * omega); + fbi.AddCoefficient(data.GetModeExcitationCoefficientReal(), -2.0 * omega); } } diff --git a/palace/models/waveportoperator.hpp b/palace/models/waveportoperator.hpp index c2a7b1f6a8..281eed8e22 100644 --- a/palace/models/waveportoperator.hpp +++ b/palace/models/waveportoperator.hpp @@ -1,181 +1,182 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_MODELS_WAVE_PORT_OPERATOR_HPP -#define PALACE_MODELS_WAVE_PORT_OPERATOR_HPP - -#include -#include -#include -#include -#include "linalg/eps.hpp" -#include "linalg/ksp.hpp" -#include "linalg/operator.hpp" -#include "linalg/vector.hpp" - -namespace palace -{ - -class IoData; -class MaterialOperator; -class SumMatrixCoefficient; -class SumVectorCoefficient; - -namespace config -{ - -struct WavePortData; - -} // namespace config - -// -// Helper class for wave port boundaries in a model. -// -class WavePortData -{ -private: - bool excitation; - int mode_idx; - double d_offset; - - // Attribute list and marker for all boundary attributes making up this port boundary. - // Mutable because some MFEM API calls are not const correct. - mfem::Array attr_list; - mutable mfem::Array attr_marker; - - // SubMesh data structures to define finite element spaces and grid functions on the - // SubMesh corresponding to this port boundary. - std::unique_ptr port_mesh; - std::unique_ptr port_nd_fec, port_h1_fec; - std::unique_ptr port_nd_fespace, port_h1_fespace; - std::unique_ptr port_nd_transfer, port_h1_transfer; - - // Operator storage for repeated boundary mode eigenvalue problem solves. - double mu_eps_max; - std::unique_ptr A2r, A2i, B3; - std::unique_ptr A, B, P; - ComplexVector v0, e0, e0t, e0n; - - // Eigenvalue solver for boundary modes. - MPI_Comm port_comm; - int port_root; - std::unique_ptr eigen; - std::unique_ptr ksp; - - // Grid functions storing the last computed electric field mode on the port and the - // associated propagation constant. Also the coefficient for the incident port mode - // (n x H_inc) computed from the electric field mode. - std::unique_ptr port_E0t, port_E0n; - std::unique_ptr port_nxH0r_func, port_nxH0i_func; - std::unique_ptr port_sr, port_si; - std::unique_ptr port_S0t; - std::complex kn0; - double omega0; - -public: - WavePortData(const config::WavePortData &data, const MaterialOperator &mat_op, - const mfem::ParFiniteElementSpace &nd_fespace, - const mfem::ParFiniteElementSpace &h1_fespace, - const mfem::Array &dbc_marker); - ~WavePortData(); - - const mfem::Array &GetMarker() const { return attr_marker; } - mfem::Array &GetMarker() { return attr_marker; } - - void Initialize(double omega); - - HYPRE_BigInt GlobalTrueNDSize() const { return port_nd_fespace->GlobalTrueVSize(); } - HYPRE_BigInt GlobalTrueH1Size() const { return port_h1_fespace->GlobalTrueVSize(); } - - std::complex GetPropagationConstant() const { return kn0; } - double GetOperatingFrequency() const { return omega0; } - - bool IsExcited() const { return excitation; } - int GetModeIndex() const { return mode_idx; } - double GetOffsetDistance() const { return d_offset; } - - const mfem::VectorCoefficient &GetModeCoefficientReal() const { return *port_nxH0r_func; } - mfem::VectorCoefficient &GetModeCoefficientReal() { return *port_nxH0r_func; } - const mfem::VectorCoefficient &GetModeCoefficientImag() const { return *port_nxH0i_func; } - mfem::VectorCoefficient &GetModeCoefficientImag() { return *port_nxH0i_func; } - - std::complex GetCharacteristicImpedance() const - { - MFEM_ABORT("GetImpedance is not yet implemented for wave port boundaries!"); - return 0.0; - } - - double GetExcitationPower() const; - std::complex GetExcitationVoltage() const - { - MFEM_ABORT("GetExcitationVoltage is not yet implemented for wave port boundaries!"); - return 0.0; - } - - std::complex GetSParameter(mfem::ParComplexGridFunction &E) const; - std::complex GetPower(mfem::ParComplexGridFunction &E, - mfem::ParComplexGridFunction &B, - const MaterialOperator &mat_op) const; - std::complex GetVoltage(mfem::ParComplexGridFunction &E) const - { - MFEM_ABORT("GetVoltage is not yet implemented for wave port boundaries!"); - return 0.0; - } -}; - -// -// A class handling wave port boundaries and their postprocessing. -// -class WavePortOperator -{ -private: - // References to configuration file and material property data (not owned). - const IoData &iodata; - const MaterialOperator &mat_op; - - // Flag which forces no printing during WavePortData::Print(). - bool suppress_output; - - // Mapping from port index to data structure containing port information. - std::map ports; - mfem::Array port_marker; - void SetUpBoundaryProperties(const IoData &iodata, const MaterialOperator &mat_op, - const mfem::ParFiniteElementSpace &nd_fespace, - const mfem::ParFiniteElementSpace &h1_fespace); - void PrintBoundaryInfo(const IoData &iodata, mfem::ParMesh &mesh); - - // Compute boundary modes for all wave port boundaries at the specified frequency. - void Initialize(double omega); - -public: - WavePortOperator(const IoData &iod, const MaterialOperator &mat, - const mfem::ParFiniteElementSpace &nd_fespace, - const mfem::ParFiniteElementSpace &h1_fespace); - - // Access data structures for the wave port with the given index. - const WavePortData &GetPort(int idx) const; - auto begin() const { return ports.begin(); } - auto end() const { return ports.end(); } - auto rbegin() const { return ports.rbegin(); } - auto rend() const { return ports.rend(); } - auto Size() const { return ports.size(); } - - // Enable or suppress all outputs (log printing and fields to disk). - void SetSuppressOutput(bool suppress) { suppress_output = suppress; } - - // Returns array marking wave port attributes. - const mfem::Array &GetMarker() const { return port_marker; } - - // Add contributions to system matrix from wave ports. - void AddExtraSystemBdrCoefficients(double omega, SumMatrixCoefficient &fbr, - SumMatrixCoefficient &fbi); - - // Add contributions to the right-hand side source term vector for an incident field at - // excited port boundaries. - void AddExcitationBdrCoefficients(double omega, SumVectorCoefficient &fbr, - SumVectorCoefficient &fbi); -}; - -} // namespace palace - -#endif // PALACE_MODELS_WAVE_PORT_OPERATOR_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_MODELS_WAVE_PORT_OPERATOR_HPP +#define PALACE_MODELS_WAVE_PORT_OPERATOR_HPP + +#include +#include +#include +#include +#include +#include "fem/fespace.hpp" +#include "fem/gridfunction.hpp" +#include "fem/mesh.hpp" +#include "linalg/eps.hpp" +#include "linalg/ksp.hpp" +#include "linalg/operator.hpp" +#include "linalg/vector.hpp" + +namespace palace +{ + +class IoData; +class MaterialOperator; +class MaterialPropertyCoefficient; +class SumVectorCoefficient; + +namespace config +{ + +struct WavePortData; +struct SolverData; + +} // namespace config + +// +// Helper class for wave port boundaries in a model. +// +class WavePortData +{ +public: + // Reference to material property data (not owned). + const MaterialOperator &mat_op; + + // Wave port properties. + int mode_idx; + double d_offset; + int excitation; + bool active; + std::complex kn0; + double omega0; + mfem::Vector port_normal; + +private: + // List of all boundary attributes making up this port boundary. + mfem::Array attr_list; + + // SubMesh data structures to define finite element spaces and grid functions on the + // SubMesh corresponding to this port boundary. + std::unique_ptr port_mesh; + std::unique_ptr port_nd_fec, port_h1_fec; + std::unique_ptr port_nd_fespace, port_h1_fespace; + std::unique_ptr port_nd_transfer, port_h1_transfer; + std::unordered_map submesh_parent_elems; + mfem::Array port_dbc_tdof_list; + double mu_eps_max; + + // Operator storage for repeated boundary mode eigenvalue problem solves. + std::unique_ptr Atnr, Atni, Antr, Anti, Annr, Anni; + std::unique_ptr opB; + ComplexVector v0, e0; + + // Eigenvalue solver for boundary modes. + MPI_Comm port_comm; + int port_root; + std::unique_ptr eigen; + std::unique_ptr ksp; + + // Grid functions storing the last computed electric field mode on the port, and stored + // objects for computing functions of the port modes for use as an excitation or in + // postprocessing. + std::unique_ptr port_E0t, port_E0n, port_S0t, port_E; + std::unique_ptr port_sr, port_si; + +public: + WavePortData(const config::WavePortData &data, const config::SolverData &solver, + const MaterialOperator &mat_op, mfem::ParFiniteElementSpace &nd_fespace, + mfem::ParFiniteElementSpace &h1_fespace, const mfem::Array &dbc_attr); + ~WavePortData(); + + [[nodiscard]] constexpr bool HasExcitation() const { return excitation != 0; } + + const auto &GetAttrList() const { return attr_list; } + + void Initialize(double omega); + + HYPRE_BigInt GlobalTrueNDSize() const { return port_nd_fespace->GlobalTrueVSize(); } + HYPRE_BigInt GlobalTrueH1Size() const { return port_h1_fespace->GlobalTrueVSize(); } + + std::unique_ptr GetModeExcitationCoefficientReal() const; + std::unique_ptr GetModeExcitationCoefficientImag() const; + + std::unique_ptr + GetModeFieldCoefficientReal(double scaling = 1.0) const; + std::unique_ptr + GetModeFieldCoefficientImag(double scaling = 1.0) const; + + std::complex GetCharacteristicImpedance() const + { + MFEM_ABORT("GetImpedance is not yet implemented for wave port boundaries!"); + return 0.0; + } + + double GetExcitationPower() const; + std::complex GetExcitationVoltage() const + { + MFEM_ABORT("GetExcitationVoltage is not yet implemented for wave port boundaries!"); + return 0.0; + } + + std::complex GetPower(GridFunction &E, GridFunction &B) const; + std::complex GetSParameter(GridFunction &E) const; + std::complex GetVoltage(GridFunction &E) const + { + MFEM_ABORT("GetVoltage is not yet implemented for wave port boundaries!"); + return 0.0; + } +}; + +// +// A class handling wave port boundaries and their postprocessing. +// +class WavePortOperator +{ +private: + // Mapping from port index to data structure containing port information. + std::map ports; + + // Flag which forces no printing during WavePortData::Print(). + bool suppress_output; + double fc, kc; + + void SetUpBoundaryProperties(const IoData &iodata, const MaterialOperator &mat_op, + mfem::ParFiniteElementSpace &nd_fespace, + mfem::ParFiniteElementSpace &h1_fespace); + void PrintBoundaryInfo(const IoData &iodata, const mfem::ParMesh &mesh); + + // Compute boundary modes for all wave port boundaries at the specified frequency. + void Initialize(double omega); + +public: + WavePortOperator(const IoData &iodata, const MaterialOperator &mat_op, + mfem::ParFiniteElementSpace &nd_fespace, + mfem::ParFiniteElementSpace &h1_fespace); + + // Access data structures for the wave port with the given index. + const WavePortData &GetPort(int idx) const; + auto begin() const { return ports.begin(); } + auto end() const { return ports.end(); } + auto rbegin() const { return ports.rbegin(); } + auto rend() const { return ports.rend(); } + auto Size() const { return ports.size(); } + + // Enable or suppress all outputs (log printing and fields to disk). + void SetSuppressOutput(bool suppress) { suppress_output = suppress; } + + // Returns array of wave port attributes. + mfem::Array GetAttrList() const; + + // Add contributions to system matrix from wave ports. + void AddExtraSystemBdrCoefficients(double omega, MaterialPropertyCoefficient &fbr, + MaterialPropertyCoefficient &fbi); + + // Add contributions to the right-hand side source term vector for an incident field at + // excited port boundaries. + void AddExcitationBdrCoefficients(int excitation_idx, double omega, + SumVectorCoefficient &fbr, SumVectorCoefficient &fbi); +}; + +} // namespace palace + +#endif // PALACE_MODELS_WAVE_PORT_OPERATOR_HPP diff --git a/palace/palace.props b/palace/palace.props index 45ecf81e5c..ed9549c510 100644 --- a/palace/palace.props +++ b/palace/palace.props @@ -4,9 +4,10 @@ $(Platform)\$(Configuration)\ - $(IncludePath) - $(WELSIM_LIBPACK)\lib\mumps;$(WELSIM_LIBPACK)\lib;$(IFORT_COMPILER22)\compiler\lib\intel64_win;$(MSMPI_LIB64);$(INTEL_MKL)\lib\intel64;$(LibraryPath) - $(MSMPI_INC);$(INTEL_MKL)\include;$(WELSIM_LIBPACK)\include\palace\mfem;$(WELSIM_LIBPACK)\include\palace\libCEED;$(WELSIM_LIBPACK)\include\fmt;$(WELSIM_LIBPACK)\include\hypre;$(WELSIM_LIBPACK)\include\zlib;$(WELSIM_LIBPACK)\include\mumps;$(WELSIM_LIBPACK)\include\petsc;$(WELSIM_LIBPACK)\include\eigen3;$(WELSIM_LIBPACK)\include;.\;$(CUDA_PATH)\include;$(ExternalIncludePath) + $(CUDA_PATH)/include;$(IncludePath) + $(WELSIM_LIBPACK)\lib\mumps;$(WELSIM_LIBPACK)\lib;$(IFORT_COMPILER24)\lib;$(MSMPI_LIB64);$(INTEL_MKL)\lib;$(CUDA_PATH)\lib\x64;$(LibraryPath) + $(MSMPI_INC);$(INTEL_MKL)\include;$(WELSIM_LIBPACK)\include\palace\mfem;$(WELSIM_LIBPACK)\include\palace\libCEED;$(WELSIM_LIBPACK)\include\fmt;$(WELSIM_LIBPACK)\include\hypre;$(WELSIM_LIBPACK)\include\zlib;$(WELSIM_LIBPACK)\include\mumps;$(WELSIM_LIBPACK)\include\petsc;$(WELSIM_LIBPACK)\include\eigen3;$(WELSIM_LIBPACK)\include;.\;$(ExternalIncludePath) + $(Platform)\$(Configuration)\ @@ -14,9 +15,10 @@ _UNICODE;UNICODE;CEED_SKIP_VISIBILITY;PALACE_WITH_ARPACK;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions) + /utf-8 %(AdditionalOptions) - palace\gslib-palace.lib;palace\mfem-palace.lib;palace\mfem-common-palace.lib;palace\libCEED-palace.lib;palace\arpack.lib;palace\parpack.lib;palace\libpalace.lib;msmpi.lib;msmpifec.lib;hypre\HYPRE.lib;fmt\fmt.lib;metis\metis.lib;PETSc\Release\libpetsc.lib;mkl_core_dll.lib;mkl_intel_lp64_dll.lib;mkl_sequential_dll.lib;mkl_scalapack_lp64_dll.lib;mkl_intel_thread_dll.lib;mkl_blacs_lp64_dll.lib;dmumps.lib;dmumps_c.lib;mumps_common_c.lib;mumps_common.lib;pord.lib;libxsmm\xsmm.lib;zlib\zlib.lib;%(AdditionalDependencies) + palace\gslib-palace.lib;palace\mfem-palace.lib;palace\mfem-common-palace.lib;palace\libCEED-palace.lib;palace\arpack.lib;palace\parpack.lib;palace\libpalace.lib;msmpi.lib;msmpifec.lib;hypre\HYPRE.lib;fmt\selfBuild\fmt.lib;metis\metis.lib;metis\GKlib.lib;PETSc\Release\libpetsc.lib;mkl_core_dll.lib;mkl_intel_lp64_dll.lib;mkl_sequential_dll.lib;mkl_scalapack_lp64_dll.lib;mkl_intel_thread_dll.lib;mkl_blacs_lp64_dll.lib;dmumps.lib;mumps_common.lib;pord.lib;libxsmm\xsmm.lib;zlib\zlib.lib;scn\scn.lib;%(AdditionalDependencies) diff --git a/palace/palace.sln b/palace/palace.sln index 7f802d9627..951782b78c 100644 --- a/palace/palace.sln +++ b/palace/palace.sln @@ -7,8 +7,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "palace", "palace.vcxproj", EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libpalace", "libpalace.vcxproj", "{CE376632-1B14-426E-9D58-DF59700836C9}" EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libCEED-palace", "..\..\libCEED-palace\libCEED-palace.vcxproj", "{E5F526CC-20F7-4ACA-BC02-FD8EDBF4FC35}" -EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mfem-palace", "..\..\mfem-palace\mfem-palace.vcxproj", "{78EF8ACD-F0EF-4312-A016-6AA5F695B476}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mfem-common-palace", "..\..\mfem-palace\mfem-common-palace.vcxproj", "{6D7405DA-71FB-4E1B-9F8B-CF7A1F364E3B}" @@ -17,6 +15,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "deps", "deps", "{FF794BCA-0 EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gslib-palace", "..\..\gslib-palace\gslib-palace.vcxproj", "{327F82B8-6770-403C-B56E-6FCF881FD072}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libCEED-palace", "..\..\libCEED-palace\msbuild\libCEED-palace.vcxproj", "{4DE98561-4F49-4D08-9E09-75DB3EBF36EE}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -37,14 +37,6 @@ Global {CE376632-1B14-426E-9D58-DF59700836C9}.Release|x64.ActiveCfg = Release|x64 {CE376632-1B14-426E-9D58-DF59700836C9}.Release|x64.Build.0 = Release|x64 {CE376632-1B14-426E-9D58-DF59700836C9}.Release|x86.ActiveCfg = Release|x64 - {E5F526CC-20F7-4ACA-BC02-FD8EDBF4FC35}.Debug|x64.ActiveCfg = Debug|x64 - {E5F526CC-20F7-4ACA-BC02-FD8EDBF4FC35}.Debug|x64.Build.0 = Debug|x64 - {E5F526CC-20F7-4ACA-BC02-FD8EDBF4FC35}.Debug|x86.ActiveCfg = Debug|x64 - {E5F526CC-20F7-4ACA-BC02-FD8EDBF4FC35}.Debug|x86.Build.0 = Debug|x64 - {E5F526CC-20F7-4ACA-BC02-FD8EDBF4FC35}.Release|x64.ActiveCfg = Release|x64 - {E5F526CC-20F7-4ACA-BC02-FD8EDBF4FC35}.Release|x64.Build.0 = Release|x64 - {E5F526CC-20F7-4ACA-BC02-FD8EDBF4FC35}.Release|x86.ActiveCfg = Release|x64 - {E5F526CC-20F7-4ACA-BC02-FD8EDBF4FC35}.Release|x86.Build.0 = Release|x64 {78EF8ACD-F0EF-4312-A016-6AA5F695B476}.Debug|x64.ActiveCfg = Debug|x64 {78EF8ACD-F0EF-4312-A016-6AA5F695B476}.Debug|x64.Build.0 = Debug|x64 {78EF8ACD-F0EF-4312-A016-6AA5F695B476}.Debug|x86.ActiveCfg = Debug|x64 @@ -69,15 +61,23 @@ Global {327F82B8-6770-403C-B56E-6FCF881FD072}.Release|x64.Build.0 = Release|x64 {327F82B8-6770-403C-B56E-6FCF881FD072}.Release|x86.ActiveCfg = Release|x64 {327F82B8-6770-403C-B56E-6FCF881FD072}.Release|x86.Build.0 = Release|x64 + {4DE98561-4F49-4D08-9E09-75DB3EBF36EE}.Debug|x64.ActiveCfg = Debug|x64 + {4DE98561-4F49-4D08-9E09-75DB3EBF36EE}.Debug|x64.Build.0 = Debug|x64 + {4DE98561-4F49-4D08-9E09-75DB3EBF36EE}.Debug|x86.ActiveCfg = Debug|x64 + {4DE98561-4F49-4D08-9E09-75DB3EBF36EE}.Debug|x86.Build.0 = Debug|x64 + {4DE98561-4F49-4D08-9E09-75DB3EBF36EE}.Release|x64.ActiveCfg = Release|x64 + {4DE98561-4F49-4D08-9E09-75DB3EBF36EE}.Release|x64.Build.0 = Release|x64 + {4DE98561-4F49-4D08-9E09-75DB3EBF36EE}.Release|x86.ActiveCfg = Release|x64 + {4DE98561-4F49-4D08-9E09-75DB3EBF36EE}.Release|x86.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(NestedProjects) = preSolution - {E5F526CC-20F7-4ACA-BC02-FD8EDBF4FC35} = {FF794BCA-0652-481F-B0D5-A7C9A04749CA} {78EF8ACD-F0EF-4312-A016-6AA5F695B476} = {FF794BCA-0652-481F-B0D5-A7C9A04749CA} {6D7405DA-71FB-4E1B-9F8B-CF7A1F364E3B} = {FF794BCA-0652-481F-B0D5-A7C9A04749CA} {327F82B8-6770-403C-B56E-6FCF881FD072} = {FF794BCA-0652-481F-B0D5-A7C9A04749CA} + {4DE98561-4F49-4D08-9E09-75DB3EBF36EE} = {FF794BCA-0652-481F-B0D5-A7C9A04749CA} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {0FFA0840-D029-3752-A1CB-5699E0B6EA23} diff --git a/palace/palace.vcxproj b/palace/palace.vcxproj index 14a4523981..39fbe48dea 100644 --- a/palace/palace.vcxproj +++ b/palace/palace.vcxproj @@ -50,6 +50,10 @@ $(Platform)\$(Configuration)\ + $(Platform)\$(Configuration)\ + + + $(Platform)\$(Configuration)\ diff --git a/palace/palace.vcxproj.filters b/palace/palace.vcxproj.filters index 4d0524d9f0..4262238975 100644 --- a/palace/palace.vcxproj.filters +++ b/palace/palace.vcxproj.filters @@ -1,14 +1,14 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx - - - - - Source Files - - + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx + + + + + Source Files + + \ No newline at end of file diff --git a/palace/palace_d.props b/palace/palace_d.props index 49f6fc15e7..fe535890b2 100644 --- a/palace/palace_d.props +++ b/palace/palace_d.props @@ -5,20 +5,22 @@ $(Platform)\$(Configuration)\ $(ProjectName)_d - $(IncludePath) - $(WELSIM_LIBPACK)\lib;$(MSMPI_LIB64);$(IFORT_COMPILER22)\compiler\lib\intel64_win;$(WELSIM_LIBPACK)\lib\mumps;$(INTEL_MKL)\lib\intel64;$(LibraryPath) - $(MSMPI_INC);$(INTEL_MKL)\include;$(WELSIM_LIBPACK)\include\palace\mfem;$(WELSIM_LIBPACK)\include\palace\libCEED;D:\WelSimLLC-github\palace\palace;$(WELSIM_LIBPACK)\include\hypre;$(WELSIM_LIBPACK)\include\zlib;$(WELSIM_LIBPACK)\include\mumps;$(WELSIM_LIBPACK)\include\eigen3;$(WELSIM_LIBPACK)\include\petsc;$(WELSIM_LIBPACK)\include\fmt;$(WELSIM_LIBPACK)\include;.\;$(CUDA_PATH)\include + $(CUDA_PATH)/include;$(IncludePath) + $(WELSIM_LIBPACK)\lib;$(MSMPI_LIB64);$(IFORT_COMPILER24)\lib;$(WELSIM_LIBPACK)\lib\mumps;$(INTEL_MKL)\lib;$(CUDA_PATH)\lib\x64;$(LibraryPath) + $(MSMPI_INC);$(INTEL_MKL)\include;$(WELSIM_LIBPACK)\include\palace\mfem;$(WELSIM_LIBPACK)\include\palace\libCEED;$(WELSIM_LIBPACK)\include\hypre;$(WELSIM_LIBPACK)\include\zlib;$(WELSIM_LIBPACK)\include\mumps;$(WELSIM_LIBPACK)\include\eigen3;$(WELSIM_LIBPACK)\include\petsc;$(WELSIM_LIBPACK)\include\fmt;$(WELSIM_LIBPACK)\include;.\;D:\WelSimLLC-github\palace\palace;$(VC_IncludePath);$(WindowsSDK_IncludePath); + $(Platform)\$(Configuration)\ postbuild_palace_d - _UNICODE;UNICODE;CEED_SKIP_VISIBILITY;PALACE_WITH_ARPACK;_CRT_SECURE_NO_WARNINGS;LIBXSMM_TARGET_ARCH = LIBXSMM_X86_GENERIC;_SILENCE_STDEXT_ARR_ITERS_DEPRECATION_WARNING;MFEM_DEBUG;%(PreprocessorDefinitions) + _UNICODE;UNICODE;CEED_SKIP_VISIBILITY;PALACE_WITH_ARPACK;_CRT_SECURE_NO_WARNINGS;LIBXSMM_TARGET_ARCH = LIBXSMM_X86_GENERIC;%(PreprocessorDefinitions) Disabled + /utf-8 %(AdditionalOptions) - palace\gslib-palace_d.lib;palace\mfem-palace_d.lib;palace\mfem-common-palace_d.lib;palace\libCEED-palace_d.lib;palace\arpack_d.lib;palace\parpack_d.lib;palace\libpalace_d.lib;msmpi.lib;msmpifec.lib;hypre\HYPRE_d.lib;fmt\fmtd.lib;metis\metis_d.lib;PETSc\Debug\libpetsc.lib;dmumps_d.lib;dmumps_c_d.lib;mumps_common_c_d.lib;mumps_common_d.lib;pord.lib;libxsmm\xsmm_d.lib;zlib\zlib.lib;mkl_core_dll.lib;mkl_intel_lp64_dll.lib;mkl_sequential_dll.lib;mkl_scalapack_lp64_dll.lib;mkl_intel_thread_dll.lib;mkl_blacs_lp64_dll.lib;%(AdditionalDependencies) + palace\gslib-palace_d.lib;palace\mfem-palace_d.lib;palace\mfem-common-palace_d.lib;palace\libCEED-palace_d.lib;palace\arpack_d.lib;palace\parpack_d.lib;palace\libpalace_d.lib;msmpi.lib;msmpifec.lib;hypre\HYPRE_d.lib;fmt\selfBuild\fmtd.lib;metis\metis_d.lib;metis\GKlib_d.lib;PETSc\Debug\libpetsc.lib;dmumps_d.lib;mumps_common_d.lib;pord.lib;libxsmm\xsmm_d.lib;zlib\zlib.lib;scn\scn_d.lib;mkl_core_dll.lib;mkl_intel_lp64_dll.lib;mkl_sequential_dll.lib;mkl_scalapack_lp64_dll.lib;mkl_intel_thread_dll.lib;mkl_blacs_lp64_dll.lib;%(AdditionalDependencies) diff --git a/palace/postbuild_libpalace.bat b/palace/postbuild_libpalace.bat index 0c09cce2a3..6930da6260 100644 --- a/palace/postbuild_libpalace.bat +++ b/palace/postbuild_libpalace.bat @@ -1,5 +1,5 @@ -IF NOT DEFINED WELSIM_LIBPACK ( -call env_var.bat -) - -XCOPY x64\Release\libpalace.lib %WELSIM_LIBPACK%\lib\palace /F /C /S /Y /I +IF NOT DEFINED WELSIM_LIBPACK ( +call env_var.bat +) + +XCOPY x64\Release\libpalace.lib %WELSIM_LIBPACK%\lib\palace /F /C /S /Y /I diff --git a/palace/postbuild_libpalace_d.bat b/palace/postbuild_libpalace_d.bat index 10be2dcce6..71a0c24da0 100644 --- a/palace/postbuild_libpalace_d.bat +++ b/palace/postbuild_libpalace_d.bat @@ -1,7 +1,7 @@ -IF NOT DEFINED WELSIM_LIBPACK ( -call env_var.bat -) - -XCOPY x64\Debug\libpalace_d.lib %WELSIM_LIBPACK%\lib\palace /F /C /S /Y /I - - +IF NOT DEFINED WELSIM_LIBPACK ( +call env_var.bat +) + +XCOPY x64\Debug\libpalace_d.lib %WELSIM_LIBPACK%\lib\palace /F /C /S /Y /I + + diff --git a/palace/postbuild_palace.bat b/palace/postbuild_palace.bat index bf04781274..8a17d17fc0 100644 --- a/palace/postbuild_palace.bat +++ b/palace/postbuild_palace.bat @@ -1,7 +1,7 @@ -IF NOT DEFINED WELSIM_LIBPACK ( -call env_var.bat -) - -XCOPY x64\Release\palace.exe %WELSIM_EXEC% /F /C /S /Y /I -XCOPY x64\Release\palace.exe %WELSIM_LIBPACK%\bin\palace /F /C /S /Y /I - +IF NOT DEFINED WELSIM_LIBPACK ( +call env_var.bat +) + +XCOPY x64\Release\palace.exe %WELSIM_EXEC% /F /C /S /Y /I +XCOPY x64\Release\palace.exe %WELSIM_LIBPACK%\bin\palace /F /C /S /Y /I + diff --git a/palace/utils/CMakeLists.txt b/palace/utils/CMakeLists.txt index b582757126..1fa78b7926 100644 --- a/palace/utils/CMakeLists.txt +++ b/palace/utils/CMakeLists.txt @@ -1,15 +1,19 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 - -# -# Add source files and subdirectories. -# - -target_sources(${LIB_TARGET_NAME} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/configfile.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/dorfler.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/geodata.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/iodata.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/meshio.cpp -) +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +# +# Add source files and subdirectories. +# + +target_sources(${LIB_TARGET_NAME} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/configfile.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/dorfler.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/geodata.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/geodata_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/iodata.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/meshio.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/omp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/tablecsv.cpp +) diff --git a/palace/utils/communication.hpp b/palace/utils/communication.hpp index 3189618da6..697bcac8e7 100644 --- a/palace/utils/communication.hpp +++ b/palace/utils/communication.hpp @@ -1,388 +1,410 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_COMMUNICATION_HPP -#define PALACE_UTILS_COMMUNICATION_HPP - -#include -#include -#include -#include -#include - -namespace palace -{ - -namespace mpi -{ - -template -inline MPI_Datatype DataType(); - -template <> -inline MPI_Datatype DataType() -{ - return MPI_CHAR; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_SIGNED_CHAR; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_UNSIGNED_CHAR; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_SHORT; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_UNSIGNED_SHORT; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_INT; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_UNSIGNED; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_LONG; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_UNSIGNED_LONG; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_LONG_LONG; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_UNSIGNED_LONG_LONG; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_FLOAT; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_DOUBLE; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_LONG_DOUBLE; -} - -template <> -inline MPI_Datatype DataType>() -{ - return MPI_C_COMPLEX; -} - -template <> -inline MPI_Datatype DataType>() -{ - return MPI_C_DOUBLE_COMPLEX; -} - -template <> -inline MPI_Datatype DataType>() -{ - return MPI_C_LONG_DOUBLE_COMPLEX; -} - -template <> -inline MPI_Datatype DataType() -{ - return MPI_C_BOOL; -} - -template -struct ValueAndLoc -{ - T val; - U loc; -}; - -template <> -inline MPI_Datatype DataType>() -{ - return MPI_FLOAT_INT; -} - -template <> -inline MPI_Datatype DataType>() -{ - return MPI_DOUBLE_INT; -} - -template <> -inline MPI_Datatype DataType>() -{ - return MPI_LONG_DOUBLE_INT; -} - -template <> -inline MPI_Datatype DataType>() -{ - return MPI_SHORT_INT; -} - -template <> -inline MPI_Datatype DataType>() -{ - return MPI_2INT; -} - -template <> -inline MPI_Datatype DataType>() -{ - return MPI_LONG_INT; -} - -} // namespace mpi - -// -// A simple convenience class for easy access to some MPI functionality. This is similar to -// mfem::Mpi and ideally should inherit from it, but the constructor being private instead -// of protected doesn't allow for that. -// -class Mpi -{ -public: - // Singleton creation. - static void Init() { Init(nullptr, nullptr); } - static void Init(int &argc, char **&argv) { Init(&argc, &argv); } - - // Finalize MPI (if it has been initialized and not yet already finalized). - static void Finalize() - { - if (IsInitialized() && !IsFinalized()) - { - MPI_Finalize(); - } - } - - // Return true if MPI has been initialized. - static bool IsInitialized() - { - int is_init; - int ierr = MPI_Initialized(&is_init); - return ierr == MPI_SUCCESS && is_init; - } - - // Return true if MPI has been finalized. - static bool IsFinalized() - { - int is_finalized; - int ierr = MPI_Finalized(&is_finalized); - return ierr == MPI_SUCCESS && is_finalized; - } - - // Call MPI_Abort with the given error code. - static void Abort(int code, MPI_Comm comm = World()) { MPI_Abort(comm, code); } - - // Barrier on the communicator. - static void Barrier(MPI_Comm comm = World()) { MPI_Barrier(comm); } - - // Return processor's rank in the communicator. - static int Rank(MPI_Comm comm) - { - int rank; - MPI_Comm_rank(comm, &rank); - return rank; - } - - // Return communicator size. - static int Size(MPI_Comm comm) - { - int size; - MPI_Comm_size(comm, &size); - return size; - } - - // Return communicator size. - static bool Root(MPI_Comm comm) { return Rank(comm) == 0; } - - // Wrapper for MPI_AllReduce. - template - static void GlobalOp(int len, T *buff, MPI_Op op, MPI_Comm comm) - { - MPI_Allreduce(MPI_IN_PLACE, buff, len, mpi::DataType(), op, comm); - } - - // Global minimum (in-place, result is broadcast to all processes). - template - static void GlobalMin(int len, T *buff, MPI_Comm comm) - { - GlobalOp(len, buff, MPI_MIN, comm); - } - - // Global maximum (in-place, result is broadcast to all processes). - template - static void GlobalMax(int len, T *buff, MPI_Comm comm) - { - GlobalOp(len, buff, MPI_MAX, comm); - } - - // Global sum (in-place, result is broadcast to all processes). - template - static void GlobalSum(int len, T *buff, MPI_Comm comm) - { - GlobalOp(len, buff, MPI_SUM, comm); - } - - // Global minimum with index (in-place, result is broadcast to all processes). - template - static void GlobalMinLoc(int len, T *val, U *loc, MPI_Comm comm) - { - std::vector> buffer(len); - for (int i = 0; i < len; i++) - { - buffer[i].val = val[i]; - buffer[i].loc = loc[i]; - } - GlobalOp(len, buffer.data(), MPI_MINLOC, comm); - for (int i = 0; i < len; i++) - { - val[i] = buffer[i].val; - loc[i] = buffer[i].loc; - } - } - - // Global maximum with index (in-place, result is broadcast to all processes). - template - static void GlobalMaxLoc(int len, T *val, U *loc, MPI_Comm comm) - { - std::vector> buffer(len); - for (int i = 0; i < len; i++) - { - buffer[i].val = val[i]; - buffer[i].loc = loc[i]; - } - GlobalOp(len, buffer.data(), MPI_MAXLOC, comm); - for (int i = 0; i < len; i++) - { - val[i] = buffer[i].val; - loc[i] = buffer[i].loc; - } - } - - // Global broadcast from root. - template - static void Broadcast(int len, T *buff, int root, MPI_Comm comm) - { - MPI_Bcast(buff, len, mpi::DataType(), root, comm); - } - - // Print methods only print on the root process of MPI_COMM_WORLD or a given MPI_Comm. - template - static void Print(MPI_Comm comm, fmt::format_string fmt, T &&...args) - { - if (Root(comm)) - { - fmt::print(fmt, std::forward(args)...); - } - } - - template - static void Print(fmt::format_string fmt, T &&...args) - { - Print(World(), fmt, std::forward(args)...); - } - - template - static void Printf(MPI_Comm comm, const char *format, T &&...args) - { - if (Root(comm)) - { - fmt::printf(format, std::forward(args)...); - } - } - - template - static void Printf(const char *format, T &&...args) - { - Printf(World(), format, std::forward(args)...); - } - - template - static void Warning(MPI_Comm comm, fmt::format_string fmt, T &&...args) - { - Print(comm, "\nWarning!\n"); - Print(comm, fmt, std::forward(args)...); - Print(comm, "\n"); - } - - template - static void Warning(fmt::format_string fmt, T &&...args) - { - Warning(World(), fmt, std::forward(args)...); - } - - // Return the global communicator. - static MPI_Comm World() { return MPI_COMM_WORLD; } - -private: - // Prevent direct construction of objects of this class. - Mpi() = default; - ~Mpi() { Finalize(); } - - // Access the singleton instance. - static Mpi &Instance() - { - static Mpi mpi; - return mpi; - } - - static void Init(int *argc, char ***argv) - { - // The Mpi object below needs to be created after MPI_Init() for some MPI - // implementations. - MFEM_VERIFY(!IsInitialized(), "MPI should not be initialized more than once!"); -#if defined(MFEM_USE_OPENMP) - int provided, requested = MPI_THREAD_FUNNELED; // MPI_THREAD_MULTIPLE - MPI_Init_thread(argc, argv, requested, &provided); - MFEM_VERIFY(provided >= requested, - "MPI could not provide the requested level of thread support!"); -#else - MPI_Init(argc, argv); -#endif - // Initialize the singleton Instance. - Instance(); - } -}; - -} // namespace palace - -#endif // PALACE_UTILS_COMMUNICATION_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_COMMUNICATION_HPP +#define PALACE_UTILS_COMMUNICATION_HPP + +#include +#include +#include +#include +#include +#include + +namespace palace +{ + +namespace mpi +{ + +template +inline MPI_Datatype DataType(); + +template <> +inline MPI_Datatype DataType() +{ + return MPI_CHAR; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_SIGNED_CHAR; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_UNSIGNED_CHAR; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_SHORT; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_UNSIGNED_SHORT; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_INT; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_UNSIGNED; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_LONG; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_UNSIGNED_LONG; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_LONG_LONG; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_UNSIGNED_LONG_LONG; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_FLOAT; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_DOUBLE; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_LONG_DOUBLE; +} + +template <> +inline MPI_Datatype DataType>() +{ + return MPI_C_COMPLEX; +} + +template <> +inline MPI_Datatype DataType>() +{ + return MPI_C_DOUBLE_COMPLEX; +} + +template <> +inline MPI_Datatype DataType>() +{ + return MPI_C_LONG_DOUBLE_COMPLEX; +} + +template <> +inline MPI_Datatype DataType() +{ + return MPI_C_BOOL; +} + +template +struct ValueAndLoc +{ + T val; + U loc; +}; + +template <> +inline MPI_Datatype DataType>() +{ + return MPI_FLOAT_INT; +} + +template <> +inline MPI_Datatype DataType>() +{ + return MPI_DOUBLE_INT; +} + +template <> +inline MPI_Datatype DataType>() +{ + return MPI_LONG_DOUBLE_INT; +} + +template <> +inline MPI_Datatype DataType>() +{ + return MPI_SHORT_INT; +} + +template <> +inline MPI_Datatype DataType>() +{ + return MPI_2INT; +} + +template <> +inline MPI_Datatype DataType>() +{ + return MPI_LONG_INT; +} + +} // namespace mpi + +// +// A simple convenience class for easy access to some MPI functionality. This is similar to +// mfem::Mpi and ideally should inherit from it, but the constructor being private instead +// of protected doesn't allow for that. +// +class Mpi +{ +public: + // Singleton creation. + static void Init(int requested = default_thread_required) + { + Init(nullptr, nullptr, requested); + } + static void Init(int &argc, char **&argv, int requested = default_thread_required) + { + Init(&argc, &argv, requested); + } + + // Finalize MPI (if it has been initialized and not yet already finalized). + static void Finalize() + { + if (IsInitialized() && !IsFinalized()) + { + MPI_Finalize(); + } + } + + // Return true if MPI has been initialized. + static bool IsInitialized() + { + int is_init; + int ierr = MPI_Initialized(&is_init); + return ierr == MPI_SUCCESS && is_init; + } + + // Return true if MPI has been finalized. + static bool IsFinalized() + { + int is_finalized; + int ierr = MPI_Finalized(&is_finalized); + return ierr == MPI_SUCCESS && is_finalized; + } + + // Call MPI_Abort with the given error code. + static void Abort(int code, MPI_Comm comm = World()) { MPI_Abort(comm, code); } + + // Barrier on the communicator. + static void Barrier(MPI_Comm comm = World()) { MPI_Barrier(comm); } + + // Return processor's rank in the communicator. + static int Rank(MPI_Comm comm) + { + int rank; + MPI_Comm_rank(comm, &rank); + return rank; + } + + // Return communicator size. + static int Size(MPI_Comm comm) + { + int size; + MPI_Comm_size(comm, &size); + return size; + } + + // Return communicator size. + static bool Root(MPI_Comm comm) { return Rank(comm) == 0; } + + // Wrapper for MPI_AllReduce. + template + static void GlobalOp(int len, T *buff, MPI_Op op, MPI_Comm comm) + { + MPI_Allreduce(MPI_IN_PLACE, buff, len, mpi::DataType(), op, comm); + } + + // Global minimum (in-place, result is broadcast to all processes). + template + static void GlobalMin(int len, T *buff, MPI_Comm comm) + { + GlobalOp(len, buff, MPI_MIN, comm); + } + + // Global maximum (in-place, result is broadcast to all processes). + template + static void GlobalMax(int len, T *buff, MPI_Comm comm) + { + GlobalOp(len, buff, MPI_MAX, comm); + } + + // Global sum (in-place, result is broadcast to all processes). + template + static void GlobalSum(int len, T *buff, MPI_Comm comm) + { + GlobalOp(len, buff, MPI_SUM, comm); + } + + // Global minimum with index (in-place, result is broadcast to all processes). + template + static void GlobalMinLoc(int len, T *val, U *loc, MPI_Comm comm) + { + std::vector> buffer(len); + for (int i = 0; i < len; i++) + { + buffer[i].val = val[i]; + buffer[i].loc = loc[i]; + } + GlobalOp(len, buffer.data(), MPI_MINLOC, comm); + for (int i = 0; i < len; i++) + { + val[i] = buffer[i].val; + loc[i] = buffer[i].loc; + } + } + + // Global maximum with index (in-place, result is broadcast to all processes). + template + static void GlobalMaxLoc(int len, T *val, U *loc, MPI_Comm comm) + { + std::vector> buffer(len); + for (int i = 0; i < len; i++) + { + buffer[i].val = val[i]; + buffer[i].loc = loc[i]; + } + GlobalOp(len, buffer.data(), MPI_MAXLOC, comm); + for (int i = 0; i < len; i++) + { + val[i] = buffer[i].val; + loc[i] = buffer[i].loc; + } + } + + // Global logical or (in-place, result is broadcast to all processes). + static void GlobalOr(int len, bool *buff, MPI_Comm comm) + { + GlobalOp(len, buff, MPI_LOR, comm); + } + + // Global logical and (in-place, result is broadcast to all processes). + static void GlobalAnd(int len, bool *buff, MPI_Comm comm) + { + GlobalOp(len, buff, MPI_LAND, comm); + } + + // Global broadcast from root. + template + static void Broadcast(int len, T *buff, int root, MPI_Comm comm) + { + MPI_Bcast(buff, len, mpi::DataType(), root, comm); + } + + // Print methods only print on the root process of MPI_COMM_WORLD or a given MPI_Comm. + template + static void Print(MPI_Comm comm, fmt::format_string fmt, T &&...args) + { + if (Root(comm)) + { + fmt::print(fmt, std::forward(args)...); + } + } + + template + static void Print(fmt::format_string fmt, T &&...args) + { + Print(World(), fmt, std::forward(args)...); + } + + template + static void Printf(MPI_Comm comm, const char *format, T &&...args) + { + if (Root(comm)) + { + fmt::printf(format, std::forward(args)...); + } + } + + template + static void Printf(const char *format, T &&...args) + { + Printf(World(), format, std::forward(args)...); + } + + template + static void Warning(MPI_Comm comm, fmt::format_string fmt, T &&...args) + { + Print(comm, "\n{}\n", fmt::styled("--> Warning!", fmt::fg(fmt::color::yellow))); + Print(comm, fmt, std::forward(args)...); + Print(comm, "\n"); + } + + template + static void Warning(fmt::format_string fmt, T &&...args) + { + Warning(World(), fmt, std::forward(args)...); + } + + // Return the global communicator. + static MPI_Comm World() { return MPI_COMM_WORLD; } + + // Default level of threading used in MPI_Init_thread unless provided to Init. +#if defined(MFEM_USE_OPENMP) + inline static int default_thread_required = MPI_THREAD_FUNNELED; +#else + inline static int default_thread_required = MPI_THREAD_SINGLE; +#endif + +private: + // Prevent direct construction of objects of this class. + Mpi() = default; + ~Mpi() { Finalize(); } + + // Access the singleton instance. + static Mpi &Instance() + { + static Mpi mpi; + return mpi; + } + + static void Init(int *argc, char ***argv, int requested) + { + // The Mpi object below needs to be created after MPI_Init() for some MPI + // implementations. + MFEM_VERIFY(!IsInitialized(), "MPI should not be initialized more than once!"); + int provided; + MPI_Init_thread(argc, argv, requested, &provided); + MFEM_VERIFY(provided >= requested, + "MPI could not provide the requested level of thread support!"); + // Initialize the singleton Instance. + Instance(); + } +}; + +} // namespace palace + +#endif // PALACE_UTILS_COMMUNICATION_HPP diff --git a/palace/utils/configfile.cpp b/palace/utils/configfile.cpp index bda2ca1ed6..3f9f3e0ee8 100644 --- a/palace/utils/configfile.cpp +++ b/palace/utils/configfile.cpp @@ -4,47 +4,181 @@ #include "configfile.hpp" #include +#include +#include +#include +#include #include #include // This is similar to NLOHMANN_JSON_SERIALIZE_ENUM, but results in an error if an enum -// value corresponding to the string cannot be found. -#define PALACE_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...) \ - template \ - inline void to_json(BasicJsonType &j, const ENUM_TYPE &e) \ - { \ - static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ - static const std::pair m[] = __VA_ARGS__; \ - auto it = std::find_if(std::begin(m), std::end(m), \ - [e](const std::pair &ej_pair) \ - { return ej_pair.first == e; }); \ - MFEM_VERIFY(it != std::end(m), \ - "Invalid value for " << #ENUM_TYPE " given when parsing to JSON!"); \ - j = it->second; \ - } \ - template \ - inline void from_json(const BasicJsonType &j, ENUM_TYPE &e) \ - { \ - static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ - static const std::pair m[] = __VA_ARGS__; \ - auto it = std::find_if(std::begin(m), std::end(m), \ - [j](const std::pair &ej_pair) \ - { return ej_pair.second == j; }); \ - MFEM_VERIFY(it != std::end(m), \ - "Invalid value (" \ - << j << ") for " \ - << #ENUM_TYPE " given in configuration file when parsing from JSON!"); \ - e = it->first; \ +// value corresponding to the string cannot be found. Also adds an overload for stream +// printing enum values. +#define PALACE_JSON_SERIALIZE_ENUM(ENUM_TYPE, ...) \ + template \ + inline void to_json(BasicJsonType &j, const ENUM_TYPE &e) \ + { \ + static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ + static const std::pair m[] = __VA_ARGS__; \ + auto it = std::find_if(std::begin(m), std::end(m), \ + [e](const std::pair &ej_pair) \ + { return ej_pair.first == e; }); \ + MFEM_VERIFY(it != std::end(m), \ + "Invalid value for " << #ENUM_TYPE " given when parsing to JSON!"); \ + j = it->second; \ + } \ + template \ + inline void from_json(const BasicJsonType &j, ENUM_TYPE &e) \ + { \ + static_assert(std::is_enum::value, #ENUM_TYPE " must be an enum!"); \ + static const std::pair m[] = __VA_ARGS__; \ + auto it = std::find_if(std::begin(m), std::end(m), \ + [j](const std::pair &ej_pair) \ + { return ej_pair.second == j; }); \ + MFEM_VERIFY(it != std::end(m), \ + "Invalid value (" << j << ") for " \ + << #ENUM_TYPE \ + " given in the configuration file when parsing from JSON!"); \ + e = it->first; \ + } \ + std::ostream &operator<<(std::ostream &os, const ENUM_TYPE &e) \ + { \ + static const std::pair m[] = __VA_ARGS__; \ + os << std::find_if(std::begin(m), std::end(m), \ + [e](const std::pair &ej_pair) \ + { return ej_pair.first == e; }) \ + ->second; \ + return os; \ } -namespace palace::config +using json = nlohmann::json; +namespace palace { +// Helpers for converting enums specified in labels.hpp. Must be done in palace scope rather +// than palace::config scope to ensure argument-dependent-lookup succeeds in json. + +// Helper for converting string keys to enum for CoordinateSystem. +PALACE_JSON_SERIALIZE_ENUM(CoordinateSystem, + {{CoordinateSystem::CARTESIAN, "Cartesian"}, + {CoordinateSystem::CYLINDRICAL, "Cylindrical"}}) + +// Helper for converting string keys to enum for ProblemType. +PALACE_JSON_SERIALIZE_ENUM(ProblemType, {{ProblemType::DRIVEN, "Driven"}, + {ProblemType::EIGENMODE, "Eigenmode"}, + {ProblemType::ELECTROSTATIC, "Electrostatic"}, + {ProblemType::MAGNETOSTATIC, "Magnetostatic"}, + {ProblemType::TRANSIENT, "Transient"}}) + +// Helper for converting string keys to enum for EigenSolverBackend. +PALACE_JSON_SERIALIZE_ENUM(EigenSolverBackend, {{EigenSolverBackend::DEFAULT, "Default"}, + {EigenSolverBackend::SLEPC, "SLEPc"}, + {EigenSolverBackend::ARPACK, "ARPACK"}}) + +// Helper for converting string keys to enum for EigenSolverBackend. +PALACE_JSON_SERIALIZE_ENUM(NonlinearEigenSolver, {{NonlinearEigenSolver::HYBRID, "Hybrid"}, + {NonlinearEigenSolver::SLP, "SLP"}}) + +// Helper for converting string keys to enum for SurfaceFlux. +PALACE_JSON_SERIALIZE_ENUM(SurfaceFlux, {{SurfaceFlux::ELECTRIC, "Electric"}, + {SurfaceFlux::MAGNETIC, "Magnetic"}, + {SurfaceFlux::POWER, "Power"}}) + +// Helper for converting string keys to enum for InterfaceDielectric. +PALACE_JSON_SERIALIZE_ENUM(InterfaceDielectric, {{InterfaceDielectric::DEFAULT, "Default"}, + {InterfaceDielectric::MA, "MA"}, + {InterfaceDielectric::MS, "MS"}, + {InterfaceDielectric::SA, "SA"}}) + +// Helper for converting string keys to enum for FrequencySampling. +PALACE_JSON_SERIALIZE_ENUM(FrequencySampling, {{FrequencySampling::DEFAULT, "Default"}, + {FrequencySampling::LINEAR, "Linear"}, + {FrequencySampling::LOG, "Log"}, + {FrequencySampling::POINT, "Point"}}) + +// Helper for converting string keys to enum for TimeSteppingScheme and Excitation. +PALACE_JSON_SERIALIZE_ENUM(TimeSteppingScheme, + {{TimeSteppingScheme::DEFAULT, "Default"}, + {TimeSteppingScheme::GEN_ALPHA, "GeneralizedAlpha"}, + {TimeSteppingScheme::RUNGE_KUTTA, "RungeKutta"}, + {TimeSteppingScheme::CVODE, "CVODE"}, + {TimeSteppingScheme::ARKODE, "ARKODE"}}) +PALACE_JSON_SERIALIZE_ENUM(Excitation, + {{Excitation::SINUSOIDAL, "Sinusoidal"}, + {Excitation::GAUSSIAN, "Gaussian"}, + {Excitation::DIFF_GAUSSIAN, "DifferentiatedGaussian"}, + {Excitation::MOD_GAUSSIAN, "ModulatedGaussian"}, + {Excitation::RAMP_STEP, "Ramp"}, + {Excitation::SMOOTH_STEP, "SmoothStep"}}) + +// Helper for converting string keys to enum for LinearSolver, KrylovSolver, and +// MultigridCoarsening +PALACE_JSON_SERIALIZE_ENUM(LinearSolver, {{LinearSolver::DEFAULT, "Default"}, + {LinearSolver::AMS, "AMS"}, + {LinearSolver::BOOMER_AMG, "BoomerAMG"}, + {LinearSolver::MUMPS, "MUMPS"}, + {LinearSolver::SUPERLU, "SuperLU"}, + {LinearSolver::STRUMPACK, "STRUMPACK"}, + {LinearSolver::STRUMPACK_MP, "STRUMPACK-MP"}, + {LinearSolver::JACOBI, "Jacobi"}}) +PALACE_JSON_SERIALIZE_ENUM(KrylovSolver, {{KrylovSolver::DEFAULT, "Default"}, + {KrylovSolver::CG, "CG"}, + {KrylovSolver::MINRES, "MINRES"}, + {KrylovSolver::GMRES, "GMRES"}, + {KrylovSolver::FGMRES, "FGMRES"}, + {KrylovSolver::BICGSTAB, "BiCGSTAB"}}) +PALACE_JSON_SERIALIZE_ENUM(MultigridCoarsening, + {{MultigridCoarsening::LINEAR, "Linear"}, + {MultigridCoarsening::LOGARITHMIC, "Logarithmic"}}) + +// Helpers for converting string keys to enum for PreconditionerSide, SymbolicFactorization, +// SparseCompression, and Orthogonalization. +PALACE_JSON_SERIALIZE_ENUM(PreconditionerSide, {{PreconditionerSide::DEFAULT, "Default"}, + {PreconditionerSide::RIGHT, "Right"}, + {PreconditionerSide::LEFT, "Left"}}) +PALACE_JSON_SERIALIZE_ENUM(SymbolicFactorization, + {{SymbolicFactorization::DEFAULT, "Default"}, + {SymbolicFactorization::METIS, "METIS"}, + {SymbolicFactorization::PARMETIS, "ParMETIS"}, + {SymbolicFactorization::SCOTCH, "Scotch"}, + {SymbolicFactorization::PTSCOTCH, "PTScotch"}, + {SymbolicFactorization::PORD, "PORD"}, + {SymbolicFactorization::AMD, "AMD"}, + {SymbolicFactorization::RCM, "RCM"}}) +PALACE_JSON_SERIALIZE_ENUM(SparseCompression, + {{SparseCompression::NONE, "None"}, + {SparseCompression::BLR, "BLR"}, + {SparseCompression::HSS, "HSS"}, + {SparseCompression::HODLR, "HODLR"}, + {SparseCompression::ZFP, "ZFP"}, + {SparseCompression::BLR_HODLR, "BLR-HODLR"}, + {SparseCompression::ZFP_BLR_HODLR, "ZFP-BLR-HODLR"}}) +PALACE_JSON_SERIALIZE_ENUM(Orthogonalization, {{Orthogonalization::MGS, "MGS"}, + {Orthogonalization::CGS, "CGS"}, + {Orthogonalization::CGS2, "CGS2"}}) + +// Helpers for converting string keys to enum for Device. +PALACE_JSON_SERIALIZE_ENUM(Device, {{Device::CPU, "CPU"}, + {Device::GPU, "GPU"}, + {Device::DEBUG, "Debug"}}) +} // namespace palace -using json = nlohmann::json; +namespace palace::config +{ namespace { +int AtIndex(json::iterator &port_it, std::string_view errmsg_parent) +{ + MFEM_VERIFY( + port_it->find("Index") != port_it->end(), + fmt::format("Missing {} \"Index\" in the configuration file!", errmsg_parent)); + auto index = port_it->at("Index").get(); + MFEM_VERIFY(index > 0, fmt::format("The {} \"Index\" should be an integer > 0; got {}", + errmsg_parent, index)); + return index; +} + template void ParseSymmetricMatrixData(json &mat, const std::string &name, SymmetricMatrixData &data) @@ -64,12 +198,6 @@ void ParseSymmetricMatrixData(json &mat, const std::string &name, data.v = mat.value("MaterialAxes", data.v); } -// Helper for converting string keys to enum for internal::ElementData::CoordinateSystem. -PALACE_JSON_SERIALIZE_ENUM( - internal::ElementData::CoordinateSystem, - {{internal::ElementData::CoordinateSystem::CARTESIAN, "Cartesian"}, - {internal::ElementData::CoordinateSystem::CYLINDRICAL, "Cylindrical"}}) - // Helper function for extracting element data from the configuration file, either from a // provided keyword argument of from a specified vector. In extracting the direction various // checks are performed for validity of the input combinations. @@ -77,6 +205,7 @@ void ParseElementData(json &elem, const std::string &name, bool required, internal::ElementData &data) { data.attributes = elem.at("Attributes").get>(); // Required + std::sort(data.attributes.begin(), data.attributes.end()); auto it = elem.find(name); if (it != elem.end() && it->is_array()) { @@ -89,7 +218,7 @@ void ParseElementData(json &elem, const std::string &name, bool required, // Fall back to parsing as a string (value is optional). MFEM_VERIFY(elem.find("CoordinateSystem") == elem.end(), "Cannot specify \"CoordinateSystem\" when specifying a direction or side " - "using a string in configuration file!"); + "using a string in the configuration file!"); std::string direction; direction = elem.value(name, direction); for (auto &c : direction) @@ -109,88 +238,78 @@ void ParseElementData(json &elem, const std::string &name, bool required, MFEM_VERIFY(direction.length() == 1 || direction[xpos - 1] == '-' || direction[xpos - 1] == '+', "Missing required sign specification on \"X\" for \"" - << name << "\" in configuration file!"); + << name << "\" in the configuration file!"); MFEM_VERIFY(!yfound && !zfound && !rfound, "\"X\" cannot be combined with \"Y\", \"Z\", or \"R\" for \"" - << name << "\" in configuration file!"); + << name << "\" in the configuration file!"); data.direction[0] = (direction.length() == 1 || direction[xpos - 1] == '+') ? 1.0 : -1.0; - data.coordinate_system = internal::ElementData::CoordinateSystem::CARTESIAN; + data.coordinate_system = CoordinateSystem::CARTESIAN; } if (yfound) { MFEM_VERIFY(direction.length() == 1 || direction[ypos - 1] == '-' || direction[ypos - 1] == '+', "Missing required sign specification on \"Y\" for \"" - << name << "\" in configuration file!"); + << name << "\" in the configuration file!"); MFEM_VERIFY(!xfound && !zfound && !rfound, "\"Y\" cannot be combined with \"X\", \"Z\", or \"R\" for \"" - << name << "\" in configuration file!"); + << name << "\" in the configuration file!"); data.direction[1] = direction.length() == 1 || direction[ypos - 1] == '+' ? 1.0 : -1.0; - data.coordinate_system = internal::ElementData::CoordinateSystem::CARTESIAN; + data.coordinate_system = CoordinateSystem::CARTESIAN; } if (zfound) { MFEM_VERIFY(direction.length() == 1 || direction[zpos - 1] == '-' || direction[zpos - 1] == '+', "Missing required sign specification on \"Z\" for \"" - << name << "\" in configuration file!"); + << name << "\" in the configuration file!"); MFEM_VERIFY(!xfound && !yfound && !rfound, "\"Z\" cannot be combined with \"X\", \"Y\", or \"R\" for \"" - << name << "\" in configuration file!"); + << name << "\" in the configuration file!"); data.direction[2] = direction.length() == 1 || direction[zpos - 1] == '+' ? 1.0 : -1.0; - data.coordinate_system = internal::ElementData::CoordinateSystem::CARTESIAN; + data.coordinate_system = CoordinateSystem::CARTESIAN; } if (rfound) { MFEM_VERIFY(direction.length() == 1 || direction[rpos - 1] == '-' || direction[rpos - 1] == '+', "Missing required sign specification on \"R\" for \"" - << name << "\" in configuration file!"); + << name << "\" in the configuration file!"); MFEM_VERIFY(!xfound && !yfound && !zfound, "\"R\" cannot be combined with \"X\", \"Y\", or \"Z\" for \"" - << name << "\" in configuration file!"); + << name << "\" in the configuration file!"); data.direction[0] = direction.length() == 1 || direction[rpos - 1] == '+' ? 1.0 : -1.0; data.direction[1] = 0.0; data.direction[2] = 0.0; - data.coordinate_system = internal::ElementData::CoordinateSystem::CYLINDRICAL; + data.coordinate_system = CoordinateSystem::CYLINDRICAL; } } - MFEM_VERIFY(data.coordinate_system != - internal::ElementData::CoordinateSystem::CYLINDRICAL || + MFEM_VERIFY(data.coordinate_system != CoordinateSystem::CYLINDRICAL || (data.direction[1] == 0.0 && data.direction[2] == 0.0), "Parsing azimuthal and longitudinal directions for cylindrical coordinate " "system directions from the configuration file is not currently supported!"); - MFEM_VERIFY(!required || data.direction[0] != 0.0 || data.direction[1] != 0.0 || - data.direction[2] != 0.0, - "Missing \"" << name - << "\" for an object which requires it in configuration file!"); + MFEM_VERIFY( + !required || data.direction[0] != 0.0 || data.direction[1] != 0.0 || + data.direction[2] != 0.0, + "Missing \"" << name + << "\" for an object which requires it in the configuration file!"); } template std::ostream &operator<<(std::ostream &os, const std::vector &data) { - bool first = true; - for (const auto &x : data) - { - os << (first ? x : (' ' << x)); - first = false; - } + os << fmt::format("{}", fmt::join(data, " ")); return os; } template std::ostream &operator<<(std::ostream &os, const std::array &data) { - bool first = true; - for (const auto &x : data) - { - os << (first ? x : (' ' << x)); - first = false; - } + os << fmt::format("{}", fmt::join(data, " ")); return os; } @@ -206,57 +325,60 @@ std::ostream &operator<<(std::ostream &os, const SymmetricMatrixData &data) return os; } -} // namespace +constexpr bool JSON_DEBUG = false; -// Helper for converting string keys to enum for ProblemData::Type. -PALACE_JSON_SERIALIZE_ENUM(ProblemData::Type, - {{ProblemData::Type::DRIVEN, "Driven"}, - {ProblemData::Type::EIGENMODE, "Eigenmode"}, - {ProblemData::Type::ELECTROSTATIC, "Electrostatic"}, - {ProblemData::Type::MAGNETOSTATIC, "Magnetostatic"}, - {ProblemData::Type::TRANSIENT, "Transient"}}) +} // namespace void ProblemData::SetUp(json &config) { auto problem = config.find("Problem"); MFEM_VERIFY(problem != config.end(), - "\"Problem\" must be specified in configuration file!"); + "\"Problem\" must be specified in the configuration file!"); MFEM_VERIFY(problem->find("Type") != problem->end(), - "Missing config[\"Problem\"][\"Type\"] in configuration file!"); + "Missing config[\"Problem\"][\"Type\"] in the configuration file!"); type = problem->at("Type"); // Required verbose = problem->value("Verbose", verbose); output = problem->value("Output", output); + // Parse output formats. + auto output_formats_it = problem->find("OutputFormats"); + if (output_formats_it != problem->end()) + { + output_formats.paraview = output_formats_it->value("Paraview", output_formats.paraview); + output_formats.gridfunction = + output_formats_it->value("GridFunction", output_formats.gridfunction); + } + // Check for provided solver configuration data (not required for electrostatics or // magnetostatics since defaults can be used for every option). auto solver = config.find("Solver"); - if (type == ProblemData::Type::DRIVEN) + if (type == ProblemType::DRIVEN) { MFEM_VERIFY(solver->find("Driven") != solver->end(), "config[\"Problem\"][\"Type\"] == \"Driven\" should be accompanied by a " "config[\"Solver\"][\"Driven\"] configuration!"); } - else if (type == ProblemData::Type::EIGENMODE) + else if (type == ProblemType::EIGENMODE) { MFEM_VERIFY(solver->find("Eigenmode") != solver->end(), "config[\"Problem\"][\"Type\"] == \"Eigenmode\" should be accompanied by a " "config[\"Solver\"][\"Eigenmode\"] configuration!"); } - else if (type == ProblemData::Type::ELECTROSTATIC) + else if (type == ProblemType::ELECTROSTATIC) { // MFEM_VERIFY( // solver->find("Electrostatic") != solver->end(), // "config[\"Problem\"][\"Type\"] == \"Electrostatic\" should be accompanied by a " // "config[\"Solver\"][\"Electrostatic\"] configuration!"); } - else if (type == ProblemData::Type::MAGNETOSTATIC) + else if (type == ProblemType::MAGNETOSTATIC) { // MFEM_VERIFY( // solver->find("Magnetostatic") != solver->end(), // "config[\"Problem\"][\"Type\"] == \"Magnetostatic\" should be accompanied by a " // "config[\"Solver\"][\"Magnetostatic\"] configuration!"); } - else if (type == ProblemData::Type::TRANSIENT) + else if (type == ProblemType::TRANSIENT) { MFEM_VERIFY(solver->find("Transient") != solver->end(), "config[\"Problem\"][\"Type\"] == \"Transient\" should be accompanied by a " @@ -267,14 +389,20 @@ void ProblemData::SetUp(json &config) problem->erase("Type"); problem->erase("Verbose"); problem->erase("Output"); + problem->erase("OutputFormats"); MFEM_VERIFY(problem->empty(), "Found an unsupported configuration file keyword under \"Problem\"!\n" << problem->dump(2)); // Debug - // std::cout << "Type: " << type << '\n'; - // std::cout << "Verbose: " << verbose << '\n'; - // std::cout << "Output: " << output << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Type: " << type << '\n'; + std::cout << "Verbose: " << verbose << '\n'; + std::cout << "Output: " << output << '\n'; + std::cout << "OutputFormats.Paraview: " << output_formats.paraview << '\n'; + std::cout << "OutputFormats.GridFunction: " << output_formats.gridfunction << '\n'; + } } void RefinementData::SetUp(json &model) @@ -308,7 +436,8 @@ void RefinementData::SetUp(json &model) // Options for a priori refinement. uniform_ref_levels = refinement->value("UniformLevels", uniform_ref_levels); - MFEM_VERIFY(uniform_ref_levels >= 0, + ser_uniform_ref_levels = refinement->value("SerialUniformLevels", ser_uniform_ref_levels); + MFEM_VERIFY(uniform_ref_levels >= 0 && ser_uniform_ref_levels >= 0, "Number of uniform mesh refinement levels must be non-negative!"); auto boxes = refinement->find("Boxes"); if (boxes != refinement->end()) @@ -317,67 +446,37 @@ void RefinementData::SetUp(json &model) "array in the configuration file!"); for (auto it = boxes->begin(); it != boxes->end(); ++it) { - auto xlim = it->find("XLimits"); - auto ylim = it->find("YLimits"); - auto zlim = it->find("ZLimits"); MFEM_VERIFY( - xlim != it->end() && ylim != it->end() && zlim != it->end(), - "Missing \"Boxes\" refinement region \"X/Y/ZLimits\" in configuration file!"); - MFEM_VERIFY(xlim->is_array() && ylim->is_array() && zlim->is_array(), - "config[\"Refinement\"][\"Boxes\"][\"X/Y/ZLimits\"] should specify an " - "array in the " + it->find("Levels") != it->end(), + "Missing \"Boxes\" refinement region \"Levels\" in the configuration file!"); + auto bbmin = it->find("BoundingBoxMin"); + auto bbmax = it->find("BoundingBoxMax"); + MFEM_VERIFY(bbmin != it->end() && bbmax != it->end(), + "Missing \"Boxes\" refinement region \"BoundingBoxMin/Max\" in the " "configuration file!"); - MFEM_VERIFY(it->find("Levels") != it->end(), - "Missing \"Boxes\" refinement region \"Levels\" in configuration file!"); - BoxRefinementData &data = boxlist.emplace_back(); - data.ref_levels = it->at("Levels"); // Required - - std::vector bx = xlim->get>(); // Required - MFEM_VERIFY(bx.size() == 2, - "config[\"Refinement\"][\"Boxes\"][\"XLimits\"] should specify an " - "array of length 2 in the configuration file!"); - if (bx[1] < bx[0]) - { - std::swap(bx[0], bx[1]); - } - data.bbmin.push_back(bx[0]); - data.bbmax.push_back(bx[1]); - - std::vector by = ylim->get>(); // Required - MFEM_VERIFY(by.size() == 2, - "config[\"Refinement\"][\"Boxes\"][\"YLimits\"] should specify an " - "array of length 2 in the configuration file!"); - if (by[1] < by[0]) - { - std::swap(by[0], by[1]); - } - data.bbmin.push_back(by[0]); - data.bbmax.push_back(by[1]); - - std::vector bz = zlim->get>(); // Required - MFEM_VERIFY(bz.size() == 2, - "config[\"Refinement\"][\"Boxes\"][\"ZLimits\"] should specify an " - "array of length 2 in the configuration file!"); - if (bz[1] < bz[0]) - { - std::swap(bz[0], bz[1]); - } - data.bbmin.push_back(bz[0]); - data.bbmax.push_back(bz[1]); + MFEM_VERIFY(bbmin->is_array() && bbmin->is_array(), + "config[\"Refinement\"][\"Boxes\"][\"BoundingBoxMin/Max\"] should " + "specify an array in the configuration file!"); + BoxRefinementData &data = box_list.emplace_back(); + data.ref_levels = it->at("Levels"); // Required + data.bbmin = bbmin->get>(); // Required + data.bbmax = bbmax->get>(); // Required // Cleanup it->erase("Levels"); - it->erase("XLimits"); - it->erase("YLimits"); - it->erase("ZLimits"); + it->erase("BoundingBoxMin"); + it->erase("BoundingBoxMax"); MFEM_VERIFY(it->empty(), "Found an unsupported configuration file keyword under " "config[\"Refinement\"][\"Boxes\"]!\n" << it->dump(2)); // Debug - // std::cout << "Levels: " << data.ref_levels << '\n'; - // std::cout << "BoxMin: " << data.bbmin << '\n'; - // std::cout << "BoxMax: " << data.bbmax << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Levels: " << data.ref_levels << '\n'; + std::cout << "BoundingBoxMin: " << data.bbmin << '\n'; + std::cout << "BoundingBoxMax: " << data.bbmax << '\n'; + } } } auto spheres = refinement->find("Spheres"); @@ -387,6 +486,9 @@ void RefinementData::SetUp(json &model) "an array in the configuration file!"); for (auto it = spheres->begin(); it != spheres->end(); ++it) { + MFEM_VERIFY( + it->find("Levels") != it->end(), + "Missing \"Spheres\" refinement region \"Levels\" in the configuration file!"); auto ctr = it->find("Center"); MFEM_VERIFY(ctr != it->end() && it->find("Radius") != it->end(), "Missing \"Spheres\" refinement region \"Center\" or \"Radius\" in " @@ -394,16 +496,10 @@ void RefinementData::SetUp(json &model) MFEM_VERIFY(ctr->is_array(), "config[\"Refinement\"][\"Spheres\"][\"Center\"] should specify " "an array in the configuration file!"); - MFEM_VERIFY( - it->find("Levels") != it->end(), - "Missing \"Spheres\" refinement region \"Levels\" in configuration file!"); - SphereRefinementData &data = spherelist.emplace_back(); - data.ref_levels = it->at("Levels"); // Required - data.r = it->at("Radius"); // Required - data.center = ctr->get>(); // Required - MFEM_VERIFY(data.center.size() == 3, "config[\"Refinement\"][\"Spheres\"][\"Center\"]" - " should specify an array of length " - "3 in the configuration file!"); + SphereRefinementData &data = sphere_list.emplace_back(); + data.ref_levels = it->at("Levels"); // Required + data.r = it->at("Radius"); // Required + data.center = ctr->get>(); // Required // Cleanup it->erase("Levels"); @@ -414,9 +510,12 @@ void RefinementData::SetUp(json &model) << it->dump(2)); // Debug - // std::cout << "Levels: " << data.ref_levels << '\n'; - // std::cout << "Radius: " << data.r << '\n'; - // std::cout << "Center: " << data.center << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Levels: " << data.ref_levels << '\n'; + std::cout << "Radius: " << data.r << '\n'; + std::cout << "Center: " << data.center << '\n'; + } } } @@ -431,6 +530,7 @@ void RefinementData::SetUp(json &model) refinement->erase("SaveAdaptIterations"); refinement->erase("SaveAdaptMesh"); refinement->erase("UniformLevels"); + refinement->erase("SerialUniformLevels"); refinement->erase("Boxes"); refinement->erase("Spheres"); MFEM_VERIFY(refinement->empty(), @@ -438,79 +538,107 @@ void RefinementData::SetUp(json &model) << refinement->dump(2)); // Debug - // std::cout << "Tol: " << tol << '\n'; - // std::cout << "MaxIts: " << max_it << '\n'; - // std::cout << "MaxSize: " << max_size << '\n'; - // std::cout << "Nonconformal: " << nonconformal << '\n'; - // std::cout << "MaxNCLevels: " << max_nc_levels << '\n'; - // std::cout << "UpdateFraction: " << update_fraction << '\n'; - // std::cout << "MaximumImbalance: " << maximum_imbalance << '\n'; - // std::cout << "SaveAdaptIterations: " << save_adapt_iterations << '\n'; - // std::cout << "SaveAdaptMesh: " << save_adapt_mesh << '\n'; - // std::cout << "UniformLevels: " << uniform_ref_levels << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Tol: " << tol << '\n'; + std::cout << "MaxIts: " << max_it << '\n'; + std::cout << "MaxSize: " << max_size << '\n'; + std::cout << "Nonconformal: " << nonconformal << '\n'; + std::cout << "MaxNCLevels: " << max_nc_levels << '\n'; + std::cout << "UpdateFraction: " << update_fraction << '\n'; + std::cout << "MaximumImbalance: " << maximum_imbalance << '\n'; + std::cout << "SaveAdaptIterations: " << save_adapt_iterations << '\n'; + std::cout << "SaveAdaptMesh: " << save_adapt_mesh << '\n'; + std::cout << "UniformLevels: " << uniform_ref_levels << '\n'; + std::cout << "SerialUniformLevels: " << ser_uniform_ref_levels << '\n'; + } } void ModelData::SetUp(json &config) { auto model = config.find("Model"); - MFEM_VERIFY(model != config.end(), "\"Model\" must be specified in configuration file!"); + MFEM_VERIFY(model != config.end(), + "\"Model\" must be specified in the configuration file!"); MFEM_VERIFY(model->find("Mesh") != model->end(), - "Missing config[\"Model\"][\"Mesh\"] file in configuration file!"); + "Missing config[\"Model\"][\"Mesh\"] file in the configuration file!"); mesh = model->at("Mesh"); // Required L0 = model->value("L0", L0); Lc = model->value("Lc", Lc); - partition = model->value("Partition", partition); - reorient_tet = model->value("ReorientTetMesh", reorient_tet); remove_curvature = model->value("RemoveCurvature", remove_curvature); + make_simplex = model->value("MakeSimplex", make_simplex); + make_hex = model->value("MakeHexahedral", make_hex); + reorder_elements = model->value("ReorderElements", reorder_elements); + clean_unused_elements = model->value("CleanUnusedElements", clean_unused_elements); + crack_bdr_elements = model->value("CrackInternalBoundaryElements", crack_bdr_elements); + refine_crack_elements = model->value("RefineCrackElements", refine_crack_elements); + crack_displ_factor = model->value("CrackDisplacementFactor", crack_displ_factor); + add_bdr_elements = model->value("AddInterfaceBoundaryElements", add_bdr_elements); + export_prerefined_mesh = model->value("ExportPrerefinedMesh", export_prerefined_mesh); + reorient_tet_mesh = model->value("ReorientTetMesh", reorient_tet_mesh); + partitioning = model->value("Partitioning", partitioning); refinement.SetUp(*model); // Cleanup model->erase("Mesh"); model->erase("L0"); model->erase("Lc"); - model->erase("Partition"); - model->erase("ReorientTetMesh"); model->erase("RemoveCurvature"); + model->erase("MakeSimplex"); + model->erase("MakeHexahedral"); + model->erase("ReorderElements"); + model->erase("CleanUnusedElements"); + model->erase("CrackInternalBoundaryElements"); + model->erase("RefineCrackElements"); + model->erase("CrackDisplacementFactor"); + model->erase("AddInterfaceBoundaryElements"); + model->erase("ExportPrerefinedMesh"); + model->erase("ReorientTetMesh"); + model->erase("Partitioning"); model->erase("Refinement"); MFEM_VERIFY(model->empty(), "Found an unsupported configuration file keyword under \"Model\"!\n" << model->dump(2)); // Debug - // std::cout << "Mesh: " << mesh << '\n'; - // std::cout << "L0: " << L0 << '\n'; - // std::cout << "Lc: " << Lc << '\n'; - // std::cout << "Partition: " << partition << '\n'; - // std::cout << "ReorientTetMesh: " << reorient_tet << '\n'; - // std::cout << "RemoveCurvature: " << remove_curvature << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Mesh: " << mesh << '\n'; + std::cout << "L0: " << L0 << '\n'; + std::cout << "Lc: " << Lc << '\n'; + std::cout << "RemoveCurvature: " << remove_curvature << '\n'; + std::cout << "MakeSimplex: " << make_simplex << '\n'; + std::cout << "MakeHexahedral: " << make_hex << '\n'; + std::cout << "ReorderElements: " << reorder_elements << '\n'; + std::cout << "CleanUnusedElements: " << clean_unused_elements << '\n'; + std::cout << "CrackInternalBoundaryElements: " << crack_bdr_elements << '\n'; + std::cout << "RefineCrackElements: " << refine_crack_elements << '\n'; + std::cout << "CrackDisplacementFactor: " << crack_displ_factor << '\n'; + std::cout << "AddInterfaceBoundaryElements: " << add_bdr_elements << '\n'; + std::cout << "ExportPrerefinedMesh: " << export_prerefined_mesh << '\n'; + std::cout << "ReorientTetMesh: " << reorient_tet_mesh << '\n'; + std::cout << "Partitioning: " << partitioning << '\n'; + } } -void MaterialDomainData::SetUp(json &domains) +void DomainMaterialData::SetUp(json &domains) { auto materials = domains.find("Materials"); MFEM_VERIFY(materials != domains.end() && materials->is_array(), - "\"Materials\" must be specified as an array in configuration file!"); + "\"Materials\" must be specified as an array in the configuration file!"); for (auto it = materials->begin(); it != materials->end(); ++it) { MFEM_VERIFY( it->find("Attributes") != it->end(), - "Missing \"Attributes\" list for \"Materials\" domain in configuration file!"); - MaterialData &data = vecdata.emplace_back(); + "Missing \"Attributes\" list for \"Materials\" domain in the configuration file!"); + MaterialData &data = emplace_back(); data.attributes = it->at("Attributes").get>(); // Required + std::sort(data.attributes.begin(), data.attributes.end()); ParseSymmetricMatrixData(*it, "Permeability", data.mu_r); ParseSymmetricMatrixData(*it, "Permittivity", data.epsilon_r); ParseSymmetricMatrixData(*it, "LossTan", data.tandelta); ParseSymmetricMatrixData(*it, "Conductivity", data.sigma); data.lambda_L = it->value("LondonDepth", data.lambda_L); - // Debug - // std::cout << "Attributes: " << data.attributes << '\n'; - // std::cout << "Permeability: " << data.mu_r << '\n'; - // std::cout << "Permittivity: " << data.epsilon_r << '\n'; - // std::cout << "LossTan: " << data.tandelta << '\n'; - // std::cout << "Conductivity: " << data.sigma << '\n'; - // std::cout << "LondonDepth: " << data.lambda_L << '\n'; - // Cleanup it->erase("Attributes"); it->erase("Permeability"); @@ -522,41 +650,55 @@ void MaterialDomainData::SetUp(json &domains) MFEM_VERIFY(it->empty(), "Found an unsupported configuration file keyword under \"Materials\"!\n" << it->dump(2)); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "Attributes: " << data.attributes << '\n'; + std::cout << "Permeability: " << data.mu_r << '\n'; + std::cout << "Permittivity: " << data.epsilon_r << '\n'; + std::cout << "LossTan: " << data.tandelta << '\n'; + std::cout << "Conductivity: " << data.sigma << '\n'; + std::cout << "LondonDepth: " << data.lambda_L << '\n'; + } } } -void DomainDielectricPostData::SetUp(json &postpro) +void DomainEnergyPostData::SetUp(json &postpro) { - auto dielectric = postpro.find("Dielectric"); - if (dielectric == postpro.end()) + auto energy = postpro.find("Energy"); + if (energy == postpro.end()) { return; } - MFEM_VERIFY(dielectric->is_array(), - "\"Dielectric\" should specify an array in the configuration file!"); - for (auto it = dielectric->begin(); it != dielectric->end(); ++it) + MFEM_VERIFY(energy->is_array(), + "\"Energy\" should specify an array in the configuration file!"); + for (auto it = energy->begin(); it != energy->end(); ++it) { - MFEM_VERIFY(it->find("Index") != it->end(), - "Missing \"Dielectric\" domain \"Index\" in configuration file!"); + auto index = AtIndex(it, "\"Energy\" domain"); MFEM_VERIFY( it->find("Attributes") != it->end(), - "Missing \"Attributes\" list for \"Dielectric\" domain in configuration file!"); - auto ret = mapdata.insert(std::make_pair(it->at("Index"), DomainDielectricData())); - MFEM_VERIFY(ret.second, "Repeated \"Index\" found when processing \"Dielectric\" " - "domains in configuration file!"); - DomainDielectricData &data = ret.first->second; + "Missing \"Attributes\" list for \"Energy\" domain in the configuration file!"); + auto ret = mapdata.insert(std::make_pair(index, DomainEnergyData())); + MFEM_VERIFY(ret.second, "Repeated \"Index\" found when processing \"Energy\" domains " + "in the configuration file!"); + auto &data = ret.first->second; data.attributes = it->at("Attributes").get>(); // Required - - // Debug - // std::cout << "Index: " << ret.first->first << '\n'; - // std::cout << "Attributes: " << data.attributes << '\n'; + std::sort(data.attributes.begin(), data.attributes.end()); // Cleanup it->erase("Index"); it->erase("Attributes"); MFEM_VERIFY(it->empty(), - "Found an unsupported configuration file keyword under \"Dielectric\"!\n" + "Found an unsupported configuration file keyword under \"Energy\"!\n" << it->dump(2)); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "Index: " << ret.first->first << '\n'; + std::cout << "Attributes: " << data.attributes << '\n'; + } } } @@ -571,34 +713,30 @@ void ProbePostData::SetUp(json &postpro) "\"Probe\" should specify an array in the configuration file!"); for (auto it = probe->begin(); it != probe->end(); ++it) { - MFEM_VERIFY(it->find("Index") != it->end(), - "Missing \"Probe\" point \"Index\" in configuration file!"); - MFEM_VERIFY(it->find("X") != it->end() && it->find("Y") != it->end() && - it->find("Z") != it->end(), - "Missing \"Probe\" point \"X\", \"Y\", or \"Z\" in configuration file!"); - auto ret = mapdata.insert(std::make_pair(it->at("Index"), ProbeData())); - MFEM_VERIFY( - ret.second, - "Repeated \"Index\" found when processing \"Probe\" points in configuration file!"); - ProbeData &data = ret.first->second; - data.x = it->at("X"); // Required - data.y = it->at("Y"); // Required - data.z = it->at("Z"); // Required - - // Debug - // std::cout << "Index: " << ret.first->first << '\n'; - // std::cout << "X: " << data.x << '\n'; - // std::cout << "Y: " << data.y << '\n'; - // std::cout << "Z: " << data.z << '\n'; + auto index = AtIndex(it, "\"Probe\" point"); + auto ctr = it->find("Center"); + MFEM_VERIFY(ctr != it->end() && ctr->is_array(), + "Missing \"Probe\" point \"Center\" or \"Center\" should specify an array " + "in the configuration file!"); + auto ret = mapdata.insert(std::make_pair(index, ProbeData())); + MFEM_VERIFY(ret.second, "Repeated \"Index\" found when processing \"Probe\" points in " + "the configuration file!"); + auto &data = ret.first->second; + data.center = ctr->get>(); // Required // Cleanup it->erase("Index"); - it->erase("X"); - it->erase("Y"); - it->erase("Z"); + it->erase("Center"); MFEM_VERIFY(it->empty(), "Found an unsupported configuration file keyword under \"Probe\"!\n" << it->dump(2)); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "Index: " << ret.first->first << '\n'; + std::cout << "Center: " << data.center << '\n'; + } } } @@ -609,17 +747,20 @@ void DomainPostData::SetUp(json &domains) { return; } - dielectric.SetUp(*postpro); + energy.SetUp(*postpro); probe.SetUp(*postpro); // Store all unique postprocessing domain attributes. - for (const auto &[idx, data] : dielectric) + for (const auto &[idx, data] : energy) { - attributes.insert(data.attributes.begin(), data.attributes.end()); + attributes.insert(attributes.end(), data.attributes.begin(), data.attributes.end()); } + std::sort(attributes.begin(), attributes.end()); + attributes.erase(std::unique(attributes.begin(), attributes.end()), attributes.end()); + attributes.shrink_to_fit(); // Cleanup - postpro->erase("Dielectric"); + postpro->erase("Energy"); postpro->erase("Probe"); MFEM_VERIFY(postpro->empty(), "Found an unsupported configuration file keyword under \"Postprocessing\"!\n" @@ -630,18 +771,22 @@ void DomainData::SetUp(json &config) { auto domains = config.find("Domains"); MFEM_VERIFY(domains != config.end(), - "\"Domains\" must be specified in configuration file!"); + "\"Domains\" must be specified in the configuration file!"); materials.SetUp(*domains); postpro.SetUp(*domains); // Store all unique domain attributes. for (const auto &data : materials) { - attributes.insert(data.attributes.begin(), data.attributes.end()); + attributes.insert(attributes.end(), data.attributes.begin(), data.attributes.end()); } + std::sort(attributes.begin(), attributes.end()); + attributes.erase(std::unique(attributes.begin(), attributes.end()), attributes.end()); + attributes.shrink_to_fit(); for (const auto &attr : postpro.attributes) { - MFEM_VERIFY(attributes.find(attr) != attributes.end(), + MFEM_VERIFY(std::lower_bound(attributes.begin(), attributes.end(), attr) != + attributes.end(), "Domain postprocessing can only be enabled on domains which have a " "corresponding \"Materials\" entry!"); } @@ -674,8 +819,9 @@ void PecBoundaryData::SetUp(json &boundaries) MFEM_ABORT( "Configuration file should not specify both \"PEC\" and \"Ground\" boundaries!"); } - MFEM_VERIFY(pec->find("Attributes") != pec->end(), - "Missing \"Attributes\" list for \"PEC\" boundary in configuration file!"); + MFEM_VERIFY( + pec->find("Attributes") != pec->end(), + "Missing \"Attributes\" list for \"PEC\" boundary in the configuration file!"); attributes = pec->at("Attributes").get>(); // Required std::sort(attributes.begin(), attributes.end()); @@ -686,12 +832,10 @@ void PecBoundaryData::SetUp(json &boundaries) << pec->dump(2)); // Debug - // std::cout << "PEC:"; - // for (auto attr : attributes) - // { - // std::cout << ' ' << attr; - // } - // std::cout << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "PEC:" << attributes << '\n'; + } } void PmcBoundaryData::SetUp(json &boundaries) @@ -714,8 +858,9 @@ void PmcBoundaryData::SetUp(json &boundaries) MFEM_ABORT("Configuration file should not specify both \"PMC\" and \"ZeroCharge\" " "boundaries!"); } - MFEM_VERIFY(pmc->find("Attributes") != pmc->end(), - "Missing \"Attributes\" list for \"PMC\" boundary in configuration file!"); + MFEM_VERIFY( + pmc->find("Attributes") != pmc->end(), + "Missing \"Attributes\" list for \"PMC\" boundary in the configuration file!"); attributes = pmc->at("Attributes").get>(); // Required std::sort(attributes.begin(), attributes.end()); @@ -726,12 +871,10 @@ void PmcBoundaryData::SetUp(json &boundaries) << pmc->dump(2)); // Debug - // std::cout << "PMC:"; - // for (auto attr : attributes) - // { - // std::cout << ' ' << attr; - // } - // std::cout << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "PMC:" << attributes << '\n'; + } } void WavePortPecBoundaryData::SetUp(json &boundaries) @@ -741,9 +884,9 @@ void WavePortPecBoundaryData::SetUp(json &boundaries) { return; } - MFEM_VERIFY( - pec->find("Attributes") != pec->end(), - "Missing \"Attributes\" list for \"WavePortPEC\" boundary in configuration file!"); + MFEM_VERIFY(pec->find("Attributes") != pec->end(), + "Missing \"Attributes\" list for \"WavePortPEC\" boundary in the " + "configuration file!"); attributes = pec->at("Attributes").get>(); // Required std::sort(attributes.begin(), attributes.end()); @@ -754,12 +897,10 @@ void WavePortPecBoundaryData::SetUp(json &boundaries) << pec->dump(2)); // Debug - // std::cout << "WavePortPEC:"; - // for (auto attr : attributes) - // { - // std::cout << ' ' << attr; - // } - // std::cout << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "WavePortPEC:" << attributes << '\n'; + } } void FarfieldBoundaryData::SetUp(json &boundaries) @@ -771,7 +912,7 @@ void FarfieldBoundaryData::SetUp(json &boundaries) } MFEM_VERIFY( absorbing->find("Attributes") != absorbing->end(), - "Missing \"Attributes\" list for \"Absorbing\" boundary in configuration file!"); + "Missing \"Attributes\" list for \"Absorbing\" boundary in the configuration file!"); attributes = absorbing->at("Attributes").get>(); // Required std::sort(attributes.begin(), attributes.end()); order = absorbing->value("Order", order); @@ -786,13 +927,11 @@ void FarfieldBoundaryData::SetUp(json &boundaries) << absorbing->dump(2)); // Debug - // std::cout << "Absorbing:"; - // for (auto attr : attributes) - // { - // std::cout << ' ' << attr; - // } - // std::cout << '\n'; - // std::cout << "Order: " << order << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Absorbing:" << attributes << '\n'; + std::cout << "Order: " << order << '\n'; + } } void ConductivityBoundaryData::SetUp(json &boundaries) @@ -806,26 +945,20 @@ void ConductivityBoundaryData::SetUp(json &boundaries) "\"Conductivity\" should specify an array in the configuration file!"); for (auto it = conductivity->begin(); it != conductivity->end(); ++it) { - MFEM_VERIFY( - it->find("Attributes") != it->end(), - "Missing \"Attributes\" list for \"Conductivity\" boundary in configuration file!"); + MFEM_VERIFY(it->find("Attributes") != it->end(), + "Missing \"Attributes\" list for \"Conductivity\" boundary in the " + "configuration file!"); MFEM_VERIFY( it->find("Conductivity") != it->end(), - "Missing \"Conductivity\" boundary \"Conductivity\" in configuration file!"); - ConductivityData &data = vecdata.emplace_back(); + "Missing \"Conductivity\" boundary \"Conductivity\" in the configuration file!"); + ConductivityData &data = emplace_back(); data.attributes = it->at("Attributes").get>(); // Required - data.sigma = it->at("Conductivity"); // Required + std::sort(data.attributes.begin(), data.attributes.end()); + data.sigma = it->at("Conductivity"); // Required data.mu_r = it->value("Permeability", data.mu_r); data.h = it->value("Thickness", data.h); data.external = it->value("External", data.external); - // Debug - // std::cout << "Attributes: " << data.attributes << '\n'; - // std::cout << "Conductivity: " << data.sigma << '\n'; - // std::cout << "Permeability: " << data.mu_r << '\n'; - // std::cout << "Thickness: " << data.h << '\n'; - // std::cout << "External: " << data.external << '\n'; - // Cleanup it->erase("Attributes"); it->erase("Conductivity"); @@ -835,6 +968,16 @@ void ConductivityBoundaryData::SetUp(json &boundaries) MFEM_VERIFY(it->empty(), "Found an unsupported configuration file keyword under \"Conductivity\"!\n" << it->dump(2)); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "Attributes: " << data.attributes << '\n'; + std::cout << "Conductivity: " << data.sigma << '\n'; + std::cout << "Permeability: " << data.mu_r << '\n'; + std::cout << "Thickness: " << data.h << '\n'; + std::cout << "External: " << data.external << '\n'; + } } } @@ -849,21 +992,16 @@ void ImpedanceBoundaryData::SetUp(json &boundaries) "\"Impedance\" should specify an array in the configuration file!"); for (auto it = impedance->begin(); it != impedance->end(); ++it) { - MFEM_VERIFY( - it->find("Attributes") != it->end(), - "Missing \"Attributes\" list for \"Impedance\" boundary in configuration file!"); - ImpedanceData &data = vecdata.emplace_back(); + MFEM_VERIFY(it->find("Attributes") != it->end(), + "Missing \"Attributes\" list for \"Impedance\" boundary in the " + "configuration file!"); + ImpedanceData &data = emplace_back(); data.attributes = it->at("Attributes").get>(); // Required + std::sort(data.attributes.begin(), data.attributes.end()); data.Rs = it->value("Rs", data.Rs); data.Ls = it->value("Ls", data.Ls); data.Cs = it->value("Cs", data.Cs); - // Debug - // std::cout << "Attributes: " << data.attributes << '\n'; - // std::cout << "Rs: " << data.Rs << '\n'; - // std::cout << "Ls: " << data.Ls << '\n'; - // std::cout << "Cs: " << data.Cs << '\n'; - // Cleanup it->erase("Attributes"); it->erase("Rs"); @@ -872,6 +1010,39 @@ void ImpedanceBoundaryData::SetUp(json &boundaries) MFEM_VERIFY(it->empty(), "Found an unsupported configuration file keyword under \"Impedance\"!\n" << it->dump(2)); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "Attributes: " << data.attributes << '\n'; + std::cout << "Rs: " << data.Rs << '\n'; + std::cout << "Ls: " << data.Ls << '\n'; + std::cout << "Cs: " << data.Cs << '\n'; + } + } +} + +int ParsePortExcitation(json::iterator &port_it, int default_excitation) +{ + auto it_excitation = port_it->find("Excitation"); + if (it_excitation == port_it->end()) + { + // Keep default; don't set input flag. + return default_excitation; + } + else if (it_excitation->is_boolean()) + { + return int(it_excitation->get()); // 0 false; 1 true + } + else if (it_excitation->is_number_unsigned()) + { + return it_excitation->get(); + } + else + { + MFEM_ABORT(fmt::format("\"Excitation\" on port index {:d} could not be parsed " + "as a bool or unsigned (non-negative) integer; got {}", + int(port_it->at("Index")), it_excitation->dump(2))); } } @@ -883,6 +1054,7 @@ void LumpedPortBoundaryData::SetUp(json &boundaries) { return; } + if (port == boundaries.end()) { port = terminal; @@ -895,30 +1067,35 @@ void LumpedPortBoundaryData::SetUp(json &boundaries) MFEM_ABORT("Configuration file should not specify both \"LumpedPort\" and \"Terminal\" " "boundaries!"); } - MFEM_VERIFY( - port->is_array(), - "\"LumpedPort\" and \"Terminal\" should specify an array in the configuration file!"); + + std::string label = (terminal != boundaries.end()) ? "\"Terminal\"" : "\"LumpedPort\""; + MFEM_VERIFY(port->is_array(), + label << " should specify an array in the configuration file!"); for (auto it = port->begin(); it != port->end(); ++it) { - MFEM_VERIFY( - it->find("Index") != it->end(), - "Missing \"LumpedPort\" or \"Terminal\" boundary \"Index\" in configuration file!"); - auto ret = mapdata.insert(std::make_pair(it->at("Index"), LumpedPortData())); - MFEM_VERIFY(ret.second, "Repeated \"Index\" found when processing \"LumpedPort\" or " - "\"Terminal\" boundaries in configuration file!"); - LumpedPortData &data = ret.first->second; + auto index = AtIndex(it, label); + auto ret = mapdata.insert(std::make_pair(index, LumpedPortData())); + MFEM_VERIFY(ret.second, fmt::format("Repeated \"Index\" found when processing {} " + "boundaries in the configuration file!", + label)); + auto &data = ret.first->second; data.R = it->value("R", data.R); data.L = it->value("L", data.L); data.C = it->value("C", data.C); data.Rs = it->value("Rs", data.Rs); data.Ls = it->value("Ls", data.Ls); data.Cs = it->value("Cs", data.Cs); - data.excitation = it->value("Excitation", data.excitation); + + data.excitation = ParsePortExcitation(it, data.excitation); + data.active = it->value("Active", data.active); if (it->find("Attributes") != it->end()) { - MFEM_VERIFY(it->find("Elements") == it->end(), - "Cannot specify both top-level \"Attributes\" list and \"Elements\" for " - "\"LumpedPort\" or \"Terminal\" boundary in configuration file!"); + MFEM_VERIFY( + it->find("Elements") == it->end(), + fmt::format( + "Cannot specify both top-level \"Attributes\" list and \"Elements\" for " + "{} in the configuration file!", + label)); auto &elem = data.elements.emplace_back(); ParseElementData(*it, "Direction", terminal == boundaries.end(), elem); } @@ -926,13 +1103,15 @@ void LumpedPortBoundaryData::SetUp(json &boundaries) { auto elements = it->find("Elements"); MFEM_VERIFY(elements != it->end(), - "Missing top-level \"Attributes\" list or \"Elements\" for " - "\"LumpedPort\" or \"Terminal\" boundary in configuration file!"); + fmt::format("Missing top-level \"Attributes\" list or \"Elements\" for " + "{} in the configuration file!", + label)); for (auto elem_it = elements->begin(); elem_it != elements->end(); ++elem_it) { MFEM_VERIFY(elem_it->find("Attributes") != elem_it->end(), - "Missing \"Attributes\" list for \"LumpedPort\" or \"Terminal\" " - "boundary element in configuration file!"); + fmt::format("Missing \"Attributes\" list for {} element in " + "the configuration file!", + label)); auto &elem = data.elements.emplace_back(); ParseElementData(*elem_it, "Direction", terminal == boundaries.end(), elem); @@ -940,35 +1119,21 @@ void LumpedPortBoundaryData::SetUp(json &boundaries) elem_it->erase("Attributes"); elem_it->erase("Direction"); elem_it->erase("CoordinateSystem"); - MFEM_VERIFY(elem_it->empty(), - "Found an unsupported configuration file keyword under \"LumpedPort\" " - "or \"Terminal\" boundary element!\n" - << elem_it->dump(2)); + MFEM_VERIFY(elem_it->empty(), fmt::format("Found an unsupported configuration file " + "keyword under {} element!\n{}", + label, elem_it->dump(2))); } } - if (it->find("Voltage") != it->end()) { + if (it->find("Voltage") != it->end()) + { data.voltage = it->value("Voltage", data.voltage); } - else { + else + { data.voltage = it->value("Voltage", 1.0); } - // Debug - // std::cout << "Index: " << ret.first->first << '\n'; - // std::cout << "R: " << data.R << '\n'; - // std::cout << "L: " << data.L << '\n'; - // std::cout << "C: " << data.C << '\n'; - // std::cout << "Rs: " << data.Rs << '\n'; - // std::cout << "Ls: " << data.Ls << '\n'; - // std::cout << "Cs: " << data.Cs << '\n'; - // std::cout << "Excitation: " << data.excitation << '\n'; - // for (const auto &elem : data.elements) - // { - // std::cout << "Attributes: " << elem.attributes << '\n'; - // std::cout << "Direction: " << elem.direction << '\n'; - // } - // Cleanup it->erase("Index"); it->erase("R"); @@ -979,13 +1144,113 @@ void LumpedPortBoundaryData::SetUp(json &boundaries) it->erase("Cs"); it->erase("Voltage"); it->erase("Excitation"); + it->erase("Active"); it->erase("Attributes"); it->erase("Direction"); it->erase("CoordinateSystem"); it->erase("Elements"); - MFEM_VERIFY(it->empty(), "Found an unsupported configuration file keyword under " - "\"LumpedPort\" or \"Terminal\"!\n" - << it->dump(2)); + MFEM_VERIFY(it->empty(), + fmt::format("Found an unsupported configuration file keyword under {}!\n{}", + label, it->dump(2))); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "Index: " << ret.first->first << '\n'; + std::cout << "R: " << data.R << '\n'; + std::cout << "L: " << data.L << '\n'; + std::cout << "C: " << data.C << '\n'; + std::cout << "Rs: " << data.Rs << '\n'; + std::cout << "Ls: " << data.Ls << '\n'; + std::cout << "Cs: " << data.Cs << '\n'; + std::cout << "Voltage: " << data.voltage << '\n'; + std::cout << "Excitation: " << data.excitation << '\n'; + std::cout << "Active: " << data.active << '\n'; + for (const auto &elem : data.elements) + { + std::cout << "Attributes: " << elem.attributes << '\n'; + std::cout << "Direction: " << elem.direction << '\n'; + std::cout << "CoordinateSystem: " << elem.coordinate_system << '\n'; + } + } + } +} + +void PeriodicBoundaryData::SetUp(json &boundaries) +{ + auto periodic = boundaries.find("Periodic"); + if (periodic == boundaries.end()) + { + return; + } + auto floquet = periodic->find("FloquetWaveVector"); + if (floquet != periodic->end()) + { + MFEM_VERIFY(floquet->is_array(), + "\"FloquetWaveVector\" should specify an array in the configuration file!"); + wave_vector = floquet->get>(); + } + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "FloquetWaveVector: " << wave_vector << '\n'; + } + + auto pairs = periodic->find("BoundaryPairs"); + MFEM_VERIFY(pairs->is_array(), + "\"BoundaryPairs\" should specify an array in the configuration file!"); + for (auto it = pairs->begin(); it != pairs->end(); ++it) + { + MFEM_VERIFY(it->find("DonorAttributes") != it->end(), + "Missing \"DonorAttributes\" list for \"Periodic\" boundary in the " + "configuration file!"); + MFEM_VERIFY(it->find("ReceiverAttributes") != it->end(), + "Missing \"ReceiverAttributes\" list for \"Periodic\" boundary in the " + "configuration file!"); + + PeriodicData &data = boundary_pairs.emplace_back(); + data.donor_attributes = it->at("DonorAttributes").get>(); // Required + data.receiver_attributes = + it->at("ReceiverAttributes").get>(); // Required + auto translation = it->find("Translation"); + if (translation != it->end()) + { + MFEM_VERIFY(translation->is_array(), + "\"Translation\" should specify an array in the configuration file!"); + std::array translation_array = translation->get>(); + for (int i = 0; i < 3; i++) + { + data.affine_transform[i * 4 + i] = 1.0; + data.affine_transform[i * 4 + 3] = translation_array[i]; + } + data.affine_transform[3 * 4 + 3] = 1.0; + } + auto transformation = it->find("AffineTransformation"); + if (transformation != it->end()) + { + MFEM_VERIFY( + transformation->is_array(), + "\"AffineTransformation\" should specify an array in the configuration file!"); + data.affine_transform = transformation->get>(); + } + + // Cleanup + it->erase("DonorAttributes"); + it->erase("ReceiverAttributes"); + it->erase("Translation"); + it->erase("AffineTransformation"); + MFEM_VERIFY(it->empty(), + "Found an unsupported configuration file keyword under \"Periodic\"!\n" + << it->dump(2)); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "DonorAttributes: " << data.donor_attributes << '\n'; + std::cout << "ReceiverAttributes: " << data.receiver_attributes << '\n'; + std::cout << "AffineTransformation: " << data.affine_transform << '\n'; + } } } @@ -998,40 +1263,63 @@ void WavePortBoundaryData::SetUp(json &boundaries) } MFEM_VERIFY(port->is_array(), "\"WavePort\" should specify an array in the configuration file!"); + for (auto it = port->begin(); it != port->end(); ++it) { - MFEM_VERIFY(it->find("Index") != it->end(), - "Missing \"WavePort\" boundary \"Index\" in configuration file!"); MFEM_VERIFY( it->find("Attributes") != it->end(), - "Missing \"Attributes\" list for \"WavePort\" boundary in configuration file!"); - auto ret = mapdata.insert(std::make_pair(it->at("Index"), WavePortData())); + "Missing \"Attributes\" list for \"WavePort\" boundary in the configuration file!"); + auto index = AtIndex(it, "\"WavePort\" boundary"); + auto ret = mapdata.insert(std::make_pair(index, WavePortData())); MFEM_VERIFY(ret.second, "Repeated \"Index\" found when processing \"WavePort\" " - "boundaries in configuration file!"); - WavePortData &data = ret.first->second; + "boundaries in the configuration file!"); + auto &data = ret.first->second; data.attributes = it->at("Attributes").get>(); // Required + std::sort(data.attributes.begin(), data.attributes.end()); data.mode_idx = it->value("Mode", data.mode_idx); MFEM_VERIFY(data.mode_idx > 0, "\"WavePort\" boundary \"Mode\" must be positive (1-based)!"); data.d_offset = it->value("Offset", data.d_offset); - data.excitation = it->value("Excitation", data.excitation); + data.eigen_solver = it->value("SolverType", data.eigen_solver); - // Debug - // std::cout << "Index: " << ret.first->first << '\n'; - // std::cout << "Attributes: " << data.attributes << '\n'; - // std::cout << "Mode: " << data.mode_idx << '\n'; - // std::cout << "Offset: " << data.d_offset << '\n'; - // std::cout << "Excitation: " << data.excitation << '\n'; + data.excitation = ParsePortExcitation(it, data.excitation); + data.active = it->value("Active", data.active); + data.ksp_max_its = it->value("MaxIts", data.ksp_max_its); + data.ksp_tol = it->value("KSPTol", data.ksp_tol); + data.eig_tol = it->value("EigenTol", data.eig_tol); + data.verbose = it->value("Verbose", data.verbose); // Cleanup it->erase("Index"); it->erase("Attributes"); it->erase("Mode"); it->erase("Offset"); + it->erase("SolverType"); it->erase("Excitation"); + it->erase("Active"); + it->erase("MaxIts"); + it->erase("KSPTol"); + it->erase("EigenTol"); + it->erase("Verbose"); MFEM_VERIFY(it->empty(), "Found an unsupported configuration file keyword under \"WavePort\"!\n" << it->dump(2)); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "Index: " << ret.first->first << '\n'; + std::cout << "Attributes: " << data.attributes << '\n'; + std::cout << "Mode: " << data.mode_idx << '\n'; + std::cout << "Offset: " << data.d_offset << '\n'; + std::cout << "SolverType: " << data.eigen_solver << '\n'; + std::cout << "Excitation: " << data.excitation << '\n'; + std::cout << "Active: " << data.active << '\n'; + std::cout << "MaxIts: " << data.ksp_max_its << '\n'; + std::cout << "KSPTol: " << data.ksp_tol << '\n'; + std::cout << "EigenTol: " << data.eig_tol << '\n'; + std::cout << "Verbose: " << data.verbose << '\n'; + } } } @@ -1046,17 +1334,16 @@ void SurfaceCurrentBoundaryData::SetUp(json &boundaries) "\"SurfaceCurrent\" should specify an array in the configuration file!"); for (auto it = source->begin(); it != source->end(); ++it) { - MFEM_VERIFY(it->find("Index") != it->end(), - "Missing \"SurfaceCurrent\" source \"Index\" in configuration file!"); - auto ret = mapdata.insert(std::make_pair(it->at("Index"), SurfaceCurrentData())); + auto index = AtIndex(it, "\"SurfaceCurrent\" source"); + auto ret = mapdata.insert(std::make_pair(index, SurfaceCurrentData())); MFEM_VERIFY(ret.second, "Repeated \"Index\" found when processing \"SurfaceCurrent\" " - "boundaries in configuration file!"); - SurfaceCurrentData &data = ret.first->second; + "boundaries in the configuration file!"); + auto &data = ret.first->second; if (it->find("Attributes") != it->end()) { MFEM_VERIFY(it->find("Elements") == it->end(), "Cannot specify both top-level \"Attributes\" list and \"Elements\" for " - "\"SurfaceCurrent\" boundary in configuration file!"); + "\"SurfaceCurrent\" boundary in the configuration file!"); auto &elem = data.elements.emplace_back(); ParseElementData(*it, "Direction", true, elem); } @@ -1066,7 +1353,7 @@ void SurfaceCurrentBoundaryData::SetUp(json &boundaries) MFEM_VERIFY( elements != it->end(), "Missing top-level \"Attributes\" list or \"Elements\" for \"SurfaceCurrent\" " - "boundary in configuration file!"); + "boundary in the configuration file!"); for (auto elem_it = elements->begin(); elem_it != elements->end(); ++elem_it) { MFEM_VERIFY( @@ -1086,14 +1373,6 @@ void SurfaceCurrentBoundaryData::SetUp(json &boundaries) } } - // Debug - // std::cout << "Index: " << ret.first->first << '\n'; - // for (const auto &elem : data.elements) - // { - // std::cout << "Attributes: " << elem.attributes << '\n'; - // std::cout << "Direction: " << elem.direction << '\n'; - // } - // Cleanup it->erase("Index"); it->erase("Attributes"); @@ -1104,83 +1383,72 @@ void SurfaceCurrentBoundaryData::SetUp(json &boundaries) it->empty(), "Found an unsupported configuration file keyword under \"SurfaceCurrent\"!\n" << it->dump(2)); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "Index: " << ret.first->first << '\n'; + for (const auto &elem : data.elements) + { + std::cout << "Attributes: " << elem.attributes << '\n'; + std::cout << "Direction: " << elem.direction << '\n'; + std::cout << "CoordinateSystem: " << elem.coordinate_system << '\n'; + } + } } } -void CapacitancePostData::SetUp(json &postpro) +void SurfaceFluxPostData::SetUp(json &postpro) { - auto capacitance = postpro.find("Capacitance"); - if (capacitance == postpro.end()) + auto flux = postpro.find("SurfaceFlux"); + if (flux == postpro.end()) { return; } - MFEM_VERIFY(capacitance->is_array(), - "\"Capacitance\" should specify an array in the configuration file!"); - for (auto it = capacitance->begin(); it != capacitance->end(); ++it) - { - MFEM_VERIFY(it->find("Index") != it->end(), - "Missing \"Capacitance\" boundary \"Index\" in configuration file!"); - MFEM_VERIFY( - it->find("Attributes") != it->end(), - "Missing \"Attributes\" list for \"Capacitance\" boundary in configuration file!"); - auto ret = mapdata.insert(std::make_pair(it->at("Index"), CapacitanceData())); - MFEM_VERIFY(ret.second, "Repeated \"Index\" found when processing \"Capacitance\" " - "boundaries in configuration file!"); - CapacitanceData &data = ret.first->second; + MFEM_VERIFY(flux->is_array(), + "\"SurfaceFlux\" should specify an array in the configuration file!"); + for (auto it = flux->begin(); it != flux->end(); ++it) + { + auto index = AtIndex(it, "\"SurfaceFlux\" boundary"); + MFEM_VERIFY(it->find("Attributes") != it->end() && it->find("Type") != it->end(), + "Missing \"Attributes\" list or \"Type\" for \"SurfaceFlux\" boundary " + "in the configuration file!"); + auto ret = mapdata.insert(std::make_pair(index, SurfaceFluxData())); + MFEM_VERIFY(ret.second, "Repeated \"Index\" found when processing \"SurfaceFlux\" " + "boundaries in the configuration file!"); + auto &data = ret.first->second; data.attributes = it->at("Attributes").get>(); // Required - - // Debug - // std::cout << "Index: " << ret.first->first << '\n'; - // std::cout << "Attributes: " << data.attributes << '\n'; + std::sort(data.attributes.begin(), data.attributes.end()); + data.type = it->at("Type"); // Required + data.two_sided = it->value("TwoSided", data.two_sided); + auto ctr = it->find("Center"); + if (ctr != it->end()) + { + MFEM_VERIFY(ctr->is_array(), + "\"Center\" should specify an array in the configuration file!"); + data.center = ctr->get>(); + data.no_center = false; + } // Cleanup it->erase("Index"); it->erase("Attributes"); + it->erase("Type"); + it->erase("TwoSided"); + it->erase("Center"); MFEM_VERIFY(it->empty(), - "Found an unsupported configuration file keyword under \"Capacitance\"!\n" + "Found an unsupported configuration file keyword under \"SurfaceFlux\"!\n" << it->dump(2)); - } -} - -void InductancePostData::SetUp(json &postpro) -{ - auto inductance = postpro.find("Inductance"); - if (inductance == postpro.end()) - { - return; - } - MFEM_VERIFY(inductance->is_array(), - "\"Inductance\" should specify an array in the configuration file!"); - for (auto it = inductance->begin(); it != inductance->end(); ++it) - { - MFEM_VERIFY(it->find("Index") != it->end(), - "Missing \"Inductance\" boundary \"Index\" in configuration file!"); - MFEM_VERIFY(it->find("Attributes") != it->end() && it->find("Direction") != it->end(), - "Missing \"Attributes\" list or \"Direction\" for \"Inductance\" boundary " - "in configuration file!"); - auto ret = mapdata.insert(std::make_pair(it->at("Index"), InductanceData())); - MFEM_VERIFY(ret.second, "Repeated \"Index\" found when processing \"Inductance\" " - "boundaries in configuration file!"); - InductanceData &data = ret.first->second; - ParseElementData(*it, "Direction", true, data); - MFEM_VERIFY(data.coordinate_system == - internal::ElementData::CoordinateSystem::CARTESIAN, - "\"Direction\" for \"Inductance\" boundary only supports Cartesian " - "coordinate systems!"); // Debug - // std::cout << "Index: " << ret.first->first << '\n'; - // std::cout << "Attributes: " << data.attributes << '\n'; - // std::cout << "Direction: " << data.direction << '\n'; - - // Cleanup - it->erase("Index"); - it->erase("Attributes"); - it->erase("Direction"); - it->erase("CoordinateSystem"); - MFEM_VERIFY(it->empty(), - "Found an unsupported configuration file keyword under \"Inductance\"!\n" - << it->dump(2)); + if constexpr (JSON_DEBUG) + { + std::cout << "Index: " << ret.first->first << '\n'; + std::cout << "Attributes: " << data.attributes << '\n'; + std::cout << "Type: " << data.type << '\n'; + std::cout << "TwoSided: " << data.two_sided << '\n'; + std::cout << "Center: " << data.center << '\n'; + } } } @@ -1195,101 +1463,214 @@ void InterfaceDielectricPostData::SetUp(json &postpro) "\"Dielectric\" should specify an array in the configuration file!"); for (auto it = dielectric->begin(); it != dielectric->end(); ++it) { - MFEM_VERIFY(it->find("Index") != it->end(), - "Missing \"Dielectric\" boundary \"Index\" in configuration file!"); - // One (and only one) of epsilon_r, epsilon_r_ma, epsilon_r_ms, and epsilon_r_sa - // are required for surfaces. - MFEM_VERIFY((it->find("Permittivity") != it->end()) + - (it->find("PermittivityMA") != it->end()) + - (it->find("PermittivityMS") != it->end()) + - (it->find("PermittivitySA") != it->end()) == - 1, - "Only one of \"Dielectric\" boundary \"Permittivity\", " - "\"PermittivityMA\", \"PermittivityMS\", or \"PermittivitySA\" should be " - "specified for interface dielectric loss in configuration file!"); - MFEM_VERIFY(it->find("Thickness") != it->end(), - "Missing \"Dielectric\" boundary \"Thickness\" in configuration file!"); - auto ret = mapdata.insert(std::make_pair(it->at("Index"), InterfaceDielectricData())); + auto index = AtIndex(it, "\"Dielectric\" boundary"); + MFEM_VERIFY(it->find("Attributes") != it->end() && it->find("Thickness") != it->end() && + it->find("Permittivity") != it->end(), + "Missing \"Dielectric\" boundary \"Attributes\" list, \"Thickness\", or " + "\"Permittivity\" in the configuration file!"); + auto ret = mapdata.insert(std::make_pair(index, InterfaceDielectricData())); MFEM_VERIFY(ret.second, "Repeated \"Index\" found when processing \"Dielectric\" " - "boundaries in configuration file!"); - InterfaceDielectricData &data = ret.first->second; - data.ts = it->at("Thickness"); // Required for surfaces + "boundaries in the configuration file!"); + auto &data = ret.first->second; + data.attributes = it->at("Attributes").get>(); // Required + std::sort(data.attributes.begin(), data.attributes.end()); + data.type = it->value("Type", data.type); + data.t = it->at("Thickness"); // Required + data.epsilon_r = it->at("Permittivity"); // Required data.tandelta = it->value("LossTan", data.tandelta); - data.epsilon_r = it->value("Permittivity", data.epsilon_r); - data.epsilon_r_ma = it->value("PermittivityMA", data.epsilon_r_ma); - data.epsilon_r_ms = it->value("PermittivityMS", data.epsilon_r_ms); - data.epsilon_r_sa = it->value("PermittivitySA", data.epsilon_r_sa); - if (it->find("Attributes") != it->end()) - { - MFEM_VERIFY(it->find("Elements") == it->end(), - "Cannot specify both top-level \"Attributes\" list and \"Elements\" for " - "\"Dielectric\" boundary in configuration file!"); - auto &elem = data.elements.emplace_back(); - ParseElementData(*it, "Side", false, elem); - MFEM_VERIFY(elem.coordinate_system == - internal::ElementData::CoordinateSystem::CARTESIAN, - "\"Side\" for \"Dielectric\" boundary only supports Cartesian coordinate " - "systems!"); - } - else - { - auto elements = it->find("Elements"); - MFEM_VERIFY(elements != it->end(), - "Missing top-level \"Attributes\" list or \"Elements\" for " - "\"Dielectric\" boundary in configuration file!"); - for (auto elem_it = elements->begin(); elem_it != elements->end(); ++elem_it) - { - MFEM_VERIFY(elem_it->find("Attributes") != elem_it->end(), - "Missing \"Attributes\" list for \"Dielectric\" boundary element in " - "configuration file!"); - auto &elem = data.elements.emplace_back(); - ParseElementData(*elem_it, "Side", false, elem); - MFEM_VERIFY(elem.coordinate_system == - internal::ElementData::CoordinateSystem::CARTESIAN, - "\"Side\" for \"Dielectric\" boundary only supports Cartesian " - "coordinate systems!"); - - // Cleanup - elem_it->erase("Attributes"); - elem_it->erase("Side"); - elem_it->erase("CoordinateSystem"); - MFEM_VERIFY(elem_it->empty(), "Found an unsupported configuration file keyword " - "under \"Dielectric\" boundary element!\n" - << elem_it->dump(2)); - } - } - - // Debug - // std::cout << "Index: " << ret.first->first << '\n'; - // std::cout << "LossTan: " << data.tandelta << '\n'; - // std::cout << "Permittivity: " << data.epsilon_r << '\n'; - // std::cout << "PermittivityMA: " << data.epsilon_r_ma << '\n'; - // std::cout << "PermittivityMS: " << data.epsilon_r_ms << '\n'; - // std::cout << "PermittivitySA: " << data.epsilon_r_sa << '\n'; - // std::cout << "Thickness: " << data.ts << '\n'; - // for (const auto &elem : data.elements) - // { - // std::cout << "Attributes: " << elem.attributes << '\n'; - // std::cout << "Side: " << elem.side << '\n'; - // } // Cleanup it->erase("Index"); - it->erase("LossTan"); - it->erase("Permittivity"); - it->erase("PermittivityMA"); - it->erase("PermittivityMS"); - it->erase("PermittivitySA"); - it->erase("Thickness"); it->erase("Attributes"); - it->erase("Side"); - it->erase("CoordinateSystem"); + it->erase("Type"); + it->erase("Thickness"); + it->erase("Permittivity"); + it->erase("LossTan"); MFEM_VERIFY(it->empty(), "Found an unsupported configuration file keyword under \"Dielectric\"!\n" << it->dump(2)); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "Index: " << ret.first->first << '\n'; + std::cout << "Attributes: " << data.attributes << '\n'; + std::cout << "Type: " << data.type << '\n'; + std::cout << "Thickness: " << data.t << '\n'; + std::cout << "Permittivity: " << data.epsilon_r << '\n'; + std::cout << "LossTan: " << data.tandelta << '\n'; + } } } +void FarFieldPostData::SetUp(json &postpro) +{ + auto farfield = postpro.find("FarField"); + if (farfield == postpro.end()) + { + return; + } + + MFEM_VERIFY(farfield->find("Attributes") != farfield->end(), + "Missing \"Attributes\" list for \"FarField\" postprocessing in the " + "configuration file!"); + + attributes = farfield->at("Attributes").get>(); // Required + std::sort(attributes.begin(), attributes.end()); + // Generate NSample points with the following properties: + // - If NSample >= 2, the generated points are precisely NSample, otherwise NSample = 2. + // - The poles, the equator, and the XZ plane are always included. + // - The points are almost uniformly on a sphere, with a small bias due to satisfying the + // previous condition. + // - The points are on rings of constant theta. + + auto nsample_json = farfield->find("NSample"); + int nsample = 0; + if (nsample_json != farfield->end()) + { + nsample = nsample_json->get(); + if (nsample > 0) + { + // Always include poles. + thetaphis.emplace_back(0.0, 0.0); // North pole. + thetaphis.emplace_back(M_PI, 0.0); // South pole. + + if (nsample > 2) + { + int remaining = nsample - 2; + + // Distribute all remaining points across rings with number weighted by the + // local circumference. + int n_theta = std::max(1, static_cast(std::sqrt(remaining))); + n_theta = std::min(n_theta, remaining); // Can't have more rings than points. + + std::vector points_per_level(n_theta); + std::vector sin_theta_values(n_theta); + double total_sin_theta = 0.0; + + // Calculate sin(theta) for each ring and total (sin(theta) is proportional to the + // circumference). + for (int i = 0; i < n_theta; ++i) + { + double theta = std::acos(1.0 - 2.0 * (i + 1) / (n_theta + 1.0)); + sin_theta_values[i] = std::sin(theta); + total_sin_theta += sin_theta_values[i]; + } + + // Distribute points proportional to sin(theta). + int assigned_points = 0; + for (int i = 0; i < n_theta - 1; ++i) + { + points_per_level[i] = + static_cast(remaining * sin_theta_values[i] / total_sin_theta + 0.5); + assigned_points += points_per_level[i]; + } + // Assign remaining points to last ring to ensure exact total. + points_per_level[n_theta - 1] = remaining - assigned_points; + + for (int i = 1; i <= n_theta; ++i) + { + // Ensure equator and XZ plane inclusion. + bool is_equator = (i == (n_theta + 1) / 2); + double theta = is_equator ? M_PI / 2 : std::acos(1.0 - 2.0 * i / (n_theta + 1.0)); + int points_in_level = points_per_level[i - 1]; + + for (int j = 0; j < points_in_level; ++j) + { + double phi = 2.0 * M_PI * j / points_in_level; + + // Force XZ plane points (phi = 0 or π). + if (j == 0) + { + phi = 0.0; + } + else if (j == points_in_level / 2) + { + phi = M_PI; + } + + thetaphis.emplace_back(theta, phi); + } + } + } + + if (nsample > 2) + MFEM_ASSERT(thetaphis.size() == nsample, + "Sampled number of points is not NSample!"); + } + } + + auto thetaphis_json = farfield->find("ThetaPhis"); + if (thetaphis_json != farfield->end()) + { + MFEM_VERIFY(thetaphis_json->is_array(), + "\"ThetaPhis\" should specify an array in the configuration file!"); + + // JSON does not support the notion of pair, so we read the theta and phis as vectors + // of vectors, and then cast them to vectors of pairs. + // + // Convert to radians in the process. + auto vec_of_vec = thetaphis_json->get>>(); + for (const auto &vec : vec_of_vec) + { + thetaphis.emplace_back(vec[0] * M_PI / 180, vec[1] * M_PI / 180); + } + } + + // Remove duplicate entries with numerical tolerance. + constexpr double tol = 1e-6; + std::sort(thetaphis.begin(), thetaphis.end()); + auto it = std::unique(thetaphis.begin(), thetaphis.end(), + [tol](const auto &a, const auto &b) + { + // At poles (theta ≈ 0 or π), phi is irrelevant. + if ((std::abs(a.first) < tol || std::abs(a.first - M_PI) < tol) && + (std::abs(b.first) < tol || std::abs(b.first - M_PI) < tol)) + { + return std::abs(a.first - b.first) < tol; + } + + // Check direct match. + if (std::abs(a.first - b.first) < tol) + { + double phi_diff = std::abs(a.second - b.second); + return phi_diff < tol || std::abs(phi_diff - 2.0 * M_PI) < tol; + } + + // Check theta periodicity: (θ, φ) ≡ (π-θ, φ+π). + if (std::abs(a.first - (M_PI - b.first)) < tol) + { + double phi_diff = std::abs(a.second - (b.second + M_PI)); + if (phi_diff > M_PI) + phi_diff = 2.0 * M_PI - phi_diff; + return phi_diff < tol; + } + + return false; + }); + thetaphis.erase(it, thetaphis.end()); + + if (thetaphis.empty()) + { + MFEM_WARNING("No target points specified under farfield \"FarField\"!\n"); + } + + // Cleanup + farfield->erase("Attributes"); + farfield->erase("NSample"); + farfield->erase("ThetaPhis"); + MFEM_VERIFY(farfield->empty(), + "Found an unsupported configuration file keyword under \"FarField\"!\n" + << farfield->dump(2)); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "Attributes: " << attributes << '\n'; + std::cout << "NSample: " << nsample << '\n'; + std::cout << "ThetaPhis: " << thetaphis << '\n'; + } +} void BoundaryPostData::SetUp(json &boundaries) { auto postpro = boundaries.find("Postprocessing"); @@ -1297,31 +1678,31 @@ void BoundaryPostData::SetUp(json &boundaries) { return; } - capacitance.SetUp(*postpro); - inductance.SetUp(*postpro); + flux.SetUp(*postpro); dielectric.SetUp(*postpro); + farfield.SetUp(*postpro); // Store all unique postprocessing boundary attributes. - for (const auto &[idx, data] : capacitance) - { - attributes.insert(data.attributes.begin(), data.attributes.end()); - } - for (const auto &[idx, data] : inductance) + for (const auto &[idx, data] : flux) { - attributes.insert(data.attributes.begin(), data.attributes.end()); + attributes.insert(attributes.end(), data.attributes.begin(), data.attributes.end()); } for (const auto &[idx, data] : dielectric) { - for (const auto &elem : data.elements) - { - attributes.insert(elem.attributes.begin(), elem.attributes.end()); - } + attributes.insert(attributes.end(), data.attributes.begin(), data.attributes.end()); } + attributes.insert(attributes.end(), farfield.attributes.begin(), + farfield.attributes.end()); + + std::sort(attributes.begin(), attributes.end()); + attributes.erase(std::unique(attributes.begin(), attributes.end()), attributes.end()); + attributes.shrink_to_fit(); + // Cleanup - postpro->erase("Capacitance"); - postpro->erase("Inductance"); + postpro->erase("SurfaceFlux"); postpro->erase("Dielectric"); + postpro->erase("FarField"); MFEM_VERIFY(postpro->empty(), "Found an unsupported configuration file keyword under \"Postprocessing\"!\n" << postpro->dump(2)); @@ -1331,7 +1712,7 @@ void BoundaryData::SetUp(json &config) { auto boundaries = config.find("Boundaries"); MFEM_VERIFY(boundaries != config.end(), - "\"Boundaries\" must be specified in configuration file!"); + "\"Boundaries\" must be specified in the configuration file!"); pec.SetUp(*boundaries); pmc.SetUp(*boundaries); auxpec.SetUp(*boundaries); @@ -1339,42 +1720,108 @@ void BoundaryData::SetUp(json &config) conductivity.SetUp(*boundaries); impedance.SetUp(*boundaries); lumpedport.SetUp(*boundaries); + periodic.SetUp(*boundaries); waveport.SetUp(*boundaries); current.SetUp(*boundaries); postpro.SetUp(*boundaries); + // Ensure unique indexing of lumpedport, waveport, current. + { + std::map index_map; + std::map> excitation_map; + const std::string lumpedport_str = "\"LumpedPort\""; + const std::string waveport_str = "WavePort"; + const std::string current_str = "SurfaceCurrent"; + + for (const auto &data : lumpedport) + { + auto result = index_map.insert({data.first, lumpedport_str}); + MFEM_VERIFY(result.second, "Duplicate \"Index\": " << data.first << " in " + << index_map[data.first] << "!"); + excitation_map[data.second.excitation].emplace_back(data.first); + } + for (const auto &data : waveport) + { + auto result = index_map.insert({data.first, waveport_str}); + MFEM_VERIFY(result.second, "Duplicate \"Index\": " << data.first << " in " + << index_map[data.first] << "!"); + excitation_map[data.second.excitation].emplace_back(data.first); + } + for (const auto &data : current) + { + auto result = index_map.insert({data.first, current_str}); + MFEM_VERIFY(result.second, "Duplicate \"Index\": " << data.first << " in " + << index_map[data.first] << "!"); + } + // Typical usecase: If each excitation is simple, S-parameters will be calculated. + // If there were multiple excitations specified, check their indices match the + // port indices. If there was only one, assign it. + excitation_map.erase(0); // zeroth index is unexcited. + bool calc_s_params = std::all_of(excitation_map.begin(), excitation_map.end(), + [](const auto &x) { return x.second.size() == 1; }); + if (calc_s_params && !excitation_map.empty()) + { + // If there's one excitation, needs to be 1 (set with bool) or the port index. + const auto &ext1 = *excitation_map.begin(); + MFEM_VERIFY( + (excitation_map.size() == 1 && + (ext1.first == 1 || ext1.second[0] == ext1.first)) || + std::all_of(excitation_map.begin(), excitation_map.end(), + [](const auto &x) { return x.first == x.second[0]; }), + "\"Excitation\" must match \"Index\" for single ports to avoid ambiguity!"); + + for (auto &[port_idx, lp] : lumpedport) + { + if (lp.excitation == 1) + { + lp.excitation = port_idx; + } + } + for (auto &[port_idx, wp] : waveport) + { + if (wp.excitation == 1) + { + wp.excitation = port_idx; + } + } + } + } + // Store all unique boundary attributes. - attributes.insert(pec.attributes.begin(), pec.attributes.end()); - attributes.insert(pmc.attributes.begin(), pmc.attributes.end()); - attributes.insert(auxpec.attributes.begin(), auxpec.attributes.end()); - attributes.insert(farfield.attributes.begin(), farfield.attributes.end()); + attributes.insert(attributes.end(), pec.attributes.begin(), pec.attributes.end()); + attributes.insert(attributes.end(), pmc.attributes.begin(), pmc.attributes.end()); + attributes.insert(attributes.end(), auxpec.attributes.begin(), auxpec.attributes.end()); + attributes.insert(attributes.end(), farfield.attributes.begin(), + farfield.attributes.end()); for (const auto &data : conductivity) { - attributes.insert(data.attributes.begin(), data.attributes.end()); + attributes.insert(attributes.end(), data.attributes.begin(), data.attributes.end()); } for (const auto &data : impedance) { - attributes.insert(data.attributes.begin(), data.attributes.end()); + attributes.insert(attributes.end(), data.attributes.begin(), data.attributes.end()); } for (const auto &[idx, data] : lumpedport) { for (const auto &elem : data.elements) { - attributes.insert(elem.attributes.begin(), elem.attributes.end()); + attributes.insert(attributes.end(), elem.attributes.begin(), elem.attributes.end()); } } for (const auto &[idx, data] : waveport) { - attributes.insert(data.attributes.begin(), data.attributes.end()); + attributes.insert(attributes.end(), data.attributes.begin(), data.attributes.end()); } for (const auto &[idx, data] : current) { for (const auto &elem : data.elements) { - attributes.insert(elem.attributes.begin(), elem.attributes.end()); + attributes.insert(attributes.end(), elem.attributes.begin(), elem.attributes.end()); } } - attributes.insert(postpro.attributes.begin(), postpro.attributes.end()); + std::sort(attributes.begin(), attributes.end()); + attributes.erase(std::unique(attributes.begin(), attributes.end()), attributes.end()); + attributes.shrink_to_fit(); // Cleanup boundaries->erase("PEC"); @@ -1384,6 +1831,8 @@ void BoundaryData::SetUp(json &config) boundaries->erase("Conductivity"); boundaries->erase("Impedance"); boundaries->erase("LumpedPort"); + boundaries->erase("Periodic"); + boundaries->erase("FloquetWaveVector"); boundaries->erase("WavePort"); boundaries->erase("SurfaceCurrent"); boundaries->erase("Ground"); @@ -1395,6 +1844,61 @@ void BoundaryData::SetUp(json &config) << boundaries->dump(2)); } +std::vector ConstructLinearRange(double start, double end, double delta) +{ + auto n_step = GetNumSteps(start, end, delta); + std::vector f(n_step); + std::iota(f.begin(), f.end(), 0); + std::for_each(f.begin(), f.end(), [=](double &x) { x = start + x * delta; }); + return f; +} +std::vector ConstructLinearRange(double start, double end, int n_sample) +{ + std::vector f(n_sample); + for (int i = 0; i < n_sample; i++) + { + f[i] = start + (double(i) / (n_sample - 1)) * (end - start); + } + return f; +} +std::vector ConstructLogRange(double start, double end, int n_sample) +{ + MFEM_VERIFY(start > 0 && end > 0, + "\"Type\": \"Log\" only valid for non-zero start and end!"); + std::vector f(n_sample); + double log_start = std::log10(start); + double log_end = std::log10(end); + for (int i = 0; i < n_sample; i++) + { + double log_val = log_start + (double(i) / (n_sample - 1)) * (log_end - log_start); + f[i] = std::pow(10.0, log_val); + } + return f; +} + +// Helper to find entry closest to x in vec, up to tol. If no match, returns end. +auto FindNearestValue(const std::vector &vec, double x, double tol) +{ + // Find the first element not less than x. + auto it = std::lower_bound(vec.begin(), vec.end(), x); + // Check if we found an exact match or a close enough value. + if (it != vec.end() && std::abs(*it - x) <= tol) + { + return it; + } + // If we're not at the beginning, check the previous element too. + if (it != vec.begin()) + { + auto prev = std::prev(it); + if (std::abs(*prev - x) <= tol) + { + return prev; + } + } + // No value within tol found + return vec.end(); +} + void DrivenSolverData::SetUp(json &solver) { auto driven = solver.find("Driven"); @@ -1402,58 +1906,212 @@ void DrivenSolverData::SetUp(json &solver) { return; } - MFEM_VERIFY(driven->find("MinFreq") != driven->end() && - driven->find("MaxFreq") != driven->end() && - driven->find("FreqStep") != driven->end(), - "Missing \"Driven\" solver \"MinFreq\", \"MaxFreq\", or \"FreqStep\" in " - "configuration file!"); - min_f = driven->at("MinFreq"); // Required - max_f = driven->at("MaxFreq"); // Required - delta_f = driven->at("FreqStep"); // Required - delta_post = driven->value("SaveStep", delta_post); - only_port_post = driven->value("SaveOnlyPorts", only_port_post); + + restart = driven->value("Restart", restart); adaptive_tol = driven->value("AdaptiveTol", adaptive_tol); - adaptive_nmax = driven->value("AdaptiveMaxSamples", adaptive_nmax); - adaptive_ncand = driven->value("AdaptiveMaxCandidates", adaptive_ncand); - adaptive_metric_aposteriori = - driven->value("AdaptiveAPosterioriError", adaptive_metric_aposteriori); - rst = driven->value("Restart", rst); + adaptive_max_size = driven->value("AdaptiveMaxSamples", adaptive_max_size); + adaptive_memory = driven->value("AdaptiveConvergenceMemory", adaptive_memory); + + MFEM_VERIFY(!(restart != 1 && adaptive_tol > 0.0), + "\"Restart\" is incompatible with adaptive frequency sweep!"); + + std::vector save_f, prom_f; // samples to be saved to paraview and added to prom + // Backwards compatible top level interface. + if (driven->find("MinFreq") != driven->end() && + driven->find("MaxFreq") != driven->end() && driven->find("FreqStep") != driven->end()) + { + double min_f = driven->at("MinFreq"); // Required + double max_f = driven->at("MaxFreq"); // Required + double delta_f = driven->at("FreqStep"); // Required + sample_f = ConstructLinearRange(min_f, max_f, delta_f); + if (int save_step = driven->value("SaveStep", 0); save_step > 0) + { + for (std::size_t n = 0; n < sample_f.size(); n += save_step) + { + save_f.emplace_back(sample_f[n]); + } + } + } + if (auto freq_samples = driven->find("Samples"); freq_samples != driven->end()) + { + for (auto &r : *freq_samples) + { + auto type = r.value("Type", r.find("Freq") != r.end() ? FrequencySampling::POINT + : FrequencySampling::DEFAULT); + auto f = [&]() + { + switch (type) + { + case FrequencySampling::LINEAR: + { + auto min_f = r.at("MinFreq"); + auto max_f = r.at("MaxFreq"); + auto delta_f = r.value("FreqStep", 0.0); + auto n_sample = r.value("NSample", 0); + MFEM_VERIFY(delta_f > 0 ^ n_sample > 0, + "Only one of \"FreqStep\" or \"NSample\" can be specified for " + "\"Type\": \"Linear\"!"); + if (delta_f > 0) + { + return ConstructLinearRange(min_f, max_f, delta_f); + } + if (n_sample > 0) + { + return ConstructLinearRange(min_f, max_f, n_sample); + } + } + case FrequencySampling::LOG: + { + auto min_f = r.at("MinFreq"); + auto max_f = r.at("MaxFreq"); + auto n_sample = r.at("NSample"); + return ConstructLogRange(min_f, max_f, n_sample); + } + case FrequencySampling::POINT: + return r.at("Freq").get>(); + } + return std::vector{}; + }(); + sample_f.insert(sample_f.end(), f.begin(), f.end()); + + if (auto save_step = r.value("SaveStep", 0); save_step > 0) + { + for (std::size_t n = 0; n < f.size(); n += save_step) + { + save_f.emplace_back(f[n]); + } + } + if (auto prom_sample = r.value("AddToPROM", false); prom_sample) + { + if (adaptive_tol == 0) + { + MFEM_WARNING("Ignoring \"AddToPROM\" for non-adaptive simulation!"); + } + prom_f.insert(prom_f.end(), f.begin(), f.end()); + } + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "Type: " << type << '\n'; + std::cout << "Freq: " << f << '\n'; + std::cout << "FreqStep: " << r.value("FreqStep", 0.0) << '\n'; + std::cout << "NSample: " << r.value("NSample", 0.0) << '\n'; + std::cout << "SaveStep: " << r.value("SaveStep", 0) << '\n'; + std::cout << "AddToPROM: " << r.value("AddToPROM", false) << '\n'; + } + + // Cleanup + r.erase("Type"); + r.erase("MinFreq"); + r.erase("MaxFreq"); + r.erase("FreqStep"); + r.erase("NSample"); + r.erase("Freq"); + r.erase("SaveStep"); + r.erase("AddToPROM"); + MFEM_VERIFY(r.empty(), + "Found an unsupported configuration file keyword in \"Samples\"!\n" + << r.dump(2)); + } + } + + // Deduplicate all samples, and find indices of save and prom samples. + constexpr double delta_eps = 1.0e-9; // Precision in frequency comparisons (Hz) + auto equal_f = [=](auto x, auto y) { return std::abs(x - y) < delta_eps; }; + auto deduplicate = [&equal_f](auto &f) + { + std::sort(f.begin(), f.end()); + f.erase(std::unique(f.begin(), f.end(), equal_f), f.end()); + }; + + // Enforce explicit saves exactly match the sample frequencies. + deduplicate(sample_f); + auto explicit_save_f = driven->value("Save", std::vector()); + for (auto &f : explicit_save_f) + { + auto it = FindNearestValue(sample_f, f, delta_eps); + MFEM_VERIFY(it != sample_f.end(), + "Entry " << f << " in \"Save\" must be an explicitly sampled frequency!"); + f = *it; + } + save_f.insert(save_f.end(), explicit_save_f.begin(), explicit_save_f.end()); + deduplicate(save_f); + deduplicate(prom_f); + + // Given the matched ordering, and values are assigned by copying, can do a + // paired-iterator scan. + for (auto it_sample = sample_f.begin(), it_save = save_f.begin(); it_save != save_f.end(); + ++it_save) + { + while (*it_sample != *it_save) // safe because save samples is a subset of samples + { + ++it_sample; + MFEM_VERIFY(it_sample != sample_f.end(), + "Save frequency " << *it_save << " not found in sample frequencies!"); + } + save_indices.emplace_back(std::distance(sample_f.begin(), it_sample)); + } + // PROM sampling always begins with the minimum and maximum frequencies. Exclude them from + // extra samples. Can use equality comparison given no floating point operations have been + // done. + prom_indices = {0, sample_f.size() - 1}; + if (prom_f.size() > 0 && prom_f.back() == sample_f.back()) + { + prom_f.pop_back(); + } + if (prom_f.size() > 0 && prom_f.front() == sample_f.front()) + { + prom_f.erase(prom_f.begin(), std::next(prom_f.begin())); + } + for (auto it_sample = sample_f.begin(), it_prom = prom_f.begin(); it_prom != prom_f.end(); + ++it_prom) + { + while (*it_sample != *it_prom) // safe because prom samples is a subset of samples + { + ++it_sample; + MFEM_VERIFY(it_sample != sample_f.end(), "PROM sample frequency " + << *it_prom + << " not found in sample frequencies!"); + } + prom_indices.emplace_back(std::distance(sample_f.begin(), it_sample)); + } + + MFEM_VERIFY(!sample_f.empty(), "No sample frequency samples specified in \"Driven\"!"); + + // Debug + if constexpr (JSON_DEBUG) + { + std::cout << "MinFreq: " << driven->value("MinFreq", 0.0) << '\n'; + std::cout << "MaxFreq: " << driven->value("MaxFreq", 0.0) << '\n'; + std::cout << "FreqStep: " << driven->value("FreqStep", 0) << '\n'; + std::cout << "Samples: " << sample_f << '\n'; + std::cout << "SaveSamples: " << save_f << '\n'; + std::cout << "PROMSamples: " << prom_f << '\n'; + std::cout << "SaveIndices: " << save_indices << '\n'; + std::cout << "PromIndices: " << prom_indices << '\n'; + std::cout << "Restart: " << restart << '\n'; + std::cout << "AdaptiveTol: " << adaptive_tol << '\n'; + std::cout << "AdaptiveMaxSamples: " << adaptive_max_size << '\n'; + std::cout << "AdaptiveConvergenceMemory: " << adaptive_memory << '\n'; + } // Cleanup driven->erase("MinFreq"); driven->erase("MaxFreq"); driven->erase("FreqStep"); + driven->erase("Samples"); + driven->erase("Save"); driven->erase("SaveStep"); - driven->erase("SaveOnlyPorts"); + driven->erase("Restart"); driven->erase("AdaptiveTol"); driven->erase("AdaptiveMaxSamples"); - driven->erase("AdaptiveMaxCandidates"); - driven->erase("AdaptiveAPosterioriError"); - driven->erase("Restart"); + driven->erase("AdaptiveConvergenceMemory"); MFEM_VERIFY(driven->empty(), "Found an unsupported configuration file keyword under \"Driven\"!\n" << driven->dump(2)); - - // Debug - // std::cout << "MinFreq: " << min_f << '\n'; - // std::cout << "MaxFreq: " << max_f << '\n'; - // std::cout << "FreqStep: " << delta_f << '\n'; - // std::cout << "SaveStep: " << delta_post << '\n'; - // std::cout << "SaveOnlyPorts: " << only_port_post << '\n'; - // std::cout << "AdaptiveTol: " << adaptive_tol << '\n'; - // std::cout << "AdaptiveMaxSamples: " << adaptive_nmax << '\n'; - // std::cout << "AdaptiveMaxCandidates: " << adaptive_ncand << '\n'; - // std::cout << "AdaptiveAPosterioriError: " << adaptive_metric_aposteriori << '\n'; - // std::cout << "Restart: " << rst << '\n'; } -// Helper for converting string keys to enum for EigenSolverData::Type. -PALACE_JSON_SERIALIZE_ENUM(EigenSolverData::Type, - {{EigenSolverData::Type::DEFAULT, "Default"}, - {EigenSolverData::Type::SLEPC, "SLEPc"}, - {EigenSolverData::Type::ARPACK, "ARPACK"}, - {EigenSolverData::Type::FEAST, "FEAST"}}) - void EigenSolverData::SetUp(json &solver) { auto eigenmode = solver.find("Eigenmode"); @@ -1461,9 +2119,10 @@ void EigenSolverData::SetUp(json &solver) { return; } - MFEM_VERIFY(eigenmode->find("Target") != eigenmode->end(), - "Missing \"Eigenmode\" solver \"Target\" in configuration file!"); - target = eigenmode->at("Target"); // Required + MFEM_VERIFY(eigenmode->find("Target") != eigenmode->end() || + solver.find("Driven") != solver.end(), + "Missing \"Eigenmode\" solver \"Target\" in the configuration file!"); + target = eigenmode->value("Target", target); // Required (only for eigenmode simulations) tol = eigenmode->value("Tol", tol); max_it = eigenmode->value("MaxIts", max_it); max_size = eigenmode->value("MaxSize", max_size); @@ -1471,21 +2130,30 @@ void EigenSolverData::SetUp(json &solver) n_post = eigenmode->value("Save", n_post); type = eigenmode->value("Type", type); pep_linear = eigenmode->value("PEPLinear", pep_linear); - feast_contour_np = eigenmode->value("ContourNPoints", feast_contour_np); - if (type == EigenSolverData::Type::FEAST && feast_contour_np > 1) - { - MFEM_VERIFY(eigenmode->find("ContourTargetUpper") != eigenmode->end() && - eigenmode->find("ContourAspectRatio") != eigenmode->end(), - "Missing \"Eigenmode\" solver \"ContourTargetUpper\" or " - "\"ContourAspectRatio\" for FEAST solver in configuration file!"); - } - feast_contour_ub = eigenmode->value("ContourTargetUpper", feast_contour_ub); - feast_contour_ar = eigenmode->value("ContourAspectRatio", feast_contour_ar); - feast_moments = eigenmode->value("ContourMoments", feast_moments); scale = eigenmode->value("Scaling", scale); init_v0 = eigenmode->value("StartVector", init_v0); init_v0_const = eigenmode->value("StartVectorConstant", init_v0_const); mass_orthog = eigenmode->value("MassOrthogonal", mass_orthog); + nonlinear_type = eigenmode->value("NonlinearType", nonlinear_type); + refine_nonlinear = eigenmode->value("RefineNonlinear", refine_nonlinear); + linear_tol = eigenmode->value("LinearTol", linear_tol); + target_upper = eigenmode->value("TargetUpper", target_upper); + preconditioner_lag = eigenmode->value("PreconditionerLag", preconditioner_lag); + preconditioner_lag_tol = eigenmode->value("PreconditionerLagTol", preconditioner_lag_tol); + max_restart = eigenmode->value("MaxRestart", max_restart); + + target_upper = (target_upper < 0) ? 3 * target : target_upper; // default = 3 * target + MFEM_VERIFY(target > 0.0, "config[\"Eigenmode\"][\"Target\"] must be strictly positive!"); + MFEM_VERIFY(target_upper > target, "config[\"Eigenmode\"][\"TargetUpper\"] must be " + "greater than config[\"Eigenmode\"][\"Target\"]!"); + MFEM_VERIFY(preconditioner_lag >= 0, + "config[\"Eigenmode\"][\"PreconditionerLag\"] must be non-negative!"); + MFEM_VERIFY(preconditioner_lag_tol >= 0, + "config[\"Eigenmode\"][\"PreconditionerLagTol\"] must be non-negative!"); + MFEM_VERIFY(max_restart >= 0, + "config[\"Eigenmode\"][\"MaxRestart\"] must be non-negative!"); + + MFEM_VERIFY(n > 0, "\"N\" must be greater than 0!"); // Cleanup eigenmode->erase("Target"); @@ -1496,35 +2164,44 @@ void EigenSolverData::SetUp(json &solver) eigenmode->erase("Save"); eigenmode->erase("Type"); eigenmode->erase("PEPLinear"); - eigenmode->erase("ContourNPoints"); - eigenmode->erase("ContourTargetUpper"); - eigenmode->erase("ContourAspectRatio"); - eigenmode->erase("ContourMoments"); eigenmode->erase("Scaling"); eigenmode->erase("StartVector"); eigenmode->erase("StartVectorConstant"); eigenmode->erase("MassOrthogonal"); + eigenmode->erase("NonlinearType"); + eigenmode->erase("RefineNonlinear"); + eigenmode->erase("LinearTol"); + eigenmode->erase("TargetUpper"); + eigenmode->erase("PreconditionerLag"); + eigenmode->erase("PreconditionerLagTol"); + eigenmode->erase("MaxRestart"); MFEM_VERIFY(eigenmode->empty(), "Found an unsupported configuration file keyword under \"Eigenmode\"!\n" << eigenmode->dump(2)); // Debug - // std::cout << "Target: " << target << '\n'; - // std::cout << "Tol: " << tol << '\n'; - // std::cout << "MaxIts: " << max_it << '\n'; - // std::cout << "MaxSize: " << max_size << '\n'; - // std::cout << "N: " << n << '\n'; - // std::cout << "Save: " << n_post << '\n'; - // std::cout << "Type: " << type << '\n'; - // std::cout << "PEPLinear: " << pep_linear << '\n'; - // std::cout << "ContourNPoints: " << feast_contour_np << '\n'; - // std::cout << "ContourTargetUpper: " << feast_contour_ub << '\n'; - // std::cout << "ContourAspectRatio: " << feast_contour_ar << '\n'; - // std::cout << "ContourMoments: " << feast_moments << '\n'; - // std::cout << "Scaling: " << scale << '\n'; - // std::cout << "StartVector: " << init_v0 << '\n'; - // std::cout << "StartVectorConstant: " << init_v0_const << '\n'; - // std::cout << "MassOrthogonal: " << mass_orthog << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Target: " << target << '\n'; + std::cout << "Tol: " << tol << '\n'; + std::cout << "MaxIts: " << max_it << '\n'; + std::cout << "MaxSize: " << max_size << '\n'; + std::cout << "N: " << n << '\n'; + std::cout << "Save: " << n_post << '\n'; + std::cout << "Type: " << type << '\n'; + std::cout << "PEPLinear: " << pep_linear << '\n'; + std::cout << "Scaling: " << scale << '\n'; + std::cout << "StartVector: " << init_v0 << '\n'; + std::cout << "StartVectorConstant: " << init_v0_const << '\n'; + std::cout << "MassOrthogonal: " << mass_orthog << '\n'; + std::cout << "NonlinearType: " << nonlinear_type << '\n'; + std::cout << "RefineNonlinear: " << refine_nonlinear << '\n'; + std::cout << "LinearTol: " << linear_tol << '\n'; + std::cout << "TargetUpper: " << target_upper << '\n'; + std::cout << "PreconditionerLag: " << preconditioner_lag << '\n'; + std::cout << "PreconditionerLagTol: " << preconditioner_lag_tol << '\n'; + std::cout << "MaxRestart: " << max_restart << '\n'; + } } void ElectrostaticSolverData::SetUp(json &solver) @@ -1543,7 +2220,10 @@ void ElectrostaticSolverData::SetUp(json &solver) << electrostatic->dump(2)); // Debug - // std::cout << "Save: " << n_post << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Save: " << n_post << '\n'; + } } void MagnetostaticSolverData::SetUp(json &solver) @@ -1562,25 +2242,12 @@ void MagnetostaticSolverData::SetUp(json &solver) << magnetostatic->dump(2)); // Debug - // std::cout << "Save: " << n_post << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Save: " << n_post << '\n'; + } } -// Helper for converting string keys to enum for TransientSolverData::Type and -// TransientSolverData::ExcitationType. -PALACE_JSON_SERIALIZE_ENUM(TransientSolverData::Type, - {{TransientSolverData::Type::DEFAULT, "Default"}, - {TransientSolverData::Type::GEN_ALPHA, "GeneralizedAlpha"}, - {TransientSolverData::Type::NEWMARK, "NewmarkBeta"}, - {TransientSolverData::Type::CENTRAL_DIFF, "CentralDifference"}}) -PALACE_JSON_SERIALIZE_ENUM( - TransientSolverData::ExcitationType, - {{TransientSolverData::ExcitationType::SINUSOIDAL, "Sinusoidal"}, - {TransientSolverData::ExcitationType::GAUSSIAN, "Gaussian"}, - {TransientSolverData::ExcitationType::DIFF_GAUSSIAN, "DifferentiatedGaussian"}, - {TransientSolverData::ExcitationType::MOD_GAUSSIAN, "ModulatedGaussian"}, - {TransientSolverData::ExcitationType::RAMP_STEP, "Ramp"}, - {TransientSolverData::ExcitationType::SMOOTH_STEP, "SmoothStep"}}) - void TransientSolverData::SetUp(json &solver) { auto transient = solver.find("Transient"); @@ -1588,12 +2255,13 @@ void TransientSolverData::SetUp(json &solver) { return; } - MFEM_VERIFY(transient->find("Excitation") != transient->end(), - "Missing \"Transient\" solver \"Excitation\" type in configuration file!"); MFEM_VERIFY( - transient->find("MaxTime") != transient->end() && - transient->find("TimeStep") != transient->end(), - "Missing \"Transient\" solver \"MaxTime\" or \"TimeStep\" in configuration file!"); + transient->find("Excitation") != transient->end(), + "Missing \"Transient\" solver \"Excitation\" type in the configuration file!"); + MFEM_VERIFY(transient->find("MaxTime") != transient->end() && + transient->find("TimeStep") != transient->end(), + "Missing \"Transient\" solver \"MaxTime\" or \"TimeStep\" in the " + "configuration file!"); type = transient->value("Type", type); excitation = transient->at("Excitation"); // Required pulse_f = transient->value("ExcitationFreq", pulse_f); @@ -1601,7 +2269,34 @@ void TransientSolverData::SetUp(json &solver) max_t = transient->at("MaxTime"); // Required delta_t = transient->at("TimeStep"); // Required delta_post = transient->value("SaveStep", delta_post); - only_port_post = transient->value("SaveOnlyPorts", only_port_post); + order = transient->value("Order", order); + rel_tol = transient->value("RelTol", rel_tol); + abs_tol = transient->value("AbsTol", abs_tol); + MFEM_VERIFY(delta_t > 0, "\"TimeStep\" must be greater than 0.0!"); + + if (type == TimeSteppingScheme::GEN_ALPHA || type == TimeSteppingScheme::RUNGE_KUTTA) + { + if (transient->contains("Order")) + { + MFEM_WARNING("GeneralizedAlpha and RungeKutta transient solvers do not use " + "config[\"Transient\"][\"Order\"]!"); + } + if (transient->contains("RelTol") || transient->contains("AbsTol")) + { + MFEM_WARNING( + "GeneralizedAlpha and RungeKutta transient solvers do not use\n" + "config[\"Transient\"][\"RelTol\"] and config[\"Transient\"][\"AbsTol\"]!"); + } + } + else + { + MFEM_VERIFY(rel_tol > 0, + "config[\"Transient\"][\"RelTol\"] must be strictly positive!"); + MFEM_VERIFY(abs_tol > 0, + "config[\"Transient\"][\"AbsTol\"] must be strictly positive!"); + MFEM_VERIFY(order >= 2 && order <= 5, + "config[\"Transient\"][\"Order\"] must be between 2 and 5!"); + } // Cleanup transient->erase("Type"); @@ -1611,69 +2306,29 @@ void TransientSolverData::SetUp(json &solver) transient->erase("MaxTime"); transient->erase("TimeStep"); transient->erase("SaveStep"); - transient->erase("SaveOnlyPorts"); + transient->erase("Order"); + transient->erase("RelTol"); + transient->erase("AbsTol"); MFEM_VERIFY(transient->empty(), "Found an unsupported configuration file keyword under \"Transient\"!\n" << transient->dump(2)); // Debug - // std::cout << "Type: " << type << '\n'; - // std::cout << "Excitation: " << excitation << '\n'; - // std::cout << "ExcitationFreq: " << pulse_freq << '\n'; - // std::cout << "ExcitationWidth: " << pulse_tau << '\n'; - // std::cout << "MaxTime: " << max_t << '\n'; - // std::cout << "TimeStep: " << delta_t << '\n'; - // std::cout << "SaveStep: " << delta_post << '\n'; - // std::cout << "SaveOnlyPorts: " << only_port_post << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Type: " << type << '\n'; + std::cout << "Excitation: " << excitation << '\n'; + std::cout << "ExcitationFreq: " << pulse_f << '\n'; + std::cout << "ExcitationWidth: " << pulse_tau << '\n'; + std::cout << "MaxTime: " << max_t << '\n'; + std::cout << "TimeStep: " << delta_t << '\n'; + std::cout << "SaveStep: " << delta_post << '\n'; + std::cout << "Order: " << order << '\n'; + std::cout << "RelTol: " << rel_tol << '\n'; + std::cout << "AbsTol: " << abs_tol << '\n'; + } } -// Helpers for converting string keys to enum for LinearSolverData::Type, -// LinearSolverData::KspType, LinearSolverData::SideType, -// LinearSolverData::MultigridCoarsenType, LinearSolverData::SymFactType, -// LinearSolverData::CompressionType, and LinearSolverData::OrthogType. -PALACE_JSON_SERIALIZE_ENUM(LinearSolverData::Type, - {{LinearSolverData::Type::DEFAULT, "Default"}, - {LinearSolverData::Type::AMS, "AMS"}, - {LinearSolverData::Type::BOOMER_AMG, "BoomerAMG"}, - {LinearSolverData::Type::MUMPS, "MUMPS"}, - {LinearSolverData::Type::SUPERLU, "SuperLU"}, - {LinearSolverData::Type::STRUMPACK, "STRUMPACK"}, - {LinearSolverData::Type::STRUMPACK_MP, "STRUMPACK-MP"}}) -PALACE_JSON_SERIALIZE_ENUM(LinearSolverData::KspType, - {{LinearSolverData::KspType::DEFAULT, "Default"}, - {LinearSolverData::KspType::CG, "CG"}, - {LinearSolverData::KspType::MINRES, "MINRES"}, - {LinearSolverData::KspType::GMRES, "GMRES"}, - {LinearSolverData::KspType::FGMRES, "FGMRES"}, - {LinearSolverData::KspType::BICGSTAB, "BiCGSTAB"}}) -PALACE_JSON_SERIALIZE_ENUM(LinearSolverData::SideType, - {{LinearSolverData::SideType::DEFAULT, "Default"}, - {LinearSolverData::SideType::RIGHT, "Right"}, - {LinearSolverData::SideType::LEFT, "Left"}}) -PALACE_JSON_SERIALIZE_ENUM(LinearSolverData::MultigridCoarsenType, - {{LinearSolverData::MultigridCoarsenType::LINEAR, "Linear"}, - {LinearSolverData::MultigridCoarsenType::LOGARITHMIC, - "Logarithmic"}}) -PALACE_JSON_SERIALIZE_ENUM(LinearSolverData::SymFactType, - {{LinearSolverData::SymFactType::DEFAULT, "Default"}, - {LinearSolverData::SymFactType::METIS, "METIS"}, - {LinearSolverData::SymFactType::PARMETIS, "ParMETIS"}, - {LinearSolverData::SymFactType::SCOTCH, "Scotch"}, - {LinearSolverData::SymFactType::PTSCOTCH, "PTScotch"}}) -PALACE_JSON_SERIALIZE_ENUM(LinearSolverData::CompressionType, - {{LinearSolverData::CompressionType::NONE, "None"}, - {LinearSolverData::CompressionType::BLR, "BLR"}, - {LinearSolverData::CompressionType::HSS, "HSS"}, - {LinearSolverData::CompressionType::HODLR, "HODLR"}, - {LinearSolverData::CompressionType::ZFP, "ZFP"}, - {LinearSolverData::CompressionType::BLR_HODLR, "BLR-HODLR"}, - {LinearSolverData::CompressionType::ZFP_BLR_HODLR, - "ZFP-BLR-HODLR"}}) -PALACE_JSON_SERIALIZE_ENUM(LinearSolverData::OrthogType, - {{LinearSolverData::OrthogType::MGS, "MGS"}, - {LinearSolverData::OrthogType::CGS, "CGS"}, - {LinearSolverData::OrthogType::CGS2, "CGS2"}}) - void LinearSolverData::SetUp(json &solver) { auto linear = solver.find("Linear"); @@ -1682,7 +2337,7 @@ void LinearSolverData::SetUp(json &solver) return; } type = linear->value("Type", type); - ksp_type = linear->value("KSPType", ksp_type); + krylov_solver = linear->value("KSPType", krylov_solver); tol = linear->value("Tol", tol); max_it = linear->value("MaxIts", max_it); max_size = linear->value("MaxSize", max_size); @@ -1690,7 +2345,8 @@ void LinearSolverData::SetUp(json &solver) // Options related to multigrid. mg_max_levels = linear->value("MGMaxLevels", mg_max_levels); - mg_coarsen_type = linear->value("MGCoarsenType", mg_coarsen_type); + mg_coarsening = linear->value("MGCoarsenType", mg_coarsening); + mg_use_mesh = linear->value("MGUseMesh", mg_use_mesh); mg_cycle_it = linear->value("MGCycleIts", mg_cycle_it); mg_smooth_aux = linear->value("MGAuxiliarySmoother", mg_smooth_aux); mg_smooth_it = linear->value("MGSmoothIts", mg_smooth_it); @@ -1702,23 +2358,29 @@ void LinearSolverData::SetUp(json &solver) // Preconditioner-specific options. pc_mat_real = linear->value("PCMatReal", pc_mat_real); pc_mat_shifted = linear->value("PCMatShifted", pc_mat_shifted); - pc_side_type = linear->value("PCSide", pc_side_type); - sym_fact_type = linear->value("ColumnOrdering", sym_fact_type); + complex_coarse_solve = linear->value("ComplexCoarseSolve", complex_coarse_solve); + drop_small_entries = linear->value("DropSmallEntries", drop_small_entries); + reorder_reuse = linear->value("ReorderingReuse", reorder_reuse); + pc_side = linear->value("PCSide", pc_side); + sym_factorization = linear->value("ColumnOrdering", sym_factorization); strumpack_compression_type = linear->value("STRUMPACKCompressionType", strumpack_compression_type); strumpack_lr_tol = linear->value("STRUMPACKCompressionTol", strumpack_lr_tol); strumpack_lossy_precision = linear->value("STRUMPACKLossyPrecision", strumpack_lossy_precision); strumpack_butterfly_l = linear->value("STRUMPACKButterflyLevels", strumpack_butterfly_l); - superlu_3d = linear->value("SuperLU3D", superlu_3d); - ams_vector = linear->value("AMSVector", ams_vector); + superlu_3d = linear->value("SuperLU3DCommunicator", superlu_3d); + ams_vector_interp = linear->value("AMSVectorInterpolation", ams_vector_interp); + ams_singular_op = linear->value("AMSSingularOperator", ams_singular_op); + amg_agg_coarsen = linear->value("AMGAggressiveCoarsening", amg_agg_coarsen); // Other linear solver options. divfree_tol = linear->value("DivFreeTol", divfree_tol); divfree_max_it = linear->value("DivFreeMaxIts", divfree_max_it); estimator_tol = linear->value("EstimatorTol", estimator_tol); estimator_max_it = linear->value("EstimatorMaxIts", estimator_max_it); - gs_orthog_type = linear->value("GSOrthogonalization", gs_orthog_type); + estimator_mg = linear->value("EstimatorMG", estimator_mg); + gs_orthog = linear->value("GSOrthogonalization", gs_orthog); // Cleanup linear->erase("Type"); @@ -1730,6 +2392,7 @@ void LinearSolverData::SetUp(json &solver) linear->erase("MGMaxLevels"); linear->erase("MGCoarsenType"); + linear->erase("MGUseMesh"); linear->erase("MGCycleIts"); linear->erase("MGAuxiliarySmoother"); linear->erase("MGSmoothIts"); @@ -1740,62 +2403,76 @@ void LinearSolverData::SetUp(json &solver) linear->erase("PCMatReal"); linear->erase("PCMatShifted"); + linear->erase("ComplexCoarseSolve"); + linear->erase("DropSmallEntries"); + linear->erase("ReorderingReuse"); linear->erase("PCSide"); linear->erase("ColumnOrdering"); linear->erase("STRUMPACKCompressionType"); linear->erase("STRUMPACKCompressionTol"); linear->erase("STRUMPACKLossyPrecision"); linear->erase("STRUMPACKButterflyLevels"); - linear->erase("SuperLU3D"); - linear->erase("AMSVector"); + linear->erase("SuperLU3DCommunicator"); + linear->erase("AMSVectorInterpolation"); + linear->erase("AMSSingularOperator"); + linear->erase("AMGAggressiveCoarsening"); linear->erase("DivFreeTol"); linear->erase("DivFreeMaxIts"); linear->erase("EstimatorTol"); linear->erase("EstimatorMaxIts"); + linear->erase("EstimatorMG"); linear->erase("GSOrthogonalization"); MFEM_VERIFY(linear->empty(), "Found an unsupported configuration file keyword under \"Linear\"!\n" << linear->dump(2)); // Debug - // std::cout << "Type: " << type << '\n'; - // std::cout << "KSPType: " << ksp_type << '\n'; - // std::cout << "Tol: " << tol << '\n'; - // std::cout << "MaxIts: " << max_it << '\n'; - // std::cout << "MaxSize: " << max_size << '\n'; - // std::cout << "InitialGuess: " << initial_guess << '\n'; - - // std::cout << "MGMaxLevels: " << mg_max_levels << '\n'; - // std::cout << "MGCoarsenType: " << mg_coarsen_type << '\n'; - // std::cout << "MGCycleIts: " << mg_cycle_it << '\n'; - // std::cout << "MGAuxiliarySmoother: " << mg_smooth_aux << '\n'; - // std::cout << "MGSmoothIts: " << mg_smooth_it << '\n'; - // std::cout << "MGSmoothOrder: " << mg_smooth_order << '\n'; - // std::cout << "MGSmoothEigScaleMax: " << mg_smooth_sf_max << '\n'; - // std::cout << "MGSmoothEigScaleMin: " << mg_smooth_sf_min << '\n'; - // std::cout << "MGSmoothChebyshev4th: " << mg_smooth_cheby_4th << '\n'; - - // std::cout << "PCMatReal: " << pc_mat_real << '\n'; - // std::cout << "PCMatShifted: " << pc_mat_shifted << '\n'; - // std::cout << "PCSide: " << pc_side_type << '\n'; - // std::cout << "ColumnOrdering: " << sym_fact_type << '\n'; - // std::cout << "STRUMPACKCompressionType: " << strumpack_compression_type << '\n'; - // std::cout << "STRUMPACKCompressionTol: " << strumpack_lr_tol << '\n'; - // std::cout << "STRUMPACKLossyPrecision: " << strumpack_lossy_precision << '\n'; - // std::cout << "STRUMPACKButterflyLevels: " << strumpack_butterfly_l << '\n'; - // std::cout << "SuperLU3D: " << superlu_3d << '\n'; - // std::cout << "AMSVector: " << ams_vector << '\n'; - // std::cout << "DivFreeTol: " << divfree_tol << '\n'; - // std::cout << "DivFreeMaxIts: " << divfree_max_it << '\n'; - // std::cout << "GSOrthogonalization: " << gs_orthog_type << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Type: " << type << '\n'; + std::cout << "KSPType: " << krylov_solver << '\n'; + std::cout << "Tol: " << tol << '\n'; + std::cout << "MaxIts: " << max_it << '\n'; + std::cout << "MaxSize: " << max_size << '\n'; + std::cout << "InitialGuess: " << initial_guess << '\n'; + + std::cout << "MGMaxLevels: " << mg_max_levels << '\n'; + std::cout << "MGCoarsenType: " << mg_coarsening << '\n'; + std::cout << "MGUseMesh: " << mg_use_mesh << '\n'; + std::cout << "MGCycleIts: " << mg_cycle_it << '\n'; + std::cout << "MGAuxiliarySmoother: " << mg_smooth_aux << '\n'; + std::cout << "MGSmoothIts: " << mg_smooth_it << '\n'; + std::cout << "MGSmoothOrder: " << mg_smooth_order << '\n'; + std::cout << "MGSmoothEigScaleMax: " << mg_smooth_sf_max << '\n'; + std::cout << "MGSmoothEigScaleMin: " << mg_smooth_sf_min << '\n'; + std::cout << "MGSmoothChebyshev4th: " << mg_smooth_cheby_4th << '\n'; + + std::cout << "PCMatReal: " << pc_mat_real << '\n'; + std::cout << "PCMatShifted: " << pc_mat_shifted << '\n'; + std::cout << "ComplexCoarseSolve: " << complex_coarse_solve << '\n'; + std::cout << "DropSmallEntries: " << drop_small_entries << '\n'; + std::cout << "ReorderingReuse: " << reorder_reuse << '\n'; + std::cout << "PCSide: " << pc_side << '\n'; + std::cout << "ColumnOrdering: " << sym_factorization << '\n'; + std::cout << "STRUMPACKCompressionType: " << strumpack_compression_type << '\n'; + std::cout << "STRUMPACKCompressionTol: " << strumpack_lr_tol << '\n'; + std::cout << "STRUMPACKLossyPrecision: " << strumpack_lossy_precision << '\n'; + std::cout << "STRUMPACKButterflyLevels: " << strumpack_butterfly_l << '\n'; + std::cout << "SuperLU3DCommunicator: " << superlu_3d << '\n'; + std::cout << "AMSVectorInterpolation: " << ams_vector_interp << '\n'; + std::cout << "AMSSingularOperator: " << ams_singular_op << '\n'; + std::cout << "AMGAggressiveCoarsening: " << amg_agg_coarsen << '\n'; + + std::cout << "DivFreeTol: " << divfree_tol << '\n'; + std::cout << "DivFreeMaxIts: " << divfree_max_it << '\n'; + std::cout << "EstimatorTol: " << estimator_tol << '\n'; + std::cout << "EstimatorMaxIts: " << estimator_max_it << '\n'; + std::cout << "EstimatorMG: " << estimator_mg << '\n'; + std::cout << "GSOrthogonalization: " << gs_orthog << '\n'; + } } -// Helpers for converting string keys to enum for SolverData::Device. -PALACE_JSON_SERIALIZE_ENUM(SolverData::Device, {{SolverData::Device::CPU, "CPU"}, - {SolverData::Device::GPU, "GPU"}, - {SolverData::Device::DEBUG, "Debug"}}) - void SolverData::SetUp(json &config) { auto solver = config.find("Solver"); @@ -1805,6 +2482,8 @@ void SolverData::SetUp(json &config) } order = solver->value("Order", order); pa_order_threshold = solver->value("PartialAssemblyOrder", pa_order_threshold); + q_order_jac = solver->value("QuadratureOrderJacobian", q_order_jac); + q_order_extra = solver->value("QuadratureOrderExtra", q_order_extra); device = solver->value("Device", device); ceed_backend = solver->value("Backend", ceed_backend); @@ -1818,6 +2497,8 @@ void SolverData::SetUp(json &config) // Cleanup solver->erase("Order"); solver->erase("PartialAssemblyOrder"); + solver->erase("QuadratureOrderJacobian"); + solver->erase("QuadratureOrderExtra"); solver->erase("Device"); solver->erase("Backend"); @@ -1832,10 +2513,29 @@ void SolverData::SetUp(json &config) << solver->dump(2)); // Debug - // std::cout << "Order: " << order << '\n'; - // std::cout << "PartialAssemblyOrder: " << pa_order_threshold << '\n'; - // std::cout << "Device: " << device << '\n'; - // std::cout << "Backend: " << ceed_backend << '\n'; + if constexpr (JSON_DEBUG) + { + std::cout << "Order: " << order << '\n'; + std::cout << "PartialAssemblyOrder: " << pa_order_threshold << '\n'; + std::cout << "QuadratureOrderJacobian: " << q_order_jac << '\n'; + std::cout << "QuadratureOrderExtra: " << q_order_extra << '\n'; + std::cout << "Device: " << device << '\n'; + std::cout << "Backend: " << ceed_backend << '\n'; + } +} + +int GetNumSteps(double start, double end, double delta) +{ + if (end < start) + { + return 1; + } + constexpr double delta_eps = 1.0e-9; // 9 digits of precision comparing endpoint + double dn = std::abs(end - start) / std::abs(delta); + int n_step = 1 + static_cast(dn); + double dfinal = start + n_step * delta; + return n_step + ((delta < 0.0 && dfinal - end > -delta_eps * end) || + (delta > 0.0 && dfinal - end < delta_eps * end)); } } // namespace palace::config diff --git a/palace/utils/configfile.hpp b/palace/utils/configfile.hpp index 1985a98ef6..15f5e940c3 100644 --- a/palace/utils/configfile.hpp +++ b/palace/utils/configfile.hpp @@ -1,902 +1,990 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_CONFIG_FILE_HPP -#define PALACE_UTILS_CONFIG_FILE_HPP - -#include -#include -#include -#include -#include -#include - -namespace palace::config -{ - -using json = nlohmann::json; - -// -// Data structures for storing configuration file data. -// - -namespace internal -{ - -template -struct DataVector -{ -protected: - std::vector vecdata = {}; - -public: - [[nodiscard]] auto size() const { return vecdata.size(); } - [[nodiscard]] auto empty() const { return vecdata.empty(); } - [[nodiscard]] auto begin() const { return vecdata.begin(); } - [[nodiscard]] auto end() const { return vecdata.end(); } - [[nodiscard]] auto begin() { return vecdata.begin(); } - [[nodiscard]] auto end() { return vecdata.end(); } - [[nodiscard]] auto front() const { return vecdata.front(); } - [[nodiscard]] auto back() const { return vecdata.back(); } - [[nodiscard]] auto front() { return vecdata.front(); } - [[nodiscard]] auto back() { return vecdata.back(); } -}; - -template -struct DataMap -{ -protected: - // Map keys are the object indices for postprocessing. - std::map mapdata = {}; - -public: - [[nodiscard]] auto &operator[](int i) { return mapdata[i]; } - [[nodiscard]] auto &at(int i) { return mapdata.at(i); } - [[nodiscard]] auto size() const { return mapdata.size(); } - [[nodiscard]] auto empty() const { return mapdata.empty(); } - [[nodiscard]] auto begin() const { return mapdata.begin(); } - [[nodiscard]] auto end() const { return mapdata.end(); } - [[nodiscard]] auto begin() { return mapdata.begin(); } - [[nodiscard]] auto end() { return mapdata.end(); } -}; - -// An ElementData consists of a list of attributes making up a single element of a -// potentially multielement boundary, and a direction and/or a normal defining the incident -// field. These are used for lumped ports, terminals, surface currents, and other boundary -// postprocessing objects. -struct ElementData -{ - // Vector defining the direction for this port. In a Cartesian system, "X", "Y", and "Z" - // map to (1,0,0), (0,1,0), and (0,0,1), respectively. - std::array direction{{0.0, 0.0, 0.0}}; - - // Coordinate system that the normal vector is expressed in. - enum class CoordinateSystem - { - CARTESIAN, - CYLINDRICAL - }; - CoordinateSystem coordinate_system = CoordinateSystem::CARTESIAN; - - // List of boundary attributes for this element. - std::vector attributes = {}; -}; - -} // namespace internal - -struct ProblemData -{ -public: - // Simulation type. - enum class Type - { - DRIVEN, - EIGENMODE, - ELECTROSTATIC, - MAGNETOSTATIC, - TRANSIENT - }; - Type type = Type::DRIVEN; - - // Level of printing. - int verbose = 1; - - // Output path for storing results. - std::string output = ""; - - void SetUp(json &config); -}; - -struct BoxRefinementData -{ - // Refinement levels. - int ref_levels = 0; - - // Region bounding box limits [m]. - std::vector bbmin = {}, bbmax = {}; -}; - -struct SphereRefinementData -{ - // Refinement levels. - int ref_levels = 0; - - // Sphere radius [m]. - double r = 0.0; - - // Sphere center [m]. - std::vector center = {}; -}; - -struct RefinementData -{ -public: - // Non-dimensional tolerance used to specify convergence of adaptive mesh refinement. - double tol = 1e-2; - - // Maximum number of iterations to perform during adaptive mesh refinement. - int max_it = 0; - - // If a refinement results in a greater number of DOFs than this value, no future - // refinement will be allowed. - int max_size = 0; - - // Whether or not to perform nonconformal adaptation. - bool nonconformal = true; - - // Maximum difference in nonconformal refinements between two adjacent elements. Zero - // implies there is no constraint on local non-conformity. - int max_nc_levels = 1; - - // Dörfler update fraction. The set of marked elements is the minimum set that contains - // update_fraction of the total error. - double update_fraction = 0.7; - - // Maximum allowable ratio of number of elements across processors before rebalancing is - // performed. - double maximum_imbalance = 1.1; - - // Whether to save off results of each adaptation iteration as a subfolder within the post - // processing directory. - bool save_adapt_iterations = true; - - // Whether to write a (serial) mesh to file after mesh modification during AMR. - bool save_adapt_mesh = false; - - // Parallel uniform mesh refinement levels. - int uniform_ref_levels = 0; - -private: - // Refinement data for mesh regions. - std::vector boxlist = {}; - std::vector spherelist = {}; - -public: - auto &GetBox(int i) { return boxlist[i]; } - const auto &GetBoxes() const { return boxlist; } - auto &GetBoxes() { return boxlist; } - - auto &GetSphere(int i) { return spherelist[i]; } - const auto &GetSpheres() const { return spherelist; } - auto &GetSpheres() { return spherelist; } - - void SetUp(json &model); -}; - -struct ModelData -{ -public: - // Mesh file. - std::string mesh = ""; - - // Mesh length unit and optional characteristic length scale for nondimensionalization - // [m]. - double L0 = 1.0e-6; - double Lc = -1.0; - - // Partitioning file (if specified, does not compute a new partitioning). - std::string partition = ""; - - // Call MFEM's ReorientTetMesh as a check of mesh orientation after partitioning. - bool reorient_tet = false; - - // Remove high-order curvature information from the mesh. - bool remove_curvature = false; - - // Object controlling mesh refinement. - RefinementData refinement = {}; - - void SetUp(json &config); -}; - -// Store symmetric matrix data as set of outer products: Σᵢ sᵢ * vᵢ * vᵢᵀ. -template -struct SymmetricMatrixData -{ -public: - std::array s; - std::array, N> v; - - SymmetricMatrixData(double diag) - { - s.fill(diag); - std::size_t i = 0; - for (auto &x : v) - { - x.fill(0.0); - x[i++] = 1.0; - } - } -}; - -struct MaterialData -{ -public: - // Relative permeability. - SymmetricMatrixData<3> mu_r = 1.0; - - // Relative permittivity. - SymmetricMatrixData<3> epsilon_r = 1.0; - - // Loss tangent. - SymmetricMatrixData<3> tandelta = 0.0; - - // Conductivity [S/m]. - SymmetricMatrixData<3> sigma = 0.0; - - // London penetration depth [m]. - double lambda_L = 0.0; - - // List of domain attributes for this material. - std::vector attributes = {}; -}; - -struct MaterialDomainData : public internal::DataVector -{ -public: - void SetUp(json &domains); -}; - -struct DomainDielectricData -{ -public: - // List of domain attributes for this domain dielectric postprocessing index. - std::vector attributes = {}; -}; - -struct DomainDielectricPostData : public internal::DataMap -{ -public: - void SetUp(json &postpro); -}; - -struct ProbeData -{ -public: - // Physical space coordinates for the probe location [m]. - double x = 0.0; - double y = 0.0; - double z = 0.0; -}; - -struct ProbePostData : public internal::DataMap -{ -public: - void SetUp(json &postpro); -}; - -struct DomainPostData -{ -public: - // Set of all postprocessing domain attributes. - std::set attributes; - - // Domain postprocessing objects. - DomainDielectricPostData dielectric; - ProbePostData probe; - - void SetUp(json &domains); -}; - -struct DomainData -{ -public: - // Set of all domain attributes. - std::set attributes = {}; - - // Domain objects. - MaterialDomainData materials = {}; - DomainPostData postpro = {}; - - void SetUp(json &config); -}; - -struct PecBoundaryData -{ -public: - // List of boundary attributes with PEC boundary conditions. - std::vector attributes = {}; - - [[nodiscard]] auto empty() const { return attributes.empty(); } - - void SetUp(json &boundaries); -}; - -struct PmcBoundaryData -{ -public: - // List of boundary attributes with PMC boundary conditions. - std::vector attributes = {}; - - [[nodiscard]] auto empty() const { return attributes.empty(); } - - void SetUp(json &boundaries); -}; - -struct WavePortPecBoundaryData -{ -public: - // List of boundary attributes with PEC boundary conditions for wave ports. - std::vector attributes; - - [[nodiscard]] auto empty() const { return attributes.empty(); } - - void SetUp(json &boundaries); -}; - -struct FarfieldBoundaryData -{ -public: - // Approximation order for farfield ABC. - int order = 1; - - // List of boundary attributes with farfield absortbing boundary conditions. - std::vector attributes = {}; - - [[nodiscard]] auto empty() const { return attributes.empty(); } - - void SetUp(json &boundaries); -}; - -struct ConductivityData -{ -public: - // Electrical conductivity of the conductor [S/m]. - double sigma = 0.0; - - // Conductor relative permeability. - double mu_r = 1.0; - - // Optional conductor thickness [m]. - double h = 0.0; - - // Optional flag for an external boundary surface, relevant for the thickness correction. - bool external = false; - - // List of boundary attributes for this surface conductivity boundary condition. - std::vector attributes = {}; -}; - -struct ConductivityBoundaryData : public internal::DataVector -{ -public: - void SetUp(json &boundaries); -}; - -struct ImpedanceData -{ -public: - // Boundary surface resistance, inductance, and capacitance [Ω/sq, H/sq, F/sq]. - double Rs = 0.0; - double Ls = 0.0; - double Cs = 0.0; - - // List of boundary attributes for this impedance boundary condition. - std::vector attributes = {}; -}; - -struct ImpedanceBoundaryData : public internal::DataVector -{ -public: - void SetUp(json &boundaries); -}; - -struct LumpedPortData -{ -public: - // Port circuit resistance, inductance, and capacitance [Ω/sq, H/sq, F/sq]. - double R = 0.0; - double L = 0.0; - double C = 0.0; - - // Port surface resistance, inductance, and capacitance [Ω/sq, H/sq, F/sq]. - double Rs = 0.0; - double Ls = 0.0; - double Cs = 0.0; - - // Voltage for terminal BC [V] - double voltage = 0.0; - - // Flag for source term in driven and transient simulations. - bool excitation = false; - - // For each lumped port index, each element contains a list of attributes making up a - // single element of a potentially multielement lumped port. - std::vector elements = {}; -}; - -struct LumpedPortBoundaryData : public internal::DataMap -{ -public: - void SetUp(json &boundaries); -}; - -struct WavePortData -{ -public: - // Flag for source term in driven and transient simulations. - bool excitation = false; - - // Mode index for the numeric wave port. - int mode_idx = 1; - - // Port offset for de-embedding [m]. - double d_offset = 0.0; - - // List of boundary attributes for this wave port. - std::vector attributes; -}; - -struct WavePortBoundaryData : public internal::DataMap -{ -public: - void SetUp(json &boundaries); -}; - -struct SurfaceCurrentData -{ -public: - // For each surface current source index, each element contains a list of attributes - // making up a single element of a potentially multielement current source. - std::vector elements = {}; -}; - -struct SurfaceCurrentBoundaryData : public internal::DataMap -{ -public: - void SetUp(json &boundaries); -}; - -struct CapacitanceData -{ -public: - // List of boundary attributes for this capacitance postprocessing index. - std::vector attributes; -}; - -struct CapacitancePostData : public internal::DataMap -{ -public: - void SetUp(json &postpro); -}; - -struct InductanceData : public internal::ElementData -{ - using internal::ElementData::ElementData; -}; - -struct InductancePostData : public internal::DataMap -{ -public: - void SetUp(json &postpro); -}; - -struct InterfaceDielectricData -{ -public: - // Dielectric interface thickness [m]. - double ts = 0.0; - - // Loss tangent. - double tandelta = 0.0; - - // Relative permittivity. - double epsilon_r = 0.0; - - // Optional relative permittivity for metal-substrate, metal-air, and substrate-air - // layers. - double epsilon_r_ma = 0.0; - double epsilon_r_ms = 0.0; - double epsilon_r_sa = 0.0; - - // For each dielectric postprocessing index, each element contains a list of attributes - // sharing the same side value. - std::vector elements = {}; -}; - -struct InterfaceDielectricPostData : public internal::DataMap -{ -public: - void SetUp(json &postpro); -}; - -struct BoundaryPostData -{ -public: - // Set of all postprocessing boundary attributes. - std::set attributes; - - // Boundary postprocessing objects. - CapacitancePostData capacitance = {}; - InductancePostData inductance = {}; - InterfaceDielectricPostData dielectric = {}; - - void SetUp(json &boundaries); -}; - -struct BoundaryData -{ -public: - // Set of all boundary attributes. - std::set attributes = {}; - - // Boundary objects. - PecBoundaryData pec = {}; - PmcBoundaryData pmc = {}; - WavePortPecBoundaryData auxpec = {}; - FarfieldBoundaryData farfield = {}; - ConductivityBoundaryData conductivity = {}; - ImpedanceBoundaryData impedance = {}; - LumpedPortBoundaryData lumpedport = {}; - WavePortBoundaryData waveport = {}; - SurfaceCurrentBoundaryData current = {}; - BoundaryPostData postpro = {}; - - void SetUp(json &config); -}; - -struct DrivenSolverData -{ -public: - // Lower bound of frequency sweep [GHz]. - double min_f = 0.0; - - // Upper bound of frequency sweep [GHz]. - double max_f = 0.0; - - // Step size for frequency sweep [GHz]. - double delta_f = 0.0; - - // Step increment for saving fields to disk. - int delta_post = 0; - - // Only perform postprocessing on port boundaries, skipping domain interior. - bool only_port_post = false; - - // Error tolerance for enabling adaptive frequency sweep. - double adaptive_tol = 0.0; - - // Maximum number of frequency samples for adaptive frequency sweep. - int adaptive_nmax = 0; - - // Number of candidate points for error metric calculation in adaptive - // frequency sweep. - int adaptive_ncand = 0; - - // Use error metric based on an a posteriori residual error estimate. Otherwise just use - // the 2-norm of the HDM residual. - bool adaptive_metric_aposteriori = false; - - // Restart iteration for a partial sweep. - int rst = 1; - - void SetUp(json &solver); -}; - -struct EigenSolverData -{ -public: - // Target for shift-and-invert spectral transformation [GHz]. - double target = 0.0; - - // Eigenvalue solver relative tolerance. - double tol = 1.0e-6; - - // Maximum iterations for eigenvalue solver. - int max_it = -1; - - // Eigensolver subspace dimension or maximum dimension before restart. - int max_size = -1; - - // Desired number of eigenmodes. - int n = 1; - - // Number of modes to write to disk. - int n_post = 0; - - // Use operator scaling in order to increase numerical robustness. - bool scale = true; - - // Compute and set a starting vector for the eigenvalue solver. - bool init_v0 = true; - bool init_v0_const = false; - - // Orthogonalize basis vectors using a mass matrix inner product, instead of generating - // using a standard ℓ² (Euclidean) norm. - bool mass_orthog = false; - - // Eigenvalue solver type. - enum class Type - { - DEFAULT, - SLEPC, - ARPACK, - FEAST - }; - Type type = Type::DEFAULT; - - // For SLEPc eigenvalue solver, use linearized formulation for quadratic eigenvalue - // problems. - bool pep_linear = true; - - // Number of integration points used for the FEAST eigenvalue solver contour. - int feast_contour_np = 4; - - // Parameters for the FEAST eigenvalue solver contour. - double feast_contour_ub = 0.0; - double feast_contour_ar = 1.0; - - // Use more than just the standard single moment for FEAST subspace construction. - int feast_moments = 1; - - void SetUp(json &solver); -}; - -struct ElectrostaticSolverData -{ -public: - // Number of fields to write to disk. - int n_post = 0; - - void SetUp(json &solver); -}; - -struct MagnetostaticSolverData -{ -public: - // Number of fields to write to disk. - int n_post = 0; - - void SetUp(json &solver); -}; - -struct TransientSolverData -{ -public: - // Time integration scheme type. - enum class Type - { - DEFAULT, - GEN_ALPHA, - NEWMARK, - CENTRAL_DIFF - }; - Type type = Type::DEFAULT; - - // Excitation type for port excitation. - enum class ExcitationType - { - SINUSOIDAL, - GAUSSIAN, - DIFF_GAUSSIAN, - MOD_GAUSSIAN, - RAMP_STEP, - SMOOTH_STEP - }; - ExcitationType excitation = ExcitationType::SINUSOIDAL; - - // Excitation parameters: frequency [GHz] and pulse width [ns]. - double pulse_f = 0.0; - double pulse_tau = 0.0; - - // Upper bound of time interval [ns]. - double max_t = 1.0; - - // Step size for time stepping [ns]. - double delta_t = 1.0e-2; - - // Step increment for saving fields to disk. - int delta_post = 0; - - // Only perform postprocessing on port boundaries, skipping domain interior. - bool only_port_post = false; - - void SetUp(json &solver); -}; - -struct LinearSolverData -{ -public: - // Solver type. - enum class Type - { - DEFAULT, - AMS, - BOOMER_AMG, - MUMPS, - SUPERLU, - STRUMPACK, - STRUMPACK_MP - }; - Type type = Type::DEFAULT; - - // Krylov solver type. - enum class KspType - { - DEFAULT, - CG, - MINRES, - GMRES, - FGMRES, - BICGSTAB - }; - KspType ksp_type = KspType::DEFAULT; - - // Iterative solver relative tolerance. - double tol = 1.0e-6; - - // Maximum iterations for iterative solver. - int max_it = 100; - - // Maximum Krylov space dimension for GMRES/FGMRES iterative solvers. - int max_size = -1; - - // Reuse previous solution as initial guess for Krylov solvers. - int initial_guess = -1; - - // Maximum number of levels for geometric multigrid (set to 1 to disable multigrid). - int mg_max_levels = 100; - - // Type of coarsening for p-multigrid. - enum class MultigridCoarsenType - { - LINEAR, - LOGARITHMIC - }; - MultigridCoarsenType mg_coarsen_type = MultigridCoarsenType::LOGARITHMIC; - - // Number of iterations for preconditioners which support it. For multigrid, this is the - // number of V-cycles per Krylov solver iteration. - int mg_cycle_it = 1; - - // Use auxiliary space smoothers on geometric multigrid levels. - int mg_smooth_aux = -1; - - // Number of pre-/post-smoothing iterations at each geometric or algebraic multigrid - // level. - int mg_smooth_it = 1; - - // Order of polynomial smoothing for geometric multigrid. - int mg_smooth_order = -1; - - // Safety factors for eigenvalue estimates associated with Chebyshev smoothing for - // geometric multigrid. - double mg_smooth_sf_max = 1.0; - double mg_smooth_sf_min = 0.0; - - // Smooth based on 4th-kind Chebyshev polynomials for geometric multigrid, otherwise - // use standard 1st-kind polynomials. - bool mg_smooth_cheby_4th = true; - - // For frequency domain applications, precondition linear systems with a real-valued - // approximation to the system matrix. - bool pc_mat_real = false; - - // For frequency domain applications, precondition linear systems with a shifted matrix - // (makes the preconditoner matrix SPD). - int pc_mat_shifted = -1; - - // Choose left or right preconditioning. - enum class SideType - { - DEFAULT, - RIGHT, - LEFT - }; - SideType pc_side_type = SideType::DEFAULT; - - // Specify details for the column ordering method in the symbolic factorization for sparse - // direct solvers. - enum class SymFactType - { - DEFAULT, - METIS, - PARMETIS, - SCOTCH, - PTSCOTCH - }; - SymFactType sym_fact_type = SymFactType::DEFAULT; - - // Low-rank and butterfly compression parameters for sparse direct solvers which support - // it (mainly STRUMPACK). - enum class CompressionType - { - NONE, - BLR, - HSS, - HODLR, - ZFP, - BLR_HODLR, - ZFP_BLR_HODLR - }; - CompressionType strumpack_compression_type = CompressionType::NONE; - double strumpack_lr_tol = 1.0e-3; - int strumpack_lossy_precision = 16; - int strumpack_butterfly_l = 1; - - // Option to enable 3D process grid for SuperLU_DIST solver. - bool superlu_3d = false; - - // Option to use vector or scalar Pi-space corrections for the AMS preconditioner. - bool ams_vector = false; - - // Relative tolerance for solving linear systems in divergence-free projector. - double divfree_tol = 1.0e-12; - - // Maximum number of iterations for solving linear systems in divergence-free projector. - int divfree_max_it = 1000; - - // Relative tolerance for solving linear systems in the error estimator. - double estimator_tol = 1.0e-6; - - // Maximum number of iterations for solving linear systems in the error estimator. - int estimator_max_it = 100; - - // Enable different variants of Gram-Schmidt orthogonalization for GMRES/FGMRES iterative - // solvers and SLEPc eigenvalue solver. - enum class OrthogType - { - MGS, - CGS, - CGS2 - }; - OrthogType gs_orthog_type = OrthogType::MGS; - - void SetUp(json &solver); -}; - -struct SolverData -{ -public: - // Approximation order. - int order = 1; - - // Order above which to use partial assembly instead of full assembly. - int pa_order_threshold = 100; - - // Device used to configure MFEM. - enum class Device - { - CPU, - GPU, - DEBUG - }; - Device device = Device::CPU; - - // Backend for libCEED (https://libceed.org/en/latest/gettingstarted/#backends). - std::string ceed_backend = ""; - - // Solver objects. - DrivenSolverData driven = {}; - EigenSolverData eigenmode = {}; - ElectrostaticSolverData electrostatic = {}; - MagnetostaticSolverData magnetostatic = {}; - TransientSolverData transient = {}; - LinearSolverData linear = {}; - - void SetUp(json &config); -}; - -} // namespace palace::config - -#endif // PALACE_UTILS_CONFIGFILE_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_CONFIG_FILE_HPP +#define PALACE_UTILS_CONFIG_FILE_HPP + +#include +#include +#include +#include +#include +#include +#include "labels.hpp" + +namespace palace::config +{ + +using json = nlohmann::json; + +// +// Data structures for storing configuration file data. +// +namespace internal +{ + +template +struct DataVector +{ +protected: + std::vector vecdata = {}; + +public: + template + decltype(auto) emplace_back(Args &&...args) + { + return vecdata.emplace_back(std::forward(args)...); + } + [[nodiscard]] const auto &operator[](int i) const { return vecdata[i]; } + [[nodiscard]] auto &operator[](int i) { return vecdata[i]; } + [[nodiscard]] const auto &at(int i) const { return vecdata.at(i); } + [[nodiscard]] auto &at(int i) { return vecdata.at(i); } + [[nodiscard]] auto size() const { return vecdata.size(); } + [[nodiscard]] auto empty() const { return vecdata.empty(); } + [[nodiscard]] auto begin() const { return vecdata.begin(); } + [[nodiscard]] auto end() const { return vecdata.end(); } + [[nodiscard]] auto begin() { return vecdata.begin(); } + [[nodiscard]] auto end() { return vecdata.end(); } + [[nodiscard]] auto front() const { return vecdata.front(); } + [[nodiscard]] auto back() const { return vecdata.back(); } + [[nodiscard]] auto front() { return vecdata.front(); } + [[nodiscard]] auto back() { return vecdata.back(); } +}; + +template +struct DataMap +{ +protected: + // Map keys are the object indices for postprocessing. + std::map mapdata = {}; + +public: + [[nodiscard]] const auto &operator[](int i) const { return mapdata[i]; } + [[nodiscard]] auto &operator[](int i) { return mapdata[i]; } + [[nodiscard]] const auto &at(int i) const { return mapdata.at(i); } + [[nodiscard]] auto &at(int i) { return mapdata.at(i); } + [[nodiscard]] auto size() const { return mapdata.size(); } + [[nodiscard]] auto empty() const { return mapdata.empty(); } + [[nodiscard]] auto begin() const { return mapdata.begin(); } + [[nodiscard]] auto end() const { return mapdata.end(); } + [[nodiscard]] auto begin() { return mapdata.begin(); } + [[nodiscard]] auto end() { return mapdata.end(); } +}; + +// An ElementData consists of a list of attributes making up a single element of a +// potentially multielement boundary, and a direction and/or a normal defining the incident +// field. These are used for lumped ports, terminals, surface currents, and other boundary +// postprocessing objects. +struct ElementData +{ + // Vector defining the direction for this port. In a Cartesian system, "X", "Y", and "Z" + // map to (1,0,0), (0,1,0), and (0,0,1), respectively. + std::array direction{{0.0, 0.0, 0.0}}; + + CoordinateSystem coordinate_system = CoordinateSystem::CARTESIAN; + + // List of boundary attributes for this element. + std::vector attributes = {}; +}; + +} // namespace internal + +// Problem & Model Config. + +struct OutputFormatsData +{ +public: + // Enable Paraview output format. + bool paraview = true; + + // Enable MFEM GLVis grid function output format. + bool gridfunction = false; +}; + +struct ProblemData +{ +public: + // Simulation type. + ProblemType type = ProblemType::DRIVEN; + + // Level of printing. + int verbose = 1; + + // Output path for storing results. + std::string output = ""; + + // Output formats configuration. + OutputFormatsData output_formats = {}; + + void SetUp(json &config); +}; + +struct BoxRefinementData +{ + // Refinement levels. + int ref_levels = 0; + + // Region bounding box limits [m]. + std::array bbmin{{0.0, 0.0, 0.0}}, bbmax{{0.0, 0.0, 0.0}}; +}; + +struct SphereRefinementData +{ + // Refinement levels. + int ref_levels = 0; + + // Sphere radius [m]. + double r = 0.0; + + // Sphere center [m]. + std::array center{{0.0, 0.0, 0.0}}; +}; + +struct RefinementData +{ +public: + // Non-dimensional tolerance used to specify convergence of adaptive mesh refinement. + double tol = 1.0e-2; + + // Maximum number of iterations to perform during adaptive mesh refinement. + int max_it = 0; + + // If a refinement results in a greater number of DOFs than this value, no future + // refinement will be allowed. + int max_size = 0; + + // Whether or not to perform nonconformal adaptation. + bool nonconformal = true; + + // Maximum difference in nonconformal refinements between two adjacent elements. Zero + // implies there is no constraint on local nonconformity. + int max_nc_levels = 1; + + // Dörfler update fraction. The set of marked elements is the minimum set that contains + // update_fraction of the total error. + double update_fraction = 0.7; + + // Maximum allowable ratio of number of elements across processors before rebalancing is + // performed. + double maximum_imbalance = 1.1; + + // Whether to save off results of each adaptation iteration as a subfolder within the post + // processing directory. + bool save_adapt_iterations = true; + + // Whether to write a (serial) mesh to file after mesh modification during AMR. + bool save_adapt_mesh = false; + + // Parallel uniform mesh refinement levels. + int uniform_ref_levels = 0; + + // Serial uniform mesh refinement levels. + int ser_uniform_ref_levels = 0; + +private: + // Refinement data for mesh regions. + std::vector box_list = {}; + std::vector sphere_list = {}; + +public: + auto &GetBox(int i) { return box_list[i]; } + const auto &GetBoxes() const { return box_list; } + auto &GetBoxes() { return box_list; } + + auto &GetSphere(int i) { return sphere_list[i]; } + const auto &GetSpheres() const { return sphere_list; } + auto &GetSpheres() { return sphere_list; } + + void SetUp(json &model); +}; + +struct ModelData +{ +public: + // Mesh file. + std::string mesh = ""; + + // Mesh length unit and optional characteristic length scale for nondimensionalization + // [m]. + double L0 = 1.0e-6; + double Lc = -1.0; + + // Remove high-order curvature information from the mesh. + bool remove_curvature = false; + + // Convert mesh to simplex elements. + bool make_simplex = false; + + // Convert mesh to hexahedral elements (using tet-to-hex algorithm). + bool make_hex = false; + + // Reorder elements based on spatial location after loading the serial mesh, which can + // potentially increase memory coherency. + bool reorder_elements = false; + + // Remove elements (along with any associated unattached boundary elements) from the mesh + // which do not have any material properties specified. + bool clean_unused_elements = true; + + // Split, or "crack", boundary elements lying on internal boundaries to decouple the + // elements on either side. + bool crack_bdr_elements = true; + + // When required, refine elements neighboring a split or crack in order to enable the + // decoupling. + bool refine_crack_elements = true; + + // Factor for displacing duplicated interior boundary elements, usually added just for + // visualization. + double crack_displ_factor = 1.0e-12; + + // Add new boundary elements for faces are on the computational domain boundary or which + // have attached elements on either side with different domain attributes. + bool add_bdr_elements = true; + + // Export mesh after pre-processing but before cracking. + bool export_prerefined_mesh = false; + + // Call MFEM's ReorientTetMesh as a check of mesh orientation after partitioning. + bool reorient_tet_mesh = false; + + // Partitioning file (if specified, does not compute a new partitioning). + std::string partitioning = ""; + + // Object controlling mesh refinement. + RefinementData refinement = {}; + + void SetUp(json &config); +}; + +// Domain Config. + +// Store symmetric matrix data as set of outer products: Σᵢ sᵢ * vᵢ * vᵢᵀ. +template +struct SymmetricMatrixData +{ +public: + std::array s; + std::array, N> v; + + SymmetricMatrixData(double diag) + { + s.fill(diag); + std::size_t i = 0; + for (auto &x : v) + { + x.fill(0.0); + x[i++] = 1.0; + } + } +}; + +struct MaterialData +{ +public: + // Relative permeability. + SymmetricMatrixData<3> mu_r = 1.0; + + // Relative permittivity. + SymmetricMatrixData<3> epsilon_r = 1.0; + + // Loss tangent. + SymmetricMatrixData<3> tandelta = 0.0; + + // Conductivity [S/m]. + SymmetricMatrixData<3> sigma = 0.0; + + // London penetration depth [m]. + double lambda_L = 0.0; + + // List of domain attributes for this material. + std::vector attributes = {}; +}; + +struct DomainMaterialData : public internal::DataVector +{ +public: + void SetUp(json &domains); +}; + +struct DomainEnergyData +{ +public: + // List of domain attributes for this domain postprocessing index. + std::vector attributes = {}; +}; + +struct DomainEnergyPostData : public internal::DataMap +{ +public: + void SetUp(json &postpro); +}; + +struct ProbeData +{ +public: + // Physical space coordinates for the probe location [m]. + std::array center{{0.0, 0.0, 0.0}}; +}; + +struct ProbePostData : public internal::DataMap +{ +public: + void SetUp(json &postpro); +}; + +struct DomainPostData +{ +public: + // List of all postprocessing domain attributes. + std::vector attributes = {}; + + // Domain postprocessing objects. + DomainEnergyPostData energy; + ProbePostData probe; + + void SetUp(json &domains); +}; + +struct DomainData +{ +public: + // List of all domain attributes (excluding postprocessing). + std::vector attributes = {}; + + // Domain objects. + DomainMaterialData materials = {}; + DomainPostData postpro = {}; + + void SetUp(json &config); +}; + +// Boundary Configuration. + +struct PecBoundaryData +{ +public: + // List of boundary attributes with PEC boundary conditions. + std::vector attributes = {}; + + [[nodiscard]] auto empty() const { return attributes.empty(); } + + void SetUp(json &boundaries); +}; + +struct PmcBoundaryData +{ +public: + // List of boundary attributes with PMC boundary conditions. + std::vector attributes = {}; + + [[nodiscard]] auto empty() const { return attributes.empty(); } + + void SetUp(json &boundaries); +}; + +struct WavePortPecBoundaryData +{ +public: + // List of boundary attributes with PEC boundary conditions for wave ports. + std::vector attributes = {}; + + [[nodiscard]] auto empty() const { return attributes.empty(); } + + void SetUp(json &boundaries); +}; + +struct FarfieldBoundaryData +{ +public: + // Approximation order for farfield ABC. + int order = 1; + + // List of boundary attributes with farfield absorbing boundary conditions. + std::vector attributes = {}; + + [[nodiscard]] auto empty() const { return attributes.empty(); } + + void SetUp(json &boundaries); +}; + +struct ConductivityData +{ +public: + // Electrical conductivity of the conductor [S/m]. + double sigma = 0.0; + + // Conductor relative permeability. + double mu_r = 1.0; + + // Optional conductor thickness [m]. + double h = 0.0; + + // Optional flag for an external boundary surface, relevant for the thickness correction. + bool external = false; + + // List of boundary attributes for this surface conductivity boundary condition. + std::vector attributes = {}; +}; + +struct ConductivityBoundaryData : public internal::DataVector +{ +public: + void SetUp(json &boundaries); +}; + +struct ImpedanceData +{ +public: + // Boundary surface resistance, inductance, and capacitance [Ω/sq, H/sq, F/sq]. + double Rs = 0.0; + double Ls = 0.0; + double Cs = 0.0; + + // List of boundary attributes for this impedance boundary condition. + std::vector attributes = {}; +}; + +struct ImpedanceBoundaryData : public internal::DataVector +{ +public: + void SetUp(json &boundaries); +}; + +struct LumpedPortData +{ +public: + // Port circuit resistance, inductance, and capacitance [Ω/sq, H/sq, F/sq]. + double R = 0.0; + double L = 0.0; + double C = 0.0; + + // Port surface resistance, inductance, and capacitance [Ω/sq, H/sq, F/sq]. + double Rs = 0.0; + double Ls = 0.0; + double Cs = 0.0; + + // Voltage for terminal BC [V] + double voltage = 0.0; + + // Input excitation for driven & transient solver: + // - Wave/Lumped ports with same index are excited together. + // - 1-based index if excited; 0 if not excited. + int excitation = 0; + + // Flag for boundary damping term in driven and transient simulations. + bool active = true; + + // For each lumped port index, each element contains a list of attributes making up a + // single element of a potentially multielement lumped port. + std::vector elements = {}; +}; + +struct LumpedPortBoundaryData : public internal::DataMap +{ +public: + void SetUp(json &boundaries); +}; + +struct PeriodicData +{ +public: + // Vector defining the affine transformation matrix for this periodic boundary condition. + std::array affine_transform = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + + // List of boundary donor attributes for this periodic boundary condition. + std::vector donor_attributes = {}; + + // List of boundary receiver attributes for this periodic boundary condition. + std::vector receiver_attributes = {}; +}; + +struct PeriodicBoundaryData +{ +public: + // Vector of periodic boundary pairs. + std::vector boundary_pairs = {}; + + // Floquet/Bloch wavevector specifying the phase delay in the X/Y/Z directions. + std::array wave_vector = {0.0, 0.0, 0.0}; + + void SetUp(json &boundaries); +}; + +struct WavePortData +{ +public: + // Mode index for the numeric wave port. + int mode_idx = 1; + + // Port offset for de-embedding [m]. + double d_offset = 0.0; + + // Eigenvalue solver type for boundary mode calculation. + EigenSolverBackend eigen_solver = EigenSolverBackend::DEFAULT; + + // Input excitation for driven & transient solver: + // - Wave/Lumped ports with same index are excited together. + // - 1-based index if excited; 0 if not excited. + int excitation = 0; + + // Flag for boundary damping term in driven and transient simulations. + bool active = true; + + // List of boundary attributes for this wave port. + std::vector attributes = {}; + + // Maximum number of iterations in linear solver. + int ksp_max_its = 45; + + // Tolerance for linear solver. + double ksp_tol = 1e-8; + + // Tolerance for eigenvalue solver. + double eig_tol = 1e-6; + + // Print level for linear and eigenvalue solvers. + int verbose = 0; +}; + +struct WavePortBoundaryData : public internal::DataMap +{ +public: + void SetUp(json &boundaries); +}; + +struct SurfaceCurrentData +{ +public: + // For each surface current source index, each element contains a list of attributes + // making up a single element of a potentially multielement current source. + std::vector elements = {}; +}; + +struct SurfaceCurrentBoundaryData : public internal::DataMap +{ +public: + void SetUp(json &boundaries); +}; + +struct SurfaceFluxData +{ +public: + // Surface flux type. + SurfaceFlux type = SurfaceFlux::ELECTRIC; + + // Flag for whether or not to consider the boundary as an infinitely thin two-sided + // boundary for postprocessing. + bool two_sided = false; + + // Coordinates of a point away from which to compute the outward flux (for orienting the + // surface normal) [m]. + std::array center{{0.0, 0.0, 0.0}}; + + // Flag which indicates whether or not the center point was specified. + bool no_center = true; + + // List of boundary attributes for this surface flux postprocessing index. + std::vector attributes = {}; +}; + +struct SurfaceFluxPostData : public internal::DataMap +{ +public: + void SetUp(json &postpro); +}; + +struct InterfaceDielectricData +{ +public: + // Type of interface dielectric for computing electric field energy participation ratios. + InterfaceDielectric type = InterfaceDielectric::DEFAULT; + + // Dielectric interface thickness [m]. + double t = 0.0; + + // Relative permittivity. + double epsilon_r = 0.0; + + // Loss tangent. + double tandelta = 0.0; + + // List of boundary attributes for this interface dielectric postprocessing index. + std::vector attributes = {}; +}; + +struct InterfaceDielectricPostData : public internal::DataMap +{ +public: + void SetUp(json &postpro); +}; + +struct FarFieldPostData +{ +public: + // List of boundary attributes to use for the surface integral. + std::vector attributes = {}; + + // List of (theta, phi) where the wave-zone fields should be evaluated. + // Units are radians. + std::vector> thetaphis = {}; + + void SetUp(json &postpro); + + bool empty() const { return thetaphis.empty(); }; +}; + +struct BoundaryPostData +{ +public: + // List of all postprocessing boundary attributes. + std::vector attributes = {}; + + // Boundary postprocessing objects. + SurfaceFluxPostData flux = {}; + InterfaceDielectricPostData dielectric = {}; + FarFieldPostData farfield = {}; + + void SetUp(json &boundaries); +}; + +struct BoundaryData +{ +public: + // List of all boundary attributes (excluding postprocessing). + std::vector attributes = {}; + + // List of all boundary attributes affected by mesh cracking. + std::unordered_set cracked_attributes = {}; + + // Boundary objects. + PecBoundaryData pec = {}; + PmcBoundaryData pmc = {}; + WavePortPecBoundaryData auxpec = {}; + FarfieldBoundaryData farfield = {}; + ConductivityBoundaryData conductivity = {}; + ImpedanceBoundaryData impedance = {}; + LumpedPortBoundaryData lumpedport = {}; + WavePortBoundaryData waveport = {}; + SurfaceCurrentBoundaryData current = {}; + PeriodicBoundaryData periodic = {}; + BoundaryPostData postpro = {}; + + void SetUp(json &config); +}; + +// Solver Configuration. + +struct DrivenSolverData +{ +public: + // Explicit frequency samples [GHz]. + std::vector sample_f = {}; + + // Indices of frequency samples to explicitly add to the PROM. + std::vector prom_indices; + + // Indices of frequency samples on which to save fields to disk. + std::vector save_indices; + + // Restart iteration for a partial sweep. 1-based indexing. So 1 <= restart <= nr_freq * + // nr_excitations. + int restart = 1; + + // Error tolerance for enabling adaptive frequency sweep. + double adaptive_tol = 0.0; + + // Maximum number of frequency samples for adaptive frequency sweep. + int adaptive_max_size = 20; + + // Memory required for adaptive sampling convergence. + int adaptive_memory = 2; + + void SetUp(json &solver); +}; + +struct EigenSolverData +{ +public: + // Target for shift-and-invert spectral transformation [GHz]. + double target = 0.0; + + // Eigenvalue solver relative tolerance. + double tol = 1.0e-6; + + // Maximum iterations for eigenvalue solver. + int max_it = -1; + + // Eigenvalue solver subspace dimension or maximum dimension before restart. + int max_size = -1; + + // Desired number of eigenmodes. + int n = 1; + + // Number of modes to write to disk. + int n_post = 0; + + // Use operator scaling in order to increase numerical robustness. + bool scale = true; + + // Compute and set a starting vector for the eigenvalue solver. + bool init_v0 = true; + bool init_v0_const = false; + + // Orthogonalize basis vectors using a mass matrix inner product, instead of generating + // using a standard ℓ² (Euclidean) norm. + bool mass_orthog = false; + + // Eigenvalue solver type. + EigenSolverBackend type = EigenSolverBackend::DEFAULT; + + // For SLEPc eigenvalue solver, use linearized formulation for quadratic eigenvalue + // problems. + bool pep_linear = true; + + // Nonlinear eigenvalue solver type. + NonlinearEigenSolver nonlinear_type = NonlinearEigenSolver::HYBRID; + + // For nonlinear problems, refine the linearized solution with a nonlinear eigensolver. + bool refine_nonlinear = true; + + // For nonlinear problems using the hybrid approach, relative tolerance of the linear + // eigenvalue solver used to generate the initial guess. + double linear_tol = 1e-3; + + // Upper end of the target range for nonlinear eigenvalue solver [GHz]. A value <0 + // will use the default (3 * target). + double target_upper = -1; + + // Update frequency of the preconditioner in the quasi-Newton nonlinear eigenvalue solver. + int preconditioner_lag = 10; + + // Relative tolerance below which the preconditioner is not updated, regardless of the + // lag. + double preconditioner_lag_tol = 1e-4; + + // Maximum number of failed attempts with a given initial guess in the quasi-Newton + // nonlinear eigenvalue solver. + int max_restart = 2; + + void SetUp(json &solver); +}; + +struct ElectrostaticSolverData +{ +public: + // Number of fields to write to disk. + int n_post = 0; + + void SetUp(json &solver); +}; + +struct MagnetostaticSolverData +{ +public: + // Number of fields to write to disk. + int n_post = 0; + + void SetUp(json &solver); +}; + +struct TransientSolverData +{ +public: + // Time integration scheme type. + TimeSteppingScheme type = TimeSteppingScheme::DEFAULT; + + // Excitation type for port excitation. + Excitation excitation = Excitation::SINUSOIDAL; + + // Excitation parameters: frequency [GHz] and pulse width [ns]. + double pulse_f = 0.0; + double pulse_tau = 0.0; + + // Upper bound of time interval [ns]. + double max_t = 1.0; + + // Step size for time stepping [ns]. + double delta_t = 1.0e-2; + + // Step increment for saving fields to disk. + int delta_post = 0; + + // RK scheme order for SUNDIALS ARKODE integrators. + // Max order for SUNDIALS CVODE integrator. + // Not used for generalized α and Runge-Kutta integrators. + int order = 2; + + // Adaptive time-stepping tolerances for CVODE and ARKODE. + double rel_tol = 1e-4; + double abs_tol = 1e-9; + + void SetUp(json &solver); +}; + +struct LinearSolverData +{ +public: + // Solver type. + LinearSolver type = LinearSolver::DEFAULT; + + // Krylov solver type. + KrylovSolver krylov_solver = KrylovSolver::DEFAULT; + + // Iterative solver relative tolerance. + double tol = 1.0e-6; + + // Maximum iterations for iterative solver. + int max_it = 100; + + // Maximum Krylov space dimension for GMRES/FGMRES iterative solvers. + int max_size = -1; + + // Reuse previous solution as initial guess for Krylov solvers. + int initial_guess = -1; + + // Maximum number of levels for geometric multigrid (set to 1 to disable multigrid). + int mg_max_levels = 100; + + // Type of coarsening for p-multigrid. + MultigridCoarsening mg_coarsening = MultigridCoarsening::LOGARITHMIC; + + // Controls whether or not to include in the geometric multigrid hierarchy the mesh levels + // from uniform refinement. + bool mg_use_mesh = true; + + // Number of iterations for preconditioners which support it. For multigrid, this is the + // number of V-cycles per Krylov solver iteration. + int mg_cycle_it = 1; + + // Use auxiliary space smoothers on geometric multigrid levels. + int mg_smooth_aux = -1; + + // Number of pre-/post-smoothing iterations at each geometric or algebraic multigrid + // level. + int mg_smooth_it = 1; + + // Order of polynomial smoothing for geometric multigrid. + int mg_smooth_order = -1; + + // Safety factors for eigenvalue estimates associated with Chebyshev smoothing for + // geometric multigrid. + double mg_smooth_sf_max = 1.0; + double mg_smooth_sf_min = 0.0; + + // Smooth based on 4th-kind Chebyshev polynomials for geometric multigrid, otherwise + // use standard 1st-kind polynomials. + bool mg_smooth_cheby_4th = true; + + // For frequency domain applications, precondition linear systems with a real-valued + // approximation to the system matrix. + bool pc_mat_real = false; + + // For frequency domain applications, precondition linear systems with a shifted matrix + // (makes the preconditioner matrix SPD). + int pc_mat_shifted = -1; + + // For frequency domain applications, use the complex-valued system matrix in the sparse + // direct solver. + bool complex_coarse_solve = false; + + // Drop small entries (< numerical ε) in the system matrix used in the sparse direct + // solver. + bool drop_small_entries = false; + + // Reuse the sparsity pattern (reordering) for repeated factorizations. + bool reorder_reuse = true; + + // Choose left or right preconditioning. + PreconditionerSide pc_side = PreconditionerSide::DEFAULT; + + // Specify details for the column ordering method in the symbolic factorization for sparse + // direct solvers. + SymbolicFactorization sym_factorization = SymbolicFactorization::DEFAULT; + + // Low-rank and butterfly compression parameters for sparse direct solvers which support + // it (mainly STRUMPACK). + SparseCompression strumpack_compression_type = SparseCompression::NONE; + + double strumpack_lr_tol = 1.0e-3; + int strumpack_lossy_precision = 16; + int strumpack_butterfly_l = 1; + + // Option to enable 3D process grid for SuperLU_DIST solver. + bool superlu_3d = false; + + // Option to use vector or scalar Pi-space corrections for the AMS preconditioner. + bool ams_vector_interp = false; + + // Option to tell the AMS solver that the operator is singular, like for magnetostatic + // problems. + int ams_singular_op = -1; + + // Option to use aggressive coarsening for Hypre AMG solves (with BoomerAMG or AMS). + // Typically use this when the operator is positive definite. + int amg_agg_coarsen = -1; + + // Relative tolerance for solving linear systems in divergence-free projector. + double divfree_tol = 1.0e-12; + + // Maximum number of iterations for solving linear systems in divergence-free projector. + int divfree_max_it = 1000; + + // Relative tolerance for solving linear systems in the error estimator. + double estimator_tol = 1.0e-6; + + // Maximum number of iterations for solving linear systems in the error estimator. + int estimator_max_it = 10000; + + // Use geometric multigrid + AMG for error estimator linear solver preconditioner (instead + // of just Jacobi). + bool estimator_mg = false; + + // Enable different variants of Gram-Schmidt orthogonalization for GMRES/FGMRES iterative + // solvers and SLEPc eigenvalue solver. + Orthogonalization gs_orthog = Orthogonalization::MGS; + + void SetUp(json &solver); +}; + +struct SolverData +{ +public: + // Approximation order. + int order = 1; + + // Order above which to use partial assembly instead of full assembly. + int pa_order_threshold = 1; + + // Include the order of det(J) in the order of accuracy for quadrature rule selection. + bool q_order_jac = false; + + // Additional quadrature order of accuracy (in addition to 2p or 2p + order(|J|)) for + // quadrature rule selection. + int q_order_extra = 0; + + // Device used to configure MFEM. + Device device = Device::CPU; + + // Backend for libCEED (https://libceed.org/en/latest/gettingstarted/#backends). + std::string ceed_backend = ""; + + // Solver objects. + DrivenSolverData driven = {}; + EigenSolverData eigenmode = {}; + ElectrostaticSolverData electrostatic = {}; + MagnetostaticSolverData magnetostatic = {}; + TransientSolverData transient = {}; + LinearSolverData linear = {}; + + void SetUp(json &config); +}; + +// Calculate the number of steps from [start, end) in increments of delta. Will only include +// end if it is a multiple of delta beyond start. +int GetNumSteps(double start, double end, double delta); + +} // namespace palace::config + +#endif // PALACE_UTILS_CONFIG_FILE_HPP diff --git a/palace/utils/constants.hpp b/palace/utils/constants.hpp index 0e9b8f29bf..4d70f81176 100644 --- a/palace/utils/constants.hpp +++ b/palace/utils/constants.hpp @@ -1,42 +1,38 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_CONSTANTS_HPP -#define PALACE_UTILS_CONSTANTS_HPP - -#include - -namespace palace::electromagnetics -{ - -// -// Define physical constants for nondimensionalization. -// - -// Permittivity of free space [F/m]. -static constexpr double epsilon0_ = 8.8541878176e-12; - -// Permeability of free space [H/m]. -static constexpr double mu0_ = 4.0e-7 * M_PI; - -// Speed of light in free space [m/s]. -static -#if defined(PALACE_WITH_CONSTEXPR_SQRT) - constexpr -#else - const -#endif - double c0_ = 1.0 / std::sqrt(epsilon0_ * mu0_); - -// Impedance of free space [Ω]. -static -#if defined(PALACE_WITH_CONSTEXPR_SQRT) - constexpr -#else - const -#endif - double Z0_ = std::sqrt(mu0_ / epsilon0_); - -} // namespace palace::electromagnetics - -#endif // PALACE_UTILS_CONSTANTS_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_CONSTANTS_HPP +#define PALACE_UTILS_CONSTANTS_HPP + +#include + +namespace palace::electromagnetics +{ + +// +// Define physical constants for nondimensionalization. +// +// SI units switched definition of the Ampere in 2019 so that mu0 is now defined in terms of +// the fine-structure constant alpha. Updates values: Mohr, P. J.; Newell, D. B.; Taylor, B. +// N.; Tiesinga, E. CODATA Recommended Values of the Fundamental Physical Constants: 2022. +// Rev. Mod. Phys. 2025, 97 (2), 025002. https://doi.org/10.1103/RevModPhys.97.025002. + +// Speed of light in free space [m/s]. +// Exact. +static constexpr double c0_ = 299'792'458; + +// Permeability of free space [H/m]. +// CODATA value with error: 1.256 637 061 27(20) * 10^(−6) +static constexpr double mu0_ = 1.256'637'061'27e-6; + +// Permittivity of free space [F/m]. +// CODATA value with error: 8.854 187 8188(14) * 10^(−12) +static constexpr double epsilon0_ = 1.0 / (mu0_ * c0_ * c0_); + +// Impedance of free space [Ω]. +// CODATA value with error: 376.730 313 412(59) +static constexpr double Z0_ = mu0_ * c0_; // = sqrt(mu0 / epsilon0) + +} // namespace palace::electromagnetics + +#endif // PALACE_UTILS_CONSTANTS_HPP diff --git a/palace/utils/device.cpp b/palace/utils/device.cpp new file mode 100644 index 0000000000..288b70050c --- /dev/null +++ b/palace/utils/device.cpp @@ -0,0 +1,36 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "device.hpp" +#include "communication.hpp" + +#include + +namespace palace::utils +{ + +int GetDeviceCount() +{ +#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP) + return mfem::Device::GetDeviceCount(); +#else + return 0; +#endif +} + +int GetDeviceId(MPI_Comm comm, int ngpu) +{ + // Assign devices round-robin over MPI ranks if GPU support is enabled. +#if defined(MFEM_USE_CUDA) || defined(MFEM_USE_HIP) + MPI_Comm node_comm; + MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, Mpi::Rank(comm), MPI_INFO_NULL, + &node_comm); + int node_size = Mpi::Rank(node_comm); + MPI_Comm_free(&node_comm); + return node_size % ngpu; +#else + return 0; +#endif +} + +} // namespace palace::utils diff --git a/palace/utils/device.hpp b/palace/utils/device.hpp new file mode 100644 index 0000000000..d6c1c46ce5 --- /dev/null +++ b/palace/utils/device.hpp @@ -0,0 +1,20 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef DEVICE_HPP +#define DEVICE_HPP + +#include "communication.hpp" + +namespace palace::utils +{ + +// Return the number of devices available. +int GetDeviceCount(); + +// Assign devices round-robin over MPI ranks. +int GetDeviceId(MPI_Comm comm, int ngpu); + +} // namespace palace::utils + +#endif // DEVICE_HPP diff --git a/palace/utils/diagnostic.hpp b/palace/utils/diagnostic.hpp index c212053c89..7c4e9a8e9a 100644 --- a/palace/utils/diagnostic.hpp +++ b/palace/utils/diagnostic.hpp @@ -1,18 +1,21 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_DIAGNOSTIC_HPP -#define PALACE_UTILS_DIAGNOSTIC_HPP - -#ifdef _MSC_VER -#define PalacePragmaDiagnosticPush _Pragma("warning(push)") -#define PalacePragmaDiagnosticPop _Pragma("warning(pop)") -#define PalacePragmaDiagnosticDisableDeprecated _Pragma("warning(disable : 4996)") -#else -#define PalacePragmaDiagnosticPush _Pragma("GCC diagnostic push") -#define PalacePragmaDiagnosticPop _Pragma("GCC diagnostic pop") -#define PalacePragmaDiagnosticDisableDeprecated \ - _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") -#endif - -#endif // PALACE_UTILS_DIAGNOSTIC_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_DIAGNOSTIC_HPP +#define PALACE_UTILS_DIAGNOSTIC_HPP + +#if defined(_MSC_VER) +#define PalacePragmaDiagnosticPush _Pragma("warning(push)") +#define PalacePragmaDiagnosticPop _Pragma("warning(pop)") +#define PalacePragmaDiagnosticDisableDeprecated _Pragma("warning(disable:4996)") +#define PalacePragmaDiagnosticDisableUnused _Pragma("warning(disable:4505)") +#else +#define PalacePragmaDiagnosticPush _Pragma("GCC diagnostic push") +#define PalacePragmaDiagnosticPop _Pragma("GCC diagnostic pop") +#define PalacePragmaDiagnosticDisableDeprecated \ + _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") +#define PalacePragmaDiagnosticDisableUnused \ + _Pragma("GCC diagnostic ignored \"-Wunused-function\"") +#endif + +#endif // PALACE_UTILS_DIAGNOSTIC_HPP diff --git a/palace/utils/dorfler.cpp b/palace/utils/dorfler.cpp index 057f0418e3..ed4e8bd225 100644 --- a/palace/utils/dorfler.cpp +++ b/palace/utils/dorfler.cpp @@ -1,201 +1,201 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "dorfler.hpp" - -#include -#include -#include -#include - -namespace palace::utils -{ - -std::array ComputeDorflerThreshold(MPI_Comm comm, const Vector &e, - double fraction) -{ - // Precompute the sort and partial sum to make evaluating a candidate partition fast. - e.HostRead(); - std::vector estimates(e.begin(), e.end()); - std::sort(estimates.begin(), estimates.end()); - - // Accumulate the squares of the estimates. - std::vector sum(estimates.size()); - for (auto &x : estimates) - { - x *= x; - } - std::partial_sum(estimates.begin(), estimates.end(), sum.begin()); - for (auto &x : estimates) - { - x = std::sqrt(x); - } - - // The pivot is the first point which leaves (1-θ) of the total sum after it. - const double local_total = sum.size() > 0 ? sum.back() : 0.0; - auto pivot = std::lower_bound(sum.begin(), sum.end(), (1 - fraction) * local_total); - auto index = std::distance(sum.begin(), pivot); - double error_threshold = estimates.size() > 0 ? estimates[index] : 0.0; - - // Compute the number of elements, and amount of error, marked by threshold value e. - auto Marked = [&estimates, &sum, &local_total](double e) -> std::pair - { - if (local_total > 0) - { - const auto lb = std::lower_bound(estimates.begin(), estimates.end(), e); - const auto elems_marked = std::distance(lb, estimates.end()); - const double error_unmarked = - lb != estimates.begin() ? sum[sum.size() - elems_marked - 1] : 0; - const double error_marked = local_total - error_unmarked; - return {elems_marked, error_marked}; - } - else - { - return {0, 0.0}; - } - }; - - // Each processor will compute a different threshold: if a given processor has lots of low - // error elements, their value will be lower and if a processor has high error, their - // value will be higher. Thus using the value from the low error processor will give too - // many elements, and using the value from the high error processor will give too few. The - // correct threshold value will be an intermediate between the min and max over - // processors. - double min_threshold = error_threshold; - double max_threshold = error_threshold; - Mpi::GlobalMin(1, &min_threshold, comm); - Mpi::GlobalMax(1, &max_threshold, comm); - struct - { - std::size_t total; - std::size_t min_marked; - std::size_t max_marked; - } elements; - elements.total = estimates.size(); - struct - { - double total; - double min_marked; - double max_marked; - } error; - error.total = local_total; - std::tie(elements.max_marked, error.max_marked) = Marked(min_threshold); - std::tie(elements.min_marked, error.min_marked) = Marked(max_threshold); - Mpi::GlobalSum(3, &elements.total, comm); - Mpi::GlobalSum(3, &error.total, comm); - const double max_indicator = [&]() - { - double max_indicator = estimates.size() > 0 ? estimates.back() : 0.0; - Mpi::GlobalMax(1, &max_indicator, comm); - return max_indicator; - }(); - MFEM_ASSERT(min_threshold <= max_threshold, - "Error in Dorfler marking: min: " << min_threshold << " max " << max_threshold - << "!"); - auto [elem_marked, error_marked] = Marked(error_threshold); - - // Keep track of the number of elements marked by the threshold bounds. If the top and - // bottom values are equal (or separated by only 1), there's no point further bisecting. - // The maximum iterations is just to prevert runaway. - constexpr int max_it = 100; - for (int i = 0; i < max_it; i++) - { - error_threshold = (min_threshold + max_threshold) / 2; - std::tie(elem_marked, error_marked) = Marked(error_threshold); - - // All processors need the values used for the stopping criteria. - Mpi::GlobalSum(1, &elem_marked, comm); - Mpi::GlobalSum(1, &error_marked, comm); - MFEM_ASSERT(elem_marked > 0, "Some elements must have been marked!"); - MFEM_ASSERT(error_marked > 0, "Some error must have been marked!"); - const auto candidate_fraction = error_marked / error.total; - if constexpr (false) - { - Mpi::Print( - "Marking threshold: {:e} < {:e} < {:e}\nMarked elements: {:d} <= {:d} <= {:d}\n", - min_threshold, error_threshold, max_threshold, elements.min_marked, elem_marked, - elements.max_marked); - } - - // Set the tolerance based off of the largest local indicator value. These tolerance - // values extremely tight because this loop is fast, and getting the marking correct is - // important. - constexpr double frac_tol = 2 * std::numeric_limits::epsilon(); - const double error_tol = 2 * std::numeric_limits::epsilon() * max_indicator; - if (std::abs(max_threshold - min_threshold) < error_tol || - std::abs(candidate_fraction - fraction) < frac_tol || - elements.max_marked <= (elements.min_marked + 1)) - { - // Candidate fraction matches to tolerance, or the number of marked elements is no - // longer changing. - if constexpr (false) - { - Mpi::Print("ΔFraction: {:.3e} (tol = {:.3e})\nΔThreshold: {:.3e} (tol = " - "{:.3e})\nΔElements: {:d}\n", - candidate_fraction - fraction, frac_tol, max_threshold - min_threshold, - error_tol, elements.max_marked - elements.min_marked); - } - break; - } - - // Update in preparation for next iteration. The logic here looks inverted compared to a - // usual binary search, because a smaller value marks a larger number of elements and - // thus a greater fraction of error. - if (candidate_fraction > fraction) - { - // This candidate marked too much, raise the lower bound. - min_threshold = error_threshold; - elements.max_marked = elem_marked; - error.max_marked = error_marked; - } - else if (candidate_fraction < fraction) - { - // This candidate marked too little, lower the upper bound. - max_threshold = error_threshold; - elements.min_marked = elem_marked; - error.min_marked = error_marked; - } - } - - // Always choose the lower threshold value, thereby marking the larger number of elements - // and fraction of the total error. Would rather over mark than under mark, as Dörfler - // marking is the smallest set that covers at least the specified fraction of the error. - error_threshold = min_threshold; - error_marked = error.max_marked; - MFEM_ASSERT(error_threshold > 0.0, - "Error threshold result from marking must be positive!"); - MFEM_VERIFY(error_marked >= fraction * error.total, - "Marked error = " << error_marked << ", total error =" << error.total - << ". Dorfler marking predicate failed!"); - return {error_threshold, error_marked / error.total}; -} - -std::array ComputeDorflerCoarseningThreshold(const mfem::ParMesh &mesh, - const Vector &e, double fraction) -{ - MFEM_VERIFY(mesh.Nonconforming(), "Can only perform coarsening on a Nonconforming mesh!"); - const auto &derefinement_table = mesh.pncmesh->GetDerefinementTable(); - mfem::Array elem_error(e.Size()); - for (int i = 0; i < e.Size(); i++) - { - elem_error[i] = e[i]; - } - mesh.pncmesh->SynchronizeDerefinementData(elem_error, derefinement_table); - Vector coarse_error(derefinement_table.Size()); - mfem::Array row; - for (int i = 0; i < derefinement_table.Size(); i++) - { - derefinement_table.GetRow(i, row); - coarse_error[i] = std::sqrt(std::accumulate( - row.begin(), row.end(), 0.0, - [&elem_error](double s, int i) { return s += std::pow(elem_error[i], 2.0); })); - } - - // Given the coarse errors, we use the Dörfler marking strategy to identify the - // smallest set of original elements that make up (1 - θ) of the total error. The - // complement of this set is then the largest number of elements that make up θ of the - // total error. - return ComputeDorflerThreshold(mesh.GetComm(), coarse_error, 1.0 - fraction); -} - -} // namespace palace::utils +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "dorfler.hpp" + +#include +#include +#include +#include + +namespace palace::utils +{ + +std::array ComputeDorflerThreshold(MPI_Comm comm, const Vector &e, + double fraction) +{ + // Precompute the sort and partial sum to make evaluating a candidate partition fast. + e.HostRead(); + std::vector estimates(e.begin(), e.end()); + std::sort(estimates.begin(), estimates.end()); + + // Accumulate the squares of the estimates. + std::vector sum(estimates.size()); + for (auto &x : estimates) + { + x *= x; + } + std::partial_sum(estimates.begin(), estimates.end(), sum.begin()); + for (auto &x : estimates) + { + x = std::sqrt(x); + } + + // The pivot is the first point which leaves (1-θ) of the total sum after it. + const double local_total = sum.size() > 0 ? sum.back() : 0.0; + auto pivot = std::lower_bound(sum.begin(), sum.end(), (1 - fraction) * local_total); + auto index = std::distance(sum.begin(), pivot); + double error_threshold = estimates.size() > 0 ? estimates[index] : 0.0; + + // Compute the number of elements, and amount of error, marked by threshold value e. + auto Marked = [&estimates, &sum, &local_total](double e) -> std::pair + { + if (local_total > 0) + { + const auto lb = std::lower_bound(estimates.begin(), estimates.end(), e); + const auto elems_marked = std::distance(lb, estimates.end()); + const double error_unmarked = + lb != estimates.begin() ? sum[sum.size() - elems_marked - 1] : 0; + const double error_marked = local_total - error_unmarked; + return {elems_marked, error_marked}; + } + else + { + return {0, 0.0}; + } + }; + + // Each processor will compute a different threshold: if a given processor has lots of low + // error elements, their value will be lower and if a processor has high error, their + // value will be higher. Thus using the value from the low error processor will give too + // many elements, and using the value from the high error processor will give too few. The + // correct threshold value will be an intermediate between the min and max over + // processors. + double min_threshold = error_threshold; + double max_threshold = error_threshold; + Mpi::GlobalMin(1, &min_threshold, comm); + Mpi::GlobalMax(1, &max_threshold, comm); + struct + { + std::size_t total; + std::size_t min_marked; + std::size_t max_marked; + } elements; + elements.total = estimates.size(); + struct + { + double total; + double min_marked; + double max_marked; + } error; + error.total = local_total; + std::tie(elements.max_marked, error.max_marked) = Marked(min_threshold); + std::tie(elements.min_marked, error.min_marked) = Marked(max_threshold); + Mpi::GlobalSum(3, &elements.total, comm); + Mpi::GlobalSum(3, &error.total, comm); + const double max_indicator = [&]() + { + double max_indicator = estimates.size() > 0 ? estimates.back() : 0.0; + Mpi::GlobalMax(1, &max_indicator, comm); + return max_indicator; + }(); + MFEM_ASSERT(min_threshold <= max_threshold, + "Error in Dorfler marking: min: " << min_threshold << " max " << max_threshold + << "!"); + auto [elem_marked, error_marked] = Marked(error_threshold); + + // Keep track of the number of elements marked by the threshold bounds. If the top and + // bottom values are equal (or separated by only 1), there's no point further bisecting. + // The maximum iterations is just to prevert runaway. + constexpr int max_it = 100; + for (int i = 0; i < max_it; i++) + { + error_threshold = (min_threshold + max_threshold) / 2; + std::tie(elem_marked, error_marked) = Marked(error_threshold); + + // All processors need the values used for the stopping criteria. + Mpi::GlobalSum(1, &elem_marked, comm); + Mpi::GlobalSum(1, &error_marked, comm); + MFEM_ASSERT(elem_marked > 0, "Some elements must have been marked!"); + MFEM_ASSERT(error_marked > 0, "Some error must have been marked!"); + const auto candidate_fraction = error_marked / error.total; + if constexpr (false) + { + Mpi::Print( + "Marking threshold: {:e} < {:e} < {:e}\nMarked elements: {:d} <= {:d} <= {:d}\n", + min_threshold, error_threshold, max_threshold, elements.min_marked, elem_marked, + elements.max_marked); + } + + // Set the tolerance based off of the largest local indicator value. These tolerance + // values extremely tight because this loop is fast, and getting the marking correct is + // important. + constexpr double frac_tol = 2 * std::numeric_limits::epsilon(); + const double error_tol = 2 * std::numeric_limits::epsilon() * max_indicator; + if (std::abs(max_threshold - min_threshold) < error_tol || + std::abs(candidate_fraction - fraction) < frac_tol || + elements.max_marked <= (elements.min_marked + 1)) + { + // Candidate fraction matches to tolerance, or the number of marked elements is no + // longer changing. + if constexpr (false) + { + Mpi::Print("ΔFraction: {:.3e} (tol = {:.3e})\nΔThreshold: {:.3e} (tol = " + "{:.3e})\nΔElements: {:d}\n", + candidate_fraction - fraction, frac_tol, max_threshold - min_threshold, + error_tol, elements.max_marked - elements.min_marked); + } + break; + } + + // Update in preparation for next iteration. The logic here looks inverted compared to a + // usual binary search, because a smaller value marks a larger number of elements and + // thus a greater fraction of error. + if (candidate_fraction > fraction) + { + // This candidate marked too much, raise the lower bound. + min_threshold = error_threshold; + elements.max_marked = elem_marked; + error.max_marked = error_marked; + } + else if (candidate_fraction < fraction) + { + // This candidate marked too little, lower the upper bound. + max_threshold = error_threshold; + elements.min_marked = elem_marked; + error.min_marked = error_marked; + } + } + + // Always choose the lower threshold value, thereby marking the larger number of elements + // and fraction of the total error. Would rather over mark than under mark, as Dörfler + // marking is the smallest set that covers at least the specified fraction of the error. + error_threshold = min_threshold; + error_marked = error.max_marked; + MFEM_ASSERT(error_threshold > 0.0, + "Error threshold result from marking must be positive!"); + MFEM_VERIFY(error_marked >= fraction * error.total, + "Marked error = " << error_marked << ", total error =" << error.total + << ". Dorfler marking predicate failed!"); + return {error_threshold, error_marked / error.total}; +} + +std::array ComputeDorflerCoarseningThreshold(const mfem::ParMesh &mesh, + const Vector &e, double fraction) +{ + MFEM_VERIFY(mesh.Nonconforming(), "Can only perform coarsening on a Nonconforming mesh!"); + const auto &derefinement_table = mesh.pncmesh->GetDerefinementTable(); + mfem::Array elem_error(e.Size()); + for (int i = 0; i < e.Size(); i++) + { + elem_error[i] = e[i]; + } + mesh.pncmesh->SynchronizeDerefinementData(elem_error, derefinement_table); + Vector coarse_error(derefinement_table.Size()); + mfem::Array row; + for (int i = 0; i < derefinement_table.Size(); i++) + { + derefinement_table.GetRow(i, row); + coarse_error[i] = std::sqrt( + std::accumulate(row.begin(), row.end(), 0.0, [&elem_error](double s, int i) + { return s += std::pow(elem_error[i], 2.0); })); + } + + // Given the coarse errors, we use the Dörfler marking strategy to identify the + // smallest set of original elements that make up (1 - θ) of the total error. The + // complement of this set is then the largest number of elements that make up θ of the + // total error. + return ComputeDorflerThreshold(mesh.GetComm(), coarse_error, 1.0 - fraction); +} + +} // namespace palace::utils diff --git a/palace/utils/dorfler.hpp b/palace/utils/dorfler.hpp index 66e6277405..ecb158f14b 100644 --- a/palace/utils/dorfler.hpp +++ b/palace/utils/dorfler.hpp @@ -1,41 +1,41 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_DORFLER_HPP -#define PALACE_UTILS_DORFLER_HPP - -#include -#include "linalg/vector.hpp" -#include "utils/communication.hpp" - -namespace mfem -{ - -class ParMesh; - -} // namespace mfem - -namespace palace::utils -{ - -// Given a vector of estimates, e, and a fraction, compute a partition value E, such that -// that the set of all estimates with value greater than E, K_E, is the smallest set to -// achieve sum_{K_E} e² >= fraction * sum e². Namely the smallest set of elements that -// will mark the top fraction of the sum of the squared error. Returns as the second element -// in the pair the actual fraction of the total error. -// Reference: Dörfler, A convergent adaptive algorithm for Poisson’s equation, SIAM J. -// Numer. Anal. (1996). -std::array ComputeDorflerThreshold(MPI_Comm comm, const Vector &e, - double fraction); - -// Given a nonconforming mesh, target fraction and error estimates, compute a threshold -// value and actual fraction that will mark the largest number of elements that make up the -// specified fraction of the total coarsening opportunities. This is analogous to -// ComputeDorflerThreshold, but operates only the list of available derefinement -// opportunities within the mesh. -std::array ComputeDorflerCoarseningThreshold(const mfem::ParMesh &mesh, - const Vector &e, double fraction); - -} // namespace palace::utils - -#endif // PALACE_UTILS_DORFLER_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_DORFLER_HPP +#define PALACE_UTILS_DORFLER_HPP + +#include +#include "linalg/vector.hpp" +#include "utils/communication.hpp" + +namespace mfem +{ + +class ParMesh; + +} // namespace mfem + +namespace palace::utils +{ + +// Given a vector of estimates, e, and a fraction, compute a partition value E, such that +// that the set of all estimates with value greater than E, K_E, is the smallest set to +// achieve sum_{K_E} e² >= fraction * sum e². Namely the smallest set of elements that +// will mark the top fraction of the sum of the squared error. Returns as the second element +// in the pair the actual fraction of the total error. +// Reference: Dörfler, A convergent adaptive algorithm for Poisson’s equation, SIAM J. +// Numer. Anal. (1996). +std::array ComputeDorflerThreshold(MPI_Comm comm, const Vector &e, + double fraction); + +// Given a nonconforming mesh, target fraction and error estimates, compute a threshold +// value and actual fraction that will mark the largest number of elements that make up the +// specified fraction of the total coarsening opportunities. This is analogous to +// ComputeDorflerThreshold, but operates only the list of available derefinement +// opportunities within the mesh. +std::array ComputeDorflerCoarseningThreshold(const mfem::ParMesh &mesh, + const Vector &e, double fraction); + +} // namespace palace::utils + +#endif // PALACE_UTILS_DORFLER_HPP diff --git a/palace/utils/excitations.hpp b/palace/utils/excitations.hpp index 83a5a7a213..fcd6aee543 100644 --- a/palace/utils/excitations.hpp +++ b/palace/utils/excitations.hpp @@ -1,114 +1,114 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_EXCITATIONS_HPP -#define PALACE_UTILS_EXCITATIONS_HPP - -#include - -namespace palace::excitations -{ - -// -// Define temporal excitation functions for transient simulations. -// - -inline double pulse_sinusoidal(double t, double omega, double t0) -{ - // g(t) = sin(ω*(t-t0)) - return std::sin(omega * (t - t0)); -} - -inline double dpulse_sinusoidal(double t, double omega, double t0) -{ - // g(t) = sin(ω*(t-t0)) - return omega * std::cos(omega * (t - t0)); -} - -inline double pulse_gaussian(double t, double tau, double t0) -{ - // g(t) = exp(-0.5*(t-t0)²/τ²) - double ts = t - t0; - return std::exp(-0.5 * ts * ts / (tau * tau)); -} - -inline double dpulse_gaussian(double t, double tau, double t0) -{ - // g(t) = exp(-0.5*(t-t0)²/τ²) - double ootau2 = 1.0 / (tau * tau); - double ts = t - t0; - return -ts * ootau2 * std::exp(-0.5 * ts * ts * ootau2); -} - -inline double pulse_gaussian_diff(double t, double tau, double t0) -{ - // g(t) = -(t-t0)/τ²*exp(-0.5*(t-t0)²/τ²) - double ootau2 = 1.0 / (tau * tau); - double ts = t - t0; - return -ts * ootau2 * std::exp(-0.5 * ts * ts * ootau2); -} - -inline double dpulse_gaussian_diff(double t, double tau, double t0) -{ - // g(t) = -(t-t0)/τ²*exp(-0.5*(t-t0)²/τ²) - double ootau2 = 1.0 / (tau * tau); - double ts = t - t0; - double ts2 = ts * ts; - return -ootau2 * (1.0 - ts2 * ootau2) * std::exp(-0.5 * ts2 * ootau2); -} - -inline double pulse_gaussian_mod(double t, double omega, double tau, double t0) -{ - // g(t) = sin(ω*(t-t0))*exp(-0.5*(t-t0)²/τ²) - double ts = t - t0; - return std::sin(omega * ts) * std::exp(-0.5 * ts * ts / (tau * tau)); -} - -inline double dpulse_gaussian_mod(double t, double omega, double tau, double t0) -{ - // g(t) = sin(ω*(t-t0))*exp(-0.5*(t-t0)²/τ²) - double ootau2 = 1.0 / (tau * tau); - double ts = t - t0; - return (-ts * ootau2 * std::sin(omega * ts) + omega * std::cos(omega * ts)) * - std::exp(-0.5 * ts * ts * ootau2); -} - -inline double pulse_ramp(double t, double tau, double t0) -{ - // g(t) = 0, t <= t0 - // (t-t0)/τ, t0 < t <= τ - // 1, t > τ+t0 - return (t <= t0) ? 0.0 : ((t - t0 >= tau) ? 1.0 : (t - t0) / tau); -} - -inline double dpulse_ramp(double t, double tau, double t0) -{ - // g(t) = 0, t <= t0 - // (t-t0)/τ, t0 < t <= τ - // 1, t > τ - return (t <= t0) ? 0.0 : ((t - t0 >= tau) ? 0.0 : 1.0 / tau); -} - -inline double pulse_smootherstep(double t, double tau, double t0) -{ - // g(t) = 0, t <= t0 - // 6*((t-t0)/τ)⁵-15*((t-t0)/τ)⁴+10*((t-t0)/τ)³, t0 < t <= τ+t0 - // 1, t > τ+t0 - double ts = (t <= t0) ? 0.0 : ((t - t0 >= tau) ? 1.0 : (t - t0) / tau); - double ts2 = ts * ts; - return ts * ts2 * (6.0 * ts2 - 15.0 * ts + 10.0); -} - -inline double dpulse_smootherstep(double t, double tau, double t0) -{ - // g(t) = 0, t <= t0 - // 6*((t-t0)/τ)⁵-15*((t-t0)/τ)⁴+10*((t-t0)/τ)³, t0 < t <= τ - // 1, t > τ - double ts = (t <= t0) ? 0.0 : ((t - t0 >= tau) ? 1.0 : (t - t0) / tau); - double ts2 = ts * ts; - return ts2 / tau * (30.0 * ts2 - 60.0 * ts + 30.0); -} - -} // namespace palace::excitations - -#endif // PALACE_UTILS_EXCITATIONS_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_EXCITATIONS_HPP +#define PALACE_UTILS_EXCITATIONS_HPP + +#include + +namespace palace::excitations +{ + +// +// Define temporal excitation functions for transient simulations. +// + +inline double pulse_sinusoidal(double t, double omega, double t0) +{ + // g(t) = sin(ω*(t-t0)) + return std::sin(omega * (t - t0)); +} + +inline double dpulse_sinusoidal(double t, double omega, double t0) +{ + // g(t) = sin(ω*(t-t0)) + return omega * std::cos(omega * (t - t0)); +} + +inline double pulse_gaussian(double t, double tau, double t0) +{ + // g(t) = exp(-0.5*(t-t0)²/τ²) + double ts = t - t0; + return std::exp(-0.5 * ts * ts / (tau * tau)); +} + +inline double dpulse_gaussian(double t, double tau, double t0) +{ + // g(t) = exp(-0.5*(t-t0)²/τ²) + double ootau2 = 1.0 / (tau * tau); + double ts = t - t0; + return -ts * ootau2 * std::exp(-0.5 * ts * ts * ootau2); +} + +inline double pulse_gaussian_diff(double t, double tau, double t0) +{ + // g(t) = -(t-t0)/τ²*exp(-0.5*(t-t0)²/τ²) + double ootau2 = 1.0 / (tau * tau); + double ts = t - t0; + return -ts * ootau2 * std::exp(-0.5 * ts * ts * ootau2); +} + +inline double dpulse_gaussian_diff(double t, double tau, double t0) +{ + // g(t) = -(t-t0)/τ²*exp(-0.5*(t-t0)²/τ²) + double ootau2 = 1.0 / (tau * tau); + double ts = t - t0; + double ts2 = ts * ts; + return -ootau2 * (1.0 - ts2 * ootau2) * std::exp(-0.5 * ts2 * ootau2); +} + +inline double pulse_gaussian_mod(double t, double omega, double tau, double t0) +{ + // g(t) = sin(ω*(t-t0))*exp(-0.5*(t-t0)²/τ²) + double ts = t - t0; + return std::sin(omega * ts) * std::exp(-0.5 * ts * ts / (tau * tau)); +} + +inline double dpulse_gaussian_mod(double t, double omega, double tau, double t0) +{ + // g(t) = sin(ω*(t-t0))*exp(-0.5*(t-t0)²/τ²) + double ootau2 = 1.0 / (tau * tau); + double ts = t - t0; + return (-ts * ootau2 * std::sin(omega * ts) + omega * std::cos(omega * ts)) * + std::exp(-0.5 * ts * ts * ootau2); +} + +inline double pulse_ramp(double t, double tau, double t0) +{ + // g(t) = 0, t <= t0 + // (t-t0)/τ, t0 < t <= τ + // 1, t > τ+t0 + return (t <= t0) ? 0.0 : ((t - t0 >= tau) ? 1.0 : (t - t0) / tau); +} + +inline double dpulse_ramp(double t, double tau, double t0) +{ + // g(t) = 0, t <= t0 + // (t-t0)/τ, t0 < t <= τ + // 1, t > τ + return (t <= t0) ? 0.0 : ((t - t0 >= tau) ? 0.0 : 1.0 / tau); +} + +inline double pulse_smootherstep(double t, double tau, double t0) +{ + // g(t) = 0, t <= t0 + // 6*((t-t0)/τ)⁵-15*((t-t0)/τ)⁴+10*((t-t0)/τ)³, t0 < t <= τ+t0 + // 1, t > τ+t0 + double ts = (t <= t0) ? 0.0 : ((t - t0 >= tau) ? 1.0 : (t - t0) / tau); + double ts2 = ts * ts; + return ts * ts2 * (6.0 * ts2 - 15.0 * ts + 10.0); +} + +inline double dpulse_smootherstep(double t, double tau, double t0) +{ + // g(t) = 0, t <= t0 + // 6*((t-t0)/τ)⁵-15*((t-t0)/τ)⁴+10*((t-t0)/τ)³, t0 < t <= τ + // 1, t > τ + double ts = (t <= t0) ? 0.0 : ((t - t0 >= tau) ? 1.0 : (t - t0) / tau); + double ts2 = ts * ts; + return ts2 / tau * (30.0 * ts2 - 60.0 * ts + 30.0); +} + +} // namespace palace::excitations + +#endif // PALACE_UTILS_EXCITATIONS_HPP diff --git a/palace/utils/filesystem.hpp b/palace/utils/filesystem.hpp index 1c03be80b4..4654605f7e 100644 --- a/palace/utils/filesystem.hpp +++ b/palace/utils/filesystem.hpp @@ -1,19 +1,24 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_FILESYSTEM_HPP -#define PALACE_UTILS_FILESYSTEM_HPP - -#if defined(__cpp_lib_filesystem) || defined(__has_include) && __has_include() -#include -#elif defined(__cpp_lib_experimental_filesystem) || \ - defined(__has_include) && __has_include() -// clang-format off -#include -namespace std { namespace filesystem = experimental::filesystem; } -// clang-format on -#else -#error "Could not find system header or !" -#endif - -#endif // PALACE_UTILS_FILESYSTEM_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_FILESYSTEM_HPP +#define PALACE_UTILS_FILESYSTEM_HPP + +#if defined(__cpp_lib_filesystem) || defined(__has_include) && __has_include() +#include +namespace palace +{ +namespace fs = std::filesystem; +} // namespace palace +#elif defined(__cpp_lib_experimental_filesystem) || \ + defined(__has_include) && __has_include() +#include +namespace palace +{ +namespace fs = std::experimental::filesystem; +} // namespace palace +#else +#error "Could not find system header or !" +#endif + +#endif // PALACE_UTILS_FILESYSTEM_HPP diff --git a/palace/utils/geodata.cpp b/palace/utils/geodata.cpp index 536a3d3faa..fe83baf0a9 100644 --- a/palace/utils/geodata.cpp +++ b/palace/utils/geodata.cpp @@ -1,1983 +1,2790 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "geodata.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include "utils/communication.hpp" -#include "utils/diagnostic.hpp" -#include "utils/filesystem.hpp" -#include "utils/iodata.hpp" -#include "utils/meshio.hpp" -#include "utils/timer.hpp" - -namespace palace -{ - -using Vector3dMap = Eigen::Map; -using CVector3dMap = Eigen::Map; - -namespace -{ - -// Floating point precision for mesh IO. This precision is important, make sure nothing is -// lost! -constexpr auto MSH_FLT_PRECISION = std::numeric_limits::max_digits10; - -// Load the serial mesh from disk. -std::unique_ptr LoadMesh(const std::string &, bool); - -// Optionally reorder mesh elements based on MFEM's internal reordeing tools for improved -// cache usage. -void ReorderMesh(mfem::Mesh &); - -// Generate element-based mesh partitioning, using either a provided file or METIS. -std::unique_ptr GetMeshPartitioning(mfem::Mesh &, int, const std::string & = ""); - -// Cleanup the provided serial mesh by removing unnecessary domain and elements, adding -// boundary elements for material interfaces and exterior boundaries, and adding boundary -// elements for subdomain interfaces. -std::map> CheckMesh(mfem::Mesh &, const std::unique_ptr &, - const IoData &, bool, bool, bool); - -// Given a serial mesh on the root processor and element partitioning, create a parallel -// mesh over the given communicator. -std::unique_ptr DistributeMesh(MPI_Comm, const std::unique_ptr &, - const std::unique_ptr & = nullptr, - const std::string & = ""); - -// Get list of domain and boundary attribute markers used in configuration file for mesh -// cleaning. -void GetUsedAttributeMarkers(const IoData &, int, int, mfem::Array &, - mfem::Array &); - -// Rebalance a conformal mesh across processor ranks, using the MeshPartitioner. Gathers the -// mesh onto the root rank before scattering the partitioned mesh. -void RebalanceConformalMesh(std::unique_ptr &, double, const std::string &); - -struct ElementTypeInfo -{ - bool has_simplices; - bool has_tensors; - bool has_wedges; - bool has_pyramids; -}; - -// Simplified helper for describing the element types in a mesh. -ElementTypeInfo CheckElements(mfem::Mesh &mesh) -{ - // MeshGenerator is reduced over the communicator. This checks for geometries on any - // processor. - auto meshgen = mesh.MeshGenerator(); - return {bool(meshgen & 1), bool(meshgen & 2), bool(meshgen & 4), bool(meshgen & 8)}; -} - -} // namespace - -namespace mesh -{ - -std::unique_ptr ReadMesh(MPI_Comm comm, const IoData &iodata, bool reorder, - bool clean, bool add_bdr, bool unassembled) -{ - // If possible on root, read the serial mesh (converting format if necessary), and do all - // necessary serial preprocessing. When finished, distribute the mesh to all processes. - // Count disk I/O time separately for the mesh read from file. - - // If not adapting, or performing conformal adaptation, can use the mesh partitioner. - std::unique_ptr smesh; - const auto &refinement = iodata.model.refinement; - const bool use_amr = refinement.max_it > 0; - const bool use_mesh_partitioner = !use_amr || !refinement.nonconformal; - { - BlockTimer bt(Timer::IO); - if (Mpi::Root(comm) || !use_mesh_partitioner) - { - // Optionally reorder elements (and vertices) based on spatial location after loading - // the serial mesh. - smesh = LoadMesh(iodata.model.mesh, iodata.model.remove_curvature); - if (reorder) - { - smesh = LoadMesh(iodata.model.mesh, iodata.model.remove_curvature); - if (reorder) - { - ReorderMesh(*smesh); - } - } - MFEM_VERIFY(!(smesh->Nonconforming() && use_mesh_partitioner), - "Cannot use mesh partitioner on a nonconforming mesh"); - } - Mpi::Barrier(comm); - } - - std::unique_ptr partitioning; - if (Mpi::Root(comm) || !use_mesh_partitioner) - { - // Check the the AMR specification and the mesh elements are compatible. - const auto element_types = CheckElements(*smesh); - MFEM_VERIFY(!use_amr || !element_types.has_tensors || refinement.nonconformal, - "If there are tensor elements, AMR must be nonconformal"); - MFEM_VERIFY(!use_amr || !element_types.has_pyramids || refinement.nonconformal, - "If there are pyramid elements, AMR must be nonconformal"); - MFEM_VERIFY(!use_amr || !element_types.has_wedges || refinement.nonconformal, - "If there are wedge elements, AMR must be nonconformal"); - - // Generate the mesh partitioning. - partitioning = GetMeshPartitioning(*smesh, Mpi::Size(comm), iodata.model.partition); - - // Clean up unused domain elements from the mesh, add new boundary elements for material - // interfaces if not present, and optionally (when running unassembled) add subdomain - // interface boundary elements. Can only clean up conforming meshes, assumes that any - // nonconformal mesh was generated by adaptation and thus does not need checking. - if (smesh->Conforming()) - { - static_cast( - CheckMesh(*smesh, partitioning, iodata, clean, add_bdr, unassembled)); - } - } - - std::unique_ptr pmesh; - if (use_mesh_partitioner) - { - pmesh = DistributeMesh(comm, smesh, partitioning, iodata.problem.output); - } - else - { - if (refinement.nonconformal && use_amr) - { - smesh->EnsureNCMesh(true); - } - pmesh = std::make_unique(comm, *smesh, partitioning.get()); - } - - if constexpr (false) - { - std::string tmp = iodata.problem.output; - if (tmp.back() != '/') - { - tmp += '/'; - } - tmp += "tmp/"; - if (Mpi::Root(comm) && !std::filesystem::exists(tmp)) - { - std::filesystem::create_directories(tmp); - } - int width = 1 + static_cast(std::log10(Mpi::Size(comm) - 1)); - std::unique_ptr gsmesh = - LoadMesh(iodata.model.mesh, iodata.model.remove_curvature); - std::unique_ptr gpartitioning = GetMeshPartitioning(*gsmesh, Mpi::Size(comm)); - mfem::ParMesh gpmesh(comm, *gsmesh, gpartitioning.get(), 0); - { - std::string pfile = - mfem::MakeParFilename(tmp + "part.", Mpi::Rank(comm), ".mesh", width); - std::ofstream fo(pfile); - // mfem::ofgzstream fo(pfile, true); // Use zlib compression if available - fo.precision(MSH_FLT_PRECISION); - gpmesh.ParPrint(fo); - } - { - std::string pfile = - mfem::MakeParFilename(tmp + "final.", Mpi::Rank(comm), ".mesh", width); - std::ofstream fo(pfile); - // mfem::ofgzstream fo(pfile, true); // Use zlib compression if available - fo.precision(MSH_FLT_PRECISION); - pmesh->ParPrint(fo); - } - } - - return pmesh; -} - -void RefineMesh(const IoData &iodata, std::vector> &mesh) -{ - // Prepare for uniform and region-based refinement. - MFEM_VERIFY(mesh.size() == 1, - "Input mesh vector before refinement has more than a single mesh!"); - int uniform_ref_levels = iodata.model.refinement.uniform_ref_levels; - int max_region_ref_levels = 0; - for (const auto &box : iodata.model.refinement.GetBoxes()) - { - if (max_region_ref_levels < box.ref_levels) - { - max_region_ref_levels = box.ref_levels; - } - } - for (const auto &sphere : iodata.model.refinement.GetSpheres()) - { - if (max_region_ref_levels < sphere.ref_levels) - { - max_region_ref_levels = sphere.ref_levels; - } - } - if (iodata.solver.linear.mg_max_levels > 1) - { - mesh.reserve(1 + uniform_ref_levels + max_region_ref_levels); - } - - // Prior to MFEM's PR #1046, the tetrahedral mesh required reorientation after all mesh - // refinement in order to define higher-order Nedelec spaces on it. This is technically - // not required after MFEM's PR #1046, but in case you want to be absolutely sure, we - // reorient only the coarse mesh so that the refinements are still true refinements of - // the original mesh (required for geometric multigrid). Otherwise, it happens after - // refinement. - if (iodata.model.reorient_tet && mesh.capacity() > 1) - { - PalacePragmaDiagnosticPush - PalacePragmaDiagnosticDisableDeprecated - mesh[0]->ReorientTetMesh(); - PalacePragmaDiagnosticPop - } - - // Uniformly refine the mesh further in parallel, saving the level meshes for geometric - // coarsening later on if desired. - for (int l = 0; l < uniform_ref_levels; l++) - { - if (mesh.capacity() > 1) - { - mesh.emplace_back(std::make_unique(*mesh.back())); - } - mesh.back()->UniformRefinement(); - } - - // Proceed with region-based refinement, level-by-level for all regions. Currently support - // box and sphere region shapes. Any overlap between regions is ignored (take the union, - // don't double-refine). - if (max_region_ref_levels > 0 && - (mesh[0]->MeshGenerator() & 2 || mesh[0]->MeshGenerator() & 4 || - mesh[0]->MeshGenerator() & 8)) - { - // XX TODO: Region-based refinement won't work if the ParMesh has been constructed from - // a conforming mesh, but nonconforming refinement is needed. Unclear if the - // current mesh distribution scheme will work even for a conforming serial mesh - // which is a NCMesh after Mesh::EnsureNCMesh is called. - MFEM_ABORT("Region-based refinement is currently only supported for simplex meshes!"); - } - const bool use_nodes = (mesh.back()->GetNodes() != nullptr); - const int ref = use_nodes ? mesh.back()->GetNodes()->FESpace()->GetMaxElementOrder() : 1; - const int dim = mesh.back()->SpaceDimension(); - int region_ref_level = 0; - while (region_ref_level < max_region_ref_levels) - { - // Mark elements for refinement in all regions. An element is marked for refinement if - // any of its vertices are inside any refinement region for the given level. - mfem::Array refs; - for (int i = 0; i < mesh.back()->GetNE(); i++) - { - bool refine = false; - mfem::DenseMatrix pointmat; - if (use_nodes) - { - mfem::ElementTransformation *T = mesh.back()->GetElementTransformation(i); - mfem::Geometry::Type geo = mesh.back()->GetElementGeometry(i); - mfem::RefinedGeometry *RefG = mfem::GlobGeometryRefiner.Refine(geo, ref); - T->Transform(RefG->RefPts, pointmat); - } - else - { - mfem::Array verts; - mesh.back()->GetElementVertices(i, verts); - pointmat.SetSize(dim, verts.Size()); - for (int j = 0; j < verts.Size(); j++) - { - const double *coord = mesh.back()->GetVertex(verts[j]); - for (int d = 0; d < dim; d++) - { - pointmat(d, j) = coord[d]; - } - } - } - for (const auto &box : iodata.model.refinement.GetBoxes()) - { - if (region_ref_level < box.ref_levels) - { - for (int j = 0; j < pointmat.Width(); j++) - { - // Check if the point is inside the box. - int d = 0; - for (; d < pointmat.Height(); d++) - { - if (pointmat(d, j) < box.bbmin[d] || pointmat(d, j) > box.bbmax[d]) - { - break; - } - } - if (d == dim) - { - refine = true; - break; - } - } - if (refine) - { - break; - } - } - } - if (refine) - { - refs.Append(mfem::Refinement(i)); - continue; - } - for (const auto &sphere : iodata.model.refinement.GetSpheres()) - { - if (region_ref_level < sphere.ref_levels) - { - for (int j = 0; j < pointmat.Width(); j++) - { - // Check if the point is inside the sphere. - double dist = 0.0; - for (int d = 0; d < pointmat.Height(); d++) - { - double s = pointmat(d, j) - sphere.center[d]; - dist += s * s; - } - if (dist <= sphere.r * sphere.r) - { - refine = true; - break; - } - } - if (refine) - { - break; - } - } - } - if (refine) - { - refs.Append(mfem::Refinement(i)); - } - } - - // Do the refinement. For tensor element meshes, this may make the mesh nonconforming - // (adds hanging nodes). - if (mesh.capacity() > 1) - { - mesh.emplace_back(std::make_unique(*mesh.back())); - } - mesh.back()->GeneralRefinement(refs, -1); - region_ref_level++; - } - - // Prior to MFEM's PR #1046, the tetrahedral mesh required reorientation after all mesh - // refinement in order to define higher-order Nedelec spaces on it. This is technically - // not required after MFEM's PR #1046, but in case you want to be absolutely sure, we - // reorient only the mesh after refinement if there is a single mesh (doesn't work with - // h-refinement geometric multigrid). - if (iodata.model.reorient_tet && mesh.size() == 1) - { - PalacePragmaDiagnosticPush - PalacePragmaDiagnosticDisableDeprecated - mesh[0]->ReorientTetMesh(); - PalacePragmaDiagnosticPop - } - - // Print some mesh information. - mfem::Vector bbmin, bbmax; - mesh[0]->GetBoundingBox(bbmin, bbmax); - const double Lc = iodata.DimensionalizeValue(IoData::ValueType::LENGTH, 1.0); - Mpi::Print(mesh[0]->GetComm(), "\nMesh curvature order: {}\nMesh bounding box:\n", - mesh[0]->GetNodes() - ? std::to_string(mesh[0]->GetNodes()->FESpace()->GetMaxElementOrder()) - : "None"); - if (mesh[0]->SpaceDimension() == 3) - { - Mpi::Print(mesh[0]->GetComm(), - " (Xmin, Ymin, Zmin) = ({:+.3e}, {:+.3e}, {:+.3e}) m\n" - " (Xmax, Ymax, Zmax) = ({:+.3e}, {:+.3e}, {:+.3e}) m\n", - bbmin[0] * Lc, bbmin[1] * Lc, bbmin[2] * Lc, bbmax[0] * Lc, bbmax[1] * Lc, - bbmax[2] * Lc); - } - else - { - Mpi::Print(mesh[0]->GetComm(), - " (Xmin, Ymin) = ({:+.3e}, {:+.3e}) m\n" - " (Xmax, Ymax) = ({:+.3e}, {:+.3e}) m\n", - bbmin[0] * Lc, bbmin[1] * Lc, bbmax[0] * Lc, bbmax[1] * Lc); - } - Mpi::Print(mesh[0]->GetComm(), "\n{}", (mesh.size() > 1) ? "Coarse " : ""); - mesh[0]->PrintInfo(); - if (mesh.size() > 1) - { - Mpi::Print(mesh[0]->GetComm(), "\nRefined "); - mesh.back()->PrintInfo(); - } -} - -namespace -{ - -void ScaleMesh(mfem::Mesh &mesh, double L) -{ - for (int i = 0; i < mesh.GetNV(); i++) - { - double *v = mesh.GetVertex(i); - std::transform(v, v + mesh.SpaceDimension(), v, [L](double val) { return val * L; }); - } - if (mesh.GetNodes()) - { - *mesh.GetNodes() *= L; - } -} - -} // namespace - -void DimensionalizeMesh(mfem::Mesh &mesh, double L) -{ - ScaleMesh(mesh, L); -} - -void NondimensionalizeMesh(mfem::Mesh &mesh, double L) -{ - ScaleMesh(mesh, 1.0 / L); -} - -void AttrToMarker(int max_attr, const mfem::Array &attrs, mfem::Array &marker) -{ - MFEM_VERIFY(attrs.Size() == 0 || attrs.Max() <= max_attr, - "Invalid attribute number present (" << attrs.Max() << ")!"); - marker.SetSize(max_attr); - if (attrs.Size() == 1 && attrs[0] == -1) - { - marker = 1; - } - else - { - marker = 0; - for (auto attr : attrs) - { - MFEM_VERIFY(attr > 0, "Attribute number less than one!"); - MFEM_VERIFY(marker[attr - 1] == 0, "Repeate attribute in attribute list!"); - marker[attr - 1] = 1; - } - } -} - -void AttrToMarker(int max_attr, const std::vector &attrs, mfem::Array &marker) -{ - MFEM_VERIFY(attrs.empty() || *std::max_element(attrs.begin(), attrs.end()) <= max_attr, - "Invalid attribute number present (" - << *std::max_element(attrs.begin(), attrs.end()) << ")!"); - marker.SetSize(max_attr); - if (attrs.size() == 1 && attrs[0] == -1) - { - marker = 1; - } - else - { - marker = 0; - for (auto attr : attrs) - { - MFEM_VERIFY(attr > 0, "Attribute number less than one!"); - MFEM_VERIFY(marker[attr - 1] == 0, "Repeate attribute in attribute list!"); - marker[attr - 1] = 1; - } - } -} - -void GetAxisAlignedBoundingBox(mfem::ParMesh &mesh, int attr, bool bdr, mfem::Vector &min, - mfem::Vector &max) -{ - mfem::Array marker(bdr ? mesh.bdr_attributes.Max() : mesh.attributes.Max()); - marker = 0; - marker[attr - 1] = 1; - GetAxisAlignedBoundingBox(mesh, marker, bdr, min, max); -} - -void GetAxisAlignedBoundingBox(mfem::ParMesh &mesh, const mfem::Array &marker, - bool bdr, mfem::Vector &min, mfem::Vector &max) -{ - int dim = mesh.SpaceDimension(); - min.SetSize(dim); - max.SetSize(dim); - for (int d = 0; d < dim; d++) - { - min(d) = mfem::infinity(); - max(d) = -mfem::infinity(); - } - if (!mesh.GetNodes()) - { - auto BBUpdate = [&mesh, &dim, &min, &max](mfem::Array &verts) -> void - { - for (int j = 0; j < verts.Size(); j++) - { - const double *coord = mesh.GetVertex(verts[j]); - for (int d = 0; d < dim; d++) - { - if (coord[d] < min(d)) - { - min(d) = coord[d]; - } - if (coord[d] > max(d)) - { - max(d) = coord[d]; - } - } - } - }; - if (bdr) - { - for (int i = 0; i < mesh.GetNBE(); i++) - { - if (!marker[mesh.GetBdrAttribute(i) - 1]) - { - continue; - } - mfem::Array verts; - mesh.GetBdrElementVertices(i, verts); - BBUpdate(verts); - } - } - else - { - for (int i = 0; i < mesh.GetNE(); i++) - { - if (!marker[mesh.GetAttribute(i) - 1]) - { - continue; - } - mfem::Array verts; - mesh.GetElementVertices(i, verts); - BBUpdate(verts); - } - } - } - else - { - const int ref = mesh.GetNodes()->FESpace()->GetMaxElementOrder(); - auto BBUpdate = [&ref, &min, &max](mfem::ElementTransformation *T, - mfem::Geometry::Type &geo) -> void - { - mfem::DenseMatrix pointmat; - mfem::RefinedGeometry *RefG = mfem::GlobGeometryRefiner.Refine(geo, ref); - T->Transform(RefG->RefPts, pointmat); - for (int j = 0; j < pointmat.Width(); j++) - { - for (int d = 0; d < pointmat.Height(); d++) - { - if (pointmat(d, j) < min(d)) - { - min(d) = pointmat(d, j); - } - if (pointmat(d, j) > max(d)) - { - max(d) = pointmat(d, j); - } - } - } - }; - if (bdr) - { - for (int i = 0; i < mesh.GetNBE(); i++) - { - if (!marker[mesh.GetBdrAttribute(i) - 1]) - { - continue; - } - mfem::ElementTransformation *T = mesh.GetBdrElementTransformation(i); - mfem::Geometry::Type geo = mesh.GetBdrElementGeometry(i); - BBUpdate(T, geo); - } - } - else - { - for (int i = 0; i < mesh.GetNE(); i++) - { - if (!marker[mesh.GetAttribute(i) - 1]) - { - continue; - } - mfem::ElementTransformation *T = mesh.GetElementTransformation(i); - mfem::Geometry::Type geo = mesh.GetElementGeometry(i); - BBUpdate(T, geo); - } - } - } - auto *Min = min.HostReadWrite(); - auto *Max = max.HostReadWrite(); - Mpi::GlobalMin(dim, Min, mesh.GetComm()); - Mpi::GlobalMax(dim, Max, mesh.GetComm()); -} - -double BoundingBox::Area() const -{ - return 4.0 * - CVector3dMap(normals[0].data()).cross(CVector3dMap(normals[1].data())).norm(); -} - -double BoundingBox::Volume() const -{ - return planar ? 0.0 : 2.0 * CVector3dMap(normals[2].data()).norm() * Area(); -} - -std::array BoundingBox::Lengths() const -{ - return {2.0 * CVector3dMap(normals[0].data()).norm(), - 2.0 * CVector3dMap(normals[1].data()).norm(), - 2.0 * CVector3dMap(normals[2].data()).norm()}; -} - -std::array BoundingBox::Deviation(const std::array &direction) const -{ - const auto eig_dir = CVector3dMap(direction.data()); - std::array deviation_deg; - for (std::size_t i = 0; i < 3; i++) - { - deviation_deg[i] = - std::acos(std::min(1.0, std::abs(eig_dir.normalized().dot( - CVector3dMap(normals[i].data()).normalized())))) * - (180.0 / M_PI); - } - return deviation_deg; -} - -namespace -{ - -// Compute a lexicographic comparison of Eigen Vector3d. -bool EigenLE(const Eigen::Vector3d &x, const Eigen::Vector3d &y) -{ - return std::lexicographical_compare(x.begin(), x.end(), y.begin(), y.end()); -}; - -// Helper for collecting a point cloud from a mesh, used in calculating bounding boxes and -// bounding balls. Returns the dominant rank, for which the vertices argument will be -// filled, while all other ranks will have an empty vector. Vertices are de-duplicated to a -// certain floating point precision. -int CollectPointCloudOnRoot(mfem::ParMesh &mesh, const mfem::Array &marker, bool bdr, - std::vector &vertices) -{ - std::set vertex_indices; - if (!mesh.GetNodes()) - { - // Linear mesh, work with element vertices directly. - mfem::Array v; - if (bdr) - { - for (int i = 0; i < mesh.GetNBE(); i++) - { - if (!marker[mesh.GetBdrAttribute(i) - 1]) - { - continue; - } - mesh.GetBdrElementVertices(i, v); - vertex_indices.insert(v.begin(), v.end()); - } - } - else - { - for (int i = 0; i < mesh.GetNE(); i++) - { - if (!marker[mesh.GetAttribute(i) - 1]) - { - continue; - } - mesh.GetElementVertices(i, v); - vertex_indices.insert(v.begin(), v.end()); - } - } - for (auto i : vertex_indices) - { - const auto &vx = mesh.GetVertex(i); - vertices.push_back({vx[0], vx[1], vx[2]}); - } - } - else - { - // Nonlinear mesh, need to process point matrices. - const int ref = mesh.GetNodes()->FESpace()->GetMaxElementOrder(); - mfem::DenseMatrix pointmat; // 3 x N - if (bdr) - { - for (int i = 0; i < mesh.GetNBE(); i++) - { - if (!marker[mesh.GetBdrAttribute(i) - 1]) - { - continue; - } - mfem::ElementTransformation *T = mesh.GetBdrElementTransformation(i); - T->Transform( - mfem::GlobGeometryRefiner.Refine(mesh.GetBdrElementGeometry(i), ref)->RefPts, - pointmat); - for (int j = 0; j < pointmat.Width(); j++) - { - vertices.push_back({pointmat(0, j), pointmat(1, j), pointmat(2, j)}); - } - } - } - else - { - for (int i = 0; i < mesh.GetNE(); i++) - { - if (!marker[mesh.GetAttribute(i) - 1]) - { - continue; - } - mfem::ElementTransformation *T = mesh.GetElementTransformation(i); - T->Transform( - mfem::GlobGeometryRefiner.Refine(mesh.GetElementGeometry(i), ref)->RefPts, - pointmat); - for (int j = 0; j < pointmat.Width(); j++) - { - vertices.push_back({pointmat(0, j), pointmat(1, j), pointmat(2, j)}); - } - } - } - } - - // dominant_rank will perform the calculation. - MPI_Comm comm = mesh.GetComm(); - const auto num_vertices = int(vertices.size()); - const int dominant_rank = [&]() - { - int vert = num_vertices, rank = Mpi::Rank(comm); - Mpi::GlobalMaxLoc(1, &vert, &rank, comm); - return rank; - }(); - std::vector recv_counts(Mpi::Size(comm)), displacements; - std::vector collected_vertices; - MPI_Gather(&num_vertices, 1, MPI_INT, recv_counts.data(), 1, MPI_INT, dominant_rank, - comm); - if (dominant_rank == Mpi::Rank(comm)) - { - // First displacement is zero, then after is the partial sum of recv_counts. - displacements.resize(Mpi::Size(comm)); - displacements[0] = 0; - std::partial_sum(recv_counts.begin(), recv_counts.end() - 1, displacements.begin() + 1); - - // Add on slots at the end of vertices for the incoming data. - collected_vertices.resize(std::accumulate(recv_counts.begin(), recv_counts.end(), 0)); - - // MPI transfer will be done with MPI_DOUBLE, so duplicate all these values. - for (auto &x : displacements) - { - x *= 3; - } - for (auto &x : recv_counts) - { - x *= 3; - } - } - - // Gather the data to the dominant rank. - static_assert(sizeof(Eigen::Vector3d) == 3 * sizeof(double)); - MPI_Gatherv(vertices.data(), 3 * num_vertices, MPI_DOUBLE, collected_vertices.data(), - recv_counts.data(), displacements.data(), MPI_DOUBLE, dominant_rank, comm); - - // Deduplicate vertices. Given floating point precision, need a tolerance. - if (dominant_rank == Mpi::Rank(comm)) - { - auto vertex_equality = [](const auto &x, const auto &y) - { - constexpr double tolerance = 10.0 * std::numeric_limits::epsilon(); - return std::abs(x[0] - y[0]) < tolerance && std::abs(x[1] - y[1]) < tolerance && - std::abs(x[2] - y[2]) < tolerance; - }; - vertices = std::move(collected_vertices); - std::sort(vertices.begin(), vertices.end(), EigenLE); - vertices.erase(std::unique(vertices.begin(), vertices.end(), vertex_equality), - vertices.end()); - } - else - { - vertices.clear(); - } - - return dominant_rank; -} - -// Calculates a bounding box from a point cloud, result is broadcast across all processes. -BoundingBox BoundingBoxFromPointCloud(MPI_Comm comm, - const std::vector &vertices, - int dominant_rank) -{ - BoundingBox box; - if (dominant_rank == Mpi::Rank(comm)) - { - // Pick a candidate 000 vertex using lexicographic sort. This can be vulnerable to - // floating point precision if the box is axis aligned, but not floating point exact. - // Pick candidate 111 as the furthest from this candidate, then reassign 000 as the - // furthest from 111. Such a pair has to form the diagonal for a point cloud defining a - // box. Verify that p_111 is also the maximum distance from p_000 -> a diagonal is - // found. - MFEM_VERIFY(vertices.size() >= 4, - "A bounding box requires a minimum of four vertices for this algorithm!"); - auto p_000 = std::min_element(vertices.begin(), vertices.end(), EigenLE); - auto DistFromP_000 = [&p_000](const Eigen::Vector3d &x, const Eigen::Vector3d &y) - { return (x - *p_000).norm() < (y - *p_000).norm(); }; - auto p_111 = std::max_element(vertices.begin(), vertices.end(), DistFromP_000); - auto DistFromP_111 = [&p_111](const Eigen::Vector3d &x, const Eigen::Vector3d &y) - { return (x - *p_111).norm() < (y - *p_111).norm(); }; - p_000 = std::max_element(vertices.begin(), vertices.end(), DistFromP_111); - MFEM_VERIFY(std::max_element(vertices.begin(), vertices.end(), DistFromP_000) == p_111, - "p_000 and p_111 must be mutually opposing points!"); - - // Define a diagonal of the ASSUMED cuboid bounding box. Store references as this is - // useful for checking pointers later. - const auto &v_000 = *p_000; - const auto &v_111 = *p_111; - MFEM_VERIFY(&v_000 != &v_111, "Minimum and maximum extents cannot be identical!"); - const auto origin = v_000; - const Eigen::Vector3d n_1 = (v_111 - v_000).normalized(); - - // Compute the distance from the normal axis. Note: everything has been oriented - // relative to v_000 == (0,0,0). - auto PerpendicularDistance = [&n_1, &origin](const Eigen::Vector3d &v) - { return ((v - origin) - (v - origin).dot(n_1) * n_1).norm(); }; - - // Find the vertex furthest from the diagonal axis. We cannot know yet if this defines - // (001) or (011). - const auto &t_0 = - *std::max_element(vertices.begin(), vertices.end(), - [PerpendicularDistance](const auto &x, const auto &y) - { return PerpendicularDistance(x) < PerpendicularDistance(y); }); - MFEM_VERIFY(&t_0 != &v_000, "Vertices are degenerate!"); - MFEM_VERIFY(&t_0 != &v_111, "Vertices are degenerate!"); - - // Use the discovered vertex to define a second direction and thus a plane. - const Eigen::Vector3d n_2 = - ((t_0 - origin) - (t_0 - origin).dot(n_1) * n_1).normalized(); - - // n_1 and n_2 now define a planar coordinate system intersecting the main diagonal, and - // two opposite edges of the cuboid. Now look for a component that maximizes distance - // from the planar system: complete the axes with a cross, then use a dot product to - // pick the greatest deviation. - auto OutOfPlaneDistance = [&n_1, &n_2, &origin](const Eigen::Vector3d &v) - { - return ((v - origin) - (v - origin).dot(n_1) * n_1 - (v - origin).dot(n_2) * n_2) - .norm(); - }; - - // Collect the furthest point from the plane. - auto max_distance = OutOfPlaneDistance( - *std::max_element(vertices.begin(), vertices.end(), - [OutOfPlaneDistance](const auto &x, const auto &y) - { return OutOfPlaneDistance(x) < OutOfPlaneDistance(y); })); - - constexpr double rel_tol = 1e-6; - box.planar = max_distance < (rel_tol * (v_111 - v_000).norm()); - - // Given numerical tolerance, collect other points with an almost matching distance. - std::vector vertices_out_of_plane; - const double cooincident_tolerance = rel_tol * max_distance; - std::copy_if( - vertices.begin(), vertices.end(), std::back_inserter(vertices_out_of_plane), - [OutOfPlaneDistance, cooincident_tolerance, max_distance](const auto &v) - { return std::abs(OutOfPlaneDistance(v) - max_distance) < cooincident_tolerance; }); - - // Given candidates t_0 and t_1, the closer to origin defines v_001. - const auto &t_1 = box.planar - ? t_0 - : *std::min_element(vertices_out_of_plane.begin(), - vertices_out_of_plane.end(), DistFromP_000); - const bool t_0_gt_t_1 = - (t_0 - origin).norm() > (t_1 - origin).norm(); // If planar t_1 == t_0 - const auto &v_001 = t_0_gt_t_1 ? t_1 : t_0; - const auto &v_011 = box.planar ? v_111 : (t_0_gt_t_1 ? t_0 : t_1); - - // Compute the center as halfway along the main diagonal. - Vector3dMap(box.center.data()) = 0.5 * (v_000 + v_111); - - // The length in each direction is given by traversing the edges of the cuboid in turn. - Vector3dMap(box.normals[0].data()) = 0.5 * (v_001 - v_000); - Vector3dMap(box.normals[1].data()) = 0.5 * (v_011 - v_001); - Vector3dMap(box.normals[2].data()) = 0.5 * (v_111 - v_011); - - // Make sure the longest dimension comes first. - std::sort(box.normals.begin(), box.normals.end(), - [](const auto &x, const auto &y) - { return CVector3dMap(x.data()).norm() > CVector3dMap(y.data()).norm(); }); - } - - // Broadcast result to all processors. - Mpi::Broadcast(3, box.center.data(), dominant_rank, comm); - Mpi::Broadcast(3 * 3, box.normals.data()->data(), dominant_rank, comm); - Mpi::Broadcast(1, &box.planar, dominant_rank, comm); - - return box; -} - -// Calculates a bounding ball from a point cloud, result is broadcast across all processes. -BoundingBall BoundingBallFromPointCloud(MPI_Comm comm, - const std::vector &vertices, - int dominant_rank) -{ - BoundingBall ball; - if (dominant_rank == Mpi::Rank(comm)) - { - // Pick a candidate 000 vertex using lexicographic sort. This can be vulnerable to - // floating point precision if there is no directly opposed vertex. - // Pick candidate 111 as the furthest from this candidate, then reassign 000 as the - // furthest from 111. Such a pair has to form the diagonal for a point cloud defining a - // ball. Verify that p_111 is also the maximum distance from p_000 -> a diagonal is - // found. - MFEM_VERIFY(vertices.size() >= 3, - "A bounding ball requires a minimum of three vertices for this algorithm!"); - auto p_000 = std::min_element(vertices.begin(), vertices.end(), EigenLE); - auto DistFromP_000 = [&p_000](const Eigen::Vector3d &x, const Eigen::Vector3d &y) - { return (x - *p_000).norm() < (y - *p_000).norm(); }; - auto p_111 = std::max_element(vertices.begin(), vertices.end(), DistFromP_000); - auto DistFromP_111 = [&p_111](const Eigen::Vector3d &x, const Eigen::Vector3d &y) - { return (x - *p_111).norm() < (y - *p_111).norm(); }; - p_000 = std::max_element(vertices.begin(), vertices.end(), DistFromP_111); - MFEM_VERIFY(std::max_element(vertices.begin(), vertices.end(), DistFromP_000) == p_111, - "p_000 and p_111 must be mutually opposing points!"); - - const auto &min = *p_000; - const auto &max = *p_111; - Eigen::Vector3d delta = max - min; - ball.radius = 0.5 * delta.norm(); - Vector3dMap(ball.center.data()) = 0.5 * (min + max); - - // Project onto this candidate diameter, and pick a vertex furthest away. Check that - // this resulting distance is less than or equal to the radius, and use the resulting - // direction to compute another in plane vector. Assumes all delta are normalized, and - // applies a common origin as part of the projection. - auto PerpendicularDistance = [min](const std::initializer_list &deltas, - const Eigen::Vector3d &vin) - { - Eigen::Vector3d v = vin - min; - for (const auto &d : deltas) - { - v -= d.dot(v) * d; - } - return v.norm(); - }; - - delta.normalize(); - const auto &perp = *std::max_element( - vertices.begin(), vertices.end(), - [&delta, PerpendicularDistance](const auto &x, const auto &y) - { return PerpendicularDistance({delta}, x) < PerpendicularDistance({delta}, y); }); - constexpr double rel_tol = 1.0e-6; - MFEM_VERIFY(std::abs(PerpendicularDistance({delta}, perp) - ball.radius) <= - rel_tol * ball.radius, - "Furthest point perpendicular must be on the exterior of the ball: " - << PerpendicularDistance({delta}, perp) << " vs. " << ball.radius - << "!"); - - // Compute a perpendicular to the circle using the cross product. - const Eigen::Vector3d n_radial = (perp - CVector3dMap(ball.center.data())).normalized(); - Vector3dMap(ball.planar_normal.data()) = delta.cross(n_radial).normalized(); - - // Compute the point furthest out of the plane discovered. If below tolerance, this - // means the ball is 2D. - const auto &out_of_plane = *std::max_element( - vertices.begin(), vertices.end(), - [&delta, &n_radial, PerpendicularDistance](const auto &x, const auto &y) - { - return PerpendicularDistance({delta, n_radial}, x) < - PerpendicularDistance({delta, n_radial}, y); - }); - - ball.planar = - PerpendicularDistance({delta, n_radial}, out_of_plane) / ball.radius < rel_tol; - if (!ball.planar) - { - // The points are not functionally coplanar, zero out the normal. - MFEM_VERIFY(std::abs(PerpendicularDistance({delta}, perp) - ball.radius) <= - rel_tol * ball.radius, - "Furthest point perpendicular must be on the exterior of the sphere!"); - Vector3dMap(ball.planar_normal.data()) *= 0; - } - } - - // Broadcast result to all processors. - Mpi::Broadcast(3, ball.center.data(), dominant_rank, comm); - Mpi::Broadcast(3, ball.planar_normal.data(), dominant_rank, comm); - Mpi::Broadcast(1, &ball.radius, dominant_rank, comm); - Mpi::Broadcast(1, &ball.planar, dominant_rank, comm); - - return ball; -} - -double LengthFromPointCloud(MPI_Comm comm, const std::vector &vertices, - int dominant_rank, const std::array &dir) -{ - double length; - if (dominant_rank == Mpi::Rank(comm)) - { - CVector3dMap direction(dir.data()); - - auto Dot = [&](const auto &x, const auto &y) - { return direction.dot(x) < direction.dot(y); }; - auto p_min = std::min_element(vertices.begin(), vertices.end(), Dot); - auto p_max = std::max_element(vertices.begin(), vertices.end(), Dot); - - length = (*p_max - *p_min).dot(direction.normalized()); - } - Mpi::Broadcast(1, &length, dominant_rank, comm); - return length; -} - -} // namespace - -double GetProjectedLength(mfem::ParMesh &mesh, const mfem::Array &marker, bool bdr, - const std::array &dir) -{ - std::vector vertices; - int dominant_rank = CollectPointCloudOnRoot(mesh, marker, bdr, vertices); - return LengthFromPointCloud(mesh.GetComm(), vertices, dominant_rank, dir); -} - -double GetProjectedLength(mfem::ParMesh &mesh, int attr, bool bdr, - const std::array &dir) -{ - mfem::Array marker(bdr ? mesh.bdr_attributes.Max() : mesh.attributes.Max()); - marker = 0; - marker[attr - 1] = 1; - return GetProjectedLength(mesh, marker, bdr, dir); -} - -BoundingBox GetBoundingBox(mfem::ParMesh &mesh, const mfem::Array &marker, bool bdr) -{ - std::vector vertices; - int dominant_rank = CollectPointCloudOnRoot(mesh, marker, bdr, vertices); - return BoundingBoxFromPointCloud(mesh.GetComm(), vertices, dominant_rank); -} - -BoundingBox GetBoundingBox(mfem::ParMesh &mesh, int attr, bool bdr) -{ - mfem::Array marker(bdr ? mesh.bdr_attributes.Max() : mesh.attributes.Max()); - marker = 0; - marker[attr - 1] = 1; - return GetBoundingBox(mesh, marker, bdr); -} - -BoundingBall GetBoundingBall(mfem::ParMesh &mesh, const mfem::Array &marker, bool bdr) -{ - std::vector vertices; - int dominant_rank = CollectPointCloudOnRoot(mesh, marker, bdr, vertices); - return BoundingBallFromPointCloud(mesh.GetComm(), vertices, dominant_rank); -} - -BoundingBall GetBoundingBall(mfem::ParMesh &mesh, int attr, bool bdr) -{ - mfem::Array marker(bdr ? mesh.bdr_attributes.Max() : mesh.attributes.Max()); - marker = 0; - marker[attr - 1] = 1; - return GetBoundingBall(mesh, marker, bdr); -} - -void GetSurfaceNormal(mfem::ParMesh &mesh, int attr, mfem::Vector &normal) -{ - mfem::Array marker(mesh.bdr_attributes.Max()); - marker = 0; - marker[attr - 1] = 1; - GetSurfaceNormal(mesh, marker, normal); -} - -void GetSurfaceNormal(mfem::ParMesh &mesh, const mfem::Array &marker, - mfem::Vector &normal) -{ - int dim = mesh.SpaceDimension(); - mfem::Vector nor(dim); - normal.SetSize(dim); - normal = 0.0; - bool init = false; - for (int i = 0; i < mesh.GetNBE(); i++) - { - if (!marker[mesh.GetBdrAttribute(i) - 1]) - { - continue; - } - mfem::ElementTransformation *T = mesh.GetBdrElementTransformation(i); - const mfem::IntegrationPoint &ip = - mfem::Geometries.GetCenter(mesh.GetBdrElementGeometry(i)); - T->SetIntPoint(&ip); - mfem::CalcOrtho(T->Jacobian(), nor); - if (!init) - { - normal = nor; - init = true; - } - else - { - // Check orientation and make sure consistent on this process. If a boundary has - // conflicting normal definitions, use the first value. - if (nor * normal < 0.0) - { - normal -= nor; - } - else - { - normal += nor; - } - } - } - // If different processors have different normal orientations, take that from the lowest - // rank processor. - MPI_Comm comm = mesh.GetComm(); - int rank = Mpi::Size(comm); - mfem::Vector glob_normal(dim); - if (init) - { - rank = Mpi::Rank(comm); - } - Mpi::GlobalMin(1, &rank, comm); - if (rank == Mpi::Size(comm)) - { - // No boundary elements of attribute attr. - normal = 0.0; - return; - } - if (rank == Mpi::Rank(comm)) - { - glob_normal = normal; - } - { - auto *GlobNormal = glob_normal.HostReadWrite(); - Mpi::Broadcast(dim, GlobNormal, rank, comm); - } - if (init && normal * glob_normal < 0.0) - { - normal.Neg(); - } - { - auto *Normal = normal.HostReadWrite(); - Mpi::GlobalSum(dim, Normal, comm); - } - normal /= normal.Norml2(); - // if (dim == 3) - // { - // Mpi::Print(comm, " Surface normal {:d} = ({:+.3e}, {:+.3e}, {:+.3e})", attr, - // normal(0), - // normal(1), normal(2)); - // } - // else - // { - // Mpi::Print(comm, " Surface normal {:d} = ({:+.3e}, {:+.3e})", attr, normal(0), - // normal(1)); - // } -} - -double RebalanceMesh(const IoData &iodata, std::unique_ptr &mesh, double tol) -{ - BlockTimer bt0(Timer::REBALANCE); - const bool save_adapt_mesh = iodata.model.refinement.save_adapt_mesh; - std::string serial_mesh_file; - if (save_adapt_mesh) - { - serial_mesh_file = iodata.problem.output; - if (serial_mesh_file.back() != '/') - { - serial_mesh_file += '/'; - } - serial_mesh_file += "serial.mesh"; - } - - MPI_Comm comm = mesh->GetComm(); - if (Mpi::Size(comm) == 1) - { - if (save_adapt_mesh) - { - BlockTimer bt1(Timer::IO); - std::ofstream fo(serial_mesh_file); - fo.precision(MSH_FLT_PRECISION); - mesh::DimensionalizeMesh(*mesh, iodata.GetLengthScale()); - mesh->mfem::Mesh::Print(fo); - mesh::NondimensionalizeMesh(*mesh, iodata.GetLengthScale()); - } - return 1.0; - } - - // If there is more than one processor, may perform rebalancing. - mesh->ExchangeFaceNbrData(); - int min_elem, max_elem; - min_elem = max_elem = mesh->GetNE(); - Mpi::GlobalMin(1, &min_elem, comm); - Mpi::GlobalMax(1, &max_elem, comm); - const double ratio = double(max_elem) / min_elem; - if constexpr (false) - { - Mpi::Print("Rebalancing: max/min elements per processor = {:d}/{:d} (ratio = {:.3e}, " - "tol = {:.3e})\n", - max_elem, min_elem, ratio, tol); - } - if (ratio > tol) - { - if (mesh->Nonconforming() && save_adapt_mesh) - { - // Do not need to duplicate the mesh, as rebalancing will undo this. - mfem::Array serial_partition(mesh->GetNE()); - serial_partition = 0; - mesh->Rebalance(serial_partition); - BlockTimer bt1(Timer::IO); - if (Mpi::Root(comm)) - { - std::ofstream fo(serial_mesh_file); - fo.precision(MSH_FLT_PRECISION); - mesh::DimensionalizeMesh(*mesh, iodata.GetLengthScale()); - mesh->Mesh::Print(fo); - mesh::NondimensionalizeMesh(*mesh, iodata.GetLengthScale()); - } - Mpi::Barrier(comm); - } - if (mesh->Nonconforming()) - { - mesh->Rebalance(); - } - else - { - // Without access to a refinement tree, partitioning must be done on the root - // processor and then redistributed. - RebalanceConformalMesh(mesh, iodata.GetLengthScale(), serial_mesh_file); - } - } - else if (save_adapt_mesh) - { - // Given no rebalancing will be done, need to handle the serial write more carefully. - // This requires creating a separate serial mesh. - if (mesh->Nonconforming()) - { - mfem::ParMesh smesh(*mesh); - mfem::Array serial_partition(mesh->GetNE()); - serial_partition = 0; - smesh.Rebalance(serial_partition); - BlockTimer bt1(Timer::IO); - if (Mpi::Root(comm)) - { - std::ofstream fo(serial_mesh_file); - fo.precision(MSH_FLT_PRECISION); - mesh::DimensionalizeMesh(smesh, iodata.GetLengthScale()); - smesh.Mesh::Print(fo); // Do not need to nondimensionalize the temporary mesh - } - Mpi::Barrier(comm); - } - else - { - auto smesh = std::make_unique(mesh->GetSerialMesh(0)); - BlockTimer bt1(Timer::IO); - if (Mpi::Rank(comm) == 0) - { - std::ofstream fo(serial_mesh_file); - fo.precision(MSH_FLT_PRECISION); - mesh::DimensionalizeMesh(*smesh, iodata.GetLengthScale()); - smesh->Print(fo); // Do not need to nondimensionalize the temporary mesh - } - Mpi::Barrier(comm); - } - } - mesh->ExchangeFaceNbrData(); - return ratio; -} - -} // namespace mesh - -namespace -{ - -std::unique_ptr LoadMesh(const std::string &path, bool remove_curvature) -{ - // Read the (serial) mesh from the given mesh file. Handle preparation for refinement and - // orientations here to avoid possible reorientations and reordering later on. MFEM - // supports a native mesh format (.mesh), VTK/VTU, Gmsh, as well as some others. We use - // built-in converters for the types we know, otherwise rely on MFEM to do the conversion - // or error out if not supported. - constexpr bool generate_edges = true, refine = true, fix_orientation = true; - std::unique_ptr mesh; - std::filesystem::path mfile(path); - if (mfile.extension() == ".mphtxt" || mfile.extension() == ".mphbin" || - mfile.extension() == ".nas" || mfile.extension() == ".bdf") - { - // Put translated mesh in temporary string buffer. - std::stringstream fi(std::stringstream::in | std::stringstream::out); - // fi << std::fixed; - fi << std::scientific; - fi.precision(MSH_FLT_PRECISION); - -#if 0 - // Put translated mesh in temporary storage (directory is created and destroyed in - // calling function). - std::string tmp = iodata.problem.output; - if (tmp.back() != '/') - { - tmp += '/'; - } - tmp += "tmp/serial.msh"; - std::ofstream fo(tmp); - // mfem::ofgzstream fo(tmp, true); // Use zlib compression if available - // fo << std::fixed; - fo << std::scientific; - fo.precision(MSH_FLT_PRECISION); -#endif - - if (mfile.extension() == ".mphtxt" || mfile.extension() == ".mphbin") - { - mesh::ConvertMeshComsol(path, fi); - // mesh::ConvertMeshComsol(path, fo); - } - else - { - mesh::ConvertMeshNastran(path, fi); - // mesh::ConvertMeshNastran(path, fo); - } - -#if 0 - std::ifstream fi(tmp); - // mfem::ifgzstream fi(tmp); - if (!fi.good()) - { - MFEM_ABORT("Unable to open translated mesh file \"" << tmp << "\"!"); - } -#endif - - mesh = std::make_unique(fi, generate_edges, refine, fix_orientation); - } - else - { - // Otherwise, just rely on MFEM load the mesh. - std::ifstream fi(path); - if (!fi.good()) - { - MFEM_ABORT("Unable to open mesh file \"" << path << "\"!"); - } - mesh = std::make_unique(fi, generate_edges, refine, fix_orientation); - } - if (remove_curvature) - { - if (mesh->GetNodes()) - { - mfem::GridFunction *nodes = nullptr; - int own_nodes = true; - mesh->SwapNodes(nodes, own_nodes); - if (own_nodes) - { - delete nodes; - } - } - } - else - { - mesh->EnsureNodes(); - } - return mesh; -} - -void ReorderMesh(mfem::Mesh &mesh) -{ - mfem::Array ordering; - - if constexpr (false) - { - // Gecko reordering. - mfem::Array tentative; - int outer = 3, inner = 3, window = 4, period = 2; - double best_cost = mfem::infinity(); - for (int i = 0; i < outer; i++) - { - int seed = i + 1; - double cost = - mesh.GetGeckoElementOrdering(tentative, inner, window, period, seed, true); - if (cost < best_cost) - { - ordering = tentative; - best_cost = cost; - } - } - Mpi::Print("Final cost: {:e}\n", best_cost); - } - - // (Faster) Hilbert reordering. - mesh.GetHilbertElementOrdering(ordering); - mesh.ReorderElements(ordering); -} - -std::unique_ptr GetMeshPartitioning(mfem::Mesh &mesh, int size, - const std::string &partition) -{ - MFEM_VERIFY(size <= mesh.GetNE(), "Mesh partitioning must have parts <= mesh elements (" - << size << " vs. " << mesh.GetNE() << ")!"); - if (partition.length() == 0) - { - const int part_method = 1; - std::unique_ptr partitioning(mesh.GeneratePartitioning(size, part_method)); - Mpi::Print("Finished partitioning mesh into {:d} subdomain{}\n", size, - (size > 1) ? "s" : ""); - return partitioning; - } - // User can optionally specify a mesh partitioning file as generated from the MFEM - // mesh-explorer miniapp, for example. It has the format: - // - // number_of_elements - // number_of_processors - // - // ... - // - // - int ne, np; - std::ifstream part_ifs(partition); - part_ifs.ignore(std::numeric_limits::max(), ' '); - part_ifs >> ne; - if (ne != mesh.GetNE()) - { - MFEM_ABORT("Invalid partitioning file (number of elements)!"); - } - part_ifs.ignore(std::numeric_limits::max(), ' '); - part_ifs >> np; - if (np != size) - { - MFEM_ABORT("Invalid partitioning file (number of processors)!"); - } - auto partitioning = std::make_unique(mesh.GetNE()); - int i = 0; - while (i < mesh.GetNE()) - { - part_ifs >> partitioning[i++]; - } - Mpi::Print("Read mesh partitioning into {:d} subdomain{} from disk\n", size, - (size > 1) ? "s" : ""); - return partitioning; -} - -std::map> CheckMesh(mfem::Mesh &orig_mesh, - const std::unique_ptr &partitioning, - const IoData &iodata, bool clean_elem, - bool add_bdr, bool add_subdomain) -{ - // - Check that all external boundaries of the mesh have a corresponding boundary - // condition. - // - If desired, create a new mesh which has added boundary elements for all material - // interfaces if these elements do not yet exist. - // - If desired, create a new mesh which has removed all domain elements which do not have - // an associated material property specified in the input file. - MFEM_VERIFY(orig_mesh.Dimension() == 3 && !orig_mesh.Nonconforming(), - "Nonconforming or 2D meshes have not been tested yet!"); - MFEM_VERIFY(dynamic_cast(&orig_mesh) == nullptr, - "This function does not work for ParMesh"); - mfem::Array mat_marker, bdr_marker; - GetUsedAttributeMarkers( - iodata, orig_mesh.attributes.Size() ? orig_mesh.attributes.Max() : 0, - orig_mesh.bdr_attributes.Size() ? orig_mesh.bdr_attributes.Max() : 0, mat_marker, - bdr_marker); - bool warn = false; - for (int be = 0; be < orig_mesh.GetNBE(); be++) - { - int attr = orig_mesh.GetBdrAttribute(be); - if (!bdr_marker[attr - 1]) - { - int f, o, e1, e2; - orig_mesh.GetBdrElementFace(be, &f, &o); - orig_mesh.GetFaceElements(f, &e1, &e2); - if (e1 < 0 || e2 < 0) // Internal boundary elements are allowed to have no BC - { - warn = true; - break; - } - } - } - if (warn) - { - Mpi::Warning("One or more external boundary attributes has no associated boundary " - "condition!\n\"PMC\"/\"ZeroCharge\" condition is assumed!\n"); - } - - // Mapping from new interface boundary attribute tags to vector of neighboring domain - // attributes (when adding new boundary elements). - std::map> new_attr_map; - if (!clean_elem && !add_bdr && !add_subdomain) - { - return new_attr_map; - } - - // Count deleted or added domain and boundary elements. - int new_ne = orig_mesh.GetNE(); - int new_nbdr = orig_mesh.GetNBE(); - mfem::Array elem_delete, bdr_delete; - mfem::Array orig_bdr_faces, add_bdr_faces; - elem_delete.SetSize(orig_mesh.GetNE(), false); - bdr_delete.SetSize(orig_mesh.GetNBE(), false); - orig_bdr_faces.SetSize(orig_mesh.GetNumFaces(), -1); - for (int be = 0; be < orig_mesh.GetNBE(); be++) - { - int f, o; - orig_mesh.GetBdrElementFace(be, &f, &o); - MFEM_VERIFY(orig_bdr_faces[f] < 0, - "Mesh should not define boundary elements multiple times!"); - orig_bdr_faces[f] = be; - } - if (add_bdr || add_subdomain) - { - add_bdr_faces.SetSize(orig_mesh.GetNumFaces(), -1); - } - - if (clean_elem) - { - // Delete domain and boundary elements which have no associated material or BC attribute - // from the mesh. - for (int e = 0; e < orig_mesh.GetNE(); e++) - { - int attr = orig_mesh.GetAttribute(e); - if (!mat_marker[attr - 1]) - { - elem_delete[e] = true; - new_ne--; - } - } - - // Make sure to remove any boundary elements which are no longer attached to elements in - // the domain. - for (int f = 0; f < orig_mesh.GetNumFaces(); f++) - { - const int &be = orig_bdr_faces[f]; - if (be >= 0) - { - int e1, e2; - orig_mesh.GetFaceElements(f, &e1, &e2); - if ((e1 < 0 || elem_delete[e1]) && (e2 < 0 || elem_delete[e2])) - { - // Mpi::Print("Deleting an unattached boundary element!\n"); - bdr_delete[be] = true; - new_nbdr--; - } - } - } - if (new_ne < orig_mesh.GetNE()) - { - Mpi::Print("Removed {:d} unmarked domain elements from the mesh\n", - orig_mesh.GetNE() - new_ne); - } - if (new_nbdr < orig_mesh.GetNBE()) - { - Mpi::Print("Removed {:d} unattached boundary elements from the mesh\n", - orig_mesh.GetNBE() - new_nbdr); - } - } - int new_ne_step1 = new_ne; - int new_nbdr_step1 = new_nbdr; - - if (add_bdr) - { - // Add new boundary elements at material interfaces or on the exterior boundary of the - // simulation domain, if there is not already a boundary element present. - MFEM_VERIFY(!orig_mesh.Nonconforming(), "Adding material interface boundary elements " - "is not supported for nonconforming meshes!"); - int add_bdr_ext = 0, add_bdr_int = 0; - for (int f = 0; f < orig_mesh.GetNumFaces(); f++) - { - const int &be = orig_bdr_faces[f]; - if (be < 0 && add_bdr_faces[f] < 0) - { - int e1, e2; - orig_mesh.GetFaceElements(f, &e1, &e2); - - bool no_e1 = (e1 < 0 || elem_delete[e1]); - bool no_e2 = (e2 < 0 || elem_delete[e2]); - if ((no_e1 || no_e2) && !(no_e1 && no_e2)) - { - // Mpi::Print("Adding exterior boundary element!\n"); - add_bdr_faces[f] = 1; - add_bdr_ext++; - } - else if (orig_mesh.GetAttribute(e1) != orig_mesh.GetAttribute(e2)) - { - // Add new boundary element at material interface between two domains. - // Mpi::Print("Adding material interface boundary element!\n"); - add_bdr_faces[f] = 1; - add_bdr_int++; - } - } - } - new_nbdr += (add_bdr_ext + add_bdr_int); - if (add_bdr_ext > 0) - { - Mpi::Print("Added {:d} boundary elements for exterior boundaries to the mesh\n", - add_bdr_ext); - } - if (add_bdr_int > 0) - { - Mpi::Print("Added {:d} boundary elements for material interfaces to the mesh\n", - add_bdr_int); - } - } - int new_ne_step2 = new_ne; - int new_nbdr_step2 = new_nbdr; - - if (add_subdomain) - { - // Add new boundary elements at interfaces between elements belonging to different - // subdomains. This uses similar code to mfem::Mesh::PrintWithPartitioning. - MFEM_VERIFY(partitioning, "Cannot add subdomain interface boundary elements without " - "supplied mesh partitioning!"); - MFEM_VERIFY(!orig_mesh.Nonconforming(), "Adding subdomain interface boundary elements " - "is not supported for nonconforming meshes!"); - for (int f = 0; f < orig_mesh.GetNumFaces(); f++) - { - const int &be = orig_bdr_faces[f]; - if (be < 0 && add_bdr_faces[f] < 0) - { - int e1, e2; - - orig_mesh.GetFaceElements(f, &e1, &e2); - bool no_e1 = (e1 < 0 || elem_delete[e1]); - bool no_e2 = (e2 < 0 || elem_delete[e2]); - if (!no_e1 && !no_e2 && partitioning[e1] != partitioning[e2]) - { - // Internal face is connected to two elements belonging to different subdomains - // (this works for conforming meshes). - add_bdr_faces[f] = 2; - new_nbdr += 2; - } - } - // else - // { - // // This face is attached to a boundary element. We could define a new boundary - // // element with opposite orientation to ensure both subdomains in the distributed - // // ParMesh have the boundary element. - // } - } - if (new_nbdr > new_nbdr_step2) - { - Mpi::Print("Added {:d} boundary elements for subdomain interfaces to the mesh\n", - new_nbdr - new_nbdr_step2); - } - } - - // Create the new mesh. - if (new_ne == new_ne_step1 && new_ne_step1 == new_ne_step2 && - new_ne_step2 == orig_mesh.GetNE() && new_nbdr == new_nbdr_step1 && - new_nbdr_step1 == new_nbdr_step2 && new_nbdr_step2 == orig_mesh.GetNBE()) - { - return new_attr_map; - } - mfem::Mesh new_mesh(orig_mesh.Dimension(), orig_mesh.GetNV(), new_ne, new_nbdr, - orig_mesh.SpaceDimension()); - - // Copy vertices and non-deleted domain and boundary elements. - for (int v = 0; v < orig_mesh.GetNV(); v++) - { - new_mesh.AddVertex(orig_mesh.GetVertex(v)); - } - for (int e = 0; e < orig_mesh.GetNE(); e++) - { - if (!elem_delete[e]) - { - mfem::Element *el = orig_mesh.GetElement(e)->Duplicate(&new_mesh); - new_mesh.AddElement(el); - } - } - for (int be = 0; be < orig_mesh.GetNBE(); be++) - { - if (!bdr_delete[be]) - { - mfem::Element *el = orig_mesh.GetBdrElement(be)->Duplicate(&new_mesh); - new_mesh.AddBdrElement(el); - } - } - - // Add new boundary elements. - if (add_bdr || add_subdomain) - { - auto FlipVertices = [](mfem::Element *el) - { - mfem::Array v; - el->GetVertices(v); - std::reverse(v.begin(), v.end()); - el->SetVertices(v.HostRead()); - }; - - // 1-based, some boundary attributes may be empty since they were removed from the - // original mesh, but to keep indices the same as config file we don't compact the - // list. - int max_bdr_attr = orig_mesh.bdr_attributes.Size() ? orig_mesh.bdr_attributes.Max() : 0; - for (int f = 0; f < orig_mesh.GetNumFaces(); f++) - { - if (add_bdr_faces[f] > 0) - { - // Assign new unique attribute based on attached elements (we want the material - // properties on the face to average those on the elements). This is used later on - // when integrating the transmission condition on the subdomain interface. Save the - // inverse so that the attributes of e1 and e2 can be easily referenced using the - // new attribute. Since attributes are in 1-based indexing, a, b > 0. - int e1, e2, a = 0, b = 0; - orig_mesh.GetFaceElements(f, &e1, &e2); - bool no_e1 = (e1 < 0 || elem_delete[e1]); - bool no_e2 = (e2 < 0 || elem_delete[e2]); - if (!no_e1 && !no_e2) - { - a = std::max(orig_mesh.GetAttribute(e1), orig_mesh.GetAttribute(e2)); - b = (a == orig_mesh.GetAttribute(e1)) ? orig_mesh.GetAttribute(e2) - : orig_mesh.GetAttribute(e1); - } - else if (!no_e1) - { - a = orig_mesh.GetAttribute(e1); - b = 0; - } - else if (!no_e2) - { - a = orig_mesh.GetAttribute(e2); - b = 0; - } - MFEM_VERIFY(a + b > 0, "Invalid new boundary element attribute!"); - int new_attr = max_bdr_attr + - (b > 0 ? (a * (a - 1)) / 2 + b : a); // At least max_bdr_attr + 1 - if (new_attr_map.find(new_attr) == new_attr_map.end()) - { - new_attr_map.emplace(new_attr, std::array{a, b}); - } - - // Add the boundary elements with the new boundary attribute. - mfem::Element *el = orig_mesh.GetFace(f)->Duplicate(&new_mesh); - el->SetAttribute(new_attr); - new_mesh.AddBdrElement(el); - if (add_bdr_faces[f] > 1) - { - // Flip order of vertices to reverse normal direction of second added element. - el = orig_mesh.GetFace(f)->Duplicate(&new_mesh); - FlipVertices(el); - el->SetAttribute(new_attr); - new_mesh.AddBdrElement(el); - // Mpi::Print("Adding two BE with attr {:d} from elements {:d} and {:d}\n", - // new_attr, a, b); - } - } - } - } - - // Finalize new mesh and replace the old one. If a curved mesh, set up the new mesh by - // projecting nodes onto the new mesh for the non-trimmed vdofs (accounts for new - // boundary elements too since no new dofs are added). See the MFEM trimmer miniapp for - // reference. After we have copied the high-order nodes information, topological changes - // in Mesh::Finalize are OK (with refine = true). - constexpr bool generate_bdr = false, refine = true, fix_orientation = true; - new_mesh.FinalizeTopology(generate_bdr); - new_mesh.RemoveUnusedVertices(); - if (orig_mesh.GetNodes()) - { - const mfem::GridFunction *nodes = orig_mesh.GetNodes(); - const mfem::FiniteElementSpace *fespace = nodes->FESpace(); - - mfem::Ordering::Type ordering = fespace->GetOrdering(); - int order = fespace->GetMaxElementOrder(); - int sdim = orig_mesh.SpaceDimension(); - bool discont = - dynamic_cast(fespace->FEColl()) != nullptr; - - new_mesh.SetCurvature(order, discont, sdim, ordering); - mfem::GridFunction *new_nodes = new_mesh.GetNodes(); - const mfem::FiniteElementSpace *new_fespace = new_nodes->FESpace(); - - // The element loop works because we know the mapping from old_mesh to new_mesh element - // indices from the insertion order. - mfem::Array vdofs, new_vdofs; - mfem::Vector loc_vec; - int te = 0; - for (int e = 0; e < orig_mesh.GetNE(); e++) - { - if (!elem_delete[e]) - { - fespace->GetElementVDofs(e, vdofs); - nodes->GetSubVector(vdofs, loc_vec); - new_fespace->GetElementVDofs(te, new_vdofs); - new_nodes->SetSubVector(new_vdofs, loc_vec); - te++; - } - } - } - new_mesh.Finalize(refine, fix_orientation); - orig_mesh = std::move(new_mesh); - return new_attr_map; -} - -std::unique_ptr DistributeMesh(MPI_Comm comm, - const std::unique_ptr &smesh, - const std::unique_ptr &partitioning, - const std::string &output_dir) -{ - // Take a serial mesh and partitioning on the root process and construct the global - // parallel mesh. For now, prefer the MPI-based version. When constructing the ParMesh, we - // pass arguments to ensure no topological changes (this isn't required since the serial - // mesh was marked for refinement). - constexpr bool generate_edges = true, refine = true, fix_orientation = true; - if constexpr (false) - { - // Write each processor's component to file. - std::string tmp = output_dir; - if (tmp.back() != '/') - { - tmp += '/'; - } - tmp += "tmp/"; - int width = 1 + static_cast(std::log10(Mpi::Size(comm) - 1)); - if (Mpi::Root(comm)) - { - if (!std::filesystem::exists(tmp)) - { - std::filesystem::create_directories(tmp); - } - mfem::MeshPartitioner partitioner(*smesh, Mpi::Size(comm), partitioning.get()); - for (int i = 0; i < Mpi::Size(comm); i++) - { - mfem::MeshPart part; - partitioner.ExtractPart(i, part); - std::string pfile = mfem::MakeParFilename(tmp + "part.", i, ".mesh", width); - std::ofstream fo(pfile); - // mfem::ofgzstream fo(pfile, true); // Use zlib compression if available - // fo << std::fixed; - fo << std::scientific; - fo.precision(MSH_FLT_PRECISION); - part.Print(fo); - } - } - - // Each process loads its own partitioned mesh file and constructs the parallel mesh. - std::string pfile = - mfem::MakeParFilename(tmp + "part.", Mpi::Rank(comm), ".mesh", width); - int exists = 0; - while (!exists) // Wait for root to finish writing all files - { - exists = std::filesystem::exists(pfile); - Mpi::GlobalMax(1, &exists, comm); - } - std::ifstream fi(pfile); - // mfem::ifgzstream fi(pfile); - if (!fi.good()) - { - MFEM_ABORT("Unable to open partitioned mesh file \"" << pfile << "\"!"); - } - auto pmesh = - std::make_unique(comm, fi, generate_edges, refine, fix_orientation); - Mpi::Barrier(comm); - if (Mpi::Root(comm)) - { - std::filesystem::remove_all(tmp); // Remove the temporary directory - } - return pmesh; - } - else - { - // Send each processor's component as a byte string. - std::unique_ptr pmesh; - if (Mpi::Root(comm)) - { - mfem::MeshPartitioner partitioner(*smesh, Mpi::Size(comm), partitioning.get()); - std::vector send_requests(Mpi::Size(comm) - 1, MPI_REQUEST_NULL); - std::vector so; - so.reserve(Mpi::Size(comm)); - for (int i = 0; i < Mpi::Size(comm); i++) - { - mfem::MeshPart part; - partitioner.ExtractPart(i, part); - std::ostringstream fo(std::stringstream::out); - // fo << std::fixed; - fo << std::scientific; - fo.precision(MSH_FLT_PRECISION); - part.Print(fo); - so.push_back(fo.str()); - // so.push_back((i > 0) ? zlib::CompressString(fo.str()) : fo.str()); - if (i > 0) - { - int slen = static_cast(so[i].length()); - MFEM_VERIFY(so[i].length() == (std::size_t)slen, - "Overflow error distributing parallel mesh!"); - MPI_Isend(so[i].data(), slen, MPI_CHAR, i, i, comm, &send_requests[i - 1]); - } - } - std::istringstream fi(so[0]); // This is never compressed - pmesh = std::make_unique(comm, fi, generate_edges, refine, - fix_orientation); - MPI_Waitall(static_cast(send_requests.size()), send_requests.data(), - MPI_STATUSES_IGNORE); - } - else - { - MPI_Status status; - int rlen; - std::string si; - MPI_Probe(0, Mpi::Rank(comm), comm, &status); - MPI_Get_count(&status, MPI_CHAR, &rlen); - si.resize(rlen); - MPI_Recv(si.data(), rlen, MPI_CHAR, 0, Mpi::Rank(comm), comm, MPI_STATUS_IGNORE); - std::istringstream fi(si); - // std::istringstream fi(zlib::DecompressString(si)); - pmesh = std::make_unique(comm, fi, generate_edges, refine, - fix_orientation); - } - return pmesh; - } -} - -void GetUsedAttributeMarkers(const IoData &iodata, int n_mat, int n_bdr, - mfem::Array &mat_marker, mfem::Array &bdr_marker) -{ - mfem::Array mat_attr, bdr_attr; - mat_attr.Reserve(static_cast(iodata.domains.attributes.size())); - for (auto attr : iodata.domains.attributes) - { - mat_attr.Append(attr); - } - bdr_attr.Reserve(static_cast(iodata.boundaries.attributes.size())); - for (auto attr : iodata.boundaries.attributes) - { - bdr_attr.Append(attr); - } - mesh::AttrToMarker(n_mat, mat_attr, mat_marker); - mesh::AttrToMarker(n_bdr, bdr_attr, bdr_marker); -} - -void RebalanceConformalMesh(std::unique_ptr &pmesh, double length_scale, - const std::string &serial_mesh_file) -{ - // Write the parallel mesh to a stream as a serial mesh, then read back in and partition - // using METIS. - MPI_Comm comm = pmesh->GetComm(); - constexpr bool generate_edges = true, refine = true, fix_orientation = true, - generate_bdr = false; - std::unique_ptr smesh; - std::unique_ptr partitioning; - if constexpr (false) - { - // Write the serial mesh to a stream and read that through the Mesh constructor. - std::stringstream fo; - fo.precision(MSH_FLT_PRECISION); - pmesh->PrintAsSerial(fo); - pmesh.reset(); - if (Mpi::Root(comm)) - { - smesh = std::make_unique(fo, generate_edges, refine, fix_orientation); - } - } - else - { - // Directly ingest the generated Mesh and release the no longer needed memory. - smesh = std::make_unique(pmesh->GetSerialMesh(0)); - pmesh.reset(); - if (!Mpi::Root(comm)) - { - smesh.reset(); - } - } - if (Mpi::Root(comm)) - { - smesh->FinalizeTopology(generate_bdr); - smesh->Finalize(refine, fix_orientation); - partitioning = GetMeshPartitioning(*smesh, Mpi::Size(comm)); - } - - // Construct the parallel mesh. - pmesh = DistributeMesh(comm, smesh, partitioning); - if (!serial_mesh_file.empty()) - { - BlockTimer bt(Timer::IO); - if (Mpi::Root(comm)) - { - std::ofstream fo(serial_mesh_file); - fo.precision(MSH_FLT_PRECISION); - mesh::DimensionalizeMesh(*smesh, length_scale); - smesh->Print(fo); // Do not need to nondimensionalize the temporary mesh - } - Mpi::Barrier(comm); - } -} - -} // namespace - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "geodata.hpp" +#include "geodata_impl.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fem/interpolator.hpp" +#include "utils/communication.hpp" +#include "utils/diagnostic.hpp" +#include "utils/filesystem.hpp" +#include "utils/meshio.hpp" +#include "utils/omp.hpp" +#include "utils/prettyprint.hpp" +#include "utils/timer.hpp" + +namespace palace +{ + +using Vector3dMap = Eigen::Map; +using CVector3dMap = Eigen::Map; + +namespace +{ + +// Floating point precision for mesh IO. This precision is important, make sure nothing is +// lost! +constexpr auto MSH_FLT_PRECISION = std::numeric_limits::max_digits10; + +// Load the serial mesh from disk. +std::unique_ptr LoadMesh(const std::string &, bool, + const config::BoundaryData &); + +// Clean the provided serial mesh by removing unused domain and boundary elements. +void CleanMesh(std::unique_ptr &, const std::vector &); + +// Create a new mesh by splitting all elements of the mesh into simplices or hexes +// (using tet-to-hex). Optionally preserves curvature of the original mesh by interpolating +// the high-order nodes with GSLIB. +void SplitMeshElements(std::unique_ptr &, bool, bool); + +// Optionally reorder mesh elements based on MFEM's internal reordering tools for improved +// cache usage. +void ReorderMeshElements(mfem::Mesh &, bool = true); + +// Check that mesh boundary conditions are given for external boundaries. +std::unordered_map CheckMesh(const mfem::Mesh &, const config::BoundaryData &); + +// Adding boundary elements for material interfaces and exterior boundaries, and "crack" +// desired internal boundary elements to disconnect the elements on either side. +int AddInterfaceBdrElements(IoData &, std::unique_ptr &, + std::unordered_map &, MPI_Comm comm); + +// Generate element-based mesh partitioning, using either a provided file or METIS. +std::unique_ptr GetMeshPartitioning(const mfem::Mesh &, int, + const std::string & = "", bool = true); + +// Given a serial mesh on the root processor and element partitioning, create a parallel +// mesh over the given communicator. The serial mesh is destroyed when no longer needed. +std::unique_ptr DistributeMesh(MPI_Comm, std::unique_ptr &, + const int *, const std::string & = ""); + +// Rebalance a conformal mesh across processor ranks, using the MeshPartitioner. Gathers the +// mesh onto the root rank before scattering the partitioned mesh. +void RebalanceConformalMesh(std::unique_ptr &); + +} // namespace + +namespace mesh +{ + +std::unique_ptr ReadMesh(IoData &iodata, MPI_Comm comm) +{ + // If possible on root, read the serial mesh (converting format if necessary), and do all + // necessary serial preprocessing. When finished, distribute the mesh to all processes. + // Count disk I/O time separately for the mesh read from file. + BlockTimer bt0(Timer::MESH_PREPROCESS); + + // If not doing any local adaptation, or performing conformal adaptation, we can use the + // mesh partitioner. + std::unique_ptr smesh; + const auto &refinement = iodata.model.refinement; + const bool use_amr = (refinement.max_it > 0) || [&refinement]() + { + for (const auto &box : refinement.GetBoxes()) + { + if (box.ref_levels > 0) + { + return true; + } + } + for (const auto &sphere : refinement.GetSpheres()) + { + if (sphere.ref_levels > 0) + { + return true; + } + } + return false; + }(); + + const bool use_mesh_partitioner = [&]() + { + // Root must load the mesh to discover if nonconformal, as a previously adapted mesh + // might be reused for nonadaptive simulations. + BlockTimer bt(Timer::IO); + bool use_mesh_partitioner = !use_amr || !refinement.nonconformal; + if (Mpi::Root(comm)) + { + smesh = LoadMesh(iodata.model.mesh, iodata.model.remove_curvature, iodata.boundaries); + use_mesh_partitioner &= smesh->Conforming(); // The initial mesh must be conformal + } + Mpi::Broadcast(1, &use_mesh_partitioner, 0, comm); + return use_mesh_partitioner; + }(); + + MPI_Comm node_comm; + if (!use_mesh_partitioner) + { + MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, Mpi::Rank(comm), MPI_INFO_NULL, + &node_comm); + } + + { + BlockTimer bt1(Timer::IO); + if (!use_mesh_partitioner && Mpi::Root(node_comm) && !Mpi::Root(comm)) + { + // Only one process per node reads the serial mesh, if not using mesh partitioner. + smesh = LoadMesh(iodata.model.mesh, iodata.model.remove_curvature, iodata.boundaries); + MFEM_VERIFY(!(smesh->Nonconforming() && use_mesh_partitioner), + "Cannot use mesh partitioner on a nonconforming mesh!"); + } + Mpi::Barrier(comm); + } + + // Do some mesh preprocessing, and generate the partitioning. + std::unique_ptr partitioning; + if (smesh) + { + // Check the the AMR specification and the mesh elements are compatible. + const auto element_types = CheckElements(*smesh); + MFEM_VERIFY(!use_amr || iodata.model.make_simplex || !element_types.has_hexahedra || + refinement.nonconformal, + "If there are tensor elements, AMR must be nonconformal!"); + MFEM_VERIFY(!use_amr || iodata.model.make_simplex || !element_types.has_prisms || + refinement.nonconformal, + "If there are wedge elements, AMR must be nonconformal!"); + MFEM_VERIFY(!use_amr || iodata.model.make_simplex || !element_types.has_pyramids || + refinement.nonconformal, + "If there are pyramid elements, AMR must be nonconformal!"); + MFEM_VERIFY( + smesh->Conforming() || !use_amr || refinement.nonconformal, + "The provided mesh is nonconformal, only nonconformal AMR can be performed!"); + + // Clean up unused domain elements from the mesh. + if (iodata.model.clean_unused_elements) + { + std::vector attr_list; + std::merge(iodata.domains.attributes.begin(), iodata.domains.attributes.end(), + iodata.domains.postpro.attributes.begin(), + iodata.domains.postpro.attributes.end(), std::back_inserter(attr_list)); + attr_list.erase(std::unique(attr_list.begin(), attr_list.end()), attr_list.end()); + CleanMesh(smesh, attr_list); + } + + // Optionally convert mesh elements to simplices, for example in order to enable + // conformal mesh refinement, or hexes. + if (iodata.model.make_simplex || iodata.model.make_hex) + { + SplitMeshElements(smesh, iodata.model.make_simplex, iodata.model.make_hex); + } + + // Optionally reorder elements (and vertices) based on spatial location after loading + // the serial mesh. + if (iodata.model.reorder_elements) + { + ReorderMeshElements(*smesh); + } + + // Refine the serial mesh (not typically used, prefer parallel uniform refinement + // instead). + { + int ne = smesh->GetNE(); + for (int l = 0; l < iodata.model.refinement.ser_uniform_ref_levels; l++) + { + smesh->UniformRefinement(); + } + if (iodata.model.refinement.ser_uniform_ref_levels > 0) + { + Mpi::Print("Serial uniform mesh refinement levels added {:d} elements (initial = " + "{:d}, final = {:d})\n", + smesh->GetNE() - ne, ne, smesh->GetNE()); + } + } + + // Check the final mesh, throwing warnings if there are exterior boundaries with no + // associated boundary condition. + if (smesh->Conforming()) + { + auto face_to_be = CheckMesh(*smesh, iodata.boundaries); + + // Add new boundary elements for material interfaces if not present (with new unique + // boundary attributes). Also duplicate internal boundary elements associated with + // cracks if desired. + if ((iodata.model.crack_bdr_elements || iodata.model.add_bdr_elements)) + { + // Split all internal (non periodic) boundary elements for boundary attributes where + // BC are applied (not just postprocessing). + while (AddInterfaceBdrElements(iodata, smesh, face_to_be, comm) != 1) + { + // May require multiple calls due to early exit/retry approach. + } + } + } + else + { + Mpi::Warning("{} is a nonconformal mesh, assumed from previous AMR iteration.\n" + "Skipping mesh modification preprocessing steps!\n\n", + iodata.model.mesh); + } + + // Finally, finalize the serial mesh. Mark tetrahedral meshes for refinement. There + // should be no need to fix orientation as this was done during initial mesh loading + // from disk. + constexpr bool refine = true, fix_orientation = false; + smesh->Finalize(refine, fix_orientation); + + // Generate the mesh partitioning. + partitioning = GetMeshPartitioning(*smesh, Mpi::Size(comm), iodata.model.partitioning); + } + + // Broadcast cracked boundary attributes to other ranks. + if ((iodata.model.crack_bdr_elements || iodata.model.add_bdr_elements)) + { + int size = iodata.boundaries.cracked_attributes.size(); + Mpi::Broadcast(1, &size, 0, comm); + std::vector data; + if (Mpi::Root(comm)) + { + data.assign(iodata.boundaries.cracked_attributes.begin(), + iodata.boundaries.cracked_attributes.end()); + } + else + { + data.resize(size); + } + Mpi::Broadcast(size, data.data(), 0, comm); + + if (!Mpi::Root(comm)) + { + iodata.boundaries.cracked_attributes.clear(); + iodata.boundaries.cracked_attributes.insert(data.begin(), data.end()); + } + } + + // Distribute the mesh. + std::unique_ptr pmesh; + if (use_mesh_partitioner) + { + pmesh = DistributeMesh(comm, smesh, partitioning.get(), iodata.problem.output); + } + else + { + // Send the preprocessed serial mesh and partitioning as a byte string. + constexpr bool generate_edges = false, refine = true, fix_orientation = false; + std::string so; + int slen = 0; + if (smesh) + { + std::ostringstream fo(std::stringstream::out); + // fo << std::fixed; + fo << std::scientific; + fo.precision(MSH_FLT_PRECISION); + smesh->Print(fo); + smesh.reset(); // Root process needs to rebuild the mesh to ensure consistency with + // the saved serial mesh (refinement marking, for example) + so = fo.str(); + // so = zlib::CompressString(fo.str()); + slen = static_cast(so.size()); + MFEM_VERIFY(so.size() == (std::size_t)slen, "Overflow in stringbuffer size!"); + } + Mpi::Broadcast(1, &slen, 0, node_comm); + if (so.empty()) + { + so.resize(slen); + } + Mpi::Broadcast(slen, so.data(), 0, node_comm); + { + std::istringstream fi(so); + // std::istringstream fi(zlib::DecompressString(so)); + smesh = std::make_unique(fi, generate_edges, refine, fix_orientation); + so.clear(); + } + if (refinement.nonconformal && use_amr) + { + smesh->EnsureNCMesh(true); + } + if (!partitioning) + { + partitioning = std::make_unique(smesh->GetNE()); + } + Mpi::Broadcast(smesh->GetNE(), partitioning.get(), 0, node_comm); + MPI_Comm_free(&node_comm); + pmesh = std::make_unique(comm, *smesh, partitioning.get()); + smesh.reset(); + } + + if constexpr (false) + { + auto tmp = fs::path(iodata.problem.output) / "tmp"; + if (Mpi::Root(comm) && !fs::exists(tmp)) + { + fs::create_directories(tmp); + } + int width = 1 + static_cast(std::log10(Mpi::Size(comm) - 1)); + std::unique_ptr gsmesh = + LoadMesh(iodata.model.mesh, iodata.model.remove_curvature, iodata.boundaries); + std::unique_ptr gpartitioning = GetMeshPartitioning(*gsmesh, Mpi::Size(comm)); + mfem::ParMesh gpmesh(comm, *gsmesh, gpartitioning.get(), 0); + { + std::string pfile = + mfem::MakeParFilename(tmp.string() + "part.", Mpi::Rank(comm), ".mesh", width); + std::ofstream fo(pfile); + // mfem::ofgzstream fo(pfile, true); // Use zlib compression if available + fo.precision(MSH_FLT_PRECISION); + gpmesh.ParPrint(fo); + } + { + std::string pfile = + mfem::MakeParFilename(tmp.string() + "final.", Mpi::Rank(comm), ".mesh", width); + std::ofstream fo(pfile); + // mfem::ofgzstream fo(pfile, true); // Use zlib compression if available + fo.precision(MSH_FLT_PRECISION); + pmesh->ParPrint(fo); + } + } + + return pmesh; +} + +void RefineMesh(const IoData &iodata, std::vector> &mesh) +{ + // Prepare for uniform and region-based refinement. + MFEM_VERIFY(mesh.size() == 1, + "Input mesh vector before refinement has more than a single mesh!"); + int uniform_ref_levels = iodata.model.refinement.uniform_ref_levels; + int max_region_ref_levels = 0; + for (const auto &box : iodata.model.refinement.GetBoxes()) + { + if (max_region_ref_levels < box.ref_levels) + { + max_region_ref_levels = box.ref_levels; + } + } + for (const auto &sphere : iodata.model.refinement.GetSpheres()) + { + if (max_region_ref_levels < sphere.ref_levels) + { + max_region_ref_levels = sphere.ref_levels; + } + } + if (iodata.solver.linear.mg_use_mesh && iodata.solver.linear.mg_max_levels > 1) + { + mesh.reserve(1 + uniform_ref_levels + max_region_ref_levels); + } + + // Prior to MFEM's PR #1046, the tetrahedral mesh required reorientation after all mesh + // refinement in order to define higher-order Nedelec spaces on it. This is technically + // not required after MFEM's PR #1046, but in case you want to be absolutely sure, we + // reorient only the coarse mesh so that the refinements are still true refinements of + // the original mesh (required for geometric multigrid). Otherwise, it happens after + // refinement. + if (iodata.model.reorient_tet_mesh && mesh.capacity() > 1) + { + PalacePragmaDiagnosticPush + PalacePragmaDiagnosticDisableDeprecated + mesh[0]->ReorientTetMesh(); + PalacePragmaDiagnosticPop + } + + // Uniformly refine the mesh further in parallel, saving the level meshes for geometric + // coarsening later on if desired. + for (int l = 0; l < uniform_ref_levels; l++) + { + if (mesh.capacity() > 1) + { + mesh.emplace_back(std::make_unique(*mesh.back())); + } + mesh.back()->UniformRefinement(); + } + + // Simplex meshes need to be re-finalized in order to use local refinement (see + // the docstring for mfem::Mesh::UniformRefinement). + const auto element_types = mesh::CheckElements(*mesh.back()); + if (element_types.has_simplices && uniform_ref_levels > 0 && + (max_region_ref_levels > 0 || iodata.model.refinement.max_it > 0)) + { + constexpr bool refine = true, fix_orientation = false; + Mpi::Print("\nFlattening mesh sequence:\n Local mesh refinement will start from the " + "final uniformly-refined mesh\n"); + mesh.erase(mesh.begin(), mesh.end() - 1); + mesh.back()->Finalize(refine, fix_orientation); + } + + // Proceed with region-based refinement, level-by-level for all regions. Currently support + // box and sphere region shapes. Any overlap between regions is ignored (take the union, + // don't double-refine). + MFEM_VERIFY( + max_region_ref_levels == 0 || + !(element_types.has_hexahedra || element_types.has_prisms || + element_types.has_pyramids) || + mesh.back()->Nonconforming(), + "Region-based refinement for non-simplex meshes requires a nonconformal mesh!"); + const bool use_nodes = (mesh.back()->GetNodes() != nullptr); + const int ref = use_nodes ? mesh.back()->GetNodes()->FESpace()->GetMaxElementOrder() : 1; + const int dim = mesh.back()->SpaceDimension(); + int region_ref_level = 0; + while (region_ref_level < max_region_ref_levels) + { + // Mark elements for refinement in all regions. An element is marked for refinement if + // any of its vertices are inside any refinement region for the given level. + mfem::Array refinements; + for (int i = 0; i < mesh.back()->GetNE(); i++) + { + bool refine = false; + mfem::DenseMatrix pointmat; + if (use_nodes) + { + mfem::ElementTransformation &T = *mesh.back()->GetElementTransformation(i); + mfem::RefinedGeometry *RefG = + mfem::GlobGeometryRefiner.Refine(T.GetGeometryType(), ref); + T.Transform(RefG->RefPts, pointmat); + } + else + { + const int *verts = mesh.back()->GetElement(i)->GetVertices(); + const int nv = mesh.back()->GetElement(i)->GetNVertices(); + pointmat.SetSize(dim, nv); + for (int j = 0; j < nv; j++) + { + const double *coord = mesh.back()->GetVertex(verts[j]); + for (int d = 0; d < dim; d++) + { + pointmat(d, j) = coord[d]; + } + } + } + for (const auto &box : iodata.model.refinement.GetBoxes()) + { + if (region_ref_level < box.ref_levels) + { + for (int j = 0; j < pointmat.Width(); j++) + { + // Check if the point is inside the box. + int d = 0; + for (; d < pointmat.Height(); d++) + { + if (pointmat(d, j) < box.bbmin[d] || pointmat(d, j) > box.bbmax[d]) + { + break; + } + } + if (d == dim) + { + refine = true; + break; + } + } + if (refine) + { + break; + } + } + } + if (refine) + { + refinements.Append(mfem::Refinement(i)); + continue; + } + for (const auto &sphere : iodata.model.refinement.GetSpheres()) + { + if (region_ref_level < sphere.ref_levels) + { + for (int j = 0; j < pointmat.Width(); j++) + { + // Check if the point is inside the sphere. + double dist = 0.0; + for (int d = 0; d < pointmat.Height(); d++) + { + double s = pointmat(d, j) - sphere.center[d]; + dist += s * s; + } + if (dist <= sphere.r * sphere.r) + { + refine = true; + break; + } + } + if (refine) + { + break; + } + } + } + if (refine) + { + refinements.Append(mfem::Refinement(i)); + } + } + + // Do the refinement. For tensor element meshes, this may make the mesh nonconforming + // (adds hanging nodes). + if (mesh.capacity() > 1) + { + mesh.emplace_back(std::make_unique(*mesh.back())); + } + mesh.back()->GeneralRefinement(refinements, -1); + region_ref_level++; + } + if (max_region_ref_levels > 0 && mesh.capacity() == 1) + { + RebalanceMesh(iodata, mesh[0]); + } + + // Prior to MFEM's PR #1046, the tetrahedral mesh required reorientation after all mesh + // refinement in order to define higher-order Nedelec spaces on it. This is technically + // not required after MFEM's PR #1046, but in case you want to be absolutely sure, we + // reorient only the mesh after refinement if there is a single mesh (doesn't work with + // h-refinement geometric multigrid). + if (iodata.model.reorient_tet_mesh && mesh.capacity() == 1) + { + PalacePragmaDiagnosticPush + PalacePragmaDiagnosticDisableDeprecated + mesh[0]->ReorientTetMesh(); + PalacePragmaDiagnosticPop + } + + // Print some mesh information. + mfem::Vector bbmin, bbmax; + GetAxisAlignedBoundingBox(*mesh[0], bbmin, bbmax); + const double Lc = iodata.units.Dimensionalize(1.0); + Mpi::Print(mesh[0]->GetComm(), "\nMesh curvature order: {}\nMesh bounding box:\n", + mesh[0]->GetNodes() + ? std::to_string(mesh[0]->GetNodes()->FESpace()->GetMaxElementOrder()) + : "None"); + if (mesh[0]->SpaceDimension() == 3) + { + Mpi::Print(mesh[0]->GetComm(), + " (Xmin, Ymin, Zmin) = ({:+.3e}, {:+.3e}, {:+.3e}) m\n" + " (Xmax, Ymax, Zmax) = ({:+.3e}, {:+.3e}, {:+.3e}) m\n", + bbmin[0] * Lc, bbmin[1] * Lc, bbmin[2] * Lc, bbmax[0] * Lc, bbmax[1] * Lc, + bbmax[2] * Lc); + } + else + { + Mpi::Print(mesh[0]->GetComm(), + " (Xmin, Ymin) = ({:+.3e}, {:+.3e}) m\n" + " (Xmax, Ymax) = ({:+.3e}, {:+.3e}) m\n", + bbmin[0] * Lc, bbmin[1] * Lc, bbmax[0] * Lc, bbmax[1] * Lc); + } + Mpi::Print(mesh[0]->GetComm(), "\n{}", (mesh.size() > 1) ? "Coarse " : ""); + mesh[0]->PrintInfo(); + if (mesh.size() > 1) + { + Mpi::Print(mesh[0]->GetComm(), "\nRefined "); + mesh.back()->PrintInfo(); + } +} + +mfem::Mesh MeshTetToHex(const mfem::Mesh &orig_mesh) +{ + // Courtesy of https://gist.github.com/pazner/e9376f77055c0918d7c43e034e9e5888, only + // supports tetrahedral elements for now. Eventually should be expanded to support prism + // and pyramid elements but this mixed mesh support requires a bit more work. + MFEM_VERIFY(orig_mesh.Dimension() == 3, "Tet-to-hex conversion only supports 3D meshes!"); + { + // This checks the local mesh on each process, but the assertion failing on any single + // process will terminate the program. + mfem::Array geoms; + orig_mesh.GetGeometries(3, geoms); + MFEM_VERIFY(geoms.Size() == 1 && geoms[0] == mfem::Geometry::TETRAHEDRON, + "Tet-to-hex conversion only works for pure tetrahedral meshes!"); + } + + // Add new vertices in every edge, face, and volume. Each tet is subdivided into 4 hexes, + // and each triangular face subdivided into 3 quads. + const int nv_tet = orig_mesh.GetNV(); + const int nedge_tet = orig_mesh.GetNEdges(); + const int nface_tet = orig_mesh.GetNFaces(); + const int ne_tet = orig_mesh.GetNE(); + const int nbe_tet = orig_mesh.GetNBE(); + const int nv = nv_tet + nedge_tet + nface_tet + ne_tet; + const int ne = 4 * ne_tet; // 4 hex per tet + const int nbe = 3 * nbe_tet; // 3 square per tri + mfem::Mesh hex_mesh(orig_mesh.Dimension(), nv, ne, nbe, orig_mesh.SpaceDimension()); + + // Add original vertices. + for (int v = 0; v < nv_tet; v++) + { + hex_mesh.AddVertex(orig_mesh.GetVertex(v)); + } + + // Add midpoints of edges, faces, and elements. + auto AddCentroid = [&orig_mesh, &hex_mesh](const int *verts, int nv) + { + double coord[3] = {0.0, 0.0, 0.0}; + for (int i = 0; i < nv; i++) + { + for (int d = 0; d < orig_mesh.SpaceDimension(); d++) + { + coord[d] += orig_mesh.GetVertex(verts[i])[d] / nv; + } + } + hex_mesh.AddVertex(coord); + }; + { + mfem::Array verts; + for (int e = 0; e < nedge_tet; ++e) + { + orig_mesh.GetEdgeVertices(e, verts); + AddCentroid(verts.GetData(), verts.Size()); + } + } + for (int f = 0; f < nface_tet; ++f) + { + AddCentroid(orig_mesh.GetFace(f)->GetVertices(), orig_mesh.GetFace(f)->GetNVertices()); + } + for (int e = 0; e < ne_tet; ++e) + { + AddCentroid(orig_mesh.GetElement(e)->GetVertices(), + orig_mesh.GetElement(e)->GetNVertices()); + } + + // Connectivity of tetrahedron vertices to the edges. The vertices of the new mesh are + // ordered so that the original tet vertices are first, then the vertices splitting each + // edge, then the vertices at the center of each triangle face, then the center of the + // tet. Thus the edge/face/element numbers can be used to index into the new array of + // vertices, and the element local edge/face can be used to extract the global edge/face + // index, and thus the corresponding vertex. + constexpr int tet_vertex_edge_map[4 * 3] = {0, 1, 2, 3, 0, 4, 1, 3, 5, 5, 4, 2}; + constexpr int tet_vertex_face_map[4 * 3] = {3, 2, 1, 3, 0, 2, 3, 1, 0, 0, 1, 2}; + constexpr int tri_vertex_edge_map[3 * 2] = {0, 2, 1, 0, 2, 1}; + + // Add four hexahedra for each tetrahedron. + { + mfem::Array edges, faces, orients; + for (int e = 0; e < ne_tet; ++e) + { + const int *verts = orig_mesh.GetElement(e)->GetVertices(); + orig_mesh.GetElementEdges(e, edges, orients); + orig_mesh.GetElementFaces(e, faces, orients); + + // One hex for each vertex of the tet. + for (int i = 0; i < 4; ++i) + { + int hex_v[8]; + hex_v[0] = verts[i]; + hex_v[1] = nv_tet + edges[tet_vertex_edge_map[3 * i + 0]]; + hex_v[2] = nv_tet + nedge_tet + faces[tet_vertex_face_map[3 * i + 0]]; + hex_v[3] = nv_tet + edges[tet_vertex_edge_map[3 * i + 1]]; + hex_v[4] = nv_tet + edges[tet_vertex_edge_map[3 * i + 2]]; + hex_v[5] = nv_tet + nedge_tet + faces[tet_vertex_face_map[3 * i + 1]]; + hex_v[6] = nv_tet + nedge_tet + nface_tet + e; + hex_v[7] = nv_tet + nedge_tet + faces[tet_vertex_face_map[3 * i + 2]]; + hex_mesh.AddHex(hex_v, orig_mesh.GetAttribute(e)); + } + } + } + + // Add the boundary elements. + { + mfem::Array edges, orients; + for (int be = 0; be < nbe_tet; ++be) + { + int f, o; + const int *verts = orig_mesh.GetBdrElement(be)->GetVertices(); + orig_mesh.GetBdrElementEdges(be, edges, orients); + orig_mesh.GetBdrElementFace(be, &f, &o); + + // One quad for each vertex of the tri. + for (int i = 0; i < 3; ++i) + { + int quad_v[4]; + quad_v[0] = verts[i]; + quad_v[1] = nv_tet + edges[tri_vertex_edge_map[2 * i + 0]]; + quad_v[2] = nv_tet + nedge_tet + f; + quad_v[3] = nv_tet + edges[tri_vertex_edge_map[2 * i + 1]]; + hex_mesh.AddBdrQuad(quad_v, orig_mesh.GetBdrAttribute(be)); + } + } + } + + // Finalize the hex mesh topology. The mesh will be marked for refinement later on. + constexpr bool generate_bdr = false; + hex_mesh.FinalizeTopology(generate_bdr); + + // All elements have now been added, can construct the higher order field. + if (orig_mesh.GetNodes()) + { + hex_mesh.EnsureNodes(); + // Higher order associated to vertices are unchanged, and those for + // previously existing edges. DOFs associated to new elements need to be set. + const int sdim = orig_mesh.SpaceDimension(); + auto *orig_fespace = orig_mesh.GetNodes()->FESpace(); + hex_mesh.SetCurvature(orig_fespace->GetMaxElementOrder(), orig_fespace->IsDGSpace(), + orig_mesh.SpaceDimension(), orig_fespace->GetOrdering()); + + // Need to convert the hexahedra local coordinate system into the parent tetrahedra + // system. Each hexahedra spans a different set of the tet's reference coordinates. To + // convert between, define the reference coordinate locations of each of the vertices + // the hexahedra will use, then perform trilinear interpolation in the reference space. + + auto [vert_loc, edge_loc, face_loc] = []() + { + std::array, 4> vert_loc{}; + vert_loc[0] = {0.0, 0.0, 0.0}; + vert_loc[1] = {1.0, 0.0, 0.0}; + vert_loc[2] = {0.0, 1.0, 0.0}; + vert_loc[3] = {0.0, 0.0, 1.0}; + std::array, 6> edge_loc{}; + edge_loc[0] = {0.5, 0.0, 0.0}; + edge_loc[1] = {0.0, 0.5, 0.0}; + edge_loc[2] = {0.0, 0.0, 0.5}; + edge_loc[3] = {0.5, 0.5, 0.0}; + edge_loc[4] = {0.5, 0.0, 0.5}; + edge_loc[5] = {0.0, 0.5, 0.5}; + std::array, 6> face_loc{}; + face_loc[0] = {1.0 / 3, 1.0 / 3, 1.0 / 3}; + face_loc[1] = {0.0, 1.0 / 3, 1.0 / 3}; + face_loc[2] = {1.0 / 3, 0.0, 1.0 / 3}; + face_loc[3] = {1.0 / 3, 1.0 / 3, 0.0}; + return std::make_tuple(vert_loc, edge_loc, face_loc); + }(); + std::array centroid{{0.25, 0.25, 0.25}}; + + // We assume the Nodes field is of a single order, and there is a single tet originally. + // The nodes within the reference hex and parent tet are always the same, so we use the + // typical FE. We then exploit the fact the map between reference spaces is always + // linear, and construct the transformation explicitly. + const auto *orig_FE = orig_mesh.GetNodes()->FESpace()->GetTypicalFE(); + const auto *child_FE = hex_mesh.GetNodes()->FESpace()->GetTypicalFE(); + // Original shape function (i), at new element nodes (j), for each new element (k). + mfem::DenseTensor shape(orig_FE->GetDof(), child_FE->GetDof(), 4); + mfem::Vector col; // For slicing into matrices within shape + for (int i = 0; i < 4; i++) + { + // Collect the vertices of the new hex within the tet. + std::array, 8> hex_verts; + hex_verts[0] = vert_loc[i]; + hex_verts[1] = edge_loc[tet_vertex_edge_map[3 * i + 0]]; + hex_verts[2] = face_loc[tet_vertex_face_map[3 * i + 0]]; + hex_verts[3] = edge_loc[tet_vertex_edge_map[3 * i + 1]]; + hex_verts[4] = edge_loc[tet_vertex_edge_map[3 * i + 2]]; + hex_verts[5] = face_loc[tet_vertex_face_map[3 * i + 1]]; + hex_verts[6] = centroid; + hex_verts[7] = face_loc[tet_vertex_face_map[3 * i + 2]]; + for (int j = 0; j < child_FE->GetNodes().Size(); j++) + { + const auto &cn = child_FE->GetNodes()[j]; + mfem::IntegrationPoint cn_in_orig; + + // Perform trilinear interpolation from (u,v,w) the unit ref coords in the new hex, + // and the corresponding nodes in the containing tet. + // clang-format off + // x component + cn_in_orig.x = + hex_verts[0][0] * (1-cn.x) * (1-cn.y) * (1-cn.z) + + hex_verts[1][0] * cn.x * (1-cn.y) * (1-cn.z) + + hex_verts[2][0] * cn.x * cn.y * (1-cn.z) + + hex_verts[3][0] * (1-cn.x) * cn.y * (1-cn.z) + + hex_verts[4][0] * (1-cn.x) * (1-cn.y) * cn.z + + hex_verts[5][0] * cn.x * (1-cn.y) * cn.z + + hex_verts[6][0] * cn.x * cn.y * cn.z + + hex_verts[7][0] * (1-cn.x) * cn.y * cn.z; + + // y component + cn_in_orig.y = + hex_verts[0][1] * (1-cn.x) * (1-cn.y) * (1-cn.z) + + hex_verts[1][1] * cn.x * (1-cn.y) * (1-cn.z) + + hex_verts[2][1] * cn.x * cn.y * (1-cn.z) + + hex_verts[3][1] * (1-cn.x) * cn.y * (1-cn.z) + + hex_verts[4][1] * (1-cn.x) * (1-cn.y) * cn.z + + hex_verts[5][1] * cn.x * (1-cn.y) * cn.z + + hex_verts[6][1] * cn.x * cn.y * cn.z + + hex_verts[7][1] * (1-cn.x) * cn.y * cn.z; + + // z component + cn_in_orig.z = + hex_verts[0][2] * (1-cn.x) * (1-cn.y) * (1-cn.z) + + hex_verts[1][2] * cn.x * (1-cn.y) * (1-cn.z) + + hex_verts[2][2] * cn.x * cn.y * (1-cn.z) + + hex_verts[3][2] * (1-cn.x) * cn.y * (1-cn.z) + + hex_verts[4][2] * (1-cn.x) * (1-cn.y) * cn.z + + hex_verts[5][2] * cn.x * (1-cn.y) * cn.z + + hex_verts[6][2] * cn.x * cn.y * cn.z + + hex_verts[7][2] * (1-cn.x) * cn.y * cn.z; + // clang-format on + shape(i).GetColumnReference(j, col); + orig_FE->CalcShape(cn_in_orig, col); + } + } + + // Each submatrix of shape tensor now encodes the reference coordinates of each hex + // within the containing tet. Extracting the specific element dof values, and applying + // to the correct shape slice will now give the requisite higher order dofs evaluated at + // the refined elements nodes. + mfem::Array hex_dofs; + mfem::DenseMatrix point_matrix(child_FE->GetDof(), sdim); // nnode_child x sdim + mfem::Vector dof_vals(orig_FE->GetDof() * sdim); + mfem::DenseMatrix dof_vals_mat(dof_vals.GetData(), orig_FE->GetDof(), sdim); + for (int e = 0; e < ne_tet; ++e) + { + // Returns byNODES no matter what, because FiniteElementSpace::GetElementVDofs does. + // Matches the GetElementVDofs call below, which similarly always uses byNODES. + orig_mesh.GetNodes()->GetElementDofValues(e, dof_vals); + for (int i = 0; i < 4; i++) + { + // shape(i) : orig_FE->GetDof() x hex_FE->GetDof() + // dof_vals_mat : orig_FE->GetDof() x sdim + // point_matrix : child_FE->GetDof() x sdim + MultAtB(shape(i), dof_vals_mat, point_matrix); + hex_mesh.GetNodes()->FESpace()->GetElementVDofs(4 * e + i, hex_dofs); + hex_mesh.GetNodes()->SetSubVector(hex_dofs, point_matrix.GetData()); + } + } + } + + return hex_mesh; +} + +namespace +{ + +void ScaleMesh(mfem::Mesh &mesh, double L) +{ + PalacePragmaOmp(parallel for schedule(static)) + for (int i = 0; i < mesh.GetNV(); i++) + { + double *v = mesh.GetVertex(i); + std::transform(v, v + mesh.SpaceDimension(), v, [L](double val) { return val * L; }); + } + if (auto *pmesh = dynamic_cast(&mesh)) + { + PalacePragmaOmp(parallel for schedule(static)) + for (int i = 0; i < pmesh->face_nbr_vertices.Size(); i++) + { + double *v = pmesh->face_nbr_vertices[i](); + std::transform(v, v + mesh.SpaceDimension(), v, [L](double val) { return val * L; }); + } + } + if (mesh.GetNodes()) + { + *mesh.GetNodes() *= L; + if (auto *pnodes = dynamic_cast(mesh.GetNodes())) + { + pnodes->FaceNbrData() *= L; + } + } +} + +} // namespace + +void DimensionalizeMesh(mfem::Mesh &mesh, double L) +{ + ScaleMesh(mesh, L); +} + +void NondimensionalizeMesh(mfem::Mesh &mesh, double L) +{ + ScaleMesh(mesh, 1.0 / L); +} + +std::vector ElementTypeInfo::GetGeomTypes() const +{ + std::vector geom_types; + if (has_simplices) + { + geom_types.push_back(mfem::Geometry::TETRAHEDRON); + } + if (has_hexahedra) + { + geom_types.push_back(mfem::Geometry::CUBE); + } + if (has_prisms) + { + geom_types.push_back(mfem::Geometry::PRISM); + } + if (has_pyramids) + { + geom_types.push_back(mfem::Geometry::PYRAMID); + } + return geom_types; +} + +ElementTypeInfo CheckElements(const mfem::Mesh &mesh) +{ + // MeshGenerator is reduced over the communicator. This checks for geometries on any + // processor. + auto meshgen = mesh.MeshGenerator(); + return {bool(meshgen & 1), bool(meshgen & 2), bool(meshgen & 4), bool(meshgen & 8)}; +} + +bool CheckRefinementFlags(const mfem::Mesh &mesh) +{ + bool marked = true; + for (int e = 0; e < mesh.GetNE(); e++) + { + const mfem::Element *el = mesh.GetElement(e); + const int geom = el->GetGeometryType(); + if (geom == mfem::Geometry::TETRAHEDRON) + { + const mfem::Tetrahedron *tet = static_cast(el); + if (tet->GetRefinementFlag() == 0) + { + marked = false; + break; + } + } + } + if (const auto *pmesh = dynamic_cast(&mesh)) + { + Mpi::GlobalAnd(1, &marked, pmesh->GetComm()); + } + return marked; +} + +void AttrToMarker(int max_attr, const int *attr_list, int attr_list_size, + mfem::Array &marker, bool skip_invalid) +{ + MFEM_VERIFY(skip_invalid || attr_list_size == 0 || + *std::max_element(attr_list, attr_list + attr_list_size) <= max_attr, + "Invalid attribute number present (" + << *std::max_element(attr_list, attr_list + attr_list_size) << ")!"); + marker.SetSize(max_attr); + if (attr_list_size == 1 && attr_list[0] == -1) + { + marker = 1; + } + else + { + marker = 0; + for (int i = 0; i < attr_list_size; i++) + { + int attr = attr_list[i]; + if ((attr <= 0 || attr > max_attr) && skip_invalid) + { + continue; + } + MFEM_VERIFY(attr > 0, "Attribute number less than one!"); + MFEM_VERIFY(marker[attr - 1] == 0, "Repeated attribute in attribute list!"); + marker[attr - 1] = 1; + } + } +} + +void GetAxisAlignedBoundingBox(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr, mfem::Vector &min, mfem::Vector &max) +{ + int dim = mesh.SpaceDimension(); + min.SetSize(dim); + max.SetSize(dim); + for (int d = 0; d < dim; d++) + { + min(d) = mfem::infinity(); + max(d) = -mfem::infinity(); + } + if (!mesh.GetNodes()) + { + auto BBUpdate = + [&mesh, &dim](const int *v, int nv, mfem::Vector &min, mfem::Vector &max) + { + for (int j = 0; j < nv; j++) + { + const double *coord = mesh.GetVertex(v[j]); + for (int d = 0; d < dim; d++) + { + if (coord[d] < min(d)) + { + min(d) = coord[d]; + } + if (coord[d] > max(d)) + { + max(d) = coord[d]; + } + } + } + }; + PalacePragmaOmp(parallel) + { + mfem::Vector loc_min(dim), loc_max(dim); + for (int d = 0; d < dim; d++) + { + loc_min(d) = mfem::infinity(); + loc_max(d) = -mfem::infinity(); + } + if (bdr) + { + PalacePragmaOmp(for schedule(static)) + for (int i = 0; i < mesh.GetNBE(); i++) + { + if (!marker[mesh.GetBdrAttribute(i) - 1]) + { + continue; + } + const int *verts = mesh.GetBdrElement(i)->GetVertices(); + BBUpdate(verts, mesh.GetBdrElement(i)->GetNVertices(), loc_min, loc_max); + } + } + else + { + PalacePragmaOmp(for schedule(static)) + for (int i = 0; i < mesh.GetNE(); i++) + { + if (!marker[mesh.GetAttribute(i) - 1]) + { + continue; + } + const int *verts = mesh.GetElement(i)->GetVertices(); + BBUpdate(verts, mesh.GetElement(i)->GetNVertices(), loc_min, loc_max); + } + } + PalacePragmaOmp(critical(BBUpdate)) + { + for (int d = 0; d < dim; d++) + { + min(d) = std::min(min(d), loc_min(d)); + max(d) = std::max(max(d), loc_max(d)); + } + } + } + } + else + { + mesh.GetNodes()->HostRead(); + const int ref = mesh.GetNodes()->FESpace()->GetMaxElementOrder(); + auto BBUpdate = [&ref](mfem::GeometryRefiner &refiner, mfem::ElementTransformation &T, + mfem::DenseMatrix &pointmat, mfem::Vector &min, + mfem::Vector &max) + { + mfem::RefinedGeometry *RefG = refiner.Refine(T.GetGeometryType(), ref); + T.Transform(RefG->RefPts, pointmat); + for (int j = 0; j < pointmat.Width(); j++) + { + for (int d = 0; d < pointmat.Height(); d++) + { + if (pointmat(d, j) < min(d)) + { + min(d) = pointmat(d, j); + } + if (pointmat(d, j) > max(d)) + { + max(d) = pointmat(d, j); + } + } + } + }; + PalacePragmaOmp(parallel) + { + mfem::Vector loc_min(dim), loc_max(dim); + for (int d = 0; d < dim; d++) + { + loc_min(d) = mfem::infinity(); + loc_max(d) = -mfem::infinity(); + } + mfem::GeometryRefiner refiner; + mfem::IsoparametricTransformation T; + mfem::DenseMatrix pointmat; + if (bdr) + { + PalacePragmaOmp(for schedule(static)) + for (int i = 0; i < mesh.GetNBE(); i++) + { + if (!marker[mesh.GetBdrAttribute(i) - 1]) + { + continue; + } + mesh.GetBdrElementTransformation(i, &T); + BBUpdate(refiner, T, pointmat, loc_min, loc_max); + } + } + else + { + PalacePragmaOmp(for schedule(static)) + for (int i = 0; i < mesh.GetNE(); i++) + { + if (!marker[mesh.GetAttribute(i) - 1]) + { + continue; + } + mesh.GetElementTransformation(i, &T); + BBUpdate(refiner, T, pointmat, loc_min, loc_max); + } + } + PalacePragmaOmp(critical(BBUpdate)) + { + for (int d = 0; d < dim; d++) + { + min(d) = std::min(min(d), loc_min(d)); + max(d) = std::max(max(d), loc_max(d)); + } + } + } + } + Mpi::GlobalMin(dim, min.HostReadWrite(), mesh.GetComm()); + Mpi::GlobalMax(dim, max.HostReadWrite(), mesh.GetComm()); +} + +double BoundingBox::Area() const +{ + return 4.0 * CVector3dMap(axes[0].data()).cross(CVector3dMap(axes[1].data())).norm(); +} + +double BoundingBox::Volume() const +{ + return planar ? 0.0 : 2.0 * CVector3dMap(axes[2].data()).norm() * Area(); +} + +std::array, 3> BoundingBox::Normals() const +{ + std::array, 3> normals = {axes[0], axes[1], axes[2]}; + Vector3dMap(normals[0].data()).normalize(); + Vector3dMap(normals[1].data()).normalize(); + Vector3dMap(normals[2].data()).normalize(); + return normals; +} + +std::array BoundingBox::Lengths() const +{ + return {2.0 * CVector3dMap(axes[0].data()).norm(), + 2.0 * CVector3dMap(axes[1].data()).norm(), + 2.0 * CVector3dMap(axes[2].data()).norm()}; +} + +std::array BoundingBox::Deviations(const std::array &direction) const +{ + const auto eig_dir = CVector3dMap(direction.data()); + std::array deviation_deg; + for (std::size_t i = 0; i < 3; i++) + { + deviation_deg[i] = + std::acos(std::min(1.0, std::abs(eig_dir.normalized().dot( + CVector3dMap(axes[i].data()).normalized())))) * + (180.0 / M_PI); + } + return deviation_deg; +} + +BoundingBox GetBoundingBox(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr) +{ + std::vector vertices; + int dominant_rank = CollectPointCloudOnRoot(mesh, marker, bdr, vertices); + return BoundingBoxFromPointCloud(mesh.GetComm(), vertices, dominant_rank); +} + +BoundingBox GetBoundingBall(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr) +{ + std::vector vertices; + int dominant_rank = CollectPointCloudOnRoot(mesh, marker, bdr, vertices); + return BoundingBallFromPointCloud(mesh.GetComm(), vertices, dominant_rank); +} + +double GetProjectedLength(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr, const std::array &dir) +{ + std::vector vertices; + int dominant_rank = CollectPointCloudOnRoot(mesh, marker, bdr, vertices); + double length; + if (dominant_rank == Mpi::Rank(mesh.GetComm())) + { + CVector3dMap direction(dir.data()); + auto Dot = [&](const auto &x, const auto &y) + { return direction.dot(x) < direction.dot(y); }; + auto p_min = std::min_element(vertices.begin(), vertices.end(), Dot); + auto p_max = std::max_element(vertices.begin(), vertices.end(), Dot); + length = (*p_max - *p_min).dot(direction.normalized()); + } + Mpi::Broadcast(1, &length, dominant_rank, mesh.GetComm()); + return length; +} + +double GetDistanceFromPoint(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr, const std::array &origin, bool max) +{ + std::vector vertices; + int dominant_rank = CollectPointCloudOnRoot(mesh, marker, bdr, vertices); + double dist; + if (dominant_rank == Mpi::Rank(mesh.GetComm())) + { + CVector3dMap x0(origin.data()); + auto p = + max ? std::max_element(vertices.begin(), vertices.end(), + [&x0](const Eigen::Vector3d &x, const Eigen::Vector3d &y) + { return (x - x0).norm() < (y - x0).norm(); }) + : std::min_element(vertices.begin(), vertices.end(), + [&x0](const Eigen::Vector3d &x, const Eigen::Vector3d &y) + { return (x - x0).norm() < (y - x0).norm(); }); + dist = (*p - x0).norm(); + } + Mpi::Broadcast(1, &dist, dominant_rank, mesh.GetComm()); + return dist; +} + +// Given a mesh and boundary attribute marker array, compute a normal for the surface. If +// not averaging, use the first entry. +mfem::Vector GetSurfaceNormal(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool average) +{ + int dim = mesh.SpaceDimension(); + mfem::IsoparametricTransformation T; + mfem::Vector loc_normal(dim), normal(dim); + normal = 0.0; + if (mesh.Dimension() == mesh.SpaceDimension()) + { + // Loop over boundary elements. Exit early if not averaging and non-zero normal. + for (int i = 0; i < mesh.GetNBE() && !(!average && normal.Norml2() > 0.0); i++) + { + if (!marker[mesh.GetBdrAttribute(i) - 1]) + { + continue; + } + mesh.GetBdrElementTransformation(i, &T); + mesh::Normal(T, loc_normal, &normal); + normal += loc_normal; + } + } + else + { + // Loop over domain elements. Exit early if not averaging and non-zero normal. + for (int i = 0; i < mesh.GetNE() && !(!average && normal.Norml2() > 0.0); i++) + { + if (!marker[mesh.GetAttribute(i) - 1]) + { + continue; + } + mesh.GetElementTransformation(i, &T); + mesh::Normal(T, loc_normal, &normal); + normal += loc_normal; + } + } + + // If different processors have different normal orientations, take that from the lowest + // rank processor. + MPI_Comm comm = mesh.GetComm(); + int rank = Mpi::Size(comm); + mfem::Vector glob_normal(dim); + if (normal.Norml2() > 0.0) + { + rank = Mpi::Rank(comm); + } + Mpi::GlobalMin(1, &rank, comm); + if (rank == Mpi::Size(comm)) + { + // No boundary elements are marked. + normal = 0.0; + return normal; + } + if (rank == Mpi::Rank(comm)) + { + glob_normal = normal; + } + Mpi::Broadcast(dim, glob_normal.HostReadWrite(), rank, comm); + if (average) + { + if (normal * glob_normal < 0.0) + { + normal.Neg(); + } + Mpi::GlobalSum(dim, normal.HostReadWrite(), comm); + } + else + { + normal = glob_normal; + } + normal /= normal.Norml2(); + + if constexpr (false) + { + Mpi::Print(comm, " Surface normal = ({:+.3e})", fmt::join(normal, ", ")); + } + return normal; +} + +double GetSurfaceArea(const mfem::ParMesh &mesh, const mfem::Array &marker) +{ + double area = 0.0; + PalacePragmaOmp(parallel reduction(+ : area)) + { + mfem::IsoparametricTransformation T; + PalacePragmaOmp(for schedule(static)) + for (int i = 0; i < mesh.GetNBE(); i++) + { + if (!marker[mesh.GetBdrAttribute(i) - 1]) + { + continue; + } + mesh.GetBdrElementTransformation(i, &T); + const mfem::IntegrationRule &ir = mfem::IntRules.Get(T.GetGeometryType(), T.OrderJ()); + for (int j = 0; j < ir.GetNPoints(); j++) + { + const mfem::IntegrationPoint &ip = ir.IntPoint(j); + T.SetIntPoint(&ip); + area += ip.weight * T.Weight(); + } + } + } + Mpi::GlobalSum(1, &area, mesh.GetComm()); + return area; +} + +double GetVolume(const mfem::ParMesh &mesh, const mfem::Array &marker) +{ + double volume = 0.0; + PalacePragmaOmp(parallel reduction(+ : volume)) + { + mfem::IsoparametricTransformation T; + PalacePragmaOmp(for schedule(static)) + for (int i = 0; i < mesh.GetNE(); i++) + { + if (!marker[mesh.GetAttribute(i) - 1]) + { + continue; + } + mesh.GetElementTransformation(i, &T); + const mfem::IntegrationRule &ir = mfem::IntRules.Get(T.GetGeometryType(), T.OrderJ()); + for (int j = 0; j < ir.GetNPoints(); j++) + { + const mfem::IntegrationPoint &ip = ir.IntPoint(j); + T.SetIntPoint(&ip); + volume += ip.weight * T.Weight(); + } + } + } + Mpi::GlobalSum(1, &volume, mesh.GetComm()); + return volume; +} + +double RebalanceMesh(const IoData &iodata, std::unique_ptr &mesh) +{ + BlockTimer bt0(Timer::REBALANCE); + MPI_Comm comm = mesh->GetComm(); + if (iodata.model.refinement.save_adapt_mesh) + { + // Create a separate serial mesh to write to disk. + auto sfile = fs::path(iodata.problem.output) / fs::path(iodata.model.mesh).stem(); + sfile += ".mesh"; + + auto PrintSerial = [&](mfem::Mesh &smesh) + { + BlockTimer bt1(Timer::IO); + if (Mpi::Root(comm)) + { + std::ofstream fo(sfile); + // mfem::ofgzstream fo(sfile, true); // Use zlib compression if available + // fo << std::fixed; + fo << std::scientific; + fo.precision(MSH_FLT_PRECISION); + mesh::DimensionalizeMesh(smesh, iodata.units.GetMeshLengthRelativeScale()); + smesh.Mesh::Print(fo); // Do not need to nondimensionalize the temporary mesh + } + Mpi::Barrier(comm); + }; + + if (mesh->Nonconforming()) + { + mfem::ParMesh smesh(*mesh); + mfem::Array serial_partition(mesh->GetNE()); + serial_partition = 0; + smesh.Rebalance(serial_partition); + PrintSerial(smesh); + } + else + { + mfem::Mesh smesh = mesh->GetSerialMesh(0); + PrintSerial(smesh); + } + } + + // If there is more than one processor, may perform rebalancing. + if (Mpi::Size(comm) == 1) + { + return 1.0; + } + int min_elem, max_elem; + min_elem = max_elem = mesh->GetNE(); + Mpi::GlobalMin(1, &min_elem, comm); + Mpi::GlobalMax(1, &max_elem, comm); + const double ratio = double(max_elem) / min_elem; + const double tol = iodata.model.refinement.maximum_imbalance; + if constexpr (false) + { + Mpi::Print("Rebalancing: max/min elements per processor = {:d}/{:d} (ratio = {:.3e}, " + "tol = {:.3e})\n", + max_elem, min_elem, ratio, tol); + } + if (ratio > tol) + { + if (mesh->Nonconforming()) + { + mesh->Rebalance(); + } + else + { + // Without access to a refinement tree, partitioning must be done on the root + // processor and then redistributed. + RebalanceConformalMesh(mesh); + } + } + return ratio; +} + +} // namespace mesh + +namespace +{ + +std::unique_ptr LoadMesh(const std::string &mesh_file, bool remove_curvature, + const config::BoundaryData &boundaries) +{ + // Read the (serial) mesh from the given mesh file. Handle preparation for refinement and + // orientations here to avoid possible reorientations and reordering later on. MFEM + // supports a native mesh format (.mesh), VTK/VTU, Gmsh, as well as some others. We use + // built-in converters for the types we know, otherwise rely on MFEM to do the conversion + // or error out if not supported. + constexpr bool generate_edges = false, refine = false, fix_orientation = true; + std::unique_ptr mesh; + fs::path mesh_path(mesh_file); + if (mesh_path.extension() == ".mphtxt" || mesh_path.extension() == ".mphbin" || + mesh_path.extension() == ".nas" || mesh_path.extension() == ".bdf") + { + // Put translated mesh in temporary string buffer. + std::stringstream fi(std::stringstream::in | std::stringstream::out); + // fi << std::fixed; + fi << std::scientific; + fi.precision(MSH_FLT_PRECISION); + if (mesh_path.extension() == ".mphtxt" || mesh_path.extension() == ".mphbin") + { + mesh::ConvertMeshComsol(mesh_file, fi, remove_curvature); + // mesh::ConvertMeshComsol(mesh_file, fo, remove_curvature); + } + else + { + mesh::ConvertMeshNastran(mesh_file, fi, remove_curvature); + // mesh::ConvertMeshNastran(mesh_file, fo, remove_curvature); + } + mesh = std::make_unique(fi, generate_edges, refine, fix_orientation); + } + else + { + // Otherwise, just rely on MFEM load the mesh. + std::ifstream fi(mesh_file); + if (!fi.good()) + { + MFEM_ABORT("Unable to open mesh file \"" << mesh_file << "\"!"); + } + mesh = std::make_unique(fi, generate_edges, refine, fix_orientation); + } + if (remove_curvature) + { + mesh->SetCurvature(-1); + } + else + { + mesh->EnsureNodes(); + } + if (!boundaries.periodic.boundary_pairs.empty()) + { + auto periodic_mesh = std::move(mesh); + + for (const auto &data : boundaries.periodic.boundary_pairs) + { + auto periodic_mapping = mesh::DeterminePeriodicVertexMapping(periodic_mesh, data); + if (!periodic_mapping.empty()) + { + auto p_mesh = std::make_unique( + mfem::Mesh::MakePeriodic(*periodic_mesh, periodic_mapping)); + periodic_mesh = std::move(p_mesh); + } + } + mesh = std::move(periodic_mesh); + } + + return mesh; +} + +template > +void TransferHighOrderNodes(const mfem::Mesh &orig_mesh, mfem::Mesh &new_mesh, + const T *elem_delete_map = nullptr) +{ + // This accounts for new boundary elements too since no new dofs are added. See the MFEM + // trimmer miniapp for reference. + MFEM_VERIFY(orig_mesh.GetNodes(), "No high-order nodes information to transfer!"); + const mfem::GridFunction *nodes = orig_mesh.GetNodes(); + const mfem::FiniteElementSpace *fespace = nodes->FESpace(); + mfem::Ordering::Type ordering = fespace->GetOrdering(); + int order = fespace->GetMaxElementOrder(); + int sdim = orig_mesh.SpaceDimension(); + bool discont = + (dynamic_cast(fespace->FEColl()) != nullptr); + new_mesh.SetCurvature(order, discont, sdim, ordering); + mfem::GridFunction *new_nodes = new_mesh.GetNodes(); + const mfem::FiniteElementSpace *new_fespace = new_nodes->FESpace(); + + // Transfer dofs from the old mesh to the new ones. Either consider all elements (works + // for orientation or numbering changes), or use the prescribed old to new element index + // map. + mfem::Array vdofs; + mfem::Vector loc_vec; + for (int e = 0; e < orig_mesh.GetNE(); e++) + { + if (!elem_delete_map || (*elem_delete_map)[e] >= 0) + { + // No need for DofTransformation here since spaces are H1 or L2. + fespace->GetElementVDofs(e, vdofs); + nodes->GetSubVector(vdofs, loc_vec); + new_fespace->GetElementVDofs(!elem_delete_map ? e : (*elem_delete_map)[e], vdofs); + new_nodes->SetSubVector(vdofs, loc_vec); + } + } +} + +void CleanMesh(std::unique_ptr &orig_mesh, + const std::vector &mat_attr_list) +{ + auto mat_marker = mesh::AttrToMarker( + orig_mesh->attributes.Size() ? orig_mesh->attributes.Max() : 0, mat_attr_list, true); + std::vector elem_delete_map(orig_mesh->GetNE(), -1), + bdr_elem_delete_map(orig_mesh->GetNBE(), -1); + + // Delete domain and boundary elements which have no associated material or BC attribute + // from the mesh. + int new_ne = 0; + for (int e = 0; e < orig_mesh->GetNE(); e++) + { + if (mat_marker[orig_mesh->GetAttribute(e) - 1]) + { + elem_delete_map[e] = new_ne++; + } + } + + // Make sure to remove any boundary elements which are no longer attached to elements in + // the domain. + int new_nbe = 0; + for (int be = 0; be < orig_mesh->GetNBE(); be++) + { + int f, o, e1, e2; + orig_mesh->GetBdrElementFace(be, &f, &o); + orig_mesh->GetFaceElements(f, &e1, &e2); + bool no_e1 = (e1 < 0 || elem_delete_map[e1] < 0); + bool no_e2 = (e2 < 0 || elem_delete_map[e2] < 0); + if (!no_e1 || !no_e2) + { + bdr_elem_delete_map[be] = new_nbe++; + } + else if constexpr (false) + { + Mpi::Print("Deleting an unattached boundary element!\n"); + } + } + if (new_ne < orig_mesh->GetNE()) + { + Mpi::Print("Removed {:d} unmarked domain elements from the mesh\n", + orig_mesh->GetNE() - new_ne); + } + if (new_nbe < orig_mesh->GetNBE()) + { + Mpi::Print("Removed {:d} unattached boundary elements from the mesh\n", + orig_mesh->GetNBE() - new_nbe); + } + + // Create the new mesh. + if (new_ne == orig_mesh->GetNE() && new_nbe == orig_mesh->GetNBE()) + { + return; + } + MFEM_VERIFY(!orig_mesh->Nonconforming(), + "Mesh element cleaning is not supported for nonconforming meshes!"); + auto new_mesh = + std::make_unique(orig_mesh->Dimension(), orig_mesh->GetNV(), new_ne, + new_nbe, orig_mesh->SpaceDimension()); + + // Copy vertices and non-deleted domain and boundary elements. + for (int v = 0; v < orig_mesh->GetNV(); v++) + { + new_mesh->AddVertex(orig_mesh->GetVertex(v)); + } + for (int e = 0; e < orig_mesh->GetNE(); e++) + { + if (elem_delete_map[e] >= 0) + { + mfem::Element *el = orig_mesh->GetElement(e)->Duplicate(new_mesh.get()); + new_mesh->AddElement(el); + } + } + for (int be = 0; be < orig_mesh->GetNBE(); be++) + { + if (bdr_elem_delete_map[be] >= 0) + { + mfem::Element *bdr_el = orig_mesh->GetBdrElement(be)->Duplicate(new_mesh.get()); + new_mesh->AddBdrElement(bdr_el); + } + } + + // Finalize the new mesh topology and replace the old mesh. If a curved mesh, set up the + // new mesh by projecting nodes onto the new mesh for the non-trimmed vdofs. No need to + // mark for refinement or fix orientations, since everything is copied from the previous + // mesh. + constexpr bool generate_bdr = false; + new_mesh->FinalizeTopology(generate_bdr); + new_mesh->RemoveUnusedVertices(); // Remove vertices from the deleted elements + if (orig_mesh->GetNodes()) + { + TransferHighOrderNodes(*orig_mesh, *new_mesh, &elem_delete_map); + } + orig_mesh = std::move(new_mesh); +} + +void SplitMeshElements(std::unique_ptr &orig_mesh, bool make_simplex, + bool make_hex) +{ + if (!make_simplex && !make_hex) + { + return; + } + mfem::Mesh *mesh = orig_mesh.get(); + mfem::Mesh new_mesh; + + // Convert all element types to simplices. + if (make_simplex) + { + const auto element_types = mesh::CheckElements(*mesh); + if (element_types.has_hexahedra || element_types.has_prisms || + element_types.has_pyramids) + { + MFEM_VERIFY(!mesh->Nonconforming(), + "Mesh element splitting is not supported for nonconforming meshes!"); + MFEM_VERIFY( + !element_types.has_pyramids, + "Splitting mesh elements to simplices does not support pyramid elements yet!"); + int ne = mesh->GetNE(); + new_mesh = mfem::Mesh::MakeSimplicial(*mesh); + Mpi::Print("Added {:d} elements to the mesh during conversion to simplices\n", + new_mesh.GetNE() - ne); + mesh = &new_mesh; + } + } + + // Convert all element types to hexahedra (currently only tet-to-hex). + if (make_hex) + { + const auto element_types = mesh::CheckElements(*mesh); + if (element_types.has_simplices || element_types.has_prisms || + element_types.has_pyramids) + { + MFEM_VERIFY(!mesh->Nonconforming(), + "Mesh element splitting is not supported for nonconforming meshes!"); + MFEM_VERIFY(!element_types.has_prisms && !element_types.has_pyramids, + "Splitting mesh elements to hexahedra only supports simplex elements " + "(tetrahedra) for now!"); + int ne = mesh->GetNE(); + new_mesh = mesh::MeshTetToHex(*mesh); + Mpi::Print("Added {:d} elements to the mesh during conversion to hexahedra\n", + new_mesh.GetNE() - ne); + mesh = &new_mesh; + } + } + + // Return if no modifications were made. + if (mesh == orig_mesh.get()) + { + return; + } + orig_mesh = std::make_unique(std::move(new_mesh)); // Call move constructor + orig_mesh->FinalizeTopology(); +} + +void ReorderMeshElements(mfem::Mesh &mesh, bool print) +{ + mfem::Array ordering; + if constexpr (false) + { + // Gecko reordering. + mfem::Array tentative; + int outer = 3, inner = 3, window = 4, period = 2; + double best_cost = mfem::infinity(); + for (int i = 0; i < outer; i++) + { + int seed = i + 1; + double cost = + mesh.GetGeckoElementOrdering(tentative, inner, window, period, seed, true); + if (cost < best_cost) + { + ordering = tentative; + best_cost = cost; + } + } + if (print) + { + Mpi::Print("Final cost: {:e}\n", best_cost); + } + } + else + { + // (Faster) Hilbert reordering. + mesh.GetHilbertElementOrdering(ordering); + mesh.ReorderElements(ordering); + } +} + +std::unordered_map GetFaceToBdrElementMap(const mfem::Mesh &mesh, + const config::BoundaryData &boundaries) +{ + std::unordered_map face_to_be; + face_to_be.reserve(mesh.GetNBE()); + for (int be = 0; be < mesh.GetNBE(); be++) + { + int f, o, e1 = -1, e2 = -1; + mesh.GetBdrElementFace(be, &f, &o); + int attr = mesh.GetBdrAttribute(be); + if (!boundaries.periodic.boundary_pairs.empty()) + { + for (const auto &data : boundaries.periodic.boundary_pairs) + { + const auto &da = data.donor_attributes, &ra = data.receiver_attributes; + auto donor = std::find(da.begin(), da.end(), attr) != da.end(); + auto receiver = std::find(ra.begin(), ra.end(), attr) != ra.end(); + if (donor || receiver) + { + mesh.GetFaceElements(f, &e1, &e2); + MFEM_VERIFY(e1 >= 0 && e2 >= 0, + "Mesh is not periodic on attribute " << attr << "!"); + } + } + } + MFEM_VERIFY((e1 >= 0 && e2 >= 0) || face_to_be.find(f) == face_to_be.end(), + "A non-periodic face (" + << f << ") cannot have multiple boundary elements! Attributes: " << attr + << ' ' << mesh.GetBdrAttribute(face_to_be[f])); + face_to_be[f] = be; + } + return face_to_be; +} + +std::unordered_map CheckMesh(const mfem::Mesh &mesh, + const config::BoundaryData &boundaries) +{ + // Check for: + // (1) Boundary elements with no prescribed boundary condition, and + // (2) Boundary faces which have no boundary element. + auto bdr_marker = + mesh::AttrToMarker(mesh.bdr_attributes.Size() ? mesh.bdr_attributes.Max() : 0, + boundaries.attributes, true); + std::unordered_map face_to_be = GetFaceToBdrElementMap(mesh, boundaries); + std::unordered_set bdr_warn_list; + int bdr_face_warn = 0; + for (int f = 0; f < mesh.GetNumFaces(); f++) + { + int e1, e2; + mesh.GetFaceElements(f, &e1, &e2); + if (e1 >= 0 && e2 >= 0) + { + continue; // Only consider true exterior faces + } + auto it = face_to_be.find(f); + if (it != face_to_be.end()) + { + int attr = mesh.GetBdrAttribute(it->second); + if (!bdr_marker[attr - 1]) + { + // Boundary element with no prescribed boundary condition. + bdr_warn_list.insert(attr); + } + } + else + { + // Boundary face with no attached boundary element. + bdr_face_warn++; + } + } + if (!bdr_warn_list.empty()) + { + Mpi::Warning("One or more external boundary attributes has no associated boundary " + "condition!\n\"PMC\"/\"ZeroCharge\" condition is assumed!\n"); + utils::PrettyPrint(bdr_warn_list, "Boundary attribute list:"); + Mpi::Print("\n"); + } + if (bdr_face_warn) + { + Mpi::Warning("{:d} mesh faces with no associated boundary element exist on the domain " + "boundary!\n", + bdr_face_warn); + } + return face_to_be; +} + +template +class EdgeRefinementMesh : public mfem::Mesh +{ +private: + // Container with keys being pairs of vertex indices (match row/column indices of v_to_v) + // of edges which we desire to be refined. + const T &refinement_edges; + + //void MarkTetMeshForRefinement(const mfem::DSTable &v_to_v) override + //{ + // // The standard tetrahedral refinement in MFEM is to mark the longest edge of each tet + // // for the first refinement. We hack this marking here in order to prioritize refinement + // // of edges which are part of internal boundary faces being marked for refinement. This + // // should hopefully limit the amount of extra refinement required to ensure conformity + // // after the marked elements are refined. Marking will then discover only the longest + // // edges, which are those within the boundary to be cracked. + // mfem::Array lengths; + // GetEdgeLengths2(v_to_v, lengths); + // const auto min_length = 0.01 * lengths.Min(); + // for (int i = 0; i < v_to_v.NumberOfRows(); i++) + // { + // for (mfem::DSTable::RowIterator it(v_to_v, i); !it; ++it) + // { + // int j = it.Column(); + // if (refinement_edges.find({i, j}) == refinement_edges.end()) + // { + // // "Zero" the edge lengths which do not connect vertices on the interface. Avoid + // // zero-length edges just in case. + // lengths[it.Index()] = min_length; + // } + // } + // } + + // // Finish marking (see mfem::Mesh::MarkTetMeshForRefinement). + // mfem::Array indices(NumOfEdges); + // std::iota(indices.begin(), indices.end(), 0); + // for (int i = 0; i < NumOfElements; i++) + // { + // if (elements[i]->GetType() == mfem::Element::TETRAHEDRON) + // { + // MFEM_ASSERT(dynamic_cast(elements[i]), + // "Unexpected non-Tetrahedron element type!"); + // static_cast(elements[i])->MarkEdge(v_to_v, lengths, indices); + // } + // } + // for (int i = 0; i < NumOfBdrElements; i++) + // { + // if (boundary[i]->GetType() == mfem::Element::TRIANGLE) + // { + // MFEM_ASSERT(dynamic_cast(boundary[i]), + // "Unexpected non-Triangle element type!"); + // static_cast(boundary[i])->MarkEdge(v_to_v, lengths, indices); + // } + // } + //} + +public: + EdgeRefinementMesh(mfem::Mesh &&mesh, const T &refinement_edges) + : mfem::Mesh(std::move(mesh)), refinement_edges(refinement_edges) + { + } +}; + +template +struct UnorderedPair +{ + T first, second; + UnorderedPair(T first, T second) : first(first), second(second) {} + bool operator==(const UnorderedPair &v) const + { + return ((v.first == first && v.second == second) || + (v.first == second && v.second == first)); + } +}; + +template +struct UnorderedPairHasher +{ + std::size_t operator()(const UnorderedPair &v) const + { + // Simple hash function for a pair, see https://tinyurl.com/2k4phapb. + return std::hash()(std::min(v.first, v.second)) ^ + std::hash()(std::max(v.first, v.second)) << 1; + } +}; + +int AddInterfaceBdrElements(IoData &iodata, std::unique_ptr &orig_mesh, + std::unordered_map &face_to_be, MPI_Comm comm) +{ + // Exclude some internal boundary conditions for which cracking would give invalid + // results: lumpedports in particular. + const auto crack_boundary_attributes = [&iodata]() + { + auto cba = iodata.boundaries.attributes; + // Remove lumped port attributes. + for (const auto &[idx, data] : iodata.boundaries.lumpedport) + { + for (const auto &e : data.elements) + { + auto attr_in_elem = [&](auto x) + { + return std::find(e.attributes.begin(), e.attributes.end(), x) != + e.attributes.end(); + }; + cba.erase(std::remove_if(cba.begin(), cba.end(), attr_in_elem), cba.end()); + } + } + return cba; + }(); + + // Return if nothing to do. Otherwise, count vertices and boundary elements to add. + if (crack_boundary_attributes.empty() && !iodata.model.add_bdr_elements) + { + return 1; // Success + } + + if (face_to_be.size() != static_cast(orig_mesh->GetNBE())) + { + face_to_be = GetFaceToBdrElementMap(*orig_mesh, iodata.boundaries); + } + int new_nv = orig_mesh->GetNV(); + int new_nbe = orig_mesh->GetNBE(); + + // Duplicate internal boundary elements from the given boundary attribute list, cracking + // the mesh such that domain elements on either side are no longer coupled. Correctly + // handles cracks where more than two domains intersect as well as seams where a crack + // ends and the vertices are not duplicated. + std::unordered_set crack_bdr_elem; + std::unordered_map>>> + crack_vert_duplicates; + std::unique_ptr vert_to_elem; + if (!crack_boundary_attributes.empty() && iodata.model.crack_bdr_elements) + { + auto crack_bdr_marker = mesh::AttrToMarker( + orig_mesh->bdr_attributes.Size() ? orig_mesh->bdr_attributes.Max() : 0, + crack_boundary_attributes, true); + std::unordered_set external_attributes; + for (int be = 0; be < orig_mesh->GetNBE(); be++) + { + if (crack_bdr_marker[orig_mesh->GetBdrAttribute(be) - 1]) + { + int f, o, e1, e2; + orig_mesh->GetBdrElementFace(be, &f, &o); + orig_mesh->GetFaceElements(f, &e1, &e2); + if (e1 >= 0 && e2 >= 0) + { + crack_bdr_elem.insert(be); + iodata.boundaries.cracked_attributes.insert(orig_mesh->GetBdrAttribute(be)); + } + else + { + external_attributes.insert(orig_mesh->GetBdrAttribute(be)); + } + } + } + MFEM_VERIFY(crack_bdr_elem.empty() || !orig_mesh->Nonconforming(), + "Duplicating internal boundary elements for interior boundaries is not " + "supported for nonconforming meshes!"); + std::vector mixed_attributes; + std::set_intersection(iodata.boundaries.cracked_attributes.begin(), + iodata.boundaries.cracked_attributes.end(), + external_attributes.begin(), external_attributes.end(), + std::back_inserter(mixed_attributes)); + if (!mixed_attributes.empty()) + { + MFEM_WARNING("Found boundary attribute with internal and external boundary elements: " + << fmt::format("{}", fmt::join(mixed_attributes, " ")) + << ". Impedance boundary conditions for these attributes will give " + "erroneous results, consider separating into different attributes!"); + } + vert_to_elem.reset(orig_mesh->GetVertexToElementTable()); // Owned by caller + const mfem::Table &elem_to_face = orig_mesh->ElementToFaceTable(); + int new_nv_dups = 0; + for (auto be : crack_bdr_elem) + { + const mfem::Element *bdr_el = orig_mesh->GetBdrElement(be); + const int *verts = bdr_el->GetVertices(); + for (int i = 0; i < bdr_el->GetNVertices(); i++) + { + // Skip vertices we have already processed. + const auto v = verts[i]; + if (crack_vert_duplicates.find(v) != crack_vert_duplicates.end()) + { + continue; + } + + // Collect connected components of elements connected to the vertex. Perform BFS + // on graph of all elements connected to this vertex, where adjacencies are + // determined by face connectivity excluding the crack faces. + std::vector> components; + const int *elems = vert_to_elem->GetRow(v); + std::unordered_set unvisited(elems, elems + vert_to_elem->RowSize(v)); + while (!unvisited.empty()) + { + auto &component = components.emplace_back(); + component.reserve(unvisited.size()); + std::queue que; + que.push(*unvisited.begin()); + unvisited.erase(unvisited.begin()); + while (!que.empty()) + { + // Process the current node. + int e = que.front(); + que.pop(); + component.insert(e); + + // Add neighbors. + const int *faces = elem_to_face.GetRow(e); + for (int j = 0; j < elem_to_face.RowSize(e); j++) + { + const auto f = faces[j]; + { + auto it = face_to_be.find(f); + if (it != face_to_be.end() && + crack_bdr_elem.find(it->second) != crack_bdr_elem.end()) + { + // Skip element-element connectivities which cross the crack. + continue; + } + } + int e1, e2; + orig_mesh->GetFaceElements(f, &e1, &e2); + MFEM_VERIFY( + e == e1 || e == e2, + "Unexpected face-element connectivity in internal boundary cracking!"); + int nbr = (e == e1) ? e2 : e1; + if (nbr >= 0) + { + auto it = unvisited.find(nbr); + if (it != unvisited.end()) + { + que.push(nbr); + unvisited.erase(it); + } + } + } + } + } + MFEM_VERIFY( + !components.empty(), + "No connected components found for elements adjacent to a crack vertex!"); +#if defined(MFEM_DEBUG) + { + std::size_t visited_size = 0; + for (const auto &component : components) + { + visited_size += component.size(); + } + MFEM_VERIFY(visited_size == static_cast(vert_to_elem->RowSize(v)), + "Failed to visit all elements in neighborhood of vertex when " + "counting connected components!"); + } +#endif + + // Save mapping from original vertex to duplicate vertices, and the corresponding + // element groupings requiring renumbering. The first group doesn't need + // renumbering so is not saved. We still keep entries for non-duplicated crack + // vertices in the set to track them as processed, however. + auto &vert_components = crack_vert_duplicates.try_emplace(v).first->second; + for (auto it = components.begin() + 1; it != components.end(); ++it) + { + vert_components.emplace_back(-1, std::move(*it)); + new_nv_dups++; + } + } + } + + // After processing all boundary elements, check if there are any elements which need + // refinement in order to successfully decouple both sides. This happens if we have an + // edge interior to the crack which connects to seam vertices (non-duplicated vertices + // attached to crack boundary elements). A previous implementation of refinement + // considered for refinement just all boundary elements with all attached vertices + // lying on the seam, but this doesn't catch all required cases. + if (iodata.model.refine_crack_elements) + { + std::unordered_map, std::vector, UnorderedPairHasher> + coarse_crack_edge_to_be; + for (auto be : crack_bdr_elem) + { + const mfem::Element *bdr_el = orig_mesh->GetBdrElement(be); + const int *verts = bdr_el->GetVertices(); + for (int i = 0; i < bdr_el->GetNEdges(); i++) + { + auto v0 = verts[bdr_el->GetEdgeVertices(i)[0]], + v1 = verts[bdr_el->GetEdgeVertices(i)[1]]; + MFEM_ASSERT(crack_vert_duplicates.find(v0) != crack_vert_duplicates.end() && + crack_vert_duplicates.find(v1) != crack_vert_duplicates.end(), + "Unable to locate crack vertices for an interior boundary element!"); + if (crack_vert_duplicates[v0].empty() && crack_vert_duplicates[v1].empty()) + { + // This is a seam edge, so add the attached boundary element to a list. The + // check for the edge being interior to the crack is indicated by visiting more + // than once. + auto it = coarse_crack_edge_to_be.find({v0, v1}); + auto &adjacent_be = + (it == coarse_crack_edge_to_be.end()) + ? coarse_crack_edge_to_be.try_emplace({v0, v1}).first->second + : it->second; + adjacent_be.push_back(be); + } + } + } + for (auto it = coarse_crack_edge_to_be.begin(); it != coarse_crack_edge_to_be.end();) + { + // Remove all seam edges which are on the "outside" of the crack (visited only + // once). + if (it->second.size() == 1) + { + it = coarse_crack_edge_to_be.erase(it); + } + else + { + ++it; + } + } + // Static reporting variables so can persist across retries. + static int new_ne_ref = 0; + static int new_ref_its = 0; + if (!coarse_crack_edge_to_be.empty()) + { + // Locally refine the mesh using conformal refinement. If necessary, convert the + // mesh to simplices first to enable conforming refinement (this will do nothing + // if the mesh is already a simplex mesh). + // Note: Eventually we can implement manual conforming face refinement of pairs of + // elements sharing a face for all element types (insert a vertex at the boundary + // element center and connect it to all other element vertices). For now, this adds + // complexity and making use of conformal simplex refinement seems good enough for + // most use cases. + int ne = orig_mesh->GetNE(); + SplitMeshElements(orig_mesh, true, false); + if (ne != orig_mesh->GetNE()) + { + face_to_be.clear(); + return 0; // Mesh was converted to simplices, start over + } + std::unordered_map elem_to_refine; + for (const auto &[edge, adjacent_be] : coarse_crack_edge_to_be) + { + for (auto be : adjacent_be) + { + int f, o, e1, e2; + orig_mesh->GetBdrElementFace(be, &f, &o); + orig_mesh->GetFaceElements(f, &e1, &e2); + MFEM_ASSERT(e1 >= 0 && e2 >= 0, + "Invalid internal boundary element connectivity!"); + elem_to_refine[e1]++; // Value-initialized to 0 at first access + elem_to_refine[e2]++; + } + } + mfem::Array refinements; + refinements.Reserve(elem_to_refine.size()); + for (const auto &[e, count] : elem_to_refine) + { + // Tetrahedral bisection (vs. default octasection) will result in fewer added + // elements at the cost of a potential minor mesh quality degradation. + refinements.Append(mfem::Refinement(e, mfem::Refinement::X)); + // refinements.Append(mfem::Refinement(e, (count > 1) ? mfem::Refinement::XY + // : mfem::Refinement::X)); + } + if (mesh::CheckElements(*orig_mesh).has_simplices) + { + // Mark tetrahedral mesh for refinement before doing local refinement. This is a + // bit of a strange pattern to override the standard conforming refinement of the + // mfem::Mesh class. We want to implement our own edge marking of the tetrahedra, + // so we move the mesh to a constructed derived class object, mark it, and then + // move assign it to the original base class object before refining. All of these + // moves should be cheap without any extra memory allocation. Also, we mark the + // mesh every time to ensure multiple rounds of refinement target the interior + // boundary (we don't care about preserving the refinement hierarchy). + constexpr bool refine = true, fix_orientation = false; + EdgeRefinementMesh ref_mesh(std::move(*orig_mesh), coarse_crack_edge_to_be); + ref_mesh.Finalize(refine, fix_orientation); + *orig_mesh = std::move(ref_mesh); + } + orig_mesh->GeneralRefinement(refinements, 0); + new_ne_ref += orig_mesh->GetNE() - ne; + new_ref_its++; + face_to_be.clear(); + return 0; // Mesh was refined (conformally), start over + } + else if (new_ne_ref > 0) + { + Mpi::Print( + "Added {:d} elements in {:d} iterations of local bisection for under-resolved " + "interior boundaries\n", + new_ne_ref, new_ref_its); + } + } + + new_nv += new_nv_dups; + new_nbe += crack_bdr_elem.size(); + if (crack_bdr_elem.size() > 0) + { + Mpi::Print("Added {:d} duplicate vertices for interior boundaries in the mesh\n", + new_nv_dups); + Mpi::Print( + "Added {:d} duplicate boundary elements for interior boundaries in the mesh\n", + crack_bdr_elem.size()); + } + } + + // Add new boundary elements at material interfaces or on the exterior boundary of the + // simulation domain, if there is not already a boundary element present. + std::unordered_map new_face_bdr_elem; + if (iodata.model.add_bdr_elements) + { + int new_nbe_ext = 0, new_nbe_int = 0; + for (int f = 0; f < orig_mesh->GetNumFaces(); f++) + { + // Skip all faces which already have an associated boundary element (this includes + // any boundary elements which were duplicated during cracking in the previous step). + if (face_to_be.find(f) != face_to_be.end()) + { + continue; + } + int e1, e2; + orig_mesh->GetFaceElements(f, &e1, &e2); + if (e1 < 0 || e2 < 0) + { + if constexpr (false) + { + Mpi::Print("Adding exterior boundary element!\n"); + } + new_face_bdr_elem[f] = 1; + new_nbe_ext++; + } + else if (orig_mesh->GetAttribute(e1) != orig_mesh->GetAttribute(e2)) + { + if constexpr (false) + { + Mpi::Print("Adding material interface boundary element!\n"); + } + new_face_bdr_elem[f] = 1; + new_nbe_int++; + } + } + MFEM_VERIFY(new_nbe_ext + new_nbe_int == 0 || !orig_mesh->Nonconforming(), + "Adding material interface boundary elements is not supported for " + "nonconforming meshes!"); + + new_nbe += (new_nbe_ext + new_nbe_int); + if (new_nbe_ext > 0) + { + Mpi::Print("Added {:d} boundary elements for exterior boundaries to the mesh\n", + new_nbe_ext); + } + if (new_nbe_int > 0) + { + Mpi::Print("Added {:d} boundary elements for material interfaces to the mesh\n", + new_nbe_int); + } + } + + // Export mesh after pre-processing, before cracking boundary elements. + if (iodata.model.export_prerefined_mesh && Mpi::Root(comm)) + { + auto pos = iodata.model.mesh.find_last_of("."); + std::string meshfile = iodata.model.mesh.substr(0, pos) + "_preprocessed.mesh"; + std::ofstream fo(meshfile); + fo.precision(MSH_FLT_PRECISION); + orig_mesh->Print(fo); + } + + // Create the new mesh. We can't just add the new vertices and boundary elements to the + // original mesh, because we need to keep it around in order to transfer the high-order + // nodes information to the new mesh. + if (new_nv == orig_mesh->GetNV() && new_nbe == orig_mesh->GetNBE()) + { + return 1; // Success + } + auto new_mesh = + std::make_unique(orig_mesh->Dimension(), new_nv, orig_mesh->GetNE(), + new_nbe, orig_mesh->SpaceDimension()); + + // Copy vertices and domain and boundary elements. + for (int v = 0; v < orig_mesh->GetNV(); v++) + { + new_mesh->AddVertex(orig_mesh->GetVertex(v)); + } + for (int e = 0; e < orig_mesh->GetNE(); e++) + { + mfem::Element *el = orig_mesh->GetElement(e)->Duplicate(new_mesh.get()); + new_mesh->AddElement(el); + } + for (int be = 0; be < orig_mesh->GetNBE(); be++) + { + mfem::Element *bdr_el = orig_mesh->GetBdrElement(be)->Duplicate(new_mesh.get()); + new_mesh->AddBdrElement(bdr_el); + } + + // Add duplicated vertices from interior boundary cracking, renumber the vertices of + // domain and boundary elements to tear the mesh, and add new crack boundary elements. + if (!crack_boundary_attributes.empty() && !crack_bdr_elem.empty()) + { + // Add duplicate vertices. We assign the vertex number of the duplicated vertex in order + // to update the element connectivities in the next step. + for (auto &[orig_v, vert_components] : crack_vert_duplicates) + { + for (auto &[dup_v, component] : vert_components) + { + dup_v = new_mesh->AddVertex(orig_mesh->GetVertex(orig_v)); + } + } + + // Renumber the duplicated vertex in the domain elements. + for (const auto &[orig_v, vert_components] : crack_vert_duplicates) + { + if (vert_components.empty()) + { + continue; // Can skip vertices which were not duplicated + } + const int *elems = vert_to_elem->GetRow(orig_v); + for (int i = 0; i < vert_to_elem->RowSize(orig_v); i++) + { + // Find vertex in the element. + const auto e = elems[i]; + mfem::Element *el = new_mesh->GetElement(e); + int *verts = el->GetVertices(), j; + for (j = 0; j < el->GetNVertices(); j++) + { + if (verts[j] == orig_v) + { + break; + } + } + MFEM_VERIFY(j < el->GetNVertices(), "Unable to locate vertex in element!"); + + // Find the correct duplicate for this vertex. It's OK if the element is not in + // any of the connected components, this indicates that it keeps the original + // vertex and its connectivity is unmodified. + for (const auto &[dup_v, component] : vert_components) + { + if (component.find(e) != component.end()) + { + verts[j] = dup_v; + break; + } + } + } + } + + // Finally, we insert the new duplicate boundary elements for the crack interface and + // also renumber the original boundary elements. To renumber the original boundary + // elements in the mesh, we use the updated vertex connectivity from the torn elements + // in the new mesh (done above). + const mfem::Table &elem_to_face = orig_mesh->ElementToFaceTable(); + for (int be = 0; be < orig_mesh->GetNBE(); be++) + { + // Whether on the crack or not, we renumber the boundary element vertices as needed + // based on the neighboring element. For non-crack boundary elements, both + // neighboring elements must be part of the same connected component. First we find + // the index of the face in the old element, which should match the new element. + int f, o, e1, e2; + orig_mesh->GetBdrElementFace(be, &f, &o); + orig_mesh->GetFaceElements(f, &e1, &e2); + MFEM_VERIFY(e1 >= 0, "Boundary element with no attached elements!"); + const int *faces = elem_to_face.GetRow(e1); + int i; + for (i = 0; i < elem_to_face.RowSize(e1); i++) + { + if (faces[i] == f) + { + break; + } + } + MFEM_VERIFY(i < elem_to_face.RowSize(e1), "Unable to locate face in element!"); + + // Update the boundary element vertices. + mfem::Element *bdr_el = new_mesh->GetBdrElement(be); + const mfem::Element *el = new_mesh->GetElement(e1); + for (int j = 0; j < bdr_el->GetNVertices(); j++) + { + bdr_el->GetVertices()[j] = el->GetVertices()[el->GetFaceVertices(i)[j]]; + } + + // Add the duplicate boundary element for boundary elements on the crack. + if (crack_bdr_elem.find(be) != crack_bdr_elem.end()) + { + faces = elem_to_face.GetRow(e2); + for (i = 0; i < elem_to_face.RowSize(e2); i++) + { + if (faces[i] == f) + { + break; + } + } + MFEM_VERIFY(i < elem_to_face.RowSize(e2), "Unable to locate face in element!"); + + // Add the interface boundary element attached to element 2 (the other part of the + // pair has been attached to element 1 in the previous step). + bdr_el = bdr_el->Duplicate(new_mesh.get()); + el = new_mesh->GetElement(e2); + for (int j = 0; j < bdr_el->GetNVertices(); j++) + { + bdr_el->GetVertices()[j] = el->GetVertices()[el->GetFaceVertices(i)[j]]; + } + new_mesh->AddBdrElement(bdr_el); + } + } + } + + // Add new boundary elements. + if (iodata.model.add_bdr_elements && !new_face_bdr_elem.empty()) + { + // Some (1-based) boundary attributes may be empty since they were removed from the + // original mesh, but to keep attributes the same as config file we don't compress the + // list. + const mfem::Table &elem_to_face = orig_mesh->ElementToFaceTable(); + int bdr_attr_max = + orig_mesh->bdr_attributes.Size() ? orig_mesh->bdr_attributes.Max() : 0; + for (int f = 0; f < orig_mesh->GetNumFaces(); f++) + { + if (new_face_bdr_elem[f] > 0) + { + // Assign new unique attribute based on attached elements. Save so that the + // attributes of e1 and e2 can be easily referenced using the new attribute. Since + // attributes are in 1-based indexing, a, b > 0. See also + // https://en.wikipedia.org/wiki/Pairing_function. + int e1, e2, a = 0, b = 0; + orig_mesh->GetFaceElements(f, &e1, &e2); + if (e1 >= 0 && e2 >= 0) + { + a = std::max(orig_mesh->GetAttribute(e1), orig_mesh->GetAttribute(e2)); + b = (a == orig_mesh->GetAttribute(e1)) ? orig_mesh->GetAttribute(e2) + : orig_mesh->GetAttribute(e1); + } + else // e1 >= 0 + { + a = orig_mesh->GetAttribute(e1); + b = 0; + } + MFEM_VERIFY(a + b > 0, "Invalid new boundary element attribute!"); + long long int l_new_attr = + bdr_attr_max + (((a + b) * (long long int)(a + b + 1)) / 2) + a; + int new_attr = mfem::internal::to_int(l_new_attr); // At least bdr_attr_max + 1 + + // Add the boundary elements with the new boundary attribute. The element vertices + // may have been renumbered in the new mesh, so the new face is not necessarily + // just a duplicate of the old one. First we find the index of the face in the old + // element, which should match the new element. + const int *faces = elem_to_face.GetRow(e1); + int i; + for (i = 0; i < elem_to_face.RowSize(e1); i++) + { + if (faces[i] == f) + { + break; + } + } + MFEM_VERIFY(i < elem_to_face.RowSize(e1), "Unable to locate face in element!"); + + // Now add the boundary element(s). + mfem::Element *bdr_el = orig_mesh->GetFace(f)->Duplicate(new_mesh.get()); + bdr_el->SetAttribute(new_attr); + const mfem::Element *el = new_mesh->GetElement(e1); + for (int j = 0; j < bdr_el->GetNVertices(); j++) + { + bdr_el->GetVertices()[j] = el->GetVertices()[el->GetFaceVertices(i)[j]]; + } + new_mesh->AddBdrElement(bdr_el); + if constexpr (false) + { + Mpi::Print( + "Adding boundary element with attribute {:d} from elements {:d} and {:d}\n", + new_attr, a, b); + } + if (new_face_bdr_elem[f] > 1) + { + // Flip order of vertices to reverse normal direction of the second added element. + bdr_el = bdr_el->Duplicate(new_mesh.get()); + std::reverse(bdr_el->GetVertices(), + bdr_el->GetVertices() + bdr_el->GetNVertices()); + new_mesh->AddBdrElement(bdr_el); + if constexpr (false) + { + Mpi::Print("Adding second boundary element with attribute {:d} from elements " + "{:d} and {:d}\n", + new_attr, a, b); + } + } + } + } + } + + // Finalize the new mesh topology, and copy mesh curvature information if needed. This + // copies the nodes over correctly accounting for the element topology changes (the number + // of elements in the mesh has not changed, just their connectivity has). + constexpr bool generate_bdr = false; + new_mesh->FinalizeTopology(generate_bdr); + if (orig_mesh->GetNodes()) + { + TransferHighOrderNodes(*orig_mesh, *new_mesh); + } + + // If we have added cracks for interior boundary elements, apply a very very small + // perturbation to separate the duplicated boundary elements on either side and prevent + // them from lying exactly on top of each other. This is mostly just for visualization + // and can be increased in magnitude for debugging. + if (!crack_boundary_attributes.empty() && !crack_bdr_elem.empty() && + iodata.model.crack_displ_factor > 0.0) + { + // mfem::Mesh::MoveNodes expects byNODES ordering when using vertices. + mfem::GridFunction *nodes = new_mesh->GetNodes(); + mfem::Ordering::Type ordering = + nodes ? nodes->FESpace()->GetOrdering() : mfem::Ordering::byNODES; + int sdim = new_mesh->SpaceDimension(); + int nv = nodes ? nodes->Size() / sdim : new_mesh->GetNV(); + auto Index = [ordering, sdim, nv](int v, int d) + { return (ordering == mfem::Ordering::byVDIM) ? sdim * v + d : d * nv + v; }; + mfem::Vector normal(sdim); + mfem::IsoparametricTransformation T; + mfem::Array dofs; + + // Compute the displacement as the average normal of the attached boundary elements. + mfem::Vector displacements(nv * sdim); + displacements = 0.0; + double h_min = mfem::infinity(); + const mfem::Table &elem_to_face = orig_mesh->ElementToFaceTable(); + const mfem::Table &new_elem_to_face = new_mesh->ElementToFaceTable(); + for (auto be : crack_bdr_elem) + { + // Get the neighboring elements (same indices in the old and new mesh). + int f, o, e1, e2; + orig_mesh->GetBdrElementFace(be, &f, &o); + orig_mesh->GetFaceElements(f, &e1, &e2); + + // Perturb both new boundary elements in opposite directions. + for (auto e : {e1, e2}) + { + // Find the index of the face in the old element, which matches the new element, so + // we can get the list of all vertices or nodes to perturb. + const int *faces = elem_to_face.GetRow(e); + int i; + for (i = 0; i < elem_to_face.RowSize(e); i++) + { + if (faces[i] == f) + { + break; + } + } + MFEM_VERIFY(i < elem_to_face.RowSize(e), "Unable to locate face in element!"); + + // Compute the element normal, oriented to point outward from element 1 initially. + int new_f = new_elem_to_face.GetRow(e)[i]; + if (e == e1) + { + new_mesh->GetFaceTransformation(new_f, &T); + const mfem::IntegrationPoint &ip = + mfem::Geometries.GetCenter(T.GetGeometryType()); + T.SetIntPoint(&ip); + mfem::CalcOrtho(T.Jacobian(), normal); + double s = normal.Norml2(); + h_min = std::min(h_min, std::sqrt(s)); + normal /= -s; // We could also area-weight the average normal + } + else // e == e2 + { + normal *= -1.0; + } + + // For all "nodes" associated with this crack face, update the direction of their + // displacements. + auto NodeUpdate = [&](int v) + { + for (int d = 0; d < sdim; d++) + { + const int idx = Index(v, d); + displacements(idx) += normal(d); + } + }; + if (nodes) + { + nodes->FESpace()->GetFaceDofs(new_f, dofs); + for (int j = 0; j < dofs.Size(); j++) + { + NodeUpdate(dofs[j]); + } + } + else + { + const mfem::Element *el = new_mesh->GetElement(e); + for (int j = 0; j < el->GetNFaceVertices(i); j++) + { + NodeUpdate(el->GetVertices()[el->GetFaceVertices(i)[j]]); + } + } + } + } + for (int v = 0; v < nv; v++) + { + double s = 0.0; + for (int d = 0; d < sdim; d++) + { + const int idx = Index(v, d); + s += displacements(idx) * displacements(idx); + } + if (s > 0.0) + { + s = std::sqrt(s); + for (int d = 0; d < sdim; d++) + { + const int idx = Index(v, d); + displacements(idx) /= s; + } + } + } + + // Scale and apply the displacements. We don't need to do anything special to constrain + // the displacements at seam vertices (and associated high-order nodes on seam edges) to + // to zero, because the normals from both sides will average out to zero. + displacements *= (iodata.model.crack_displ_factor * h_min / + (nodes ? nodes->FESpace()->GetMaxElementOrder() : 1)); + new_mesh->MoveNodes(displacements); + } + + orig_mesh = std::move(new_mesh); + return 1; // Success +} + +std::unique_ptr GetMeshPartitioning(const mfem::Mesh &mesh, int size, + const std::string &part_file, bool print) +{ + MFEM_VERIFY(size <= mesh.GetNE(), "Mesh partitioning must have parts <= mesh elements (" + << size << " vs. " << mesh.GetNE() << ")!"); + if (part_file.length() == 0) + { + const int part_method = 1; + std::unique_ptr partitioning( + const_cast(mesh).GeneratePartitioning(size, part_method)); + if (print) + { + Mpi::Print("Finished partitioning mesh into {:d} subdomain{}\n", size, + (size > 1) ? "s" : ""); + } + return partitioning; + } + // User can optionally specify a mesh partitioning file as generated from the MFEM + // mesh-explorer miniapp, for example. It has the format: + // + // number_of_elements + // number_of_processors + // + // ... + // + // + int ne, np; + std::ifstream part_ifs(part_file); + part_ifs.ignore(std::numeric_limits::max(), ' '); + part_ifs >> ne; + if (ne != mesh.GetNE()) + { + MFEM_ABORT("Invalid partitioning file (number of elements)!"); + } + part_ifs.ignore(std::numeric_limits::max(), ' '); + part_ifs >> np; + if (np != size) + { + MFEM_ABORT("Invalid partitioning file (number of processors)!"); + } + auto partitioning = std::make_unique(mesh.GetNE()); + int i = 0; + while (i < mesh.GetNE()) + { + part_ifs >> partitioning[i++]; + } + if (print) + { + Mpi::Print("Read mesh partitioning into {:d} subdomain{} from disk\n", size, + (size > 1) ? "s" : ""); + } + return partitioning; +} + +std::unique_ptr DistributeMesh(MPI_Comm comm, + std::unique_ptr &smesh, + const int *partitioning, + const std::string &output_dir) +{ + // Take a serial mesh and partitioning on the root process and construct the global + // parallel mesh. For now, prefer the MPI-based version to the file IO one. When + // constructing the ParMesh, we mark for refinement since refinement flags are not copied + // from the serial mesh. Beware that mfem::ParMesh constructor argument order is not the + // same as mfem::Mesh! Each processor's component gets sent as a byte string. + constexpr bool generate_edges = false, refine = true, fix_orientation = false; + std::unique_ptr pmesh; + if (Mpi::Root(comm)) + { + mfem::MeshPartitioner partitioner(*smesh, Mpi::Size(comm), + const_cast(partitioning)); + std::vector send_requests(Mpi::Size(comm) - 1, MPI_REQUEST_NULL); + std::vector so; + so.reserve(Mpi::Size(comm)); + for (int i = 0; i < Mpi::Size(comm); i++) + { + mfem::MeshPart part; + partitioner.ExtractPart(i, part); + std::ostringstream fo(std::stringstream::out); + // fo << std::fixed; + fo << std::scientific; + fo.precision(MSH_FLT_PRECISION); + part.Print(fo); + so.push_back(fo.str()); + // so.push_back((i > 0) ? zlib::CompressString(fo.str()) : fo.str()); + if (i > 0) + { + int slen = static_cast(so[i].length()); + MFEM_VERIFY(so[i].length() == (std::size_t)slen, + "Overflow error distributing parallel mesh!"); + MPI_Isend(so[i].data(), slen, MPI_CHAR, i, i, comm, &send_requests[i - 1]); + } + } + smesh.reset(); + std::istringstream fi(so[0]); // This is never compressed + pmesh = + std::make_unique(comm, fi, refine, generate_edges, fix_orientation); + MPI_Waitall(static_cast(send_requests.size()), send_requests.data(), + MPI_STATUSES_IGNORE); + } + else + { + MPI_Status status; + int rlen; + std::string si; + MPI_Probe(0, Mpi::Rank(comm), comm, &status); + MPI_Get_count(&status, MPI_CHAR, &rlen); + si.resize(rlen); + MPI_Recv(si.data(), rlen, MPI_CHAR, 0, Mpi::Rank(comm), comm, MPI_STATUS_IGNORE); + std::istringstream fi(si); + // std::istringstream fi(zlib::DecompressString(si)); + pmesh = + std::make_unique(comm, fi, refine, generate_edges, fix_orientation); + } + return pmesh; +} + +void RebalanceConformalMesh(std::unique_ptr &pmesh) +{ + // Write the parallel mesh to a stream as a serial mesh, then read back in and partition + // using METIS. + MPI_Comm comm = pmesh->GetComm(); + constexpr bool generate_edges = false, generate_bdr = false, refine = true, + fix_orientation = false; + std::unique_ptr smesh; + if constexpr (false) + { + // Write the serial mesh to a stream and read that through the Mesh constructor. + std::ostringstream fo(std::stringstream::out); + // fo << std::fixed; + fo << std::scientific; + fo.precision(MSH_FLT_PRECISION); + pmesh->PrintAsSerial(fo); + if (Mpi::Root(comm)) + { + smesh = std::make_unique(fo, generate_edges, refine, fix_orientation); + } + } + else + { + // Directly ingest the generated Mesh and release the no longer needed memory. + smesh = std::make_unique(pmesh->GetSerialMesh(0)); + if (Mpi::Root(comm)) + { + smesh->FinalizeTopology(generate_bdr); + smesh->Finalize(refine, fix_orientation); + } + else + { + smesh.reset(); + } + } + + // (Re)-construct the parallel mesh. + std::unique_ptr partitioning; + if (Mpi::Root(comm)) + { + partitioning = GetMeshPartitioning(*smesh, Mpi::Size(comm), "", false); + } + pmesh = DistributeMesh(comm, smesh, partitioning.get()); +} + +} // namespace + +} // namespace palace diff --git a/palace/utils/geodata.hpp b/palace/utils/geodata.hpp index 2b94963b9c..50fccb13ee 100644 --- a/palace/utils/geodata.hpp +++ b/palace/utils/geodata.hpp @@ -1,149 +1,248 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_GEODATA_HPP -#define PALACE_UTILS_GEODATA_HPP - -#include -#include -#include -#include -#include - -#ifndef M_PI -#define M_PI (3.14159265358979323846) -#endif - -namespace mfem -{ - -template -class Array; -class Mesh; -class ParMesh; -class Vector; - -} // namespace mfem - -namespace palace -{ - -class IoData; - -namespace mesh -{ -// -// Functions for mesh related functionality. -// - -// Read and partition a serial mesh from file, returning a pointer to the new parallel mesh -// object, which should be destroyed by the user. -std::unique_ptr ReadMesh(MPI_Comm comm, const IoData &iodata, bool reorder, - bool clean, bool add_bdr, bool unassembled); - -// Refine the provided mesh according to the data in the input file. If levels of refinement -// are requested, the refined meshes are stored in order of increased refinement. Ownership -// of the initial coarse mesh is inherited by the fine meshes and it should not be deleted. -// The fine mesh hierarchy is owned by the user. -void RefineMesh(const IoData &iodata, std::vector> &mesh); - -// Dimensionalize a mesh for use in exporting a mesh. Scales vertices and nodes by L. -void DimensionalizeMesh(mfem::Mesh &mesh, double L); - -// Nondimensionalize a mesh for use in the solver. Scales vertices and nodes by 1/L. -void NondimensionalizeMesh(mfem::Mesh &mesh, double L); - -// Helper function to convert a set of attribute numbers to a marker array. The marker array -// will be of size max_attr and it will contain only zeroes and ones. Ones indicate which -// attribute numbers are present in the attrs array. In the special case when attrs has a -// single entry equal to -1 the marker array will contain all ones. -void AttrToMarker(int max_attr, const mfem::Array &attrs, mfem::Array &marker); -void AttrToMarker(int max_attr, const std::vector &attrs, mfem::Array &marker); - -// Helper function to construct the bounding box for all elements with the given attribute. -void GetAxisAlignedBoundingBox(mfem::ParMesh &mesh, int attr, bool bdr, mfem::Vector &min, - mfem::Vector &max); -void GetAxisAlignedBoundingBox(mfem::ParMesh &mesh, const mfem::Array &marker, - bool bdr, mfem::Vector &min, mfem::Vector &max); - -// Struct describing a bounding box in terms of the center and face normals. The normals -// specify the direction from the center of the box. -struct BoundingBox -{ - // The central point of the bounding box. - std::array center; - - // Vectors from center to the midpoint of each face. - std::array, 3> normals; - - // Whether or not this bounding box is two dimensional. - bool planar; - - // Compute the area of the bounding box spanned by the first two normals. - double Area() const; - - // Compute the volume of a 3D bounding box. Returns zero if planar. - double Volume() const; - - // Compute the lengths of each axis. - std::array Lengths() const; - - // Compute the deviation in degrees of a vector from each of the normal directions. - std::array Deviation(const std::array &direction) const; -}; - -// Struct describing a bounding ball in terms of a center and radius. If a ball is two -// dimensional, additionally provides a normal to the plane. -struct BoundingBall -{ - // The centroid of the ball. - std::array center; - - // The radius of the ball from the center. - double radius; - - // If the ball is two dimensional, the normal defining the planar surface. Zero magnitude - // if a sphere. - std::array planar_normal; - - // Whether or not this bounding ball is two dimensional. - bool planar; - - // Compute the area of the bounding box spanned by the first two normals. - double Area() const { return M_PI * std::pow(radius, 2.0); } - - // Compute the volume of a 3D bounding box. Returns zero if planar. - double Volume() const { return planar ? 0.0 : (4 * M_PI / 3) * std::pow(radius, 3.0); } -}; - -// Helper functions for computing bounding boxes from a mesh and markers. -BoundingBox GetBoundingBox(mfem::ParMesh &mesh, const mfem::Array &marker, bool bdr); -BoundingBox GetBoundingBox(mfem::ParMesh &mesh, int attr, bool bdr); - -// Helper function for computing the direction aligned length of a marked group. -double GetProjectedLength(mfem::ParMesh &mesh, const mfem::Array &marker, bool bdr, - const std::array &dir); -double GetProjectedLength(mfem::ParMesh &mesh, int attr, bool bdr, - const std::array &dir); - -// Given a mesh and a marker, compute the diameter of a bounding circle/sphere, assuming -// that the extrema points are in the marked group. -BoundingBall GetBoundingBall(mfem::ParMesh &mesh, const mfem::Array &marker, bool bdr); -BoundingBall GetBoundingBall(mfem::ParMesh &mesh, int attr, bool bdr); - -// Helper function to compute the average surface normal for all elements with the given -// attribute. -void GetSurfaceNormal(mfem::ParMesh &mesh, int attr, mfem::Vector &normal); -void GetSurfaceNormal(mfem::ParMesh &mesh, const mfem::Array &marker, - mfem::Vector &normal); - -// Helper function responsible for rebalancing the mesh, and optionally writing meshes from -// the intermediate stages to disk. Returns the imbalance ratio before rebalancing. -double RebalanceMesh(const IoData &iodata, std::unique_ptr &mesh, - double tol); - -} // namespace mesh - -} // namespace palace - -#endif // PALACE_UTILS_GEODATA_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_GEODATA_HPP +#define PALACE_UTILS_GEODATA_HPP + +#include +#include +#include +#include +#include + +namespace palace +{ + +class IoData; + +namespace mesh +{ + +// +// Functions for mesh related functionality. +// + +// Read and partition a serial mesh from file, returning a pointer to the new parallel mesh +// object, which should be destroyed by the user. +std::unique_ptr ReadMesh(IoData &iodata, MPI_Comm comm); + +// Refine the provided mesh according to the data in the input file. If levels of refinement +// are requested, the refined meshes are stored in order of increased refinement. Ownership +// of the initial coarse mesh is inherited by the fine meshes and it should not be deleted. +// The fine mesh hierarchy is owned by the user. +void RefineMesh(const IoData &iodata, std::vector> &mesh); + +// Dimensionalize a mesh for use in exporting a mesh. Scales vertices and nodes by L. +void DimensionalizeMesh(mfem::Mesh &mesh, double L); + +// Nondimensionalize a mesh for use in the solver. Scales vertices and nodes by 1/L. +void NondimensionalizeMesh(mfem::Mesh &mesh, double L); + +// Struct containing flags for the (global) mesh element types. +struct ElementTypeInfo +{ + bool has_simplices; + bool has_hexahedra; + bool has_prisms; + bool has_pyramids; + std::vector GetGeomTypes() const; +}; + +// Simplified helper for describing the element types in a (Par)Mesh. +ElementTypeInfo CheckElements(const mfem::Mesh &mesh); + +// Check if a tetrahedral (Par)Mesh is ready for local refinement. +bool CheckRefinementFlags(const mfem::Mesh &mesh); + +// Helper function to convert a set of attribute numbers to a marker array. The marker array +// will be of size max_attr and it will contain only zeroes and ones. Ones indicate which +// attribute numbers are present in the list array. In the special case when list has a +// single entry equal to -1 the marker array will contain all ones. +void AttrToMarker(int max_attr, const int *attr_list, int attr_list_size, + mfem::Array &marker, bool skip_invalid = false); + +template +inline void AttrToMarker(int max_attr, const T &attr_list, mfem::Array &marker, + bool skip_invalid = false) +{ + const auto size = std::distance(attr_list.begin(), attr_list.end()); + AttrToMarker(max_attr, (size > 0) ? &attr_list[0] : nullptr, size, marker, skip_invalid); +} + +template +inline mfem::Array AttrToMarker(int max_attr, const T &attr_list, + bool skip_invalid = false) +{ + mfem::Array marker; + AttrToMarker(max_attr, attr_list, marker, skip_invalid); + return marker; +} + +// Helper function to construct the axis-aligned bounding box for all elements with the +// given attribute. +void GetAxisAlignedBoundingBox(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr, mfem::Vector &min, mfem::Vector &max); + +inline void GetAxisAlignedBoundingBox(const mfem::ParMesh &mesh, int attr, bool bdr, + mfem::Vector &min, mfem::Vector &max) +{ + mfem::Array marker(bdr ? mesh.bdr_attributes.Max() : mesh.attributes.Max()); + marker = 0; + marker[attr - 1] = 1; + GetAxisAlignedBoundingBox(mesh, marker, bdr, min, max); +} + +inline void GetAxisAlignedBoundingBox(const mfem::ParMesh &mesh, mfem::Vector &min, + mfem::Vector &max) +{ + mfem::Array marker(mesh.attributes.Max()); + marker = 1; + GetAxisAlignedBoundingBox(mesh, marker, false, min, max); +} + +// Struct describing a bounding box in terms of the center and face normals. The normals +// specify the direction from the center of the box. +struct BoundingBox +{ + // The central point of the bounding box. + std::array center; + + // Vectors from center to the midpoint of each face. + std::array, 3> axes; + + // Whether or not this bounding box is two dimensional. + bool planar; + + // Compute the area of the bounding box spanned by the first two normals. + double Area() const; + + // Compute the volume of the 3D bounding box. Returns zero if planar. + double Volume() const; + + // Compute the normalized axes of the bounding box. + std::array, 3> Normals() const; + + // Compute the lengths along each axis. + std::array Lengths() const; + + // Compute the deviations in degrees of a vector from each of the axis directions. Angles + // are returned in the interval [0, 180]. + std::array Deviations(const std::array &direction) const; +}; + +// Helper functions for computing bounding boxes from a mesh and markers. These do not need +// to be axis-aligned. Note: This function only returns a minimum oriented bounding box for +// points whose convex hull exactly forms a rectangle or rectangular prism, implementing a +// vastly simplified version of QuickHull for this case. For other shapes, the result is +// less predictable, and may not even form a bounding box of the sampled point cloud. +BoundingBox GetBoundingBox(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr); + +inline BoundingBox GetBoundingBox(const mfem::ParMesh &mesh, int attr, bool bdr) +{ + mfem::Array marker(bdr ? mesh.bdr_attributes.Max() : mesh.attributes.Max()); + marker = 0; + marker[attr - 1] = 1; + return GetBoundingBox(mesh, marker, bdr); +} + +// Given a mesh and a marker, compute the bounding circle/sphere of the marked elements. In +// this case the normals of the bounding box object are arbitrary, and the Area and Volume +// members should not be used, but the Lengths function returns the ball diameter. This +// function implements Welzl's algorithm. +BoundingBox GetBoundingBall(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr); + +inline BoundingBox GetBoundingBall(const mfem::ParMesh &mesh, int attr, bool bdr) +{ + mfem::Array marker(bdr ? mesh.bdr_attributes.Max() : mesh.attributes.Max()); + marker = 0; + marker[attr - 1] = 1; + return GetBoundingBall(mesh, marker, bdr); +} + +// Helper function for computing the direction aligned length of a marked group. +double GetProjectedLength(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr, const std::array &dir); + +inline double GetProjectedLength(const mfem::ParMesh &mesh, int attr, bool bdr, + const std::array &dir) +{ + mfem::Array marker(bdr ? mesh.bdr_attributes.Max() : mesh.attributes.Max()); + marker = 0; + marker[attr - 1] = 1; + return GetProjectedLength(mesh, marker, bdr, dir); +} + +// Helper function for computing the closest distance of a marked group to a given point, +// by brute force searching over the entire point set. Optionally compute the furthest +// distance instead of the closest. +double GetDistanceFromPoint(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr, const std::array &origin, + bool max = false); + +inline double GetDistanceFromPoint(const mfem::ParMesh &mesh, int attr, bool bdr, + const std::array &dir, bool max = false) +{ + mfem::Array marker(bdr ? mesh.bdr_attributes.Max() : mesh.attributes.Max()); + marker = 0; + marker[attr - 1] = 1; + return GetDistanceFromPoint(mesh, marker, bdr, dir, max); +} + +// Helper function to compute the average surface normal for all elements with the given +// attributes. +mfem::Vector GetSurfaceNormal(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool average = true); + +inline mfem::Vector GetSurfaceNormal(const mfem::ParMesh &mesh, int attr, + bool average = true) +{ + const bool bdr = (mesh.Dimension() == mesh.SpaceDimension()); + mfem::Array marker(bdr ? mesh.bdr_attributes.Max() : mesh.attributes.Max()); + marker = 0; + marker[attr - 1] = 1; + return GetSurfaceNormal(mesh, marker, average); +} + +inline mfem::Vector GetSurfaceNormal(const mfem::ParMesh &mesh, bool average = true) +{ + const bool bdr = (mesh.Dimension() == mesh.SpaceDimension()); + const auto &attributes = bdr ? mesh.bdr_attributes : mesh.attributes; + return GetSurfaceNormal(mesh, AttrToMarker(attributes.Max(), attributes), average); +} + +// Helper functions to compute the volume or area for all domain or boundary elements with +// the given attributes. +double GetSurfaceArea(const mfem::ParMesh &mesh, const mfem::Array &marker); + +inline double GetSurfaceArea(const mfem::ParMesh &mesh, int attr) +{ + mfem::Array marker(mesh.bdr_attributes.Max()); + marker = 0; + marker[attr - 1] = 1; + return GetSurfaceArea(mesh, marker); +} + +double GetVolume(const mfem::ParMesh &mesh, const mfem::Array &marker); + +inline double GetVolume(const mfem::ParMesh &mesh, int attr) +{ + mfem::Array marker(mesh.attributes.Max()); + marker = 0; + marker[attr - 1] = 1; + return GetVolume(mesh, marker); +} + +// Helper function responsible for rebalancing the mesh, and optionally writing meshes from +// the intermediate stages to disk. Returns the imbalance ratio before rebalancing. +double RebalanceMesh(const IoData &iodata, std::unique_ptr &mesh); + +// Helper for creating a hexahedral mesh from a tetrahedral mesh. +mfem::Mesh MeshTetToHex(const mfem::Mesh &orig_mesh); + +} // namespace mesh + +} // namespace palace + +#endif // PALACE_UTILS_GEODATA_HPP diff --git a/palace/utils/geodata_impl.cpp b/palace/utils/geodata_impl.cpp new file mode 100644 index 0000000000..fc3afe6684 --- /dev/null +++ b/palace/utils/geodata_impl.cpp @@ -0,0 +1,958 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "geodata.hpp" + +#include "geodata_impl.hpp" + +#include +#include +#include +#include +#include +#include "utils/communication.hpp" +#include "utils/omp.hpp" + +namespace palace::mesh +{ + +using Vector3dMap = Eigen::Map; +using CVector3dMap = Eigen::Map; + +// Compute a lexicographic comparison of Eigen Vector3d. +bool EigenLE(const Eigen::Vector3d &x, const Eigen::Vector3d &y) +{ + return std::lexicographical_compare(x.begin(), x.end(), y.begin(), y.end()); +} + +// Helper for collecting a point cloud from a mesh, used in calculating bounding boxes and +// bounding balls. Returns the dominant rank, for which the vertices argument will be +// filled, while all other ranks will have an empty vector. Vertices are de-duplicated to a +// certain floating point precision. +int CollectPointCloudOnRoot(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr, std::vector &vertices) +{ + if (!mesh.GetNodes()) + { + // Linear mesh, work with element vertices directly. + PalacePragmaOmp(parallel) + { + std::unordered_set vertex_indices; + if (bdr) + { + PalacePragmaOmp(for schedule(static)) + for (int i = 0; i < mesh.GetNBE(); i++) + { + if (!marker[mesh.GetBdrAttribute(i) - 1]) + { + continue; + } + const int *verts = mesh.GetBdrElement(i)->GetVertices(); + vertex_indices.insert(verts, verts + mesh.GetBdrElement(i)->GetNVertices()); + } + } + else + { + PalacePragmaOmp(for schedule(static)) + for (int i = 0; i < mesh.GetNE(); i++) + { + if (!marker[mesh.GetAttribute(i) - 1]) + { + continue; + } + const int *verts = mesh.GetElement(i)->GetVertices(); + vertex_indices.insert(verts, verts + mesh.GetElement(i)->GetNVertices()); + } + } + PalacePragmaOmp(critical(PointCloud)) + { + for (auto i : vertex_indices) + { + const auto &vx = mesh.GetVertex(i); + vertices.emplace_back(vx[0], vx[1], vx[2]); + } + } + } + } + else + { + // Curved mesh, need to process point matrices. + const int ref = mesh.GetNodes()->FESpace()->GetMaxElementOrder(); + auto AddPoints = [&](mfem::GeometryRefiner &refiner, mfem::ElementTransformation &T, + mfem::DenseMatrix &pointmat, + std::vector &loc_vertices) + { + mfem::RefinedGeometry *RefG = refiner.Refine(T.GetGeometryType(), ref); + T.Transform(RefG->RefPts, pointmat); + for (int j = 0; j < pointmat.Width(); j++) + { + loc_vertices.emplace_back(pointmat(0, j), pointmat(1, j), pointmat(2, j)); + } + }; + PalacePragmaOmp(parallel) + { + mfem::GeometryRefiner refiner; + mfem::IsoparametricTransformation T; + mfem::DenseMatrix pointmat; // 3 x N + std::vector loc_vertices; + if (bdr) + { + PalacePragmaOmp(for schedule(static)) + for (int i = 0; i < mesh.GetNBE(); i++) + { + if (!marker[mesh.GetBdrAttribute(i) - 1]) + { + continue; + } + mesh.GetBdrElementTransformation(i, &T); + AddPoints(refiner, T, pointmat, loc_vertices); + } + } + else + { + PalacePragmaOmp(for schedule(static)) + for (int i = 0; i < mesh.GetNE(); i++) + { + if (!marker[mesh.GetAttribute(i) - 1]) + { + continue; + } + mesh.GetElementTransformation(i, &T); + AddPoints(refiner, T, pointmat, loc_vertices); + } + } + PalacePragmaOmp(critical(PointCloud)) + { + for (const auto &v : loc_vertices) + { + vertices.push_back(v); + } + } + } + } + + // dominant_rank will perform the calculation. + MPI_Comm comm = mesh.GetComm(); + const auto num_vertices = int(vertices.size()); + const int dominant_rank = [&]() + { + int vert = num_vertices, rank = Mpi::Rank(comm); + Mpi::GlobalMaxLoc(1, &vert, &rank, comm); + return rank; + }(); + std::vector recv_counts(Mpi::Size(comm)), displacements; + std::vector collected_vertices; + MPI_Gather(&num_vertices, 1, MPI_INT, recv_counts.data(), 1, MPI_INT, dominant_rank, + comm); + if (dominant_rank == Mpi::Rank(comm)) + { + // First displacement is zero, then after is the partial sum of recv_counts. + displacements.resize(Mpi::Size(comm)); + displacements[0] = 0; + std::partial_sum(recv_counts.begin(), recv_counts.end() - 1, displacements.begin() + 1); + + // Add on slots at the end of vertices for the incoming data. + collected_vertices.resize(std::accumulate(recv_counts.begin(), recv_counts.end(), 0)); + + // MPI transfer will be done with MPI_DOUBLE, so duplicate all these values. + for (auto &x : displacements) + { + x *= 3; + } + for (auto &x : recv_counts) + { + x *= 3; + } + } + + // Gather the data to the dominant rank. + static_assert(sizeof(Eigen::Vector3d) == 3 * sizeof(double)); + MPI_Gatherv(vertices.data(), 3 * num_vertices, MPI_DOUBLE, collected_vertices.data(), + recv_counts.data(), displacements.data(), MPI_DOUBLE, dominant_rank, comm); + + // Deduplicate vertices. Given floating point precision, need a tolerance. + if (dominant_rank == Mpi::Rank(comm)) + { + auto vertex_equality = [](const auto &x, const auto &y) + { + constexpr double tolerance = 10.0 * std::numeric_limits::epsilon(); + return std::abs(x[0] - y[0]) < tolerance && std::abs(x[1] - y[1]) < tolerance && + std::abs(x[2] - y[2]) < tolerance; + }; + vertices = std::move(collected_vertices); + std::sort(vertices.begin(), vertices.end(), EigenLE); + vertices.erase(std::unique(vertices.begin(), vertices.end(), vertex_equality), + vertices.end()); + } + else + { + vertices.clear(); + } + + return dominant_rank; +} + +// Compute the distance from a point orthogonal to the list of normal axes, relative to +// the given origin. +auto PerpendicularDistance(const std::initializer_list &normals, + const Eigen::Vector3d &origin, const Eigen::Vector3d &v) +{ + Eigen::Vector3d v0 = v - origin; + for (const auto &n : normals) + { + v0 -= n.dot(v0) * n; + } + return v0.norm(); +} + +// Calculates a bounding box from a point cloud, result is broadcast across all processes. +BoundingBox BoundingBoxFromPointCloud(MPI_Comm comm, + const std::vector &vertices, + int dominant_rank) +{ + BoundingBox box; + if (dominant_rank == Mpi::Rank(comm)) + { + // Pick a candidate 000 vertex using lexicographic sort. This can be vulnerable to + // floating point precision if the box is axis aligned, but not floating point exact. + // Pick candidate 111 as the furthest from this candidate, then reassign 000 as the + // furthest from 111. Such a pair has to form the diagonal for a point cloud defining a + // box. Verify that p_111 is also the maximum distance from p_000 -> a diagonal is + // found. + MFEM_VERIFY(vertices.size() >= 4, + "A bounding box requires a minimum of four vertices for this algorithm!"); + auto p_000 = std::min_element(vertices.begin(), vertices.end(), EigenLE); + auto p_111 = + std::max_element(vertices.begin(), vertices.end(), + [p_000](const Eigen::Vector3d &x, const Eigen::Vector3d &y) + { return (x - *p_000).norm() < (y - *p_000).norm(); }); + p_000 = std::max_element(vertices.begin(), vertices.end(), + [p_111](const Eigen::Vector3d &x, const Eigen::Vector3d &y) + { return (x - *p_111).norm() < (y - *p_111).norm(); }); + MFEM_ASSERT(std::max_element(vertices.begin(), vertices.end(), + [p_000](const Eigen::Vector3d &x, const Eigen::Vector3d &y) + { return (x - *p_000).norm() < (y - *p_000).norm(); }) == + p_111, + "p_000 and p_111 must be mutually opposing points!"); + + // Define a diagonal of the ASSUMED cuboid bounding box. + const auto &v_000 = *p_000; + const auto &v_111 = *p_111; + MFEM_VERIFY(&v_000 != &v_111, "Minimum and maximum extents cannot be identical!"); + const Eigen::Vector3d origin = v_000; + const Eigen::Vector3d n_1 = (v_111 - v_000).normalized(); + + // Find the vertex furthest from the diagonal axis. We cannot know yet if this defines + // (001) or (011). + const auto &t_0 = *std::max_element(vertices.begin(), vertices.end(), + [&](const auto &x, const auto &y) + { + return PerpendicularDistance({n_1}, origin, x) < + PerpendicularDistance({n_1}, origin, y); + }); + MFEM_VERIFY(&t_0 != &v_000 && &t_0 != &v_111, "Vertices are degenerate!"); + + // Use the discovered vertex to define a second direction and thus a plane. n_1 and n_2 + // now define a planar coordinate system intersecting the main diagonal, and two + // opposite edges of the cuboid. + const Eigen::Vector3d n_2 = + ((t_0 - origin) - ((t_0 - origin).dot(n_1) * n_1)).normalized(); + + // Collect the furthest point from the plane to determine if the box is planar. Look for + // a component that maximizes distance from the planar system: complete the axes with a + // cross, then use a dot product to pick the greatest deviation. + constexpr double rel_tol = 1.0e-6; + auto max_distance = PerpendicularDistance( + {n_1, n_2}, origin, + *std::max_element(vertices.begin(), vertices.end(), + [&](const auto &x, const auto &y) + { + return PerpendicularDistance({n_1, n_2}, origin, x) < + PerpendicularDistance({n_1, n_2}, origin, y); + })); + box.planar = (max_distance < rel_tol * (v_111 - v_000).norm()); + + // For the non-planar case, collect points furthest from the plane and choose the one + // closest to the origin as the next vertex which might be (001) or (011). + const auto &t_1 = [&]() + { + if (box.planar) + { + return t_0; + } + std::vector vertices_out_of_plane; + std::copy_if(vertices.begin(), vertices.end(), + std::back_inserter(vertices_out_of_plane), + [&](const auto &v) + { + return std::abs(PerpendicularDistance({n_1, n_2}, origin, v) - + max_distance) < rel_tol * max_distance; + }); + return *std::min_element(vertices_out_of_plane.begin(), vertices_out_of_plane.end(), + [&](const Eigen::Vector3d &x, const Eigen::Vector3d &y) + { return (x - origin).norm() < (y - origin).norm(); }); + }(); + + // Given candidates t_0 and t_1, the closer to origin defines v_001. + const bool t_0_gt_t_1 = (t_0 - origin).norm() > (t_1 - origin).norm(); + const auto &v_001 = t_0_gt_t_1 ? t_1 : t_0; + const auto &v_011 = box.planar ? v_111 : (t_0_gt_t_1 ? t_0 : t_1); + + // Compute the center as halfway along the main diagonal. + Vector3dMap(box.center.data()) = 0.5 * (v_000 + v_111); + + if constexpr (false) + { + fmt::print("box.center {}!\n", box.center); + fmt::print("v_000 {}!\n", v_000); + fmt::print("v_001 {}!\n", v_001); + fmt::print("v_011 {}!\n", v_011); + fmt::print("v_111 {}!\n", v_111); + } + + // Compute the box axes. Using the 4 extremal points, we find the first two axes as the + // edges which are closest to perpendicular. For a perfect rectangular prism point + // cloud, we could instead compute the axes and length in each direction using the + // found edges of the cuboid, but this does not work for non-rectangular prism + // cross-sections or pyramid shapes. + { + const auto [e_0, e_1] = [&v_000, &v_001, &v_011, &v_111]() + { + std::array verts = {&v_000, &v_001, &v_011, &v_111}; + Eigen::Vector3d e_0 = Eigen::Vector3d::Zero(), e_1 = Eigen::Vector3d::Zero(); + double dot_min = mfem::infinity(); + for (int i_0 = 0; i_0 < 4; i_0++) + { + for (int j_0 = i_0 + 1; j_0 < 4; j_0++) + { + for (int i_1 = 0; i_1 < 4; i_1++) + { + for (int j_1 = i_1 + 1; j_1 < 4; j_1++) + { + if ((i_1 == i_0 && j_1 == j_0) || verts[i_0] == verts[j_0] || + verts[i_1] == verts[j_1]) + { + continue; + } + const auto e_ij_0 = (*verts[j_0] - *verts[i_0]).normalized(); + const auto e_ij_1 = (*verts[j_1] - *verts[i_1]).normalized(); + const auto dot = std::abs(e_ij_0.dot(e_ij_1)); + if (dot < dot_min) + { + if constexpr (false) + { + fmt::print("i_0 {} i_1 {} j_0 {} j_1 {}\n", i_0, i_1, j_0, j_1); + fmt::print("e_ij_0 {}, e_ij_1 {}!\n", e_ij_0, e_ij_1); + } + dot_min = dot; + e_0 = e_ij_0; + e_1 = e_ij_1; + if (dot_min < rel_tol) + { + return std::make_pair(e_0, e_1); + } + } + } + } + } + } + return std::make_pair(e_0, e_1); + }(); + + if constexpr (false) + { + fmt::print("e_0 {}, e_1 {}!\n", e_0, e_1); + } + + Vector3dMap(box.axes[0].data()) = e_0; + Vector3dMap(box.axes[1].data()) = e_1; + Vector3dMap(box.axes[2].data()) = + box.planar ? Eigen::Vector3d::Zero() : e_0.cross(e_1); + } + + // Scale axes by length of the box in each direction. + std::array l = {0.0}; + for (const auto &v : {v_000, v_001, v_011, v_111}) + { + const auto v_0 = v - Vector3dMap(box.center.data()); + l[0] = std::max(l[0], std::abs(v_0.dot(Vector3dMap(box.axes[0].data())))); + l[1] = std::max(l[1], std::abs(v_0.dot(Vector3dMap(box.axes[1].data())))); + l[2] = std::max(l[2], std::abs(v_0.dot(Vector3dMap(box.axes[2].data())))); + } + Vector3dMap(box.axes[0].data()) *= l[0]; + Vector3dMap(box.axes[1].data()) *= l[1]; + Vector3dMap(box.axes[2].data()) *= l[2]; + + // Make sure the longest dimension comes first. + std::sort(box.axes.begin(), box.axes.end(), [](const auto &x, const auto &y) + { return CVector3dMap(x.data()).norm() > CVector3dMap(y.data()).norm(); }); + } + + // Broadcast result to all processors. + Mpi::Broadcast(3, box.center.data(), dominant_rank, comm); + Mpi::Broadcast(3 * 3, box.axes.data()->data(), dominant_rank, comm); + Mpi::Broadcast(1, &box.planar, dominant_rank, comm); + + return box; +} + +// Use 4 points to define a sphere in 3D. If the points are coplanar, 3 of them are used to +// define a circle which is interpreted as the equator of the sphere. We assume the points +// are unique and not collinear. +BoundingBall SphereFromPoints(const std::vector &indices, + const std::vector &vertices) +{ + // Given 0 or 1 points, just return a radius of 0. + MFEM_VERIFY( + indices.size() <= 4, + "Determining a sphere in 3D requires 4 points (and a circle requires 3 points)!"); + BoundingBall ball; + ball.planar = (indices.size() < 4); + if (indices.size() < 2) + { + ball.origin = Eigen::Vector3d::Zero(); + ball.radius = 0.0; + return ball; + } + + // For two points, construct a circle with the segment as its diameter. This could also + // handle the collinear case for more than 2 points. + if (indices.size() == 2) + { + ball.origin = 0.5 * (vertices[indices[0]] + vertices[indices[1]]); + ball.radius = (vertices[indices[0]] - ball.origin).norm(); + return ball; + } + + // Check for coplanarity. + constexpr double rel_tol = 1.0e-6; + const Eigen::Vector3d AB = vertices[indices[1]] - vertices[indices[0]]; + const Eigen::Vector3d AC = vertices[indices[2]] - vertices[indices[0]]; + const Eigen::Vector3d ABAC = AB.cross(AC); + Eigen::Vector3d AD = Eigen::Vector3d::Zero(); + if (!ball.planar) + { + AD = vertices[indices[3]] - vertices[indices[0]]; + ball.planar = (std::abs(AD.dot(ABAC)) < rel_tol * AD.norm() * ABAC.norm()); + } + + // Construct a circle passing through 3 points. + // See: https://en.wikipedia.org/wiki/Circumcircle#Higher_dimensions. + if (ball.planar) + { + ball.origin = (0.5 / ABAC.squaredNorm()) * + ((AB.squaredNorm() * AC) - (AC.squaredNorm() * AB)).cross(ABAC); + ball.radius = ball.origin.norm(); + ball.origin += vertices[indices[0]]; +#if defined(MFEM_DEBUG) + const auto r1 = (vertices[indices[1]] - ball.origin).norm(); + const auto r2 = (vertices[indices[2]] - ball.origin).norm(); + MFEM_VERIFY((1.0 - rel_tol) * ball.radius < r1 && r1 < (1.0 + rel_tol) * ball.radius && + (1.0 - rel_tol) * ball.radius < r2 && + r2 < (1.0 + rel_tol) * ball.radius, + "Invalid circle calculated from 3 points!"); +#endif + return ball; + } + + // Construct a sphere passing through 4 points. + // See: https://steve.hollasch.net/cgindex/geometry/sphere4pts.html. + Eigen::Matrix3d C; + Eigen::Vector3d d; + const auto s = vertices[indices[0]].squaredNorm(); + C.row(0) = AB.transpose(); + C.row(1) = AC.transpose(); + C.row(2) = AD.transpose(); + d(0) = 0.5 * (vertices[indices[1]].squaredNorm() - s); + d(1) = 0.5 * (vertices[indices[2]].squaredNorm() - s); + d(2) = 0.5 * (vertices[indices[3]].squaredNorm() - s); + ball.origin = C.inverse() * d; // 3x3 matrix inverse might be faster than general LU + // if Eigen uses the explicit closed-form solution + ball.radius = (vertices[indices[0]] - ball.origin).norm(); +#if defined(MFEM_DEBUG) + const auto r1 = (vertices[indices[1]] - ball.origin).norm(); + const auto r2 = (vertices[indices[2]] - ball.origin).norm(); + const auto r3 = (vertices[indices[3]] - ball.origin).norm(); + MFEM_VERIFY((1.0 - rel_tol) * ball.radius < r1 && r1 < (1.0 + rel_tol) * ball.radius && + (1.0 - rel_tol) * ball.radius < r2 && + r2 < (1.0 + rel_tol) * ball.radius && + (1.0 - rel_tol) * ball.radius < r3 && r3 < (1.0 + rel_tol) * ball.radius, + "Invalid sphere calculated from 3 points!"); +#endif + return ball; +} + +BoundingBall Welzl(std::vector P, std::vector R, + const std::vector &vertices) +{ + // Base case. + if (R.size() == 4 || P.empty()) + { + return SphereFromPoints(R, vertices); + } + + // Choose a p ∈ P randomly, and recurse for (P \ {p}, R). The set P has already been + // randomized on input. + const std::size_t p = P.back(); + P.pop_back(); + BoundingBall D = Welzl(P, R, vertices); + + // If p is outside the sphere, recurse for (P \ {p}, R U {p}). + constexpr double rel_tol = 1.0e-6; + if ((vertices[p] - D.origin).norm() >= (1.0 + rel_tol) * D.radius) + { + R.push_back(p); + D = Welzl(P, R, vertices); + } + + return D; +} + +// Calculates a bounding ball from a point cloud using Welzl's algorithm, result is +// broadcast across all processes. We don't operate on the convex hull, since the number of +// points should be small enough that operating on the full set should be OK. If only three +// points are provided, the bounding circle is computed (likewise for if the points are +// coplanar). +BoundingBox BoundingBallFromPointCloud(MPI_Comm comm, + const std::vector &vertices, + int dominant_rank) +{ + BoundingBox ball; + if (dominant_rank == Mpi::Rank(comm)) + { + MFEM_VERIFY(vertices.size() >= 3, + "A bounding ball requires a minimum of three vertices for this algorithm!"); + std::vector indices(vertices.size()); + std::iota(indices.begin(), indices.end(), 0); + + // Acceleration from https://informatica.vu.lt/journal/INFORMATICA/article/1251. Allow + // for duplicate points and just add the 4 points to the end of the indices list to be + // considered first. The two points are not necessarily the maximizer of the distance + // between all pairs, but they should be a good estimate. + { + auto p_1 = std::min_element(vertices.begin(), vertices.end(), EigenLE); + auto p_2 = std::max_element(vertices.begin(), vertices.end(), + [p_1](const Eigen::Vector3d &x, const Eigen::Vector3d &y) + { return (x - *p_1).norm() < (y - *p_1).norm(); }); + p_1 = std::max_element(vertices.begin(), vertices.end(), + [p_2](const Eigen::Vector3d &x, const Eigen::Vector3d &y) + { return (x - *p_2).norm() < (y - *p_2).norm(); }); + + // Find the next point as the vertex furthest from the initial axis. + const Eigen::Vector3d n_1 = (*p_2 - *p_1).normalized(); + auto p_3 = std::max_element(vertices.begin(), vertices.end(), + [&](const auto &x, const auto &y) + { + return PerpendicularDistance({n_1}, *p_1, x) < + PerpendicularDistance({n_1}, *p_1, y); + }); + auto p_4 = std::max_element(vertices.begin(), vertices.end(), + [p_3](const Eigen::Vector3d &x, const Eigen::Vector3d &y) + { return (x - *p_3).norm() < (y - *p_3).norm(); }); + MFEM_VERIFY(p_3 != p_1 && p_3 != p_2 && p_4 != p_1 && p_4 != p_2, + "Vertices are degenerate!"); + + // Start search with these points, which should be roughly extremal. With the search + // for p_3 done in an orthogonal direction, p_1, p_2, p_3, and p_4 should all be + // unique. + std::swap(indices[indices.size() - 1], indices[p_1 - vertices.begin()]); + std::swap(indices[indices.size() - 2], indices[p_2 - vertices.begin()]); + std::swap(indices[indices.size() - 3], indices[p_3 - vertices.begin()]); + std::swap(indices[indices.size() - 4], indices[p_4 - vertices.begin()]); + } + + // Randomly permute the point set. + { + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(indices.begin(), indices.end() - 4, g); + } + + // Compute the bounding ball. + BoundingBall min_ball = Welzl(indices, {}, vertices); + Vector3dMap(ball.center.data()) = min_ball.origin; + Vector3dMap(ball.axes[0].data()) = Eigen::Vector3d(min_ball.radius, 0.0, 0.0); + Vector3dMap(ball.axes[1].data()) = Eigen::Vector3d(0.0, min_ball.radius, 0.0); + Vector3dMap(ball.axes[2].data()) = Eigen::Vector3d(0.0, 0.0, min_ball.radius); + ball.planar = min_ball.planar; + } + + // Broadcast result to all processors. + Mpi::Broadcast(3, ball.center.data(), dominant_rank, comm); + Mpi::Broadcast(3 * 3, ball.axes.data()->data(), dominant_rank, comm); + Mpi::Broadcast(1, &ball.planar, dominant_rank, comm); + + return ball; +} + +// Compute a normal vector from an element transformation, optionally ensure aligned +// (| normal ⋅ align | > 0) +void Normal(mfem::ElementTransformation &T, mfem::Vector &normal, + const mfem::Vector *const align) +{ + const mfem::IntegrationPoint &ip = mfem::Geometries.GetCenter(T.GetGeometryType()); + T.SetIntPoint(&ip); + mfem::CalcOrtho(T.Jacobian(), normal); + normal /= normal.Norml2(); + if (align && (normal * (*align) < 0)) + { + normal *= -1; + } +} + +// Compute the centroid of a container of vertices. +template +mfem::Vector ComputeCentroid(const std::unique_ptr &mesh, const T &vertidxs) +{ + mfem::Vector centroid(mesh->SpaceDimension()); + centroid = 0.0; + int c = 0; + for (const int v : vertidxs) + { + centroid += mfem::Vector(mesh->GetVertex(v), 3); + c++; + } + return centroid /= c; +} + +// Compute the normal vector for a set of elements. If "inside" is true, normal will +// point inside the mesh, otherwise it will point outside the mesh. +template +mfem::Vector ComputeNormal(const std::unique_ptr &mesh, const T &elem_set, + bool inside, bool check_planar = true) +{ + const int sdim = mesh->SpaceDimension(); + mfem::IsoparametricTransformation trans; + mfem::Vector normal(sdim), last_normal(sdim), align(sdim); + normal = 0.0; + last_normal = 0.0; + mfem::Array vert_bdr; + + // Ensure that the computed normal points "inside" or "outside". + auto Alignment = [&](int el, auto &align) + { + int eladj, info; + mesh->GetBdrElementAdjacentElement(el, eladj, info); + mesh->GetElementCenter(eladj, align); + mesh->GetBdrElementVertices(el, vert_bdr); + align -= ComputeCentroid(mesh, vert_bdr); + if (!inside) // align points inwards + { + align *= -1; + } + }; + + for (auto elem : elem_set) + { + Alignment(elem, align); + mesh->GetBdrElementTransformation(elem, &trans); + mesh::Normal(trans, normal, &align); + if (!check_planar) + { + break; // If not checking planar, use the first. + } + MFEM_VERIFY((last_normal * last_normal == 0.0) || ((last_normal * normal - 1) < 1e-8), + "Periodic boundary mapping is only supported for planar boundaries!"); + last_normal = normal; + } + return normal; +} + +struct Frame +{ + Frame(const mfem::Vector &o) : origin(o) + { + for (auto &x : basis) + { + x.SetSize(3); + x = 0.0; + } + } + mfem::Vector origin; + std::array basis; +}; + +Frame Find3DFrame(std::unique_ptr &mesh, + const std::unordered_set &vertidxs, const mfem::Vector ¢roid, + const mfem::Vector &normal, double mesh_dim) +{ + Frame frame(centroid); + frame.basis[0] = normal; + + // For each point, compute its distance to the centroid. + std::map, std::greater> dist2points; + for (const int v : vertidxs) + { + auto dist = centroid.DistanceTo(mesh->GetVertex(v)); + // Convert dist to integer to avoid floating point differences. + dist2points[std::round(dist / mesh_dim * 1e8)].push_back(v); + } + + for (const auto &[dist, verts] : dist2points) + { + if (verts.size() > 1 || dist == 0) + { + continue; + } + frame.basis[1] = mesh->GetVertex(verts.front()); + frame.basis[1] -= centroid; + frame.basis[1] /= frame.basis[1].Norml2(); + break; + } + + // Define final point by computing the cross product. + frame.basis[0].cross3D(frame.basis[1], frame.basis[2]); + + return frame; +} + +// Calculate the rotation matrix between two vectors. +void ComputeRotation(const mfem::Vector &normal1, const mfem::Vector &normal2, + mfem::DenseMatrix &transformation) +{ + mfem::DenseMatrix R(3), vx(3), vx2(3); + + mfem::Vector v(normal1.Size()); + normal1.cross3D(normal2, v); + double c = normal1 * normal2; + + vx(0, 1) = -v[2]; + vx(0, 2) = v[1]; + vx(1, 0) = v[2]; + vx(1, 2) = -v[0]; + vx(2, 0) = -v[1]; + vx(2, 1) = v[0]; + + R(0, 0) = R(1, 1) = R(2, 2) = 1.0; + R += vx; + Mult(vx, vx, vx2); + if (std::abs(1.0 + c) > 1e-8) + { + vx2.Set(1.0 / (1.0 + c), vx2); + } + R += vx2; + + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + transformation(i, j) = R(i, j); + } + } +} + +mfem::DenseMatrix ComputeAffineTransformationMatrix(const Frame &donor, + const Frame &receiver) +{ + + mfem::DenseMatrix A(4, 4); + A = 0.0; + if (donor.basis[1].Norml2() > 0.0 && receiver.basis[1].Norml2() > 0.0) + { + // Stably compute the rotation matrix from unit vectors. + Eigen::Matrix3d source, target; + Eigen::Vector3d source_centroid, target_centroid; + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + source(i, j) = donor.basis[j](i); + target(i, j) = receiver.basis[j](i); + } + source_centroid(i) = donor.origin(i); + target_centroid(i) = receiver.origin(i); + } + Eigen::Matrix3d R = source * target.transpose(); + Eigen::JacobiSVD svd(R, Eigen::ComputeFullU | Eigen::ComputeFullV); + R = svd.matrixV() * svd.matrixU().transpose(); + + // Account for possible reflection in R (det(R) = -1). + Eigen::DiagonalMatrix Diag(1.0, 1.0, R.determinant()); + R = svd.matrixV() * Diag * svd.matrixU().transpose(); + + // Compute translation and form transformation matrix. + const Eigen::Vector3d translation = target_centroid - R * source_centroid; + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + A(i, j) = R(i, j); + } + A(i, 3) = translation(i); + } + A(3, 3) = 1.0; + } + else + { + // If the donor or receiver basis is ambiguous, we assume no rotation around the + // normals, and that the rotation comes only from realigning normal vectors. + ComputeRotation(donor.basis[0], receiver.basis[0], A); + for (int i = 0; i < 3; i++) + { + A(i, 3) = receiver.origin(i) - donor.origin(i); + } + A(3, 3) = 1.0; + } + + return A; +} + +// Create the vertex mapping between sets of donor and receiver pts related +// by an affine transformation matrix. +std::vector CreatePeriodicVertexMapping(std::unique_ptr &mesh, + const std::unordered_set &donor_v, + const std::unordered_set &receiver_v, + const mfem::DenseMatrix &transform, + double tol = 1e-6) +{ + // Similar to MFEM's CreatePeriodicVertexMapping, maps from replica to primary vertex. + std::unordered_map replica2primary; + + // KD-tree containing all the receiver points. + mfem::KDTree3D kdtree; + for (const int v : receiver_v) + { + kdtree.AddPoint(mesh->GetVertex(v), v); + } + kdtree.Sort(); + + // Loop over donor points and find the corresponding receiver point. + mfem::Vector from(4), to(4); + for (int vi : donor_v) + { + // TODO: mfem patch to allow SetVector direct from pointer. + std::copy(mesh->GetVertex(vi), mesh->GetVertex(vi) + 3, from.begin()); + from[3] = 1.0; // reset + transform.Mult(from, to); // receiver = transform * donor + + const int vj = kdtree.FindClosestPoint(to.GetData()); + std::copy(mesh->GetVertex(vj), mesh->GetVertex(vj) + 3, from.begin()); + from -= to; // Check that the loaded vertex is identical to the transformed + MFEM_VERIFY(from.Norml2() < tol, + "Could not match points on periodic boundaries, transformed donor point " + "does not correspond to a receiver point!"); + MFEM_VERIFY(replica2primary.find(vj) == replica2primary.end(), + "Could not match points on periodic boundaries, multiple donor points map " + "to the same receiver point!") + replica2primary[vj] = vi; + } + + std::vector v2v(mesh->GetNV()); + std::iota(v2v.begin(), v2v.end(), 0); + for (const auto &[r, p] : replica2primary) + { + v2v[r] = p; + } + return v2v; +} + +// Determine the vertex mapping between donor and receiver boundary attributes. +// Uses the translation vector or affine transformation matrix specified in the +// configuration file. If not provided, attempts to automatically detect the +// affine transformation between donor and receiver boundary vertices. +std::vector +DeterminePeriodicVertexMapping(std::unique_ptr &mesh, + const struct palace::config::PeriodicData &data, + const double tol) +{ + // Get mesh dimensions, will be used to define a reasonable tolerance in mesh units. + mfem::Vector bbmin, bbmax; + mesh->GetBoundingBox(bbmin, bbmax); + bbmax -= bbmin; + const double mesh_dim = bbmax.Norml2(); + const double mesh_tol = tol * mesh_dim; + + // Identify donor and receiver vertices and elements. + const auto &da = data.donor_attributes, &ra = data.receiver_attributes; + std::unordered_set bdr_v_donor, bdr_v_receiver; + std::unordered_set bdr_e_donor, bdr_e_receiver; + for (int be = 0; be < mesh->GetNBE(); be++) + { + int attr = mesh->GetBdrAttribute(be); + auto donor = std::find(da.begin(), da.end(), attr) != da.end(); + auto receiver = std::find(ra.begin(), ra.end(), attr) != ra.end(); + if (donor || receiver) + { + int el, info; + mesh->GetBdrElementAdjacentElement(be, el, info); + mfem::Array vertidxs; + mesh->GetBdrElementVertices(be, vertidxs); + (donor ? bdr_e_donor : bdr_e_receiver).insert(be); + (donor ? bdr_v_donor : bdr_v_receiver).insert(vertidxs.begin(), vertidxs.end()); + } + } + + MFEM_VERIFY(bdr_v_donor.size() == bdr_v_receiver.size(), + "Different number of " + "vertices on donor and receiver boundaries. Cannot create periodic mesh."); + + // Check if mesh has enough elements in periodic direction. MFEM's periodicity + // fails for meshes with <=2 elements in the period direction. + // Compare the number of mesh elements to the number of periodic boundary + // elements. + const int num_periodic_bc_elems = bdr_e_donor.size() + bdr_e_receiver.size(); + mfem::Array geoms; + mesh->GetGeometries(3, geoms); + if (geoms.Size() == 1 && geoms[0] == mfem::Geometry::TETRAHEDRON) + { + // Pure tet mesh. + MFEM_VERIFY(mesh->GetNE() > 3 * num_periodic_bc_elems, + "Not enough mesh elements in periodic direction!"); + } + else + { + // No tets. + MFEM_VERIFY(mesh->GetNE() > num_periodic_bc_elems, + "Not enough mesh elements in periodic direction!"); + } + + // Determine the affine transformation between donor and receiver points. + // Use the translation vector or affine transformation matrix if provided + // in the config file, otherwise automatically detect the transformation. + mfem::DenseMatrix transformation(4); + if (std::any_of(data.affine_transform.begin(), data.affine_transform.end(), + [](auto y) { return std::abs(y) > 0.0; })) + { + // Use user-provided affine transformation matrix. + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + transformation(i, j) = data.affine_transform[i * 4 + j]; // row major conversion + } + } + } + else + { + // Automatically detect transformation. + // Compute the centroid for each boundary. + auto donor_centroid = ComputeCentroid(mesh, bdr_v_donor); + auto receiver_centroid = ComputeCentroid(mesh, bdr_v_receiver); + + // Compute the normal vector for each boundary. + auto donor_normal = ComputeNormal(mesh, bdr_e_donor, true); + auto receiver_normal = ComputeNormal(mesh, bdr_e_receiver, false); + + // Return empty mapping if centroids and normal vectors are the same (up to a sign). + mfem::Vector diff = donor_centroid; + diff -= receiver_centroid; + double dot = donor_normal * receiver_normal; + if (diff.Norml2() < mesh_tol && std::abs(std::abs(dot) - 1.0) < mesh_tol) + { + return {}; + } + + // Compute a frame (origin, normal, and two in plane points) for each boundary. + auto donor_frame = + Find3DFrame(mesh, bdr_v_donor, donor_centroid, donor_normal, mesh_dim); + auto receiver_frame = + Find3DFrame(mesh, bdr_v_receiver, receiver_centroid, receiver_normal, mesh_dim); + + // Compute the affine transformation matrix. + transformation = ComputeAffineTransformationMatrix(donor_frame, receiver_frame); + } + return CreatePeriodicVertexMapping(mesh, bdr_v_donor, bdr_v_receiver, transformation, + mesh_tol); +} + +} // namespace palace::mesh \ No newline at end of file diff --git a/palace/utils/geodata_impl.hpp b/palace/utils/geodata_impl.hpp new file mode 100644 index 0000000000..927f9e55e9 --- /dev/null +++ b/palace/utils/geodata_impl.hpp @@ -0,0 +1,90 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_GEODATA_IMPL_HPP +#define PALACE_UTILS_GEODATA_IMPL_HPP + +#include +#include +#include +#include +#include +#include "utils/iodata.hpp" + +namespace palace +{ + +struct BoundingBox; + +namespace mesh +{ + +// +// Implementations Functions for mesh related functionality. Isolated to avoid Eigen +// propagation. +// + +// For the public interface, we can use a BoundingBox as a generalization of a BoundingBall. +// Internally, however, it's nice to work with a specific ball data type. +struct BoundingBall +{ + Eigen::Vector3d origin; + double radius; + bool planar; +}; + +// Helper for collecting a point cloud from a mesh, used in calculating bounding boxes and +// bounding balls. Returns the dominant rank, for which the vertices argument will be +// filled, while all other ranks will have an empty vector. Vertices are de-duplicated to a +// certain floating point precision. +int CollectPointCloudOnRoot(const mfem::ParMesh &mesh, const mfem::Array &marker, + bool bdr, std::vector &vertices); + +// Calculates a bounding box from a point cloud, result is broadcast across all processes. +BoundingBox BoundingBoxFromPointCloud(MPI_Comm comm, + const std::vector &vertices, + int dominant_rank); + +// Compute the distance from a point orthogonal to the list of normal axes, relative to +// the given origin. +auto PerpendicularDistance(const std::initializer_list &normals, + const Eigen::Vector3d &origin, const Eigen::Vector3d &v); + +// Use 4 points to define a sphere in 3D. If the points are coplanar, 3 of them are used to +// define a circle which is interpreted as the equator of the sphere. We assume the points +// are unique and not collinear. +BoundingBall SphereFromPoints(const std::vector &indices, + const std::vector &vertices); + +// Implementation of the recursive Welzl algorithm kernel. +BoundingBall Welzl(std::vector P, std::vector R, + const std::vector &vertices); + +// Calculates a bounding ball from a point cloud using Welzl's algorithm, result is +// broadcast across all processes. We don't operate on the convex hull, since the number of +// points should be small enough that operating on the full set should be OK. If only three +// points are provided, the bounding circle is computed (likewise for if the points are +// coplanar). +BoundingBox BoundingBallFromPointCloud(MPI_Comm comm, + const std::vector &vertices, + int dominant_rank); + +// Compute a normal vector from an element transformation, optionally ensure aligned +// (| normal ⋅ align | > 0) +void Normal(mfem::ElementTransformation &T, mfem::Vector &normal, + const mfem::Vector *const align); + +// Determine the vertex mapping between donor and receiver boundary attributes. +// Uses the translation vector or affine transformation matrix specified in the +// configuration file. If not provided, attempts to automatically detect the +// affine transformation between donor and receiver boundary vertices. +std::vector +DeterminePeriodicVertexMapping(std::unique_ptr &mesh, + const struct palace::config::PeriodicData &data, + const double tol = 1e-8); + +} // namespace mesh + +} // namespace palace + +#endif // PALACE_UTILS_GEODATA_IMPL_HPP diff --git a/palace/utils/iodata.cpp b/palace/utils/iodata.cpp index dcd79bb3f6..6a5c695a4f 100644 --- a/palace/utils/iodata.cpp +++ b/palace/utils/iodata.cpp @@ -1,619 +1,585 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "iodata.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "utils/communication.hpp" -#include "utils/constants.hpp" -#include "utils/geodata.hpp" - -namespace palace -{ - -namespace -{ - -std::stringstream PreprocessFile(const char *filename) -{ - // Read configuration file into memory and return as a stringstream. - std::string file; - { - std::ifstream fi(filename); - std::stringstream buf; - if (!fi.is_open()) - { - MFEM_ABORT("Unable to open configuration file \"" << filename << "\"!"); - } - buf << fi.rdbuf(); - fi.close(); - file = buf.str(); - } - - // Strip C and C++ style comments (//, /* */) using regex. Correctly handles comments - // within strings and escaped comment markers (see tinyurl.com/2s3n8dkr). - { - std::regex rgx(R"((([\"'])(?:(?=(\\?))\3.)*?\2))" - R"(|(\/\*([^*]|[\r\n]|(\*+([^*\/]|[\r\n])))*\*+\/))" - R"(|(\/\/.*))"); - file = std::regex_replace(file, rgx, "$1"); - } - - // Also strip whitespace. - { - std::regex rgx(R"((([\"'])(?:(?=(\\?))\3.)*?\2))" - R"(|(\s+))"); - file = std::regex_replace(file, rgx, "$1"); - } - - // Also strip erroneous trailing commas. - { - std::regex rgx(R"((([\"'])(?:(?=(\\?))\3.)*?\2))" - R"(|,+(?=\s*?[\}\]]))"); - file = std::regex_replace(file, rgx, "$1"); - } - - // Perform integer range expansion for arrays ([a - b, c] = [a-b,c] = - // [a,a+1,...,b-1,b,c]). The whole file is now one line and arrays have no spaces after - // whitespace stripping. - std::stringstream output; - auto RangeExpand = [](std::string_view str) -> std::string - { - // Handle the given string which is only numeric with possible hyphens. - int num; - auto [ptr, ec] = std::from_chars(str.data(), str.data() + str.length(), num); - MFEM_VERIFY( - ec == std::errc(), - "Invalid integer conversion in range expansion" - << (ec == std::errc::result_out_of_range ? " (integer out of range)!" : "!")); - if (ptr == str.data() + str.length()) - { - return std::string(str); - } - // Range specified, expand the bounds. - int num2; - auto [ptr2, ec2] = std::from_chars(ptr + 1, str.data() + str.length(), num2); - MFEM_VERIFY( - ec2 == std::errc(), - "Invalid integer conversion in range expansion" - << (ec2 == std::errc::result_out_of_range ? " (integer out of range)!" : "!")); - std::string rng; - while (num < num2) - { - rng += std::to_string(num++) + ","; - } - rng += std::to_string(num); - return rng; - }; - { - const std::string range_vals = "-0123456789,"; - auto start = file.begin(); - bool inside = false; - for (auto it = start; it != file.end(); ++it) - { - if (inside) - { - if (*it == ']') - { - // Apply integer range expansion (as needed) to the array, which includes only - // digits, commas, and '-'. Exclude the outer square brackets. - std::string_view str(file.data() + (start - file.cbegin() + 1), it - start - 1); - std::size_t s = 0, pos; - output << '['; - while ((pos = str.find(',', s)) != std::string::npos) - { - output << RangeExpand(str.substr(s, pos - s)) << ','; - s = pos + 1; - } - output << RangeExpand(str.substr(s)) << ']'; - start = it + 1; - inside = false; - } - else if (*it == '[') - { - output << std::string(start, it); - start = it; - } - else if (range_vals.find(*it) == std::string::npos) - { - output << std::string(start, it); - start = it; - inside = false; - } - } - else if (*it == '[') - { - output << std::string(start, it); - start = it; - inside = true; - } - } - output << std::string(start, file.end()); - } - return output; -} - -} // namespace - -using json = nlohmann::json; - -IoData::IoData(const char *filename, bool print) : Lc(1.0), tc(1.0), init(false) -{ - // Open configuration file and preprocess: strip whitespace, comments, and expand integer - // ranges. - std::stringstream buffer = PreprocessFile(filename); - - // Parse the configuration file. Use a callback function to detect and throw errors for - // duplicate keys. - json config; - std::stack> parse_stack; - json::parser_callback_t check_duplicate_keys = - [&](int, json::parse_event_t event, json &parsed) - { - switch (event) - { - case json::parse_event_t::object_start: - parse_stack.push(std::set()); - break; - case json::parse_event_t::object_end: - parse_stack.pop(); - break; - case json::parse_event_t::key: - { - const auto result = parse_stack.top().insert(parsed); - if (!result.second) - { - MFEM_ABORT("Error parsing configuration file!\nDuplicate key " - << parsed << " was already seen in this object!"); - return false; - } - } - break; - default: - break; - } - return true; - }; - try - { - config = json::parse(buffer, check_duplicate_keys); - } - catch (json::parse_error &e) - { - MFEM_ABORT("Error parsing configuration file!\n " << e.what()); - } - if (print) - { - Mpi::Print("\n{}\n", config.dump(2)); - } - - // Set up configuration option data structures. - problem.SetUp(config); - model.SetUp(config); - domains.SetUp(config); - boundaries.SetUp(config); - solver.SetUp(config); - - // Cleanup and error checking. - config.erase("Problem"); - config.erase("Model"); - config.erase("Domains"); - config.erase("Boundaries"); - config.erase("Solver"); - MFEM_VERIFY(config.empty(), "Found an unsupported configuration file section!\n" - << config.dump(2)); - - // Check compatibility of configuration file and problem type. - CheckConfiguration(); -} - -void IoData::CheckConfiguration() -{ - // Check that the provided domain and boundary objects are all supported by the requested - // problem type. - if (problem.type == config::ProblemData::Type::DRIVEN) - { - // No unsupported domain or boundary objects for frequency domain driven simulations. - } - else if (problem.type == config::ProblemData::Type::EIGENMODE) - { - if (!boundaries.conductivity.empty()) - { - Mpi::Warning("Eigenmode problem type does not support surface conductivity boundary " - "conditions!\n"); - } - if (!boundaries.auxpec.empty() || !boundaries.waveport.empty()) - { - Mpi::Warning( - "Eigenmode problem type does not support wave port boundary conditions!\n"); - } - } - else if (problem.type == config::ProblemData::Type::ELECTROSTATIC) - { - if (!boundaries.farfield.empty()) - { - Mpi::Warning( - "Electrostatic problem type does not support absorbing boundary conditions!\n"); - } - if (!boundaries.conductivity.empty()) - { - Mpi::Warning("Electrostatic problem type does not support surface conductivity " - "boundary conditions!\n"); - } - if (!boundaries.impedance.empty()) - { - Mpi::Warning("Electrostatic problem type does not support surface impedance boundary " - "conditions!\n"); - } - if (!boundaries.auxpec.empty() || !boundaries.waveport.empty()) - { - Mpi::Warning( - "Electrostatic problem type does not support wave port boundary conditions!\n"); - } - if (!boundaries.current.empty()) - { - Mpi::Warning( - "Electrostatic problem type does not support surface current excitation!\n"); - } - if (!boundaries.postpro.inductance.empty()) - { - Mpi::Warning("Electrostatic problem type does not support boundary inductance " - "postprocessing!\n"); - } - } - else if (problem.type == config::ProblemData::Type::MAGNETOSTATIC) - { - if (!domains.postpro.dielectric.empty()) - { - Mpi::Warning("Magnetostatic problem type does not support domain bulk dielectric " - "loss postprocessing!\n"); - } - if (!boundaries.farfield.empty()) - { - Mpi::Warning( - "Magnetostatic problem type does not support absorbing boundary conditions!\n"); - } - if (!boundaries.conductivity.empty()) - { - Mpi::Warning("Magnetostatic problem type does not support surface conductivity " - "boundary conditions!\n"); - } - if (!boundaries.impedance.empty()) - { - Mpi::Warning("Magnetostatic problem type does not support surface impedance boundary " - "conditions!\n"); - } - if (!boundaries.lumpedport.empty()) - { - Mpi::Warning( - "Magnetostatic problem type does not support lumped port boundary conditions!\n"); - } - if (!boundaries.auxpec.empty() || !boundaries.waveport.empty()) - { - Mpi::Warning( - "Magnetostatic problem type does not support wave port boundary conditions!\n"); - } - if (!boundaries.postpro.capacitance.empty()) - { - Mpi::Warning("Magnetostatic problem type does not support boundary capacitance " - "postprocessing!\n"); - } - if (!boundaries.postpro.dielectric.empty()) - { - Mpi::Warning("Magnetostatic problem type does not support boundary interface " - "dielectric loss postprocessing!\n"); - } - } - else if (problem.type == config::ProblemData::Type::TRANSIENT) - { - if (!boundaries.conductivity.empty()) - { - Mpi::Warning("Transient problem type does not support surface conductivity boundary " - "conditions!\n"); - } - if (!boundaries.auxpec.empty() || !boundaries.waveport.empty()) - { - Mpi::Warning( - "Transient problem type does not support wave port boundary conditions!\n"); - } - } - - // XX TODO: Default value for pa_order_threshold if we want PA enabled by default - // XX TODO: Enable Device::GPU by default if MFEM built with CUDA/HIP support? - - // Resolve default values in configuration file. - if (solver.linear.type == config::LinearSolverData::Type::DEFAULT) - { - if (problem.type == config::ProblemData::Type::ELECTROSTATIC || - (problem.type == config::ProblemData::Type::TRANSIENT && - solver.transient.type == config::TransientSolverData::Type::CENTRAL_DIFF)) - { - solver.linear.type = config::LinearSolverData::Type::BOOMER_AMG; - } - else if (problem.type == config::ProblemData::Type::MAGNETOSTATIC || - problem.type == config::ProblemData::Type::TRANSIENT) - { - solver.linear.type = config::LinearSolverData::Type::AMS; - } - else - { - // Prefer sparse direct solver for frequency domain problems if available. -#if defined(MFEM_USE_SUPERLU) - solver.linear.type = config::LinearSolverData::Type::SUPERLU; -#elif defined(MFEM_USE_STRUMPACK) - solver.linear.type = config::LinearSolverData::Type::STRUMPACK; -#elif defined(MFEM_USE_MUMPS) - solver.linear.type = config::LinearSolverData::Type::MUMPS; -#else - solver.linear.type = config::LinearSolverData::Type::AMS; -#endif - } - } - if (solver.linear.ksp_type == config::LinearSolverData::KspType::DEFAULT) - { - // Problems with SPD operators use CG by default, else GMRES. - if (problem.type == config::ProblemData::Type::ELECTROSTATIC || - problem.type == config::ProblemData::Type::MAGNETOSTATIC || - problem.type == config::ProblemData::Type::TRANSIENT) - { - solver.linear.ksp_type = config::LinearSolverData::KspType::CG; - } - else - { - solver.linear.ksp_type = config::LinearSolverData::KspType::GMRES; - } - } - if (solver.linear.max_size < 0) - { - solver.linear.max_size = solver.linear.max_it; - } - if (solver.linear.initial_guess < 0) - { - if ((problem.type == config::ProblemData::Type::DRIVEN && - solver.driven.adaptive_tol <= 0.0) || - problem.type == config::ProblemData::Type::TRANSIENT || - problem.type == config::ProblemData::Type::ELECTROSTATIC || - problem.type == config::ProblemData::Type::MAGNETOSTATIC) - { - // Default true only driven simulations without adaptive frequency sweep, transient - // simulations, or electrostatic or magnetostatics. - solver.linear.initial_guess = 1; - } - else - { - solver.linear.initial_guess = 0; - } - } - if (solver.linear.pc_mat_shifted < 0) - { - if (problem.type == config::ProblemData::Type::DRIVEN && - solver.linear.type == config::LinearSolverData::Type::AMS) - { - // Default true only driven simulations using AMS (false for most cases). - solver.linear.pc_mat_shifted = 1; - } - else - { - solver.linear.pc_mat_shifted = 0; - } - } - if (solver.linear.mg_smooth_aux < 0) - { - if (problem.type == config::ProblemData::Type::ELECTROSTATIC || - problem.type == config::ProblemData::Type::MAGNETOSTATIC) - { - // Disable auxiliary space smoothing using distributive relaxation by default for - // problems which don't need it. - solver.linear.mg_smooth_aux = 0; - } - else - { - solver.linear.mg_smooth_aux = 1; - } - } - if (solver.linear.mg_smooth_order < 0) - { - solver.linear.mg_smooth_order = std::max(2 * solver.order, 4); - } -} - -namespace -{ - -template -constexpr config::SymmetricMatrixData &operator/=(config::SymmetricMatrixData &data, - double s) -{ - for (auto &x : data.s) - { - x /= s; - } - return data; -} - -} // namespace - -void IoData::NondimensionalizeInputs(mfem::ParMesh &mesh) -{ - // Nondimensionalization of the equations is based on a given length Lc in[m], typically - // the largest domain dimension. Configuration file lengths and the mesh coordinates are - // provided with units of model.L0 x [m]. - MFEM_VERIFY(!init, "NondimensionalizeInputs should only be called once!"); - init = true; - - // Calculate the reference length and time. - if (model.Lc > 0.0) - { - // User specified Lc in mesh length units. - Lc = model.Lc * model.L0; // [m] - } - else - { - mfem::Vector bbmin, bbmax; - mesh.GetBoundingBox(bbmin, bbmax); - bbmax -= bbmin; - bbmax *= model.L0; // [m] - Lc = *std::max_element(bbmax.begin(), bbmax.end()); - } - tc = 1.0e9 * Lc / electromagnetics::c0_; // [ns] - - // Mesh refinement parameters. - auto Divides = [this](double val) { return val / (Lc / model.L0); }; - for (auto &box : model.refinement.GetBoxes()) - { - std::transform(box.bbmin.begin(), box.bbmin.end(), box.bbmin.begin(), Divides); - std::transform(box.bbmax.begin(), box.bbmax.end(), box.bbmax.begin(), Divides); - } - for (auto &sphere : model.refinement.GetSpheres()) - { - sphere.r /= Lc / model.L0; - std::transform(sphere.center.begin(), sphere.center.end(), sphere.center.begin(), - Divides); - } - - // Materials: conductivity and London penetration depth. - for (auto &data : domains.materials) - { - data.sigma /= 1.0 / (electromagnetics::Z0_ * Lc); - data.lambda_L /= Lc / model.L0; - } - - // Probe location coordinates. - for (auto &[idx, data] : domains.postpro.probe) - { - data.x /= Lc / model.L0; - data.y /= Lc / model.L0; - data.z /= Lc / model.L0; - } - - // Finite conductivity boundaries. - for (auto &data : boundaries.conductivity) - { - data.sigma /= 1.0 / (electromagnetics::Z0_ * Lc); - data.h /= Lc / model.L0; - } - - // Impedance boundaries and lumped ports. - for (auto &data : boundaries.impedance) - { - data.Rs /= electromagnetics::Z0_; - data.Ls /= electromagnetics::mu0_ * Lc; - data.Cs /= electromagnetics::epsilon0_ * Lc; - } - for (auto &[idx, data] : boundaries.lumpedport) - { - data.R /= electromagnetics::Z0_; - data.L /= electromagnetics::mu0_ * Lc; - data.C /= electromagnetics::epsilon0_ * Lc; - data.Rs /= electromagnetics::Z0_; - data.Ls /= electromagnetics::mu0_ * Lc; - data.Cs /= electromagnetics::epsilon0_ * Lc; - } - - // Wave port offset distance. - for (auto &[idx, data] : boundaries.waveport) - { - data.d_offset /= Lc / model.L0; - } - - // Dielectric interface thickness. - for (auto &[idx, data] : boundaries.postpro.dielectric) - { - data.ts /= Lc / model.L0; - } - - // For eigenmode simulations: - solver.eigenmode.target *= 2.0 * M_PI * tc; - solver.eigenmode.feast_contour_ub *= 2.0 * M_PI * tc; - - // For driven simulations: - solver.driven.min_f *= 2.0 * M_PI * tc; - solver.driven.max_f *= 2.0 * M_PI * tc; - solver.driven.delta_f *= 2.0 * M_PI * tc; - - // For transient simulations: - solver.transient.pulse_f *= 2.0 * M_PI * tc; - solver.transient.pulse_tau /= tc; - solver.transient.max_t /= tc; - solver.transient.delta_t /= tc; - - // Scale mesh vertices for correct nondimensionalization. - mesh::NondimensionalizeMesh(mesh, GetLengthScale()); - - // Print some information. - Mpi::Print(mesh.GetComm(), - "\nCharacteristic length and time scales:\n L₀ = {:.3e} m, t₀ = {:.3e} ns\n", - Lc, tc); -} - -template -T IoData::DimensionalizeValue(IoData::ValueType type, T v) const -{ - // Characteristic reference magnetic field strength Hc² = 1 / (Zc * Lc²) A/m (with Ec = - // Hc Zc). Yields Pc = Hc² Zc Lc² = 1 W. - const T Hc = 1.0 / std::sqrt(electromagnetics::Z0_ * Lc * Lc); // [A/m] - T sf = 1.0; - switch (type) - { - case ValueType::TIME: - sf = tc; // [ns] - break; - case ValueType::FREQUENCY: - sf = 1.0 / (2.0 * M_PI * tc); // [GHz/rad] - break; - case ValueType::LENGTH: - sf = Lc; // [m] - break; - case ValueType::IMPEDANCE: - sf = electromagnetics::Z0_; // [Ω] - break; - case ValueType::INDUCTANCE: - sf = electromagnetics::mu0_ * Lc; // [H] - break; - case ValueType::CAPACITANCE: - sf = electromagnetics::epsilon0_ * Lc; // [F] - break; - case ValueType::CONDUCTIVITY: - sf = 1.0 / (electromagnetics::Z0_ * Lc); // [S/m] - break; - case ValueType::VOLTAGE: - sf = Hc * electromagnetics::Z0_ * Lc; // [V] - break; - case ValueType::CURRENT: - sf = Hc * Lc; // [A] - break; - case ValueType::POWER: - sf = Hc * Hc * electromagnetics::Z0_ * Lc * Lc; // [W] - break; - case ValueType::ENERGY: - sf = Hc * Hc * electromagnetics::Z0_ * Lc * Lc * tc; // [J] - break; - case ValueType::FIELD_E: - sf = Hc * electromagnetics::Z0_; // [V/m] - break; - case ValueType::FIELD_D: - sf = electromagnetics::epsilon0_ * Hc * electromagnetics::Z0_; // [C/m²] - break; - case ValueType::FIELD_H: - sf = Hc; // [A/m] - break; - case ValueType::FIELD_B: - sf = electromagnetics::mu0_ * Hc; // [Wb/m²] - break; - } - return v * sf; -} - -template double IoData::DimensionalizeValue(IoData::ValueType, double) const; - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "iodata.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fem/bilinearform.hpp" +#include "fem/integrator.hpp" +#include "utils/communication.hpp" +#include "utils/geodata.hpp" + +namespace palace +{ + +std::stringstream PreprocessFile(const char *filename) +{ + // Read configuration file into memory and return as a stringstream. + std::string file; + { + std::ifstream fi(filename); + std::stringstream buf; + if (!fi.is_open()) + { + MFEM_ABORT("Unable to open configuration file \"" << filename << "\"!"); + } + buf << fi.rdbuf(); + fi.close(); + file = buf.str(); + } + + // Strip C and C++ style comments (//, /* */) using regex. Correctly handles comments + // within strings and escaped comment markers (see tinyurl.com/2s3n8dkr). An alternative + // for the middle line is: R"(|(\/\*([^*]|[\r\n]|(\*+([^*\/]|[\r\n])))*\*+\/))", but this + // seems to sometimes lead to issues with std::regex_replace for long files. + { + std::regex rgx(R"((([\"'])(?:(?=(\\?))\3.)*?\2))" + R"(|(\/\*(.|[\r\n])*?\*\/))" + R"(|(\/\/.*))"); + file = std::regex_replace(file, rgx, "$1"); + } + + // Also strip whitespace. + { + std::regex rgx(R"((([\"'])(?:(?=(\\?))\3.)*?\2))" + R"(|(\s+))"); + file = std::regex_replace(file, rgx, "$1"); + } + + // Also strip erroneous trailing commas. + { + std::regex rgx(R"((([\"'])(?:(?=(\\?))\3.)*?\2))" + R"(|,+(?=\s*?[\}\]]))"); + file = std::regex_replace(file, rgx, "$1"); + } + + // Perform integer range expansion for arrays ([a - b, c] = [a-b,c] = + // [a,a+1,...,b-1,b,c]). The whole file is now one line and arrays have no spaces after + // whitespace stripping. + std::stringstream output; + auto RangeExpand = [](std::string_view str) -> std::string + { + // Handle the given string which is only numeric with possible hyphens. + if (str.empty()) + { + return ""; + } + int num; + auto [ptr, ec] = std::from_chars(str.data(), str.data() + str.length(), num); + MFEM_VERIFY( + ec == std::errc(), + "Invalid integer conversion in range expansion" + << (ec == std::errc::result_out_of_range ? " (integer out of range)!" : "!")); + if (ptr == str.data() + str.length()) + { + return std::string(str); + } + // Range specified, expand the bounds. + int num2; + auto [ptr2, ec2] = std::from_chars(ptr + 1, str.data() + str.length(), num2); + MFEM_VERIFY( + ec2 == std::errc(), + "Invalid integer conversion in range expansion" + << (ec2 == std::errc::result_out_of_range ? " (integer out of range)!" : "!")); + std::string rng; + while (num < num2) + { + rng += std::to_string(num++) + ","; + } + rng += std::to_string(num); + return rng; + }; + { + const std::string range_vals = "-0123456789,"; + auto start = file.begin(); + bool inside = false; + for (auto it = start; it != file.end(); ++it) + { + if (inside) + { + if (*it == ']') + { + // Apply integer range expansion (as needed) to the array, which includes only + // digits, commas, and '-'. Exclude the outer square brackets. + std::string_view str(file.data() + (start - file.cbegin() + 1), it - start - 1); + std::size_t s = 0, pos; + output << '['; + while ((pos = str.find(',', s)) != std::string::npos) + { + output << RangeExpand(str.substr(s, pos - s)) << ','; + s = pos + 1; + } + output << RangeExpand(str.substr(s)) << ']'; + start = it + 1; + inside = false; + } + else if (*it == '[') + { + output << std::string(start, it); + start = it; + } + else if (range_vals.find(*it) == std::string::npos) + { + output << std::string(start, it); + start = it; + inside = false; + } + } + else if (*it == '[') + { + output << std::string(start, it); + start = it; + inside = true; + } + } + output << std::string(start, file.end()); + } + return output; +} + +using json = nlohmann::json; + +IoData::IoData(const char *filename, bool print) : units(1.0, 1.0), init(false) +{ + // Open configuration file and preprocess: strip whitespace, comments, and expand integer + // ranges. + std::stringstream buffer = PreprocessFile(filename); + + // Parse the configuration file. Use a callback function to detect and throw errors for + // duplicate keys. + json config; + std::stack> parse_stack; + json::parser_callback_t check_duplicate_keys = + [&](int, json::parse_event_t event, json &parsed) + { + switch (event) + { + case json::parse_event_t::object_start: + parse_stack.push(std::set()); + break; + case json::parse_event_t::object_end: + parse_stack.pop(); + break; + case json::parse_event_t::key: + { + const auto result = parse_stack.top().insert(parsed); + if (!result.second) + { + MFEM_ABORT("Error parsing configuration file!\nDuplicate key " + << parsed << " was already seen in this object!"); + return false; + } + } + break; + default: + break; + } + return true; + }; + try + { + config = json::parse(buffer, check_duplicate_keys); + } + catch (json::parse_error &e) + { + MFEM_ABORT("Error parsing configuration file!\n " << e.what()); + } + if (print) + { + Mpi::Print("\n{}\n", config.dump(2)); + } + + // Set up configuration option data structures. + problem.SetUp(config); + model.SetUp(config); + domains.SetUp(config); + boundaries.SetUp(config); + solver.SetUp(config); + + // Cleanup and error checking. + config.erase("Problem"); + config.erase("Model"); + config.erase("Domains"); + config.erase("Boundaries"); + config.erase("Solver"); + MFEM_VERIFY(config.empty(), "Found an unsupported configuration file section!\n" + << config.dump(2)); + + // Check compatibility of configuration file and problem type. + CheckConfiguration(); +} + +void IoData::CheckConfiguration() +{ + // Check that the provided domain and boundary objects are all supported by the requested + // problem type. + if (problem.type == ProblemType::DRIVEN) + { + // No unsupported domain or boundary objects for frequency domain driven simulations. + } + else if (problem.type == ProblemType::EIGENMODE) + { + // No unsupported domain or boundary objects for frequency domain driven simulations. + } + else if (problem.type == ProblemType::ELECTROSTATIC) + { + if (!boundaries.farfield.empty()) + { + Mpi::Warning( + "Electrostatic problem type does not support absorbing boundary conditions!\n"); + } + if (!boundaries.conductivity.empty()) + { + Mpi::Warning("Electrostatic problem type does not support surface conductivity " + "boundary conditions!\n"); + } + if (!boundaries.impedance.empty()) + { + Mpi::Warning("Electrostatic problem type does not support surface impedance boundary " + "conditions!\n"); + } + if (!boundaries.auxpec.empty() || !boundaries.waveport.empty()) + { + Mpi::Warning( + "Electrostatic problem type does not support wave port boundary conditions!\n"); + } + if (!boundaries.current.empty()) + { + Mpi::Warning( + "Electrostatic problem type does not support surface current excitation!\n"); + } + } + else if (problem.type == ProblemType::MAGNETOSTATIC) + { + if (!boundaries.farfield.empty()) + { + Mpi::Warning( + "Magnetostatic problem type does not support absorbing boundary conditions!\n"); + } + if (!boundaries.conductivity.empty()) + { + Mpi::Warning("Magnetostatic problem type does not support surface conductivity " + "boundary conditions!\n"); + } + if (!boundaries.impedance.empty()) + { + Mpi::Warning("Magnetostatic problem type does not support surface impedance boundary " + "conditions!\n"); + } + if (!boundaries.lumpedport.empty()) + { + Mpi::Warning( + "Magnetostatic problem type does not support lumped port boundary conditions!\n"); + } + if (!boundaries.auxpec.empty() || !boundaries.waveport.empty()) + { + Mpi::Warning( + "Magnetostatic problem type does not support wave port boundary conditions!\n"); + } + if (!boundaries.postpro.dielectric.empty()) + { + Mpi::Warning("Magnetostatic problem type does not support boundary interface " + "dielectric loss postprocessing!\n"); + } + } + else if (problem.type == ProblemType::TRANSIENT) + { + if (!boundaries.conductivity.empty()) + { + Mpi::Warning("Transient problem type does not support surface conductivity boundary " + "conditions!\n"); + } + if (!boundaries.auxpec.empty() || !boundaries.waveport.empty()) + { + Mpi::Warning( + "Transient problem type does not support wave port boundary conditions!\n"); + } + if (!boundaries.farfield.empty() && boundaries.farfield.order > 1) + { + Mpi::Warning("Transient problem type does not support absorbing boundary conditions " + "with order > 1!\n"); + } + } + + // Resolve default values in configuration file. + if (solver.linear.type == LinearSolver::DEFAULT) + { + if (problem.type == ProblemType::ELECTROSTATIC) + { + solver.linear.type = LinearSolver::BOOMER_AMG; + } + else if (problem.type == ProblemType::MAGNETOSTATIC || + problem.type == ProblemType::TRANSIENT) + { + solver.linear.type = LinearSolver::AMS; + } + else + { + // Prefer sparse direct solver for frequency domain problems if available. +#if defined(MFEM_USE_SUPERLU) + solver.linear.type = LinearSolver::SUPERLU; +#elif defined(MFEM_USE_STRUMPACK) + solver.linear.type = LinearSolver::STRUMPACK; +#elif defined(MFEM_USE_MUMPS) + solver.linear.type = LinearSolver::MUMPS; +#else + solver.linear.type = LinearSolver::AMS; +#endif + } + } + if (solver.linear.krylov_solver == KrylovSolver::DEFAULT) + { + // Problems with SPD operators use CG by default, else GMRES. + if (problem.type == ProblemType::ELECTROSTATIC || + problem.type == ProblemType::MAGNETOSTATIC || + problem.type == ProblemType::TRANSIENT) + { + solver.linear.krylov_solver = KrylovSolver::CG; + } + else + { + solver.linear.krylov_solver = KrylovSolver::GMRES; + } + } + if (solver.linear.max_size < 0) + { + solver.linear.max_size = solver.linear.max_it; + } + if (solver.linear.initial_guess < 0) + { + if ((problem.type == ProblemType::DRIVEN && solver.driven.adaptive_tol <= 0.0) || + problem.type == ProblemType::TRANSIENT || + problem.type == ProblemType::ELECTROSTATIC || + problem.type == ProblemType::MAGNETOSTATIC) + { + // Default true only driven simulations without adaptive frequency sweep, transient + // simulations, electrostatics, or magnetostatics. + solver.linear.initial_guess = 1; + } + else + { + solver.linear.initial_guess = 0; + } + } + if (solver.linear.pc_mat_shifted < 0) + { + if (problem.type == ProblemType::DRIVEN && solver.linear.type == LinearSolver::AMS) + { + // Default true only driven simulations using AMS (false for most cases). + solver.linear.pc_mat_shifted = 1; + } + else + { + solver.linear.pc_mat_shifted = 0; + } + } + if (solver.linear.mg_smooth_aux < 0) + { + if (problem.type == ProblemType::ELECTROSTATIC || + problem.type == ProblemType::MAGNETOSTATIC) + { + // Disable auxiliary space smoothing using distributive relaxation by default for + // problems which don't need it. + solver.linear.mg_smooth_aux = 0; + } + else + { + solver.linear.mg_smooth_aux = 1; + } + } + if (solver.linear.mg_smooth_order < 0) + { + solver.linear.mg_smooth_order = std::max(2 * solver.order, 4); + } + if (solver.linear.ams_singular_op < 0) + { + solver.linear.ams_singular_op = (problem.type == ProblemType::MAGNETOSTATIC); + } + if (solver.linear.amg_agg_coarsen < 0) + { + solver.linear.amg_agg_coarsen = (problem.type == ProblemType::ELECTROSTATIC || + problem.type == ProblemType::MAGNETOSTATIC || + problem.type == ProblemType::TRANSIENT); + } + if (solver.linear.reorder_reuse && solver.linear.drop_small_entries && + solver.linear.complex_coarse_solve && (problem.type == ProblemType::EIGENMODE) && + (!boundaries.waveport.empty() || !boundaries.conductivity.empty() || + (!boundaries.farfield.empty() && boundaries.farfield.order > 1))) + { + // Do not reuse the sparsity pattern for nonlinear eigenmode simulations with complex + // coarse preconditioners when dropping small entries. In those cases, the sparsity + // pattern of the first preconditioner (purely real coefficients) will be different from + // subsequent preconditioners with complex coefficients. + solver.linear.reorder_reuse = false; + } + // Configure settings for quadrature rules and partial assembly. + BilinearForm::pa_order_threshold = solver.pa_order_threshold; + fem::DefaultIntegrationOrder::p_trial = solver.order; + fem::DefaultIntegrationOrder::q_order_jac = solver.q_order_jac; + fem::DefaultIntegrationOrder::q_order_extra_pk = solver.q_order_extra; + fem::DefaultIntegrationOrder::q_order_extra_qk = solver.q_order_extra; +} + +namespace +{ + +template +constexpr config::SymmetricMatrixData &operator/=(config::SymmetricMatrixData &data, + double s) +{ + for (auto &x : data.s) + { + x /= s; + } + return data; +} + +} // namespace + +void IoData::NondimensionalizeInputs(mfem::ParMesh &mesh) +{ + // Nondimensionalization of the equations is based on a given length Lc in [m], typically + // the largest domain dimension. Configuration file lengths and the mesh coordinates are + // provided with units of model.L0 x [m]. + MFEM_VERIFY(!init, "NondimensionalizeInputs should only be called once!"); + init = true; + + // Calculate the reference length and time. A user specified model.Lc is in mesh length + // units. + if (model.Lc <= 0.0) + { + mfem::Vector bbmin, bbmax; + mesh::GetAxisAlignedBoundingBox(mesh, bbmin, bbmax); + bbmax -= bbmin; + model.Lc = *std::max_element(bbmax.begin(), bbmax.end()); + } + // Define units now mesh length set. Note: In model field Lc is measured in units of L0. + units = Units(model.L0, model.Lc * model.L0); + + // Mesh refinement parameters. + auto DivideLengthScale = [Lc0 = units.GetMeshLengthRelativeScale()](double val) + { return val / Lc0; }; + for (auto &box : model.refinement.GetBoxes()) + { + std::transform(box.bbmin.begin(), box.bbmin.end(), box.bbmin.begin(), + DivideLengthScale); + std::transform(box.bbmax.begin(), box.bbmax.end(), box.bbmax.begin(), + DivideLengthScale); + } + for (auto &sphere : model.refinement.GetSpheres()) + { + sphere.r /= units.GetMeshLengthRelativeScale(); + std::transform(sphere.center.begin(), sphere.center.end(), sphere.center.begin(), + DivideLengthScale); + } + + // Materials: conductivity and London penetration depth. + for (auto &data : domains.materials) + { + data.sigma /= units.GetScaleFactor(); + data.lambda_L /= units.GetMeshLengthRelativeScale(); + } + + // Probe location coordinates. + for (auto &[idx, data] : domains.postpro.probe) + { + std::transform(data.center.begin(), data.center.end(), data.center.begin(), + DivideLengthScale); + } + + // Finite conductivity boundaries. + for (auto &data : boundaries.conductivity) + { + data.sigma /= units.GetScaleFactor(); + data.h /= units.GetMeshLengthRelativeScale(); + } + + // Impedance boundaries and lumped ports. + for (auto &data : boundaries.impedance) + { + data.Rs /= units.GetScaleFactor(); + data.Ls /= units.GetScaleFactor(); + data.Cs /= units.GetScaleFactor(); + } + for (auto &[idx, data] : boundaries.lumpedport) + { + data.R /= units.GetScaleFactor(); + data.L /= units.GetScaleFactor(); + data.C /= units.GetScaleFactor(); + data.Rs /= units.GetScaleFactor(); + data.Ls /= units.GetScaleFactor(); + data.Cs /= units.GetScaleFactor(); + } + + // Floquet periodic boundaries. + for (auto &k : boundaries.periodic.wave_vector) + { + k *= units.GetMeshLengthRelativeScale(); + } + + // Wave port offset distance. + for (auto &[idx, data] : boundaries.waveport) + { + data.d_offset /= units.GetMeshLengthRelativeScale(); + } + + // Center coordinates for surface flux. + for (auto &[idx, data] : boundaries.postpro.flux) + { + std::transform(data.center.begin(), data.center.end(), data.center.begin(), + DivideLengthScale); + } + + // Dielectric interface thickness. + for (auto &[idx, data] : boundaries.postpro.dielectric) + { + data.t /= units.GetMeshLengthRelativeScale(); + } + + // Convert from GHz to non-dimensional angular frequency (adds the 2pi): + // 1/ns -> rad/ns -> non-dim units. + + // For eigenmode simulations: + solver.eigenmode.target = + 2 * M_PI * + units.Nondimensionalize(solver.eigenmode.target); + solver.eigenmode.target_upper = + 2 * M_PI * + units.Nondimensionalize(solver.eigenmode.target_upper); + + // For driven simulations: + for (auto &f : solver.driven.sample_f) + f = 2 * M_PI * units.Nondimensionalize(f); + + // For transient simulations: + solver.transient.pulse_f = + 2 * M_PI * + units.Nondimensionalize(solver.transient.pulse_f); + solver.transient.pulse_tau = + units.Nondimensionalize(solver.transient.pulse_tau); + solver.transient.max_t = + units.Nondimensionalize(solver.transient.max_t); + solver.transient.delta_t = + units.Nondimensionalize(solver.transient.delta_t); + + // Scale mesh vertices for correct nondimensionalization. + mesh::NondimensionalizeMesh(mesh, units.GetMeshLengthRelativeScale()); + + // Print some information. + Mpi::Print(mesh.GetComm(), + "\nCharacteristic length and time scales:\n Lc = {:.3e} m, tc = {:.3e} ns\n", + units.GetScaleFactor(), + units.GetScaleFactor()); +} + +} // namespace palace diff --git a/palace/utils/iodata.hpp b/palace/utils/iodata.hpp index b57d3082fd..5174362b3e 100644 --- a/palace/utils/iodata.hpp +++ b/palace/utils/iodata.hpp @@ -1,83 +1,56 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_IODATA_HPP -#define PALACE_UTILS_IODATA_HPP - -#include -#include "utils/configfile.hpp" - -namespace mfem -{ - -class ParMesh; - -} // namespace mfem - -namespace palace -{ - -// -// A parser class for processing the configuration file which controls runtime options. -// -class IoData -{ -public: - // Configuration file objects. - config::ProblemData problem; - config::ModelData model; - config::DomainData domains; - config::BoundaryData boundaries; - config::SolverData solver; - -private: - // Characteristic reference length [m] and time [ns] for nondimensionalization. - double Lc, tc; - bool init; - - // Check configuration file options and compatibility with requested problem type. - void CheckConfiguration(); - -public: - // Parse command line arguments and override options defaults. - IoData(const char *filename, bool print); - - // Nondimensionalize input values for use in the solver, including the mesh coordinates. - void NondimensionalizeInputs(mfem::ParMesh &mesh); - - // Return the mesh scaling factor in units model.L0 x [m] for mesh IO. - double GetLengthScale() const { return Lc / model.L0; } - - // Redimensionalize values for output. Outputs which depend on the fields assume a - // characteristic reference magnetic field strength Hc such that Pc = 1 W, where Pc is the - // characteristic reference power. - enum class ValueType - { - TIME, // [ns] - FREQUENCY, // [GHz] - LENGTH, // [m] - IMPEDANCE, // [Ω] - INDUCTANCE, // [H] = [Ωs] - CAPACITANCE, // [F] = [s/Ω] - CONDUCTIVITY, // [S/m] - VOLTAGE, // [V] - CURRENT, // [A] - POWER, // [W] - ENERGY, // [J] - FIELD_E, // [V/m] - FIELD_D, // [C/m²] = [A⋅s/m²] - FIELD_H, // [A/m] - FIELD_B // [Wb/m²] = [V⋅s/m²] - }; - template - T DimensionalizeValue(ValueType type, T v) const; - template - std::complex DimensionalizeValue(ValueType type, std::complex v) const - { - return {DimensionalizeValue(type, v.real()), DimensionalizeValue(type, v.imag())}; - } -}; - -} // namespace palace - -#endif // PALACE_UTILS_IODATA_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_IODATA_HPP +#define PALACE_UTILS_IODATA_HPP + +#include "utils/configfile.hpp" +#include "utils/units.hpp" + +namespace mfem +{ + +class ParMesh; + +} // namespace mfem + +namespace palace +{ + +std::stringstream PreprocessFile(const char *filename); + +// +// A parser class for processing the configuration file which controls runtime options. +// +class IoData +{ +public: + // Configuration file objects. + config::ProblemData problem; + config::ModelData model; + config::DomainData domains; + config::BoundaryData boundaries; + config::SolverData solver; + + // Class that holds mesh scale and converts between SI quantities and normalized values. + Units units; + +private: + bool init; + + // Check configuration file options and compatibility with requested problem type. + void CheckConfiguration(); + +public: + IoData(const Units &units) : units(units), init(false) {} + + // Parse command line arguments and override options defaults. + IoData(const char *filename, bool print); + + // Nondimensionalize input values for use in the solver, including the mesh coordinates. + void NondimensionalizeInputs(mfem::ParMesh &mesh); +}; + +} // namespace palace + +#endif // PALACE_UTILS_IODATA_HPP diff --git a/palace/utils/labels.hpp b/palace/utils/labels.hpp new file mode 100644 index 0000000000..86fa8a2dd8 --- /dev/null +++ b/palace/utils/labels.hpp @@ -0,0 +1,173 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_LABELS_HPP +#define PALACE_UTILS_LABELS_HPP + +namespace palace +{ + +// Usable coordinate systems. +enum class CoordinateSystem : char +{ + CARTESIAN, + CYLINDRICAL +}; + +// The types of problem Palace is able to solve. +enum class ProblemType : char +{ + DRIVEN, + EIGENMODE, + ELECTROSTATIC, + MAGNETOSTATIC, + TRANSIENT +}; + +// Eigenvalue solver type. +enum class EigenSolverBackend : char +{ + DEFAULT, + SLEPC, + ARPACK +}; + +// Nonlinear eigenvalue solver type. +enum class NonlinearEigenSolver : char +{ + HYBRID, + SLP +}; + +// Surface fluxes. +enum class SurfaceFlux : char +{ + ELECTRIC, + MAGNETIC, + POWER +}; + +// Interface dielectrics for computing electric field energy participation ratios. +enum class InterfaceDielectric : char +{ + DEFAULT, + MA, + MS, + SA +}; + +// Frequency sampling schemes. +enum class FrequencySampling : char +{ + LINEAR, + LOG, + POINT, + DEFAULT = LINEAR +}; + +// Time integration scheme type. +enum class TimeSteppingScheme : char +{ + GEN_ALPHA, + RUNGE_KUTTA, + ARKODE, + CVODE, + DEFAULT = GEN_ALPHA +}; + +// Excitation type for port excitation. +enum class Excitation : char +{ + SINUSOIDAL, + GAUSSIAN, + DIFF_GAUSSIAN, + MOD_GAUSSIAN, + RAMP_STEP, + SMOOTH_STEP +}; + +// Possible linear solvers +enum class LinearSolver : char +{ + DEFAULT, + AMS, + BOOMER_AMG, + MUMPS, + SUPERLU, + STRUMPACK, + STRUMPACK_MP, + JACOBI +}; + +// Krylov solvers to use in the linear solver. +enum class KrylovSolver : char +{ + DEFAULT, + CG, + MINRES, + GMRES, + FGMRES, + BICGSTAB +}; + +// Method of coarsening for p-multigrid. +enum class MultigridCoarsening : char +{ + LINEAR, + LOGARITHMIC +}; + +// Preconditioning side. +enum class PreconditionerSide : char +{ + DEFAULT, + RIGHT, + LEFT +}; + +// Column ordering method in the symbolic factorization for sparse direct solvers. +enum class SymbolicFactorization : char +{ + DEFAULT, + METIS, + PARMETIS, + SCOTCH, + PTSCOTCH, + PORD, + AMD, + RCM +}; + +// Low-rank and butterfly compression scheme for sparse direct solvers which support it +// (mainly STRUMPACK). +enum class SparseCompression : char +{ + NONE, + BLR, + HSS, + HODLR, + ZFP, + BLR_HODLR, + ZFP_BLR_HODLR +}; + +// Variations of Gram-Schmidt orthogonalization for GMRES/FGMRES iterative solvers and SLEPc +// eigenvalue solver. +enum class Orthogonalization : char +{ + MGS, + CGS, + CGS2 +}; + +// Device used to configure MFEM. +enum class Device : char +{ + CPU, + GPU, + DEBUG +}; + +} // namespace palace + +#endif // PALACE_UTILS_LABELS_HPP diff --git a/palace/utils/meshio.cpp b/palace/utils/meshio.cpp index e275f3df66..d54f274219 100644 --- a/palace/utils/meshio.cpp +++ b/palace/utils/meshio.cpp @@ -1,1093 +1,1127 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#include "meshio.hpp" - -#include -#include -#include - -#define GMSH_BIN // Use binary Gmsh format - -namespace palace -{ - -namespace -{ - -inline int ElemTypeComsol(const std::string &type) -{ - if (!type.compare("tri")) // 3-node triangle - { - return 2; - } - if (!type.compare("quad")) // 4-node quadrangle - { - return 3; - } - if (!type.compare("tet")) // 4-node tetrahedron - { - return 4; - } - if (!type.compare("hex")) // 8-node hexahedron - { - return 5; - } - if (!type.compare("prism")) // 6-node prism - { - return 6; - } - if (!type.compare("pyr")) // 5-node pyramid - { - return 7; - } - if (!type.compare("tri2")) // 6-node triangle - { - return 9; - } - if (!type.compare("quad2")) // 9-node quadrangle - { - return 10; - } - if (!type.compare("tet2")) // 10-node tetrahedron - { - return 11; - } - if (!type.compare("hex2")) // 27-node hexahedron - { - return 12; - } - if (!type.compare("prism2")) // 18-node prism - { - return 13; - } - if (!type.compare("pyr2")) // 14-node pyramid - { - return 14; - } - return 0; // Skip this element type -} - -inline int ElemTypeNastran(const std::string &type) -{ - // Returns only the low-order type for a given keyword. - if (!type.compare(0, 5, "CTRIA")) - { - return 2; - } - if (!type.compare(0, 5, "CQUAD")) - { - return 3; - } - if (!type.compare(0, 6, "CTETRA")) - { - return 4; - } - if (!type.compare(0, 5, "CHEXA")) - { - return 5; - } - if (!type.compare(0, 6, "CPENTA")) - { - return 6; - } - if (!type.compare(0, 6, "CPYRAM")) - { - return 7; - } - return 0; // Skip this element type -} - -inline int HOElemTypeNastran(const int lo_type, const int num_nodes) -{ - // Get high-order element type for corresponding low-order type. - if (lo_type == 2 && num_nodes > 3) - { - MFEM_VERIFY(num_nodes == 6, "Invalid high-order Nastran element!"); - return 9; - } - if (lo_type == 3) - { - if (num_nodes == 9) - { - return 10; - } - if (num_nodes == 8) - { - return 16; - } - MFEM_VERIFY(num_nodes == 4, "Invalid high-order Nastran element!"); - return 3; - } - if (lo_type == 4 && num_nodes > 4) - { - MFEM_VERIFY(num_nodes == 10, "Invalid high-order Nastran element!"); - return 11; - } - if (lo_type == 5 && num_nodes > 8) - { - MFEM_VERIFY(num_nodes == 20, "Invalid high-order Nastran element!"); - return 17; - } - if (lo_type == 6 && num_nodes > 6) - { - MFEM_VERIFY(num_nodes == 15, "Invalid high-order Nastran element!"); - return 18; - } - if (lo_type == 7 && num_nodes > 5) - { - MFEM_VERIFY(num_nodes == 13, "Invalid high-order Nastran element!"); - return 19; - } - return lo_type; -} - -constexpr int ElemNumNodes[] = {-1, // 2-node edge - 3, 4, 4, 8, 6, 5, - -1, // 3-node edge - 6, 9, 10, 27, 18, 14, - -1, // 1-node node - 8, 20, 15, 13}; - -// From COMSOL or Nastran to Gmsh ordering. See: -// - https://gmsh.info/doc/texinfo/gmsh.html#Node-ordering -// - https://tinyurl.com/yezswzfv -// - https://tinyurl.com/4d32zxtn -constexpr int SkipElem[] = {-1}; -constexpr int Msh3[] = {0, 1, 2}; -constexpr int Msh4[] = {0, 1, 2, 3}; -constexpr int Msh5[] = {0, 1, 2, 3, 4}; -constexpr int Msh6[] = {0, 1, 2, 3, 4, 5}; -constexpr int Msh8[] = {0, 1, 2, 3, 4, 5, 6, 7}; -constexpr int Msh9[] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - -constexpr int MphQuad4[] = {0, 1, 3, 2}; -constexpr int MphHex8[] = {0, 1, 3, 2, 4, 5, 7, 6}; -constexpr int MphPyr5[] = {0, 1, 3, 2, 4}; -constexpr int MphTri6[] = {0, 1, 2, 3, 5, 4}; -constexpr int MphQuad9[] = {0, 1, 3, 2, 4, 7, 8, 5, 6}; -constexpr int MphTet10[] = {0, 1, 2, 3, 4, 6, 5, 7, 9, 8}; -constexpr int MphHex27[] = {0, 1, 3, 2, 4, 5, 7, 6, 8, 9, 20, 11, 13, 10, - 21, 12, 22, 26, 23, 15, 24, 14, 16, 17, 25, 18, 19}; -constexpr int MphWdg18[] = {0, 1, 2, 3, 4, 5, 6, 7, 9, 8, 15, 10, 16, 17, 11, 12, 13, 14}; -constexpr int MphPyr14[] = {0, 1, 3, 2, 4, 5, 6, 13, 8, 10, 7, 9, 12, 11}; - -constexpr int NasTet10[] = {0, 1, 2, 3, 4, 5, 6, 7, 9, 8}; -constexpr int NasHex20[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 11, - 13, 9, 10, 12, 14, 15, 16, 18, 19, 17}; -constexpr int NasWdg15[] = {0, 1, 2, 3, 4, 5, 6, 9, 7, 8, 10, 11, 12, 14, 13}; -constexpr int NasPyr13[] = {0, 1, 2, 3, 4, 5, 8, 10, 6, 7, 9, 11, 12}; - -constexpr const int *ElemNodesComsol[] = {SkipElem, Msh3, MphQuad4, Msh4, MphHex8, - Msh6, MphPyr5, SkipElem, MphTri6, MphQuad9, - MphTet10, MphHex27, MphWdg18, MphPyr14, SkipElem, - SkipElem, SkipElem, SkipElem, SkipElem}; -constexpr const int *ElemNodesNastran[] = {SkipElem, Msh3, Msh4, Msh4, Msh8, - Msh6, Msh5, SkipElem, Msh6, Msh9, - NasTet10, SkipElem, SkipElem, SkipElem, SkipElem, - Msh8, NasHex20, NasWdg15, NasPyr13}; - -// Get line, strip comments, leading/trailing whitespace. Should not be called if end of -// file is expected. -inline std::string GetLineComsol(std::ifstream &input) -{ - std::string str; - std::getline(input, str); - MFEM_VERIFY(input, "Unexpected read failure parsing mesh file!"); - const auto pos = str.find_first_of('#'); - if (pos != std::string::npos) - { - str.erase(pos); - } - const auto start = str.find_first_not_of(" \t"); - if (start == std::string::npos) - { - return ""; - } - const auto stop = str.find_last_not_of(" \t"); - return str.substr(start, stop - start + 1); -} - -inline std::string GetLineNastran(std::ifstream &input) -{ - std::string str; - std::getline(input, str); - MFEM_VERIFY(input, "Unexpected read failure parsing mesh file!"); - return str[0] == '$' ? "" : str; -} - -// COMSOL strings are parsed as an integer length followed by array of integers for the -// string characters. -inline std::string ReadStringComsol(std::istream &input) -{ - int n; - std::string str; - input >> n >> str; - return str; -} - -inline std::string ReadStringComsolBinary(std::istream &input) -{ - int n; - input.read(reinterpret_cast(&n), sizeof(int)); - std::vector vstr(n); - input.read(reinterpret_cast(vstr.data()), (std::streamsize)(n * sizeof(int))); - return std::string(vstr.begin(), vstr.end()); -} - -// Nastran has a special floating point format: "-7.-1" instead of "-7.E-01" or "2.3+2" -// instead of "2.3E+02". -inline double ConvertDoubleNastran(const std::string &str) -{ - double d; - try - { - d = std::stod(str); - } - catch (const std::invalid_argument &ia) - { - const std::size_t start = str.find_first_not_of(' '); - MFEM_VERIFY(start != std::string::npos, - "Invalid number conversion parsing Nastran mesh!") - std::string fstr = str.substr(start); - std::size_t pos = fstr.find('+', 1); // Skip leading +/- sign - if (pos != std::string::npos) - { - fstr.replace(pos, 1, "E+"); - } - else if ((pos = fstr.find('-', 1)) != std::string::npos) - { - fstr.replace(pos, 1, "E-"); - } - d = std::stod(fstr); - } - return d; -} - -inline void WriteNode(std::ostream &buffer, const int tag, const double *coord) -{ -#if defined(GMSH_BIN) - buffer.write(reinterpret_cast(&tag), sizeof(int)); - buffer.write(reinterpret_cast(coord), 3 * sizeof(double)); - // No newline for binary data -#else - // Always 3D coordinates (user sets floating point format/precision on buffer). - buffer << tag << ' ' << coord[0] << ' ' << coord[1] << ' ' << coord[2] << '\n'; -#endif -} - -inline void WriteElement(std::ostream &buffer, const int tag, const int type, - const int geom, const int nodes[]) -{ -#if defined(GMSH_BIN) - const int data[3] = {tag, geom, geom}; - buffer.write(reinterpret_cast(data), 3 * sizeof(int)); - buffer.write(reinterpret_cast(nodes), - (std::streamsize)(ElemNumNodes[type - 1] * sizeof(int))); - // No newline for binary data -#else - buffer << tag << ' ' << type << " 2 " << geom << ' ' << geom; - for (int i = 0; i < ElemNumNodes[type - 1]; i++) - { - buffer << ' ' << nodes[i]; - } - buffer << '\n'; -#endif -} - -void WriteGmsh(std::ostream &buffer, const std::vector &node_coords, - const std::vector &node_tags, - const std::unordered_map> &elem_nodes) -{ - // Write the Gmsh file header (version 2.2). - buffer << "$MeshFormat\n2.2 " - << -#if defined(GMSH_BIN) - "1 " << -#else - "0 " << -#endif - sizeof(double) << '\n'; -#if defined(GMSH_BIN) - const int one = 1; - buffer.write(reinterpret_cast(&one), sizeof(int)); - buffer << '\n'; -#endif - buffer << "$EndMeshFormat\n"; - - // Write mesh nodes. - const int num_nodes = (int)node_coords.size() / 3; - MFEM_VERIFY(num_nodes > 0 && node_coords.size() % 3 == 0, - "Gmsh nodes should always be in 3D space!"); - buffer << "$Nodes\n" << num_nodes << '\n'; - { - if (!node_tags.empty()) - { - // Use input node tags which should be positive but don't need to be contiguous. - MFEM_VERIFY(node_tags.size() == (std::size_t)num_nodes, - "Invalid size for node tags!"); - for (int i = 0; i < num_nodes; i++) - { - WriteNode(buffer, node_tags[i], &node_coords[3 * i]); - } - } - else - { - // Label nodes as contiguous starting at 1. - for (int i = 0; i < num_nodes; i++) - { - WriteNode(buffer, i + 1, &node_coords[3 * i]); - } - } - } -#if defined(GMSH_BIN) - buffer << '\n'; -#endif - buffer << "$EndNodes\n"; - - // Write mesh elements. - int tot_num_elem = 0; - for (const auto &[elem_type, nodes] : elem_nodes) - { - MFEM_VERIFY(elem_type > 0, "Invalid element type writing Gmsh elements!"); - const int &num_elem_nodes = ElemNumNodes[elem_type - 1]; - tot_num_elem += ((int)nodes.size()) / (num_elem_nodes + 1); - MFEM_VERIFY(nodes.size() % (num_elem_nodes + 1) == 0, - "Unexpected data size when writing elements!"); - } - MFEM_VERIFY(tot_num_elem > 0, "No mesh elements parsed from COMSOL mesh file!"); - buffer << "$Elements\n" << tot_num_elem << '\n'; - { - int tag = 1; // Global element tag - for (const auto &[elem_type, nodes] : elem_nodes) - { - const int &num_elem_nodes = ElemNumNodes[elem_type - 1]; - const int num_elem = (int)nodes.size() / (num_elem_nodes + 1); -#if defined(GMSH_BIN) - // For binary output, write the element header for each type. Always have 2 tags - // (physical + geometry) - const int header[3] = {elem_type, num_elem, 2}; - buffer.write(reinterpret_cast(header), 3 * sizeof(int)); -#endif - for (int i = 0; i < num_elem; i++) - { - WriteElement(buffer, tag++, elem_type, - nodes[i * (num_elem_nodes + 1)], // Geometry tag - &nodes[i * (num_elem_nodes + 1) + 1]); // Element nodes - } - } - } -#if defined(GMSH_BIN) - buffer << '\n'; -#endif - buffer << "$EndElements\n"; -} - -} // namespace - -namespace mesh -{ - -void ConvertMeshComsol(const std::string &filename, std::ostream &buffer) -{ - // Read a COMSOL format mesh. - const int comsol_bin = !filename.compare(filename.length() - 7, 7, ".mphbin") || - !filename.compare(filename.length() - 7, 7, ".MPHBIN"); - MFEM_VERIFY(!filename.compare(filename.length() - 7, 7, ".mphtxt") || - !filename.compare(filename.length() - 7, 7, ".MPHTXT") || comsol_bin, - "Invalid file extension for COMSOL mesh format conversion!"); - std::ifstream input(filename); - if (!input.is_open()) - { - MFEM_ABORT("Unable to open mesh file \"" << filename << "\"!"); - } - - // Parse COMSOL header. COMSOL encodes strings as integer-string pairs where the integer - // is the string length. It also allows for blank lines and other whitespace wherever in - // the file. - { - int version[2] = {-1, -1}; - int num_tags = -1; - int num_types = -1; - if (!comsol_bin) - { - while (num_types < 0) - { - auto line = GetLineComsol(input); - if (!line.empty()) - { - std::istringstream sline(line); - if (version[0] < 0) - { - sline >> version[0] >> version[1]; - } - else if (num_tags < 0) - { - sline >> num_tags; - int i = 0; - while (i < num_tags) - { - if (!GetLineComsol(input).empty()) - { - i++; - } - } - } - else if (num_types < 0) - { - sline >> num_types; - int i = 0; - while (i < num_types) - { - if (!GetLineComsol(input).empty()) - { - i++; - } - } - } - } - } - } - else - { - input.read(reinterpret_cast(version), 2 * sizeof(int)); - input.read(reinterpret_cast(&num_tags), sizeof(int)); - { - int i = 0; - while (i < num_tags) - { - ReadStringComsolBinary(input); - i++; - } - } - input.read(reinterpret_cast(&num_types), sizeof(int)); - { - int i = 0; - while (i < num_types) - { - ReadStringComsolBinary(input); - i++; - } - } - } - MFEM_VERIFY(version[0] == 0 && version[1] == 1, "Invalid COMSOL file version!"); - } - - // Parse mesh objects until we get to the mesh. Currently only supports a single mesh - // object in the file, and selections are ignored. - while (true) - { - int object[3] = {-1, -1, -1}; - std::string object_class; - if (!comsol_bin) - { - while (object_class.empty()) - { - auto line = GetLineComsol(input); - if (!line.empty()) - { - std::istringstream sline(line); - if (object[0] < 0) - { - sline >> object[0] >> object[1] >> object[2]; - } - else if (object_class.empty()) - { - object_class = ReadStringComsol(sline); - } - } - } - } - else - { - input.read(reinterpret_cast(object), 3 * sizeof(int)); - object_class = ReadStringComsolBinary(input); - } - MFEM_VERIFY(object[0] == 0 && object[1] == 0 && object[2] == 1, - "Invalid COMSOL object version!"); - - // If yes, then ready to parse the mesh. - if (!object_class.compare("Mesh")) - { - break; - } - - // Otherwise, parse over the selection to the next object. - MFEM_VERIFY(!object_class.compare("Selection"), - "COMSOL mesh file only supports Mesh and Selection objects!"); - int version = -1; - std::string label_str; - std::string tag_str; - int sdim = -1; - int num_ent = -1; - if (!comsol_bin) - { - while (num_ent < 0) - { - auto line = GetLineComsol(input); - if (!line.empty()) - { - std::istringstream sline(line); - if (version < 0) - { - sline >> version; - } - else if (label_str.empty()) - { - label_str = ReadStringComsol(sline); - } - else if (tag_str.empty()) - { - tag_str = ReadStringComsol(sline); - } - else if (sdim < 0) - { - sline >> sdim; - } - else if (num_ent < 0) - { - sline >> num_ent; - } - } - } - } - else - { - input.read(reinterpret_cast(&version), sizeof(int)); - label_str = ReadStringComsolBinary(input); - tag_str = ReadStringComsolBinary(input); - input.read(reinterpret_cast(&sdim), sizeof(int)); - input.read(reinterpret_cast(&num_ent), sizeof(int)); - } - - // Parse over the entities in the selection. - int i = 0; - if (!comsol_bin) - { - while (i < num_ent) - { - if (!GetLineComsol(input).empty()) - { - i++; - } - } - } - else - { - while (i < num_ent) - { - int dummy; - input.read(reinterpret_cast(&dummy), sizeof(int)); - i++; - } - } - } // Repeat until Mesh is found - - // Parse the mesh object header. - int sdim = -1; - int num_nodes = -1; - int nodes_start = -1; - { - int version = -1; - if (!comsol_bin) - { - while (nodes_start < 0) - { - auto line = GetLineComsol(input); - if (!line.empty()) - { - std::istringstream sline(line); - if (version < 0) - { - sline >> version; - } - else if (sdim < 0) - { - sline >> sdim; - } - else if (num_nodes < 0) - { - sline >> num_nodes; - } - else if (nodes_start < 0) - { - sline >> nodes_start; - } - } - } - } - else - { - input.read(reinterpret_cast(&version), sizeof(int)); - input.read(reinterpret_cast(&sdim), sizeof(int)); - input.read(reinterpret_cast(&num_nodes), sizeof(int)); - input.read(reinterpret_cast(&nodes_start), sizeof(int)); - } - MFEM_VERIFY(version == 4, "Only COMSOL files with Mesh version 4 are supported!"); - MFEM_VERIFY(sdim == 2 || sdim == 3, - "COMSOL mesh nodes are required to be in 2D or 3D space!"); - MFEM_VERIFY(num_nodes > 0, "COMSOL mesh file contains no nodes!"); - MFEM_VERIFY(nodes_start >= 0, "COMSOL mesh nodes have a negative starting tag!"); - } - - // Parse mesh nodes. - std::vector node_coords; - { - // Gmsh nodes are always 3D, so initialize to 0.0 in case z-coordinate isn't set. - node_coords.resize(3 * num_nodes, 0.0); - int i = 0; - if (!comsol_bin) - { - while (i < num_nodes) - { - auto line = GetLineComsol(input); - if (!line.empty()) - { - std::istringstream sline(line); - for (int j = 0; j < sdim; j++) - { - sline >> node_coords[3 * i + j]; - } - i++; - } - } - } - else - { - // Don't read as a single block in case sdim < 3. - while (i < num_nodes) - { - input.read(reinterpret_cast(node_coords.data() + 3 * i), - (std::streamsize)(sdim * sizeof(double))); - i++; - } - } - } - - // Parse mesh elements. Store for each element of each type: [geometry tag, [node tags]]. - std::unordered_map> elem_nodes; - { - int num_elem_types = -1; - if (!comsol_bin) - { - while (num_elem_types < 0) - { - auto line = GetLineComsol(input); - if (!line.empty()) - { - std::istringstream sline(line); - if (num_elem_types < 0) - { - sline >> num_elem_types; - } - } - } - } - else - { - input.read(reinterpret_cast(&num_elem_types), sizeof(int)); - } - MFEM_VERIFY(num_elem_types > 0, "COMSOL mesh file contains no elements!"); - - int parsed_types = 0; // COMSOL groups elements by type in file - int elem_type = -1; - int num_elem_nodes = -1; - int num_elem = -1; - int num_elem_geom = -1; - bool skip_type = false; - while (parsed_types < num_elem_types) - { - if (!comsol_bin) - { - auto line = GetLineComsol(input); - if (!line.empty()) - { - std::istringstream sline(line); - if (elem_type < 0) - { - auto elem_str = ReadStringComsol(sline); - MFEM_VERIFY(!elem_str.empty(), - "Unexpected empty element type found in COMSOL mesh file!"); - elem_type = ElemTypeComsol(elem_str); - skip_type = (elem_type == 0); - MFEM_VERIFY(skip_type || elem_nodes.find(elem_type) == elem_nodes.end(), - "Duplicate element types found in COMSOL mesh file!"); - } - else if (num_elem_nodes < 0) - { - sline >> num_elem_nodes; - MFEM_VERIFY(num_elem_nodes > 0, - "COMSOL element type " << elem_type << " has no nodes!"); - MFEM_VERIFY(skip_type || num_elem_nodes == ElemNumNodes[elem_type - 1], - "Mismatch between COMSOL and Gmsh element types!"); - } - else if (num_elem < 0) - { - sline >> num_elem; - MFEM_VERIFY(num_elem > 0, - "COMSOL mesh file has no elements of type " << elem_type << "!"); - std::vector *data = nullptr; - if (!skip_type) - { - data = &elem_nodes[elem_type]; - data->resize(num_elem * (num_elem_nodes + 1)); // Node tags + geometry tag - } - - // Parse all element nodes. - int i = 0; - while (i < num_elem) - { - line = GetLineComsol(input); - if (!line.empty()) - { - if (!skip_type) - { - std::istringstream isline(line); - for (int j = 0; j < num_elem_nodes; j++) - { - // Permute and reset to 1-based node tags. - const int &p = ElemNodesComsol[elem_type - 1][j]; - isline >> (*data)[i * (num_elem_nodes + 1) + 1 + p]; - (*data)[i * (num_elem_nodes + 1) + 1 + p] += (1 - nodes_start); - } - } - i++; - } - } - } - else if (num_elem_geom < 0) - { - sline >> num_elem_geom; - MFEM_VERIFY(num_elem_geom == num_elem, - "COMSOL mesh file should have geometry tags for all elements!"); - std::vector *data = nullptr; - if (!skip_type) - { - MFEM_VERIFY(elem_nodes.find(elem_type) != elem_nodes.end(), - "Can't find expected element type!"); - data = &elem_nodes[elem_type]; - MFEM_VERIFY(data->size() == (std::size_t)num_elem * (num_elem_nodes + 1), - "Unexpected element data size!"); - } - - // Parse all element geometry tags (stored at beginning of element nodes). For - // geometric entites in < 3D, the exported COMSOL tags are 0-based and need - // correcting to 1-based for Gmsh. - int i = 0; - const int geom_start = - (elem_type < 4 || (elem_type > 7 && elem_type < 11)) ? 1 : 0; - while (i < num_elem) - { - line = GetLineComsol(input); - if (!line.empty()) - { - if (!skip_type) - { - std::istringstream ssline(line); - ssline >> (*data)[i * (num_elem_nodes + 1)]; - (*data)[i * (num_elem_nodes + 1)] += geom_start; - } - i++; - } - } - - // Debug - // std::cout << "Finished parsing " << num_elem - // << " elements with type " << elem_type - // << " (parsed types " << parsed_types + 1 << ")\n"; - - // Finished with this element type, on to the next. - parsed_types++; - elem_type = num_elem_nodes = num_elem = num_elem_geom = -1; - skip_type = false; - } - } - } - else - { - auto elem_str = ReadStringComsolBinary(input); - MFEM_VERIFY(!elem_str.empty(), - "Unexpected empty element type found in COMSOL mesh file!"); - elem_type = ElemTypeComsol(elem_str); - skip_type = (elem_type == 0); - MFEM_VERIFY(skip_type || elem_nodes.find(elem_type) == elem_nodes.end(), - "Duplicate element types found in COMSOL mesh file!"); - input.read(reinterpret_cast(&num_elem_nodes), sizeof(int)); - MFEM_VERIFY(num_elem_nodes > 0, - "COMSOL element type " << elem_type << " has no nodes!"); - MFEM_VERIFY(skip_type || num_elem_nodes == ElemNumNodes[elem_type - 1], - "Mismatch between COMSOL and Gmsh element types!"); - - // Parse all element nodes. - input.read(reinterpret_cast(&num_elem), sizeof(int)); - MFEM_VERIFY(num_elem > 0, - "COMSOL mesh file has no elements of type " << elem_type << "!"); - std::vector *data = nullptr; - if (!skip_type) - { - data = &elem_nodes[elem_type]; - data->resize(num_elem * (num_elem_nodes + 1)); // Node tags + geometry tag - } - int i = 0; - std::vector nodes(num_elem_nodes); - while (i < num_elem) - { - input.read(reinterpret_cast(nodes.data()), - (std::streamsize)(num_elem_nodes * sizeof(int))); - if (!skip_type) - { - for (int j = 0; j < num_elem_nodes; j++) - { - // Permute and reset to 1-based node tags. - const int &p = ElemNodesComsol[elem_type - 1][j]; - (*data)[i * (num_elem_nodes + 1) + 1 + p] = nodes[j] + (1 - nodes_start); - } - } - i++; - } - - // Parse element geometry tags. - input.read(reinterpret_cast(&num_elem_geom), sizeof(int)); - MFEM_VERIFY(num_elem_geom == num_elem, - "COMSOL mesh file should have geometry tags for all elements!"); - - i = 0; - const int geom_start = (elem_type < 4 || (elem_type > 7 && elem_type < 11)) ? 1 : 0; - int geom_tag; - while (i < num_elem) - { - input.read(reinterpret_cast(&geom_tag), sizeof(int)); - if (!skip_type) - { - (*data)[i * (num_elem_nodes + 1)] = geom_tag + geom_start; - } - i++; - } - - // Debug - // std::cout << "Finished parsing " << num_elem - // << " elements with type " << elem_type - // << " (parsed types " << parsed_types + 1 << ")\n"; - - // Finished with this element type, on to the next. - parsed_types++; - elem_type = num_elem_nodes = num_elem = num_elem_geom = -1; - skip_type = false; - } - } - } - - // Finalize input, write the Gmsh mesh. - input.close(); - std::vector dummy; - WriteGmsh(buffer, node_coords, dummy, elem_nodes); -} - -void ConvertMeshNastran(const std::string &filename, std::ostream &buffer) -{ - // Read a Nastran/BDF format mesh. - MFEM_VERIFY(!filename.compare(filename.length() - 4, 4, ".nas") || - !filename.compare(filename.length() - 4, 4, ".NAS") || - !filename.compare(filename.length() - 4, 4, ".bdf") || - !filename.compare(filename.length() - 4, 4, ".BDF"), - "Invalid file extension for Nastran mesh format conversion!"); - std::ifstream input(filename); - if (!input.is_open()) - { - MFEM_ABORT("Unable to open mesh file \"" << filename << "\"!"); - } - const int NASTRAN_CHUNK = 8; // NASTRAN divides row into 10 columns of 8 spaces - const int MAX_CHUNK = 9; // Never read the 10-th chunk - - // Parse until bulk data starts. - while (true) - { - auto line = GetLineNastran(input); - if (line.length() > 0) - { - if (!line.compare("BEGIN BULK")) - { - break; - } - } - } - - // Parse mesh nodes and elements. It is expected that node tags start at 1 and are - // contiguous. Store for each element of each type: [geometry tag, [node tags]]. - std::vector node_coords; - std::vector node_tags; - std::unordered_map> elem_nodes; - int elem_type; - while (true) - { - auto line = GetLineNastran(input); - if (line.length() > 0) - { - if (!line.compare("ENDDATA")) - { - break; // Done parsing file - } - else if (!line.compare(0, 5, "GRID*")) - { - // Coordinates in long field format (8 + 16 * 4 + 8). - auto next = GetLineNastran(input); - MFEM_VERIFY(!next.empty(), "Unexpected empty line parsing Nastran!"); - - node_tags.push_back(std::stoi(line.substr(1 * NASTRAN_CHUNK, 2 * NASTRAN_CHUNK))); - node_coords.insert( - node_coords.end(), - {ConvertDoubleNastran(line.substr(5 * NASTRAN_CHUNK, 2 * NASTRAN_CHUNK)), - ConvertDoubleNastran(line.substr(7 * NASTRAN_CHUNK, 2 * NASTRAN_CHUNK)), - ConvertDoubleNastran(next.substr(1 * NASTRAN_CHUNK, 2 * NASTRAN_CHUNK))}); - } - else if (!line.compare(0, 4, "GRID")) - { - if (line.find_first_of(',') != std::string::npos) - { - // Free field format (comma separated). - std::istringstream sline(line); - - std::string word; - std::getline(sline, word, ','); // Discard "GRID" - - std::getline(sline, word, ','); - node_tags.push_back(std::stoi(word)); - - std::getline(sline, word, ','); // Discard coordinate system - - std::getline(sline, word, ','); - double x = ConvertDoubleNastran(word); - std::getline(sline, word, ','); - double y = ConvertDoubleNastran(word); - std::getline(sline, word, ','); - double z = ConvertDoubleNastran(word); - node_coords.insert(node_coords.end(), {x, y, z}); - } - else - { - // Short format (10 * 8). - node_tags.push_back(std::stoi(line.substr(1 * NASTRAN_CHUNK, NASTRAN_CHUNK))); - node_coords.insert( - node_coords.end(), - {ConvertDoubleNastran(line.substr(3 * NASTRAN_CHUNK, NASTRAN_CHUNK)), - ConvertDoubleNastran(line.substr(4 * NASTRAN_CHUNK, NASTRAN_CHUNK)), - ConvertDoubleNastran(line.substr(5 * NASTRAN_CHUNK, NASTRAN_CHUNK))}); - } - } - else if ((elem_type = ElemTypeNastran(line))) - { - // Prepare to parse the element ID and nodes. - const bool free = (line.find_first_of(',') != std::string::npos); - - // Get the element type, tag, and geometry attribute. Then get the element nodes on - // this line. - std::string elem_str; - // int elem_tag; - int geom_tag; - std::vector nodes; - std::string word; - if (!free) - { - elem_str = line.substr(0 * NASTRAN_CHUNK, NASTRAN_CHUNK); - const std::size_t stop = elem_str.find_last_not_of(' '); - MFEM_VERIFY(stop != std::string::npos, "Invalid element type string!"); - elem_str.resize(stop + 1); - // elem_tag = std::stoi(line.substr(1*NASTRAN_CHUNK, NASTRAN_CHUNK)); - geom_tag = std::stoi(line.substr(2 * NASTRAN_CHUNK, NASTRAN_CHUNK)); - - int i = 3; - while (i < MAX_CHUNK) - { - word = line.substr((i++) * NASTRAN_CHUNK, NASTRAN_CHUNK); - if (word.find_first_not_of(' ') == std::string::npos) - { - break; - } - nodes.push_back(std::stoi(word)); - } - } - else - { - std::istringstream sline(line); - std::getline(sline, elem_str, ','); - std::getline(sline, word, ','); - // elem_tag = std::stoi(word); - std::getline(sline, word, ','); - geom_tag = std::stoi(word); - - int i = 3; - while (i < MAX_CHUNK) - { - std::getline(sline, word, ','); - if (word.find_first_not_of(' ') == std::string::npos) - { - break; - } - nodes.push_back(std::stoi(word)); - i++; - } - } - - // Handle line continuation. - while (input.peek() == '+') - { - auto next = GetLineNastran(input); - MFEM_VERIFY(!next.empty(), "Unexpected empty line parsing Nastran!"); - - if (!free) - { - int i = 1; - while (i < MAX_CHUNK) - { - word = next.substr((i++) * NASTRAN_CHUNK, NASTRAN_CHUNK); - if (word.find_first_not_of(' ') == std::string::npos) - { - break; - } - nodes.push_back(std::stoi(word)); - } - } - else - { - std::istringstream snext(next); - int i = 1; - while (i < MAX_CHUNK) - { - std::getline(snext, word, ','); - if (word.find_first_not_of(' ') == std::string::npos) - { - break; - } - nodes.push_back(std::stoi(word)); - i++; - } - } - } - - // Save the element and its geometry tag. - elem_type = HOElemTypeNastran(elem_type, (int)nodes.size()); - const int &num_elem_nodes = ElemNumNodes[elem_type - 1]; - MFEM_VERIFY((std::size_t)num_elem_nodes == nodes.size(), - "Mismatch between Nastran and Gmsh element types!"); - std::vector &data = elem_nodes[elem_type]; - const int i = (int)data.size(); - data.resize(i + 1 + num_elem_nodes); - data[i] = geom_tag; - for (int j = 0; j < num_elem_nodes; j++) - { - // Permute back to Gmsh ordering. - const int &p = ElemNodesNastran[elem_type - 1][j]; - data[i + 1 + p] = nodes[j]; - } - } - } - } - - // Finalize input, write the Gmsh mesh. - input.close(); - WriteGmsh(buffer, node_coords, node_tags, elem_nodes); -} - -} // namespace mesh - -} // namespace palace +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "meshio.hpp" + +#include +#include +#include + +#define GMSH_BIN // Use binary Gmsh format + +namespace palace +{ + +namespace +{ + +inline int ElemTypeComsol(const std::string &type) +{ + if (!type.compare("tri")) // 3-node triangle + { + return 2; + } + if (!type.compare("quad")) // 4-node quadrangle + { + return 3; + } + if (!type.compare("tet")) // 4-node tetrahedron + { + return 4; + } + if (!type.compare("hex")) // 8-node hexahedron + { + return 5; + } + if (!type.compare("prism")) // 6-node prism + { + return 6; + } + if (!type.compare("pyr")) // 5-node pyramid + { + return 7; + } + if (!type.compare("tri2")) // 6-node triangle + { + return 9; + } + if (!type.compare("quad2")) // 9-node quadrangle + { + return 10; + } + if (!type.compare("tet2")) // 10-node tetrahedron + { + return 11; + } + if (!type.compare("hex2")) // 27-node hexahedron + { + return 12; + } + if (!type.compare("prism2")) // 18-node prism + { + return 13; + } + if (!type.compare("pyr2")) // 14-node pyramid + { + return 14; + } + return 0; // Skip this element type +} + +inline int ElemTypeNastran(const std::string &type) +{ + // Returns only the low-order type for a given keyword. + if (!type.compare(0, 5, "CTRIA")) // 3-node triangle + { + return 2; + } + if (!type.compare(0, 5, "CQUAD")) // 4-node quadrangle + { + return 3; + } + if (!type.compare(0, 6, "CTETRA")) // 4-node tetrahedron + { + return 4; + } + if (!type.compare(0, 5, "CHEXA")) // 8-node hexahedron + { + return 5; + } + if (!type.compare(0, 6, "CPENTA")) // 6-node prism + { + return 6; + } + if (!type.compare(0, 6, "CPYRAM")) // 5-node pyramid + { + return 7; + } + return 0; // Skip this element type +} + +inline int HOElemTypeNastran(const int lo_type, const int num_nodes) +{ + // Get high-order element type for corresponding low-order type. + if (lo_type == 2 && num_nodes > 3) // 6-node triangle + { + MFEM_VERIFY(num_nodes == 6, "Invalid high-order Nastran element!"); + return 9; + } + if (lo_type == 3) + { + if (num_nodes == 9) // 9-node quadrangle + { + return 10; + } + if (num_nodes == 8) // 8-node quadrangle + { + return 16; + } + MFEM_VERIFY(num_nodes == 4, "Invalid high-order Nastran element!"); + return lo_type; + } + if (lo_type == 4 && num_nodes > 4) // 10-node tetrahedron + { + MFEM_VERIFY(num_nodes == 10, "Invalid high-order Nastran element!"); + return 11; + } + if (lo_type == 5 && num_nodes > 8) // 20-node hexahedron + { + MFEM_VERIFY(num_nodes == 20, "Invalid high-order Nastran element!"); + return 17; + } + if (lo_type == 6 && num_nodes > 6) // 15-node prism + { + MFEM_VERIFY(num_nodes == 15, "Invalid high-order Nastran element!"); + return 18; + } + if (lo_type == 7 && num_nodes > 5) // 13-node pyramid + { + MFEM_VERIFY(num_nodes == 13, "Invalid high-order Nastran element!"); + return 19; + } + return lo_type; +} + +inline int LOElemTypeGmsh(int ho_type) +{ + if (ho_type == 9) // 6-node triangle + { + return 2; + } + if (ho_type == 10 || ho_type == 16) // 9- or 8-node quadrangle + { + return 3; + } + if (ho_type == 11) // 10-node tetrahedron + { + return 4; + } + if (ho_type == 12 || ho_type == 17) // 27- or 20-node hexahedron + { + return 5; + } + if (ho_type == 13 || ho_type == 18) // 18- or 15-node prism + { + return 6; + } + if (ho_type == 14 || ho_type == 19) // 14- or 13-node pyramid + { + return 7; + } + return ho_type; +} + +constexpr int ElemNumNodes[] = {-1, // 2-node edge + 3, 4, 4, 8, 6, 5, + -1, // 3-node edge + 6, 9, 10, 27, 18, 14, + -1, // 1-node node + 8, 20, 15, 13}; + +// From COMSOL or Nastran to Gmsh ordering. See: +// - https://gmsh.info/doc/texinfo/gmsh.html#Node-ordering +// - https://tinyurl.com/yezswzfv +// - https://tinyurl.com/4d32zxtn +constexpr int SkipElem[] = {-1}; +constexpr int Msh3[] = {0, 1, 2}; +constexpr int Msh4[] = {0, 1, 2, 3}; +constexpr int Msh5[] = {0, 1, 2, 3, 4}; +constexpr int Msh6[] = {0, 1, 2, 3, 4, 5}; +constexpr int Msh8[] = {0, 1, 2, 3, 4, 5, 6, 7}; +constexpr int Msh9[] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + +constexpr int MphQuad4[] = {0, 1, 3, 2}; +constexpr int MphHex8[] = {0, 1, 3, 2, 4, 5, 7, 6}; +constexpr int MphPyr5[] = {0, 1, 3, 2, 4}; +constexpr int MphTri6[] = {0, 1, 2, 3, 5, 4}; +constexpr int MphQuad9[] = {0, 1, 3, 2, 4, 7, 8, 5, 6}; +constexpr int MphTet10[] = {0, 1, 2, 3, 4, 6, 5, 7, 9, 8}; +constexpr int MphHex27[] = {0, 1, 3, 2, 4, 5, 7, 6, 8, 9, 20, 11, 13, 10, + 21, 12, 22, 26, 23, 15, 24, 14, 16, 17, 25, 18, 19}; +constexpr int MphWdg18[] = {0, 1, 2, 3, 4, 5, 6, 7, 9, 8, 15, 10, 16, 17, 11, 12, 13, 14}; +constexpr int MphPyr14[] = {0, 1, 3, 2, 4, 5, 6, 13, 8, 10, 7, 9, 12, 11}; + +constexpr int NasTet10[] = {0, 1, 2, 3, 4, 5, 6, 7, 9, 8}; +constexpr int NasHex20[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 11, + 13, 9, 10, 12, 14, 15, 16, 18, 19, 17}; +constexpr int NasWdg15[] = {0, 1, 2, 3, 4, 5, 6, 9, 7, 8, 10, 11, 12, 14, 13}; +constexpr int NasPyr13[] = {0, 1, 2, 3, 4, 5, 8, 10, 6, 7, 9, 11, 12}; + +constexpr const int *ElemNodesComsol[] = {SkipElem, Msh3, MphQuad4, Msh4, MphHex8, + Msh6, MphPyr5, SkipElem, MphTri6, MphQuad9, + MphTet10, MphHex27, MphWdg18, MphPyr14, SkipElem, + SkipElem, SkipElem, SkipElem, SkipElem}; +constexpr const int *ElemNodesNastran[] = {SkipElem, Msh3, Msh4, Msh4, Msh8, + Msh6, Msh5, SkipElem, Msh6, Msh9, + NasTet10, SkipElem, SkipElem, SkipElem, SkipElem, + Msh8, NasHex20, NasWdg15, NasPyr13}; + +// Get line, strip comments, leading/trailing whitespace. Should not be called if end of +// file is expected. +inline std::string GetLineComsol(std::ifstream &input) +{ + std::string str; + std::getline(input, str); + MFEM_VERIFY(input, "Unexpected read failure parsing mesh file!"); + const auto pos = str.find_first_of('#'); + if (pos != std::string::npos) + { + str.erase(pos); + } + const auto start = str.find_first_not_of(" \t"); + if (start == std::string::npos) + { + return ""; + } + const auto stop = str.find_last_not_of(" \t"); + return str.substr(start, stop - start + 1); +} + +inline std::string GetLineNastran(std::ifstream &input) +{ + std::string str; + std::getline(input, str); + MFEM_VERIFY(input.good(), "Unexpected read failure parsing mesh file!"); + str.erase(std::remove(str.begin(), str.end(), '\r'), str.end()); + return str[0] == '$' ? "" : str; +} + +// COMSOL strings are parsed as an integer length followed by array of integers for the +// string characters. +inline std::string ReadStringComsol(std::istream &input) +{ + int n; + std::string str; + input >> n >> str; + return str; +} + +inline std::string ReadStringComsolBinary(std::istream &input) +{ + int n; + input.read(reinterpret_cast(&n), sizeof(int)); + std::vector vstr(n); + input.read(reinterpret_cast(vstr.data()), (std::streamsize)(n * sizeof(int))); + return std::string(vstr.begin(), vstr.end()); +} + +// Nastran has a special floating point format: "-7.-1" instead of "-7.E-01" or "2.3+2" +// instead of "2.3E+02". +inline double ConvertDoubleNastran(const std::string &str) +{ + double d; + try + { + d = std::stod(str); + } + catch (const std::invalid_argument &ia) + { + const std::size_t start = str.find_first_not_of(' '); + MFEM_VERIFY(start != std::string::npos, + "Invalid number conversion parsing Nastran mesh!") + std::string fstr = str.substr(start); + std::size_t pos = fstr.find('+', 1); // Skip leading +/- sign + if (pos != std::string::npos) + { + fstr.replace(pos, 1, "E+"); + } + else if ((pos = fstr.find('-', 1)) != std::string::npos) + { + fstr.replace(pos, 1, "E-"); + } + d = std::stod(fstr); + } + return d; +} + +inline void WriteNode(std::ostream &buffer, const int tag, const double *coord) +{ +#if defined(GMSH_BIN) + buffer.write(reinterpret_cast(&tag), sizeof(int)); + buffer.write(reinterpret_cast(coord), 3 * sizeof(double)); + // No newline for binary data. +#else + // Always 3D coordinates (user sets floating point format/precision on buffer). + buffer << tag << ' ' << coord[0] << ' ' << coord[1] << ' ' << coord[2] << '\n'; +#endif +} + +inline void WriteElement(std::ostream &buffer, const int tag, const int type, + const int geom, const int nodes[]) +{ +#if defined(GMSH_BIN) + const int data[3] = {tag, geom, geom}; + buffer.write(reinterpret_cast(data), 3 * sizeof(int)); + buffer.write(reinterpret_cast(nodes), + (std::streamsize)(ElemNumNodes[type - 1] * sizeof(int))); + // No newline for binary data. +#else + buffer << tag << ' ' << type << " 2 " << geom << ' ' << geom; + for (int i = 0; i < ElemNumNodes[type - 1]; i++) + { + buffer << ' ' << nodes[i]; + } + buffer << '\n'; +#endif +} + +void WriteGmsh(std::ostream &buffer, const std::vector &node_coords, + const std::vector &node_tags, + const std::unordered_map> &elem_nodes, + const bool use_lo_type) +{ + // Write the Gmsh file header (version 2.2). + buffer << "$MeshFormat\n2.2 " + << +#if defined(GMSH_BIN) + "1 " << +#else + "0 " << +#endif + sizeof(double) << '\n'; +#if defined(GMSH_BIN) + const int one = 1; + buffer.write(reinterpret_cast(&one), sizeof(int)); + buffer << '\n'; +#endif + buffer << "$EndMeshFormat\n"; + + // Write mesh nodes. + const int num_nodes = (int)node_coords.size() / 3; + MFEM_VERIFY(num_nodes > 0 && node_coords.size() % 3 == 0, + "Gmsh nodes should always be in 3D space!"); + buffer << "$Nodes\n" << num_nodes << '\n'; + { + if (!node_tags.empty()) + { + // Use input node tags which should be positive but don't need to be contiguous. + MFEM_VERIFY(node_tags.size() == (std::size_t)num_nodes, + "Invalid size for node tags!"); + for (int i = 0; i < num_nodes; i++) + { + WriteNode(buffer, node_tags[i], &node_coords[3 * i]); + } + } + else + { + // Label nodes as contiguous starting at 1. + for (int i = 0; i < num_nodes; i++) + { + WriteNode(buffer, i + 1, &node_coords[3 * i]); + } + } + } +#if defined(GMSH_BIN) + buffer << '\n'; +#endif + buffer << "$EndNodes\n"; + + // Write mesh elements. + int tot_num_elem = 0; + for (const auto &[elem_type, nodes] : elem_nodes) + { + MFEM_VERIFY(elem_type > 0, "Invalid element type writing Gmsh elements!"); + const int &num_elem_nodes = ElemNumNodes[elem_type - 1]; + tot_num_elem += ((int)nodes.size()) / (num_elem_nodes + 1); + MFEM_VERIFY(nodes.size() % (num_elem_nodes + 1) == 0, + "Unexpected data size when writing elements!"); + } + MFEM_VERIFY(tot_num_elem > 0, "No mesh elements parsed from COMSOL mesh file!"); + buffer << "$Elements\n" << tot_num_elem << '\n'; + { + int tag = 1; // Global element tag + for (const auto &[elem_type, nodes] : elem_nodes) + { + const int elem_type_w = use_lo_type ? LOElemTypeGmsh(elem_type) : elem_type; + const int &num_elem_nodes = ElemNumNodes[elem_type - 1]; + const int num_elem = (int)nodes.size() / (num_elem_nodes + 1); +#if defined(GMSH_BIN) + // For binary output, write the element header for each type. Always have 2 tags + // (physical + geometry). + const int header[3] = {elem_type_w, num_elem, 2}; + buffer.write(reinterpret_cast(header), 3 * sizeof(int)); +#endif + for (int i = 0; i < num_elem; i++) + { + WriteElement(buffer, tag++, elem_type_w, + nodes[i * (num_elem_nodes + 1)], // Geometry tag + &nodes[i * (num_elem_nodes + 1) + 1]); // Element nodes + } + } + } +#if defined(GMSH_BIN) + buffer << '\n'; +#endif + buffer << "$EndElements\n"; +} + +} // namespace + +namespace mesh +{ + +void ConvertMeshComsol(const std::string &filename, std::ostream &buffer, + bool remove_curvature) +{ + // Read a COMSOL format mesh. + const int comsol_bin = !filename.compare(filename.length() - 7, 7, ".mphbin") || + !filename.compare(filename.length() - 7, 7, ".MPHBIN"); + MFEM_VERIFY(!filename.compare(filename.length() - 7, 7, ".mphtxt") || + !filename.compare(filename.length() - 7, 7, ".MPHTXT") || comsol_bin, + "Invalid file extension for COMSOL mesh format conversion!"); + std::ifstream input(filename); + if (!input.is_open()) + { + MFEM_ABORT("Unable to open mesh file \"" << filename << "\"!"); + } + + // Parse COMSOL header. COMSOL encodes strings as integer-string pairs where the integer + // is the string length. It also allows for blank lines and other whitespace wherever in + // the file. + { + int version[2] = {-1, -1}; + int num_tags = -1; + int num_types = -1; + if (!comsol_bin) + { + while (num_types < 0) + { + auto line = GetLineComsol(input); + if (!line.empty()) + { + std::istringstream sline(line); + if (version[0] < 0) + { + sline >> version[0] >> version[1]; + } + else if (num_tags < 0) + { + sline >> num_tags; + int i = 0; + while (i < num_tags) + { + if (!GetLineComsol(input).empty()) + { + i++; + } + } + } + else if (num_types < 0) + { + sline >> num_types; + int i = 0; + while (i < num_types) + { + if (!GetLineComsol(input).empty()) + { + i++; + } + } + } + } + } + } + else + { + input.read(reinterpret_cast(version), 2 * sizeof(int)); + input.read(reinterpret_cast(&num_tags), sizeof(int)); + { + int i = 0; + while (i < num_tags) + { + ReadStringComsolBinary(input); + i++; + } + } + input.read(reinterpret_cast(&num_types), sizeof(int)); + { + int i = 0; + while (i < num_types) + { + ReadStringComsolBinary(input); + i++; + } + } + } + MFEM_VERIFY(version[0] == 0 && version[1] == 1, "Invalid COMSOL file version!"); + } + + // Parse mesh objects until we get to the mesh. Currently only supports a single mesh + // object in the file, and selections are ignored. + while (true) + { + int object[3] = {-1, -1, -1}; + std::string object_class; + if (!comsol_bin) + { + while (object_class.empty()) + { + auto line = GetLineComsol(input); + if (!line.empty()) + { + std::istringstream sline(line); + if (object[0] < 0) + { + sline >> object[0] >> object[1] >> object[2]; + } + else if (object_class.empty()) + { + object_class = ReadStringComsol(sline); + } + } + } + } + else + { + input.read(reinterpret_cast(object), 3 * sizeof(int)); + object_class = ReadStringComsolBinary(input); + } + MFEM_VERIFY(object[0] == 0 && object[1] == 0 && object[2] == 1, + "Invalid COMSOL object version!"); + + // If yes, then ready to parse the mesh. + if (!object_class.compare(0, 4, "Mesh")) + { + break; + } + + // Otherwise, parse over the selection to the next object. + MFEM_VERIFY(!object_class.compare(0, 9, "Selection"), + "COMSOL mesh file only supports Mesh and Selection objects!"); + int version = -1; + std::string label_str; + std::string tag_str; + int sdim = -1; + int num_ent = -1; + if (!comsol_bin) + { + while (num_ent < 0) + { + auto line = GetLineComsol(input); + if (!line.empty()) + { + std::istringstream sline(line); + if (version < 0) + { + sline >> version; + } + else if (label_str.empty()) + { + label_str = ReadStringComsol(sline); + } + else if (tag_str.empty()) + { + tag_str = ReadStringComsol(sline); + } + else if (sdim < 0) + { + sline >> sdim; + } + else if (num_ent < 0) + { + sline >> num_ent; + } + } + } + } + else + { + input.read(reinterpret_cast(&version), sizeof(int)); + label_str = ReadStringComsolBinary(input); + tag_str = ReadStringComsolBinary(input); + input.read(reinterpret_cast(&sdim), sizeof(int)); + input.read(reinterpret_cast(&num_ent), sizeof(int)); + } + + // Parse over the entities in the selection. + int i = 0; + if (!comsol_bin) + { + while (i < num_ent) + { + if (!GetLineComsol(input).empty()) + { + i++; + } + } + } + else + { + while (i < num_ent) + { + int dummy; + input.read(reinterpret_cast(&dummy), sizeof(int)); + i++; + } + } + } // Repeat until Mesh is found + + // Parse the mesh object header. + int sdim = -1; + int num_nodes = -1; + int nodes_start = -1; + { + int version = -1; + if (!comsol_bin) + { + while (nodes_start < 0) + { + auto line = GetLineComsol(input); + if (!line.empty()) + { + std::istringstream sline(line); + if (version < 0) + { + sline >> version; + } + else if (sdim < 0) + { + sline >> sdim; + } + else if (num_nodes < 0) + { + sline >> num_nodes; + } + else if (nodes_start < 0) + { + sline >> nodes_start; + } + } + } + } + else + { + input.read(reinterpret_cast(&version), sizeof(int)); + input.read(reinterpret_cast(&sdim), sizeof(int)); + input.read(reinterpret_cast(&num_nodes), sizeof(int)); + input.read(reinterpret_cast(&nodes_start), sizeof(int)); + } + MFEM_VERIFY(version == 4, "Only COMSOL files with Mesh version 4 are supported!"); + MFEM_VERIFY(sdim == 2 || sdim == 3, + "COMSOL mesh nodes are required to be in 2D or 3D space!"); + MFEM_VERIFY(num_nodes > 0, "COMSOL mesh file contains no nodes!"); + MFEM_VERIFY(nodes_start >= 0, "COMSOL mesh nodes have a negative starting tag!"); + } + + // Parse mesh nodes. + std::vector node_coords; + { + // Gmsh nodes are always 3D, so initialize to 0.0 in case z-coordinate isn't set. + node_coords.resize(3 * num_nodes, 0.0); + int i = 0; + if (!comsol_bin) + { + while (i < num_nodes) + { + auto line = GetLineComsol(input); + if (!line.empty()) + { + std::istringstream sline(line); + for (int j = 0; j < sdim; j++) + { + sline >> node_coords[3 * i + j]; + } + i++; + } + } + } + else + { + // Don't read as a single block in case sdim < 3. + while (i < num_nodes) + { + input.read(reinterpret_cast(node_coords.data() + 3 * i), + (std::streamsize)(sdim * sizeof(double))); + i++; + } + } + } + + // Parse mesh elements. Store for each element of each type: [geometry tag, [node tags]]. + std::unordered_map> elem_nodes; + { + int num_elem_types = -1; + if (!comsol_bin) + { + while (num_elem_types < 0) + { + auto line = GetLineComsol(input); + if (!line.empty()) + { + std::istringstream sline(line); + if (num_elem_types < 0) + { + sline >> num_elem_types; + } + } + } + } + else + { + input.read(reinterpret_cast(&num_elem_types), sizeof(int)); + } + MFEM_VERIFY(num_elem_types > 0, "COMSOL mesh file contains no elements!"); + + int parsed_types = 0; // COMSOL groups elements by type in file + int elem_type = -1; + int num_elem_nodes = -1; + int num_elem = -1; + int num_elem_geom = -1; + bool skip_type = false; + while (parsed_types < num_elem_types) + { + if (!comsol_bin) + { + auto line = GetLineComsol(input); + if (!line.empty()) + { + std::istringstream sline(line); + if (elem_type < 0) + { + auto elem_str = ReadStringComsol(sline); + MFEM_VERIFY(!elem_str.empty(), + "Unexpected empty element type found in COMSOL mesh file!"); + elem_type = ElemTypeComsol(elem_str); + skip_type = (elem_type == 0); + MFEM_VERIFY(skip_type || elem_nodes.find(elem_type) == elem_nodes.end(), + "Duplicate element types found in COMSOL mesh file!"); + } + else if (num_elem_nodes < 0) + { + sline >> num_elem_nodes; + MFEM_VERIFY(num_elem_nodes > 0, + "COMSOL element type " << elem_type << " has no nodes!"); + MFEM_VERIFY(skip_type || num_elem_nodes == ElemNumNodes[elem_type - 1], + "Mismatch between COMSOL and Gmsh element types!"); + } + else if (num_elem < 0) + { + sline >> num_elem; + MFEM_VERIFY(num_elem > 0, + "COMSOL mesh file has no elements of type " << elem_type << "!"); + std::vector *data = nullptr; + if (!skip_type) + { + data = &elem_nodes[elem_type]; + data->resize(num_elem * (num_elem_nodes + 1)); // Node tags + geometry tag + } + + // Parse all element nodes. + int i = 0; + while (i < num_elem) + { + line = GetLineComsol(input); + if (!line.empty()) + { + if (!skip_type) + { + std::istringstream isline(line); + for (int j = 0; j < num_elem_nodes; j++) + { + // Permute and reset to 1-based node tags. + const int &p = ElemNodesComsol[elem_type - 1][j]; + isline >> (*data)[i * (num_elem_nodes + 1) + 1 + p]; + (*data)[i * (num_elem_nodes + 1) + 1 + p] += (1 - nodes_start); + } + } + i++; + } + } + } + else if (num_elem_geom < 0) + { + sline >> num_elem_geom; + MFEM_VERIFY(num_elem_geom == num_elem, + "COMSOL mesh file should have geometry tags for all elements!"); + std::vector *data = nullptr; + if (!skip_type) + { + MFEM_VERIFY(elem_nodes.find(elem_type) != elem_nodes.end(), + "Can't find expected element type!"); + data = &elem_nodes[elem_type]; + MFEM_VERIFY(data->size() == (std::size_t)num_elem * (num_elem_nodes + 1), + "Unexpected element data size!"); + } + + // Parse all element geometry tags (stored at beginning of element nodes). For + // geometric entities in < 3D, the exported COMSOL tags are 0-based and need + // correcting to 1-based for Gmsh. + int i = 0; + const int geom_start = + (elem_type < 4 || (elem_type > 7 && elem_type < 11)) ? 1 : 0; + while (i < num_elem) + { + line = GetLineComsol(input); + if (!line.empty()) + { + if (!skip_type) + { + std::istringstream ssline(line); + ssline >> (*data)[i * (num_elem_nodes + 1)]; + (*data)[i * (num_elem_nodes + 1)] += geom_start; + } + i++; + } + } + + // Debug + // std::cout << "Finished parsing " << num_elem + // << " elements with type " << elem_type + // << " (parsed types " << parsed_types + 1 << ")\n"; + + // Finished with this element type, on to the next. + parsed_types++; + elem_type = num_elem_nodes = num_elem = num_elem_geom = -1; + skip_type = false; + } + } + } + else + { + auto elem_str = ReadStringComsolBinary(input); + MFEM_VERIFY(!elem_str.empty(), + "Unexpected empty element type found in COMSOL mesh file!"); + elem_type = ElemTypeComsol(elem_str); + skip_type = (elem_type == 0); + MFEM_VERIFY(skip_type || elem_nodes.find(elem_type) == elem_nodes.end(), + "Duplicate element types found in COMSOL mesh file!"); + input.read(reinterpret_cast(&num_elem_nodes), sizeof(int)); + MFEM_VERIFY(num_elem_nodes > 0, + "COMSOL element type " << elem_type << " has no nodes!"); + MFEM_VERIFY(skip_type || num_elem_nodes == ElemNumNodes[elem_type - 1], + "Mismatch between COMSOL and Gmsh element types!"); + + // Parse all element nodes. + input.read(reinterpret_cast(&num_elem), sizeof(int)); + MFEM_VERIFY(num_elem > 0, + "COMSOL mesh file has no elements of type " << elem_type << "!"); + std::vector *data = nullptr; + if (!skip_type) + { + data = &elem_nodes[elem_type]; + data->resize(num_elem * (num_elem_nodes + 1)); // Node tags + geometry tag + } + int i = 0; + std::vector nodes(num_elem_nodes); + while (i < num_elem) + { + input.read(reinterpret_cast(nodes.data()), + (std::streamsize)(num_elem_nodes * sizeof(int))); + if (!skip_type) + { + for (int j = 0; j < num_elem_nodes; j++) + { + // Permute and reset to 1-based node tags. + const int &p = ElemNodesComsol[elem_type - 1][j]; + (*data)[i * (num_elem_nodes + 1) + 1 + p] = nodes[j] + (1 - nodes_start); + } + } + i++; + } + + // Parse element geometry tags. + input.read(reinterpret_cast(&num_elem_geom), sizeof(int)); + MFEM_VERIFY(num_elem_geom == num_elem, + "COMSOL mesh file should have geometry tags for all elements!"); + + i = 0; + const int geom_start = (elem_type < 4 || (elem_type > 7 && elem_type < 11)) ? 1 : 0; + int geom_tag; + while (i < num_elem) + { + input.read(reinterpret_cast(&geom_tag), sizeof(int)); + if (!skip_type) + { + (*data)[i * (num_elem_nodes + 1)] = geom_tag + geom_start; + } + i++; + } + + // Debug + // std::cout << "Finished parsing " << num_elem + // << " elements with type " << elem_type + // << " (parsed types " << parsed_types + 1 << ")\n"; + + // Finished with this element type, on to the next. + parsed_types++; + elem_type = num_elem_nodes = num_elem = num_elem_geom = -1; + skip_type = false; + } + } + } + + // Finalize input, write the Gmsh mesh. + input.close(); + std::vector dummy; + WriteGmsh(buffer, node_coords, dummy, elem_nodes, remove_curvature); +} + +void ConvertMeshNastran(const std::string &filename, std::ostream &buffer, + bool remove_curvature) +{ + // Read a Nastran/BDF format mesh. + MFEM_VERIFY(!filename.compare(filename.length() - 4, 4, ".nas") || + !filename.compare(filename.length() - 4, 4, ".NAS") || + !filename.compare(filename.length() - 4, 4, ".bdf") || + !filename.compare(filename.length() - 4, 4, ".BDF"), + "Invalid file extension for Nastran mesh format conversion!"); + std::ifstream input(filename); + if (!input.is_open()) + { + MFEM_ABORT("Unable to open mesh file \"" << filename << "\"!"); + } + const int NASTRAN_CHUNK = 8; // NASTRAN divides row into 10 columns of 8 spaces + const int MAX_CHUNK = 9; // Never read the 10-th chunk + + // Parse until bulk data starts. + while (true) + { + auto line = GetLineNastran(input); + if (line.length() > 0) + { + if (!line.compare(0, 10, "BEGIN BULK")) + { + break; + } + } + } + + // Parse mesh nodes and elements. It is expected that node tags start at 1 and are + // contiguous. Store for each element of each type: [geometry tag, [node tags]]. + std::vector node_coords; + std::vector node_tags; + std::unordered_map> elem_nodes; + int elem_type; + while (true) + { + auto line = GetLineNastran(input); + if (line.length() > 0 && !input.eof()) + { + if (!line.compare(0, 7, "ENDDATA")) + { + break; // Done parsing file + } + else if (!line.compare(0, 5, "GRID*")) + { + // Coordinates in long field format (8 + 16 * 4 + 8). + auto next = GetLineNastran(input); + MFEM_VERIFY(!next.empty(), "Unexpected empty line parsing Nastran!"); + + node_tags.push_back(std::stoi(line.substr(1 * NASTRAN_CHUNK, 2 * NASTRAN_CHUNK))); + node_coords.insert( + node_coords.end(), + {ConvertDoubleNastran(line.substr(5 * NASTRAN_CHUNK, 2 * NASTRAN_CHUNK)), + ConvertDoubleNastran(line.substr(7 * NASTRAN_CHUNK, 2 * NASTRAN_CHUNK)), + ConvertDoubleNastran(next.substr(1 * NASTRAN_CHUNK, 2 * NASTRAN_CHUNK))}); + } + else if (!line.compare(0, 4, "GRID")) + { + if (line.find_first_of(',') != std::string::npos) + { + // Free field format (comma separated). + std::istringstream sline(line); + + std::string word; + std::getline(sline, word, ','); // Discard "GRID" + + std::getline(sline, word, ','); + node_tags.push_back(std::stoi(word)); + + std::getline(sline, word, ','); // Discard coordinate system + + std::getline(sline, word, ','); + double x = ConvertDoubleNastran(word); + std::getline(sline, word, ','); + double y = ConvertDoubleNastran(word); + std::getline(sline, word, ','); + double z = ConvertDoubleNastran(word); + node_coords.insert(node_coords.end(), {x, y, z}); + } + else + { + // Short format (10 * 8). + node_tags.push_back(std::stoi(line.substr(1 * NASTRAN_CHUNK, NASTRAN_CHUNK))); + node_coords.insert( + node_coords.end(), + {ConvertDoubleNastran(line.substr(3 * NASTRAN_CHUNK, NASTRAN_CHUNK)), + ConvertDoubleNastran(line.substr(4 * NASTRAN_CHUNK, NASTRAN_CHUNK)), + ConvertDoubleNastran(line.substr(5 * NASTRAN_CHUNK, NASTRAN_CHUNK))}); + } + } + else if ((elem_type = ElemTypeNastran(line))) + { + // Prepare to parse the element ID and nodes. + const bool free = (line.find_first_of(',') != std::string::npos); + + // Get the element type, tag, and geometry attribute. Then get the element nodes on + // this line. + std::string elem_str; + // int elem_tag; + int geom_tag; + std::vector nodes; + std::string word; + if (!free) + { + elem_str = line.substr(0 * NASTRAN_CHUNK, NASTRAN_CHUNK); + const std::size_t stop = elem_str.find_last_not_of(' '); + MFEM_VERIFY(stop != std::string::npos, "Invalid element type string!"); + elem_str.resize(stop + 1); + // elem_tag = std::stoi(line.substr(1*NASTRAN_CHUNK, NASTRAN_CHUNK)); + geom_tag = std::stoi(line.substr(2 * NASTRAN_CHUNK, NASTRAN_CHUNK)); + + int i = 3; + while (i < MAX_CHUNK) + { + word = line.substr((i++) * NASTRAN_CHUNK, NASTRAN_CHUNK); + if (word.find_first_not_of(' ') == std::string::npos) + { + break; + } + nodes.push_back(std::stoi(word)); + } + } + else + { + std::istringstream sline(line); + std::getline(sline, elem_str, ','); + std::getline(sline, word, ','); + // elem_tag = std::stoi(word); + std::getline(sline, word, ','); + geom_tag = std::stoi(word); + + int i = 3; + while (i < MAX_CHUNK) + { + std::getline(sline, word, ','); + if (word.find_first_not_of(' ') == std::string::npos) + { + break; + } + nodes.push_back(std::stoi(word)); + i++; + } + } + + // Handle line continuation. + while (input.peek() == '+') + { + auto next = GetLineNastran(input); + MFEM_VERIFY(!next.empty(), "Unexpected empty line parsing Nastran!"); + + if (!free) + { + int i = 1; + while (i < MAX_CHUNK) + { + word = next.substr((i++) * NASTRAN_CHUNK, NASTRAN_CHUNK); + if (word.find_first_not_of(' ') == std::string::npos) + { + break; + } + nodes.push_back(std::stoi(word)); + } + } + else + { + std::istringstream snext(next); + int i = 1; + while (i < MAX_CHUNK) + { + std::getline(snext, word, ','); + if (word.find_first_not_of(' ') == std::string::npos) + { + break; + } + nodes.push_back(std::stoi(word)); + i++; + } + } + } + + // Save the element and its geometry tag. + elem_type = HOElemTypeNastran(elem_type, (int)nodes.size()); + const int &num_elem_nodes = ElemNumNodes[elem_type - 1]; + MFEM_VERIFY((std::size_t)num_elem_nodes == nodes.size(), + "Mismatch between Nastran and Gmsh element types!"); + std::vector &data = elem_nodes[elem_type]; + const int i = (int)data.size(); + data.resize(i + 1 + num_elem_nodes); + data[i] = geom_tag; + for (int j = 0; j < num_elem_nodes; j++) + { + // Permute back to Gmsh ordering. + const int &p = ElemNodesNastran[elem_type - 1][j]; + data[i + 1 + p] = nodes[j]; + } + } + } + } + + // Finalize input, write the Gmsh mesh. + input.close(); + WriteGmsh(buffer, node_coords, node_tags, elem_nodes, remove_curvature); +} + +} // namespace mesh + +} // namespace palace diff --git a/palace/utils/meshio.hpp b/palace/utils/meshio.hpp index bfdb27f2da..e8398423c6 100644 --- a/palace/utils/meshio.hpp +++ b/palace/utils/meshio.hpp @@ -1,27 +1,29 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_MESH_IO_HPP -#define PALACE_UTILS_MESH_IO_HPP - -#include -#include - -namespace palace::mesh -{ - -// -// Functions for mesh format conversion to Gmsh format, which is supported by MFEM. In both -// cases, the user should configure the buffer for the desired floating point -// format/precision for writing node coordinates. -// - -// Convert a binary or ASCII COMSOL (.mphbin/.mphtxt) mesh to Gmsh v2.2. -void ConvertMeshComsol(const std::string &filename, std::ostream &buffer); - -// Convert an ASCII NASTRAN (.nas/.bdf) mesh to Gmsh v2.2. -void ConvertMeshNastran(const std::string &filename, std::ostream &buffer); - -} // namespace palace::mesh - -#endif // PALACE_UTILS_MESH_IO_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_MESH_IO_HPP +#define PALACE_UTILS_MESH_IO_HPP + +#include +#include + +namespace palace::mesh +{ + +// +// Functions for mesh format conversion to Gmsh format, which is supported by MFEM. In both +// cases, the user should configure the buffer for the desired floating point +// format/precision for writing node coordinates. +// + +// Convert a binary or ASCII COMSOL (.mphbin/.mphtxt) mesh to Gmsh v2.2. +void ConvertMeshComsol(const std::string &filename, std::ostream &buffer, + bool remove_curvature = false); + +// Convert an ASCII NASTRAN (.nas/.bdf) mesh to Gmsh v2.2. +void ConvertMeshNastran(const std::string &filename, std::ostream &buffer, + bool remove_curvature = false); + +} // namespace palace::mesh + +#endif // PALACE_UTILS_MESH_IO_HPP diff --git a/palace/utils/omp.cpp b/palace/utils/omp.cpp new file mode 100644 index 0000000000..2945e5dd11 --- /dev/null +++ b/palace/utils/omp.cpp @@ -0,0 +1,76 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#include "omp.hpp" + +#if defined(MFEM_USE_OPENMP) +#include +#endif + +namespace palace::utils +{ + +void SetNumThreads(int nt) +{ +#if defined(MFEM_USE_OPENMP) + omp_set_num_threads(nt); +#endif +} + +int GetMaxThreads() +{ +#if defined(MFEM_USE_OPENMP) + return omp_get_max_threads(); +#else + return 1; +#endif +} + +int GetNumActiveThreads() +{ +#if defined(MFEM_USE_OPENMP) + return omp_get_num_threads(); +#else + return 1; +#endif +} + +int GetThreadNum() +{ +#if defined(MFEM_USE_OPENMP) + return omp_get_thread_num(); +#else + return 0; +#endif +} + +int InParallel() +{ +#if defined(MFEM_USE_OPENMP) + return omp_in_parallel(); +#else + return 0; +#endif +} + +int ConfigureOmp() +{ +#if defined(MFEM_USE_OPENMP) + int nt; + const char *env = std::getenv("OMP_NUM_THREADS"); + if (env) + { + std::sscanf(env, "%d", &nt); + } + else + { + nt = 1; + } + utils::SetNumThreads(nt); + return nt; +#else + return 0; +#endif +} + +} // namespace palace::utils diff --git a/palace/utils/omp.hpp b/palace/utils/omp.hpp index bd453be6b2..4bf26a7173 100644 --- a/palace/utils/omp.hpp +++ b/palace/utils/omp.hpp @@ -1,16 +1,39 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_OMP_HPP -#define PALACE_UTILS_OMP_HPP - -#include - -#if defined(MFEM_USE_OPENMP) -#define PalacePragmaOmpHelper(x) _Pragma(#x) -#define PalacePragmaOmp(x) PalacePragmaOmpHelper(omp x) -#else -#define PalacePragmaOmp(x) -#endif - -#endif // PALACE_UTILS_OMP_HPP +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_OMP_HPP +#define PALACE_UTILS_OMP_HPP + +#include + +#if defined(MFEM_USE_OPENMP) +#define PalacePragmaOmpHelper(x) _Pragma(#x) +#define PalacePragmaOmp(x) PalacePragmaOmpHelper(omp x) +#else +#define PalacePragmaOmp(x) +#endif + +namespace palace::utils +{ + +// Set the number of OpenMP threads to be used for parallel regions. +void SetNumThreads(int nt); + +// Return maximum number of OpenMP threads. +int GetMaxThreads(); + +// Return number of active OpenMP threads. +int GetNumActiveThreads(); + +// Return the current thread ID. +int GetThreadNum(); + +// Return whether or not the current scope is inside a parallel OpenMP region. +int InParallel(); + +// Set and return the number of OpenMP threads depending on OMP_NUM_THREADS. +int ConfigureOmp(); + +} // namespace palace::utils + +#endif // PALACE_UTILS_OMP_HPP diff --git a/palace/utils/outputdir.hpp b/palace/utils/outputdir.hpp new file mode 100644 index 0000000000..c4510abf74 --- /dev/null +++ b/palace/utils/outputdir.hpp @@ -0,0 +1,82 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +#ifndef PALACE_UTILS_OUTPUTDIR_HPP +#define PALACE_UTILS_OUTPUTDIR_HPP + +#include +#include +#include "communication.hpp" +#include "filesystem.hpp" +#include "iodata.hpp" +#include "timer.hpp" + +namespace palace +{ + +inline void MakeOutputFolder(IoData &iodata, MPI_Comm &comm) +{ + BlockTimer bt(Timer::IO); + // Validate and make folder on root. + auto root = Mpi::Root(comm); + auto &output_str = iodata.problem.output; + if (root) + { + MFEM_VERIFY(!output_str.empty(), + fmt::format("Invalid output directory, got empty string \"\".")) + // Remove any trailing "/" to get folder name. + if (output_str.back() == '/') + { + output_str.erase(output_str.end() - 1); + } + auto output_path = fs::path(output_str); + // Make folder if it does not exist. + if (!fs::exists(output_path)) + { + MFEM_VERIFY(fs::create_directories(output_path), + fmt::format("Error std::filesystem could not create a directory at {}", + output_path.string())); + } + else + { + MFEM_VERIFY(fs::is_directory(output_path), + fmt::format("Output path already exists but is not a directory: {}", + output_path.string())); + if (!fs::is_empty(output_path)) + { + Mpi::Warning("Output folder is not empty; program will overwrite content! ({})", + output_path.string()); + } + } + // Ensure we can write to folder by making test file. + { + fs::path tmp_ = output_path / "tmp_test_file.txt"; + auto file_buf = fmt::output_file( + tmp_.string(), fmt::file::WRONLY | fmt::file::CREATE | fmt::file::TRUNC); + file_buf.print("Test Print"); + file_buf.close(); + MFEM_VERIFY( + fs::exists(tmp_) && fs::is_regular_file(tmp_), + fmt::format("Error creating test file in output folder: {}", tmp_.string())); + fs::remove(tmp_); + } + output_str = output_path.string(); + } + + // Broadcast new output_str to all ranks. + if (Mpi::Size(comm) > 1) + { + int str_len = static_cast(output_str.size()); + if (root) + { + MFEM_VERIFY(output_str.size() == std::size_t(str_len), + "Overflow in stringbuffer size!"); + } + Mpi::Broadcast(1, &str_len, 0, comm); + output_str.resize(str_len); + Mpi::Broadcast(str_len, output_str.data(), 0, comm); + } +} + +} // namespace palace +#endif // PALACE_UTILS_OUTPUTDIR_HPP \ No newline at end of file diff --git a/palace/utils/prettyprint.hpp b/palace/utils/prettyprint.hpp index 7c547cd592..6ba619a4c7 100644 --- a/palace/utils/prettyprint.hpp +++ b/palace/utils/prettyprint.hpp @@ -1,137 +1,116 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 - -#ifndef PALACE_UTILS_PRETTY_PRINT_HPP -#define PALACE_UTILS_PRETTY_PRINT_HPP - -#include -#include -#include -#include -#include "utils/communication.hpp" - -namespace palace::utils -{ - -// -// Utility functions for formatted printing. -// - -namespace internal -{ - -constexpr std::size_t max_width = 60; - -template -inline std::size_t GetSize(const T &v) -{ - return v.size(); -} - -template -inline std::size_t GetSize(const mfem::Array &v) -{ - return v.Size(); -} - -inline std::size_t PrePrint(MPI_Comm comm, std::size_t w, std::size_t wv, std::size_t lead) -{ - auto end = w + 2 + wv + 1; // Consider space for trailing comma - if (w > 0 && end > max_width - lead) - { - Mpi::Print(comm, ",\n{}", std::string(lead, ' ')); // Line break - w = 0; - } - if (w) - { - Mpi::Print(comm, ", "); - return w + 2; - } - else - { - Mpi::Print(comm, " "); - return w + 1; - } -} - -} // namespace internal - -// Fixed column width wrapped printing with range notation for the contents of a marker -// array. -template